This commit is contained in:
2025-09-07 22:09:54 +02:00
parent e1b817252c
commit 2fc0d000b6
7796 changed files with 2159515 additions and 933 deletions

View File

@ -0,0 +1,225 @@
import shlex
import subprocess
import time
import uuid
import pytest
from pandas.compat import (
is_ci_environment,
is_platform_arm,
is_platform_mac,
is_platform_windows,
)
import pandas.util._test_decorators as td
import pandas.io.common as icom
from pandas.io.parsers import read_csv
@pytest.fixture
def compression_to_extension():
return {value: key for key, value in icom.extension_to_compression.items()}
@pytest.fixture
def tips_file(datapath):
"""Path to the tips dataset"""
return datapath("io", "data", "csv", "tips.csv")
@pytest.fixture
def jsonl_file(datapath):
"""Path to a JSONL dataset"""
return datapath("io", "parser", "data", "items.jsonl")
@pytest.fixture
def salaries_table(datapath):
"""DataFrame with the salaries dataset"""
return read_csv(datapath("io", "parser", "data", "salaries.csv"), sep="\t")
@pytest.fixture
def feather_file(datapath):
return datapath("io", "data", "feather", "feather-0_3_1.feather")
@pytest.fixture
def xml_file(datapath):
return datapath("io", "data", "xml", "books.xml")
@pytest.fixture
def s3_base(worker_id, monkeypatch):
"""
Fixture for mocking S3 interaction.
Sets up moto server in separate process locally
Return url for motoserver/moto CI service
"""
pytest.importorskip("s3fs")
pytest.importorskip("boto3")
# temporary workaround as moto fails for botocore >= 1.11 otherwise,
# see https://github.com/spulec/moto/issues/1924 & 1952
monkeypatch.setenv("AWS_ACCESS_KEY_ID", "foobar_key")
monkeypatch.setenv("AWS_SECRET_ACCESS_KEY", "foobar_secret")
if is_ci_environment():
if is_platform_arm() or is_platform_mac() or is_platform_windows():
# NOT RUN on Windows/macOS, only Ubuntu
# - subprocess in CI can cause timeouts
# - GitHub Actions do not support
# container services for the above OSs
pytest.skip(
"S3 tests do not have a corresponding service on "
"Windows or macOS platforms"
)
else:
# set in .github/workflows/unit-tests.yml
yield "http://localhost:5000"
else:
requests = pytest.importorskip("requests")
pytest.importorskip("moto")
pytest.importorskip("flask") # server mode needs flask too
# Launching moto in server mode, i.e., as a separate process
# with an S3 endpoint on localhost
worker_id = "5" if worker_id == "master" else worker_id.lstrip("gw")
endpoint_port = f"555{worker_id}"
endpoint_uri = f"http://127.0.0.1:{endpoint_port}/"
# pipe to null to avoid logging in terminal
with subprocess.Popen(
shlex.split(f"moto_server s3 -p {endpoint_port}"),
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
) as proc:
timeout = 5
while timeout > 0:
try:
# OK to go once server is accepting connections
r = requests.get(endpoint_uri)
if r.ok:
break
except Exception:
pass
timeout -= 0.1
time.sleep(0.1)
yield endpoint_uri
proc.terminate()
@pytest.fixture
def s3so(s3_base):
return {"client_kwargs": {"endpoint_url": s3_base}}
@pytest.fixture
def s3_resource(s3_base):
import boto3
s3 = boto3.resource("s3", endpoint_url=s3_base)
return s3
@pytest.fixture
def s3_public_bucket(s3_resource):
bucket = s3_resource.Bucket(f"pandas-test-{uuid.uuid4()}")
bucket.create()
yield bucket
bucket.objects.delete()
bucket.delete()
@pytest.fixture
def s3_public_bucket_with_data(
s3_public_bucket, tips_file, jsonl_file, feather_file, xml_file
):
"""
The following datasets
are loaded.
- tips.csv
- tips.csv.gz
- tips.csv.bz2
- items.jsonl
"""
test_s3_files = [
("tips#1.csv", tips_file),
("tips.csv", tips_file),
("tips.csv.gz", tips_file + ".gz"),
("tips.csv.bz2", tips_file + ".bz2"),
("items.jsonl", jsonl_file),
("simple_dataset.feather", feather_file),
("books.xml", xml_file),
]
for s3_key, file_name in test_s3_files:
with open(file_name, "rb") as f:
s3_public_bucket.put_object(Key=s3_key, Body=f)
return s3_public_bucket
@pytest.fixture
def s3_private_bucket(s3_resource):
bucket = s3_resource.Bucket(f"cant_get_it-{uuid.uuid4()}")
bucket.create(ACL="private")
yield bucket
bucket.objects.delete()
bucket.delete()
@pytest.fixture
def s3_private_bucket_with_data(
s3_private_bucket, tips_file, jsonl_file, feather_file, xml_file
):
"""
The following datasets
are loaded.
- tips.csv
- tips.csv.gz
- tips.csv.bz2
- items.jsonl
"""
test_s3_files = [
("tips#1.csv", tips_file),
("tips.csv", tips_file),
("tips.csv.gz", tips_file + ".gz"),
("tips.csv.bz2", tips_file + ".bz2"),
("items.jsonl", jsonl_file),
("simple_dataset.feather", feather_file),
("books.xml", xml_file),
]
for s3_key, file_name in test_s3_files:
with open(file_name, "rb") as f:
s3_private_bucket.put_object(Key=s3_key, Body=f)
return s3_private_bucket
_compression_formats_params = [
(".no_compress", None),
("", None),
(".gz", "gzip"),
(".GZ", "gzip"),
(".bz2", "bz2"),
(".BZ2", "bz2"),
(".zip", "zip"),
(".ZIP", "zip"),
(".xz", "xz"),
(".XZ", "xz"),
pytest.param((".zst", "zstd"), marks=td.skip_if_no("zstandard")),
pytest.param((".ZST", "zstd"), marks=td.skip_if_no("zstandard")),
]
@pytest.fixture(params=_compression_formats_params[1:])
def compression_format(request):
return request.param
@pytest.fixture(params=_compression_formats_params)
def compression_ext(request):
return request.param[0]

View File

@ -0,0 +1,77 @@
import functools
import numpy as np
import pytest
from pandas.compat import is_platform_windows
import pandas as pd
import pandas._testing as tm
pytest.importorskip("odf")
if is_platform_windows():
pytestmark = pytest.mark.single_cpu
@pytest.fixture(autouse=True)
def cd_and_set_engine(monkeypatch, datapath):
func = functools.partial(pd.read_excel, engine="odf")
monkeypatch.setattr(pd, "read_excel", func)
monkeypatch.chdir(datapath("io", "data", "excel"))
def test_read_invalid_types_raises():
# the invalid_value_type.ods required manually editing
# of the included content.xml file
with pytest.raises(ValueError, match="Unrecognized type awesome_new_type"):
pd.read_excel("invalid_value_type.ods")
def test_read_writer_table():
# Also test reading tables from an text OpenDocument file
# (.odt)
index = pd.Index(["Row 1", "Row 2", "Row 3"], name="Header")
expected = pd.DataFrame(
[[1, np.nan, 7], [2, np.nan, 8], [3, np.nan, 9]],
index=index,
columns=["Column 1", "Unnamed: 2", "Column 3"],
)
result = pd.read_excel("writertable.odt", sheet_name="Table1", index_col=0)
tm.assert_frame_equal(result, expected)
def test_read_newlines_between_xml_elements_table():
# GH#45598
expected = pd.DataFrame(
[[1.0, 4.0, 7], [np.nan, np.nan, 8], [3.0, 6.0, 9]],
columns=["Column 1", "Column 2", "Column 3"],
)
result = pd.read_excel("test_newlines.ods")
tm.assert_frame_equal(result, expected)
def test_read_unempty_cells():
expected = pd.DataFrame(
[1, np.nan, 3, np.nan, 5],
columns=["Column 1"],
)
result = pd.read_excel("test_unempty_cells.ods")
tm.assert_frame_equal(result, expected)
def test_read_cell_annotation():
expected = pd.DataFrame(
["test", np.nan, "test 3"],
columns=["Column 1"],
)
result = pd.read_excel("test_cell_annotation.ods")
tm.assert_frame_equal(result, expected)

View File

@ -0,0 +1,106 @@
from datetime import (
date,
datetime,
)
import re
import pytest
from pandas.compat import is_platform_windows
import pandas as pd
import pandas._testing as tm
from pandas.io.excel import ExcelWriter
odf = pytest.importorskip("odf")
if is_platform_windows():
pytestmark = pytest.mark.single_cpu
@pytest.fixture
def ext():
return ".ods"
def test_write_append_mode_raises(ext):
msg = "Append mode is not supported with odf!"
with tm.ensure_clean(ext) as f:
with pytest.raises(ValueError, match=msg):
ExcelWriter(f, engine="odf", mode="a")
@pytest.mark.parametrize("engine_kwargs", [None, {"kwarg": 1}])
def test_engine_kwargs(ext, engine_kwargs):
# GH 42286
# GH 43445
# test for error: OpenDocumentSpreadsheet does not accept any arguments
with tm.ensure_clean(ext) as f:
if engine_kwargs is not None:
error = re.escape(
"OpenDocumentSpreadsheet() got an unexpected keyword argument 'kwarg'"
)
with pytest.raises(
TypeError,
match=error,
):
ExcelWriter(f, engine="odf", engine_kwargs=engine_kwargs)
else:
with ExcelWriter(f, engine="odf", engine_kwargs=engine_kwargs) as _:
pass
def test_book_and_sheets_consistent(ext):
# GH#45687 - Ensure sheets is updated if user modifies book
with tm.ensure_clean(ext) as f:
with ExcelWriter(f) as writer:
assert writer.sheets == {}
table = odf.table.Table(name="test_name")
writer.book.spreadsheet.addElement(table)
assert writer.sheets == {"test_name": table}
@pytest.mark.parametrize(
["value", "cell_value_type", "cell_value_attribute", "cell_value"],
argvalues=[
(True, "boolean", "boolean-value", "true"),
("test string", "string", "string-value", "test string"),
(1, "float", "value", "1"),
(1.5, "float", "value", "1.5"),
(
datetime(2010, 10, 10, 10, 10, 10),
"date",
"date-value",
"2010-10-10T10:10:10",
),
(date(2010, 10, 10), "date", "date-value", "2010-10-10"),
],
)
def test_cell_value_type(ext, value, cell_value_type, cell_value_attribute, cell_value):
# GH#54994 ODS: cell attributes should follow specification
# http://docs.oasis-open.org/office/v1.2/os/OpenDocument-v1.2-os-part1.html#refTable13
from odf.namespaces import OFFICENS
from odf.table import (
TableCell,
TableRow,
)
table_cell_name = TableCell().qname
with tm.ensure_clean(ext) as f:
pd.DataFrame([[value]]).to_excel(f, header=False, index=False)
with pd.ExcelFile(f) as wb:
sheet = wb._reader.get_sheet_by_index(0)
sheet_rows = sheet.getElementsByType(TableRow)
sheet_cells = [
x
for x in sheet_rows[0].childNodes
if hasattr(x, "qname") and x.qname == table_cell_name
]
cell = sheet_cells[0]
assert cell.attributes.get((OFFICENS, "value-type")) == cell_value_type
assert cell.attributes.get((OFFICENS, cell_value_attribute)) == cell_value

View File

@ -0,0 +1,432 @@
import contextlib
from pathlib import Path
import re
import numpy as np
import pytest
from pandas.compat import is_platform_windows
import pandas as pd
from pandas import DataFrame
import pandas._testing as tm
from pandas.io.excel import (
ExcelWriter,
_OpenpyxlWriter,
)
from pandas.io.excel._openpyxl import OpenpyxlReader
openpyxl = pytest.importorskip("openpyxl")
if is_platform_windows():
pytestmark = pytest.mark.single_cpu
@pytest.fixture
def ext():
return ".xlsx"
def test_to_excel_styleconverter():
from openpyxl import styles
hstyle = {
"font": {"color": "00FF0000", "bold": True},
"borders": {"top": "thin", "right": "thin", "bottom": "thin", "left": "thin"},
"alignment": {"horizontal": "center", "vertical": "top"},
"fill": {"patternType": "solid", "fgColor": {"rgb": "006666FF", "tint": 0.3}},
"number_format": {"format_code": "0.00"},
"protection": {"locked": True, "hidden": False},
}
font_color = styles.Color("00FF0000")
font = styles.Font(bold=True, color=font_color)
side = styles.Side(style=styles.borders.BORDER_THIN)
border = styles.Border(top=side, right=side, bottom=side, left=side)
alignment = styles.Alignment(horizontal="center", vertical="top")
fill_color = styles.Color(rgb="006666FF", tint=0.3)
fill = styles.PatternFill(patternType="solid", fgColor=fill_color)
number_format = "0.00"
protection = styles.Protection(locked=True, hidden=False)
kw = _OpenpyxlWriter._convert_to_style_kwargs(hstyle)
assert kw["font"] == font
assert kw["border"] == border
assert kw["alignment"] == alignment
assert kw["fill"] == fill
assert kw["number_format"] == number_format
assert kw["protection"] == protection
def test_write_cells_merge_styled(ext):
from pandas.io.formats.excel import ExcelCell
sheet_name = "merge_styled"
sty_b1 = {"font": {"color": "00FF0000"}}
sty_a2 = {"font": {"color": "0000FF00"}}
initial_cells = [
ExcelCell(col=1, row=0, val=42, style=sty_b1),
ExcelCell(col=0, row=1, val=99, style=sty_a2),
]
sty_merged = {"font": {"color": "000000FF", "bold": True}}
sty_kwargs = _OpenpyxlWriter._convert_to_style_kwargs(sty_merged)
openpyxl_sty_merged = sty_kwargs["font"]
merge_cells = [
ExcelCell(
col=0, row=0, val="pandas", mergestart=1, mergeend=1, style=sty_merged
)
]
with tm.ensure_clean(ext) as path:
with _OpenpyxlWriter(path) as writer:
writer._write_cells(initial_cells, sheet_name=sheet_name)
writer._write_cells(merge_cells, sheet_name=sheet_name)
wks = writer.sheets[sheet_name]
xcell_b1 = wks["B1"]
xcell_a2 = wks["A2"]
assert xcell_b1.font == openpyxl_sty_merged
assert xcell_a2.font == openpyxl_sty_merged
@pytest.mark.parametrize("iso_dates", [True, False])
def test_engine_kwargs_write(ext, iso_dates):
# GH 42286 GH 43445
engine_kwargs = {"iso_dates": iso_dates}
with tm.ensure_clean(ext) as f:
with ExcelWriter(f, engine="openpyxl", engine_kwargs=engine_kwargs) as writer:
assert writer.book.iso_dates == iso_dates
# ExcelWriter won't allow us to close without writing something
DataFrame().to_excel(writer)
def test_engine_kwargs_append_invalid(ext):
# GH 43445
# test whether an invalid engine kwargs actually raises
with tm.ensure_clean(ext) as f:
DataFrame(["hello", "world"]).to_excel(f)
with pytest.raises(
TypeError,
match=re.escape(
"load_workbook() got an unexpected keyword argument 'apple_banana'"
),
):
with ExcelWriter(
f, engine="openpyxl", mode="a", engine_kwargs={"apple_banana": "fruit"}
) as writer:
# ExcelWriter needs us to write something to close properly
DataFrame(["good"]).to_excel(writer, sheet_name="Sheet2")
@pytest.mark.parametrize("data_only, expected", [(True, 0), (False, "=1+1")])
def test_engine_kwargs_append_data_only(ext, data_only, expected):
# GH 43445
# tests whether the data_only engine_kwarg actually works well for
# openpyxl's load_workbook
with tm.ensure_clean(ext) as f:
DataFrame(["=1+1"]).to_excel(f)
with ExcelWriter(
f, engine="openpyxl", mode="a", engine_kwargs={"data_only": data_only}
) as writer:
assert writer.sheets["Sheet1"]["B2"].value == expected
# ExcelWriter needs us to writer something to close properly?
DataFrame().to_excel(writer, sheet_name="Sheet2")
# ensure that data_only also works for reading
# and that formulas/values roundtrip
assert (
pd.read_excel(
f,
sheet_name="Sheet1",
engine="openpyxl",
engine_kwargs={"data_only": data_only},
).iloc[0, 1]
== expected
)
@pytest.mark.parametrize("kwarg_name", ["read_only", "data_only"])
@pytest.mark.parametrize("kwarg_value", [True, False])
def test_engine_kwargs_append_reader(datapath, ext, kwarg_name, kwarg_value):
# GH 55027
# test that `read_only` and `data_only` can be passed to
# `openpyxl.reader.excel.load_workbook` via `engine_kwargs`
filename = datapath("io", "data", "excel", "test1" + ext)
with contextlib.closing(
OpenpyxlReader(filename, engine_kwargs={kwarg_name: kwarg_value})
) as reader:
assert getattr(reader.book, kwarg_name) == kwarg_value
@pytest.mark.parametrize(
"mode,expected", [("w", ["baz"]), ("a", ["foo", "bar", "baz"])]
)
def test_write_append_mode(ext, mode, expected):
df = DataFrame([1], columns=["baz"])
with tm.ensure_clean(ext) as f:
wb = openpyxl.Workbook()
wb.worksheets[0].title = "foo"
wb.worksheets[0]["A1"].value = "foo"
wb.create_sheet("bar")
wb.worksheets[1]["A1"].value = "bar"
wb.save(f)
with ExcelWriter(f, engine="openpyxl", mode=mode) as writer:
df.to_excel(writer, sheet_name="baz", index=False)
with contextlib.closing(openpyxl.load_workbook(f)) as wb2:
result = [sheet.title for sheet in wb2.worksheets]
assert result == expected
for index, cell_value in enumerate(expected):
assert wb2.worksheets[index]["A1"].value == cell_value
@pytest.mark.parametrize(
"if_sheet_exists,num_sheets,expected",
[
("new", 2, ["apple", "banana"]),
("replace", 1, ["pear"]),
("overlay", 1, ["pear", "banana"]),
],
)
def test_if_sheet_exists_append_modes(ext, if_sheet_exists, num_sheets, expected):
# GH 40230
df1 = DataFrame({"fruit": ["apple", "banana"]})
df2 = DataFrame({"fruit": ["pear"]})
with tm.ensure_clean(ext) as f:
df1.to_excel(f, engine="openpyxl", sheet_name="foo", index=False)
with ExcelWriter(
f, engine="openpyxl", mode="a", if_sheet_exists=if_sheet_exists
) as writer:
df2.to_excel(writer, sheet_name="foo", index=False)
with contextlib.closing(openpyxl.load_workbook(f)) as wb:
assert len(wb.sheetnames) == num_sheets
assert wb.sheetnames[0] == "foo"
result = pd.read_excel(wb, "foo", engine="openpyxl")
assert list(result["fruit"]) == expected
if len(wb.sheetnames) == 2:
result = pd.read_excel(wb, wb.sheetnames[1], engine="openpyxl")
tm.assert_frame_equal(result, df2)
@pytest.mark.parametrize(
"startrow, startcol, greeting, goodbye",
[
(0, 0, ["poop", "world"], ["goodbye", "people"]),
(0, 1, ["hello", "world"], ["poop", "people"]),
(1, 0, ["hello", "poop"], ["goodbye", "people"]),
(1, 1, ["hello", "world"], ["goodbye", "poop"]),
],
)
def test_append_overlay_startrow_startcol(ext, startrow, startcol, greeting, goodbye):
df1 = DataFrame({"greeting": ["hello", "world"], "goodbye": ["goodbye", "people"]})
df2 = DataFrame(["poop"])
with tm.ensure_clean(ext) as f:
df1.to_excel(f, engine="openpyxl", sheet_name="poo", index=False)
with ExcelWriter(
f, engine="openpyxl", mode="a", if_sheet_exists="overlay"
) as writer:
# use startrow+1 because we don't have a header
df2.to_excel(
writer,
index=False,
header=False,
startrow=startrow + 1,
startcol=startcol,
sheet_name="poo",
)
result = pd.read_excel(f, sheet_name="poo", engine="openpyxl")
expected = DataFrame({"greeting": greeting, "goodbye": goodbye})
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"if_sheet_exists,msg",
[
(
"invalid",
"'invalid' is not valid for if_sheet_exists. Valid options "
"are 'error', 'new', 'replace' and 'overlay'.",
),
(
"error",
"Sheet 'foo' already exists and if_sheet_exists is set to 'error'.",
),
(
None,
"Sheet 'foo' already exists and if_sheet_exists is set to 'error'.",
),
],
)
def test_if_sheet_exists_raises(ext, if_sheet_exists, msg):
# GH 40230
df = DataFrame({"fruit": ["pear"]})
with tm.ensure_clean(ext) as f:
with pytest.raises(ValueError, match=re.escape(msg)):
df.to_excel(f, sheet_name="foo", engine="openpyxl")
with ExcelWriter(
f, engine="openpyxl", mode="a", if_sheet_exists=if_sheet_exists
) as writer:
df.to_excel(writer, sheet_name="foo")
def test_to_excel_with_openpyxl_engine(ext):
# GH 29854
with tm.ensure_clean(ext) as filename:
df1 = DataFrame({"A": np.linspace(1, 10, 10)})
df2 = DataFrame({"B": np.linspace(1, 20, 10)})
df = pd.concat([df1, df2], axis=1)
styled = df.style.map(
lambda val: f"color: {'red' if val < 0 else 'black'}"
).highlight_max()
styled.to_excel(filename, engine="openpyxl")
@pytest.mark.parametrize("read_only", [True, False])
def test_read_workbook(datapath, ext, read_only):
# GH 39528
filename = datapath("io", "data", "excel", "test1" + ext)
with contextlib.closing(
openpyxl.load_workbook(filename, read_only=read_only)
) as wb:
result = pd.read_excel(wb, engine="openpyxl")
expected = pd.read_excel(filename)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"header, expected_data",
[
(
0,
{
"Title": [np.nan, "A", 1, 2, 3],
"Unnamed: 1": [np.nan, "B", 4, 5, 6],
"Unnamed: 2": [np.nan, "C", 7, 8, 9],
},
),
(2, {"A": [1, 2, 3], "B": [4, 5, 6], "C": [7, 8, 9]}),
],
)
@pytest.mark.parametrize(
"filename", ["dimension_missing", "dimension_small", "dimension_large"]
)
# When read_only is None, use read_excel instead of a workbook
@pytest.mark.parametrize("read_only", [True, False, None])
def test_read_with_bad_dimension(
datapath, ext, header, expected_data, filename, read_only
):
# GH 38956, 39001 - no/incorrect dimension information
path = datapath("io", "data", "excel", f"{filename}{ext}")
if read_only is None:
result = pd.read_excel(path, header=header)
else:
with contextlib.closing(
openpyxl.load_workbook(path, read_only=read_only)
) as wb:
result = pd.read_excel(wb, engine="openpyxl", header=header)
expected = DataFrame(expected_data)
tm.assert_frame_equal(result, expected)
def test_append_mode_file(ext):
# GH 39576
df = DataFrame()
with tm.ensure_clean(ext) as f:
df.to_excel(f, engine="openpyxl")
with ExcelWriter(
f, mode="a", engine="openpyxl", if_sheet_exists="new"
) as writer:
df.to_excel(writer)
# make sure that zip files are not concatenated by making sure that
# "docProps/app.xml" only occurs twice in the file
data = Path(f).read_bytes()
first = data.find(b"docProps/app.xml")
second = data.find(b"docProps/app.xml", first + 1)
third = data.find(b"docProps/app.xml", second + 1)
assert second != -1 and third == -1
# When read_only is None, use read_excel instead of a workbook
@pytest.mark.parametrize("read_only", [True, False, None])
def test_read_with_empty_trailing_rows(datapath, ext, read_only):
# GH 39181
path = datapath("io", "data", "excel", f"empty_trailing_rows{ext}")
if read_only is None:
result = pd.read_excel(path)
else:
with contextlib.closing(
openpyxl.load_workbook(path, read_only=read_only)
) as wb:
result = pd.read_excel(wb, engine="openpyxl")
expected = DataFrame(
{
"Title": [np.nan, "A", 1, 2, 3],
"Unnamed: 1": [np.nan, "B", 4, 5, 6],
"Unnamed: 2": [np.nan, "C", 7, 8, 9],
}
)
tm.assert_frame_equal(result, expected)
# When read_only is None, use read_excel instead of a workbook
@pytest.mark.parametrize("read_only", [True, False, None])
def test_read_empty_with_blank_row(datapath, ext, read_only):
# GH 39547 - empty excel file with a row that has no data
path = datapath("io", "data", "excel", f"empty_with_blank_row{ext}")
if read_only is None:
result = pd.read_excel(path)
else:
with contextlib.closing(
openpyxl.load_workbook(path, read_only=read_only)
) as wb:
result = pd.read_excel(wb, engine="openpyxl")
expected = DataFrame()
tm.assert_frame_equal(result, expected)
def test_book_and_sheets_consistent(ext):
# GH#45687 - Ensure sheets is updated if user modifies book
with tm.ensure_clean(ext) as f:
with ExcelWriter(f, engine="openpyxl") as writer:
assert writer.sheets == {}
sheet = writer.book.create_sheet("test_name", 0)
assert writer.sheets == {"test_name": sheet}
def test_ints_spelled_with_decimals(datapath, ext):
# GH 46988 - openpyxl returns this sheet with floats
path = datapath("io", "data", "excel", f"ints_spelled_with_decimals{ext}")
result = pd.read_excel(path)
expected = DataFrame(range(2, 12), columns=[1])
tm.assert_frame_equal(result, expected)
def test_read_multiindex_header_no_index_names(datapath, ext):
# GH#47487
path = datapath("io", "data", "excel", f"multiindex_no_index_names{ext}")
result = pd.read_excel(path, index_col=[0, 1, 2], header=[0, 1, 2])
expected = DataFrame(
[[np.nan, "x", "x", "x"], ["x", np.nan, np.nan, np.nan]],
columns=pd.MultiIndex.from_tuples(
[("X", "Y", "A1"), ("X", "Y", "A2"), ("XX", "YY", "B1"), ("XX", "YY", "B2")]
),
index=pd.MultiIndex.from_tuples([("A", "AA", "AAA"), ("A", "BB", "BBB")]),
)
tm.assert_frame_equal(result, expected)

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,298 @@
import contextlib
import time
import numpy as np
import pytest
from pandas.compat import is_platform_windows
import pandas.util._test_decorators as td
from pandas import (
DataFrame,
read_excel,
)
import pandas._testing as tm
from pandas.io.excel import ExcelWriter
from pandas.io.formats.excel import ExcelFormatter
pytest.importorskip("jinja2")
# jinja2 is currently required for Styler.__init__(). Technically Styler.to_excel
# could compute styles and render to excel without jinja2, since there is no
# 'template' file, but this needs the import error to delayed until render time.
if is_platform_windows():
pytestmark = pytest.mark.single_cpu
def assert_equal_cell_styles(cell1, cell2):
# TODO: should find a better way to check equality
assert cell1.alignment.__dict__ == cell2.alignment.__dict__
assert cell1.border.__dict__ == cell2.border.__dict__
assert cell1.fill.__dict__ == cell2.fill.__dict__
assert cell1.font.__dict__ == cell2.font.__dict__
assert cell1.number_format == cell2.number_format
assert cell1.protection.__dict__ == cell2.protection.__dict__
@pytest.mark.parametrize(
"engine",
["xlsxwriter", "openpyxl"],
)
def test_styler_to_excel_unstyled(engine):
# compare DataFrame.to_excel and Styler.to_excel when no styles applied
pytest.importorskip(engine)
df = DataFrame(np.random.default_rng(2).standard_normal((2, 2)))
with tm.ensure_clean(".xlsx") as path:
with ExcelWriter(path, engine=engine) as writer:
df.to_excel(writer, sheet_name="dataframe")
df.style.to_excel(writer, sheet_name="unstyled")
openpyxl = pytest.importorskip("openpyxl") # test loading only with openpyxl
with contextlib.closing(openpyxl.load_workbook(path)) as wb:
for col1, col2 in zip(wb["dataframe"].columns, wb["unstyled"].columns):
assert len(col1) == len(col2)
for cell1, cell2 in zip(col1, col2):
assert cell1.value == cell2.value
assert_equal_cell_styles(cell1, cell2)
shared_style_params = [
(
"background-color: #111222",
["fill", "fgColor", "rgb"],
{"xlsxwriter": "FF111222", "openpyxl": "00111222"},
),
(
"color: #111222",
["font", "color", "value"],
{"xlsxwriter": "FF111222", "openpyxl": "00111222"},
),
("font-family: Arial;", ["font", "name"], "arial"),
("font-weight: bold;", ["font", "b"], True),
("font-style: italic;", ["font", "i"], True),
("text-decoration: underline;", ["font", "u"], "single"),
("number-format: $??,???.00;", ["number_format"], "$??,???.00"),
("text-align: left;", ["alignment", "horizontal"], "left"),
(
"vertical-align: bottom;",
["alignment", "vertical"],
{"xlsxwriter": None, "openpyxl": "bottom"}, # xlsxwriter Fails
),
("vertical-align: middle;", ["alignment", "vertical"], "center"),
# Border widths
("border-left: 2pt solid red", ["border", "left", "style"], "medium"),
("border-left: 1pt dotted red", ["border", "left", "style"], "dotted"),
("border-left: 2pt dotted red", ["border", "left", "style"], "mediumDashDotDot"),
("border-left: 1pt dashed red", ["border", "left", "style"], "dashed"),
("border-left: 2pt dashed red", ["border", "left", "style"], "mediumDashed"),
("border-left: 1pt solid red", ["border", "left", "style"], "thin"),
("border-left: 3pt solid red", ["border", "left", "style"], "thick"),
# Border expansion
(
"border-left: 2pt solid #111222",
["border", "left", "color", "rgb"],
{"xlsxwriter": "FF111222", "openpyxl": "00111222"},
),
("border: 1pt solid red", ["border", "top", "style"], "thin"),
(
"border: 1pt solid #111222",
["border", "top", "color", "rgb"],
{"xlsxwriter": "FF111222", "openpyxl": "00111222"},
),
("border: 1pt solid red", ["border", "right", "style"], "thin"),
(
"border: 1pt solid #111222",
["border", "right", "color", "rgb"],
{"xlsxwriter": "FF111222", "openpyxl": "00111222"},
),
("border: 1pt solid red", ["border", "bottom", "style"], "thin"),
(
"border: 1pt solid #111222",
["border", "bottom", "color", "rgb"],
{"xlsxwriter": "FF111222", "openpyxl": "00111222"},
),
("border: 1pt solid red", ["border", "left", "style"], "thin"),
(
"border: 1pt solid #111222",
["border", "left", "color", "rgb"],
{"xlsxwriter": "FF111222", "openpyxl": "00111222"},
),
# Border styles
(
"border-left-style: hair; border-left-color: black",
["border", "left", "style"],
"hair",
),
]
@pytest.mark.parametrize(
"engine",
["xlsxwriter", "openpyxl"],
)
@pytest.mark.parametrize("css, attrs, expected", shared_style_params)
def test_styler_to_excel_basic(engine, css, attrs, expected):
pytest.importorskip(engine)
df = DataFrame(np.random.default_rng(2).standard_normal((1, 1)))
styler = df.style.map(lambda x: css)
with tm.ensure_clean(".xlsx") as path:
with ExcelWriter(path, engine=engine) as writer:
df.to_excel(writer, sheet_name="dataframe")
styler.to_excel(writer, sheet_name="styled")
openpyxl = pytest.importorskip("openpyxl") # test loading only with openpyxl
with contextlib.closing(openpyxl.load_workbook(path)) as wb:
# test unstyled data cell does not have expected styles
# test styled cell has expected styles
u_cell, s_cell = wb["dataframe"].cell(2, 2), wb["styled"].cell(2, 2)
for attr in attrs:
u_cell, s_cell = getattr(u_cell, attr, None), getattr(s_cell, attr)
if isinstance(expected, dict):
assert u_cell is None or u_cell != expected[engine]
assert s_cell == expected[engine]
else:
assert u_cell is None or u_cell != expected
assert s_cell == expected
@pytest.mark.parametrize(
"engine",
["xlsxwriter", "openpyxl"],
)
@pytest.mark.parametrize("css, attrs, expected", shared_style_params)
def test_styler_to_excel_basic_indexes(engine, css, attrs, expected):
pytest.importorskip(engine)
df = DataFrame(np.random.default_rng(2).standard_normal((1, 1)))
styler = df.style
styler.map_index(lambda x: css, axis=0)
styler.map_index(lambda x: css, axis=1)
null_styler = df.style
null_styler.map(lambda x: "null: css;")
null_styler.map_index(lambda x: "null: css;", axis=0)
null_styler.map_index(lambda x: "null: css;", axis=1)
with tm.ensure_clean(".xlsx") as path:
with ExcelWriter(path, engine=engine) as writer:
null_styler.to_excel(writer, sheet_name="null_styled")
styler.to_excel(writer, sheet_name="styled")
openpyxl = pytest.importorskip("openpyxl") # test loading only with openpyxl
with contextlib.closing(openpyxl.load_workbook(path)) as wb:
# test null styled index cells does not have expected styles
# test styled cell has expected styles
ui_cell, si_cell = wb["null_styled"].cell(2, 1), wb["styled"].cell(2, 1)
uc_cell, sc_cell = wb["null_styled"].cell(1, 2), wb["styled"].cell(1, 2)
for attr in attrs:
ui_cell, si_cell = getattr(ui_cell, attr, None), getattr(si_cell, attr)
uc_cell, sc_cell = getattr(uc_cell, attr, None), getattr(sc_cell, attr)
if isinstance(expected, dict):
assert ui_cell is None or ui_cell != expected[engine]
assert si_cell == expected[engine]
assert uc_cell is None or uc_cell != expected[engine]
assert sc_cell == expected[engine]
else:
assert ui_cell is None or ui_cell != expected
assert si_cell == expected
assert uc_cell is None or uc_cell != expected
assert sc_cell == expected
# From https://openpyxl.readthedocs.io/en/stable/api/openpyxl.styles.borders.html
# Note: Leaving behavior of "width"-type styles undefined; user should use border-width
# instead
excel_border_styles = [
# "thin",
"dashed",
"mediumDashDot",
"dashDotDot",
"hair",
"dotted",
"mediumDashDotDot",
# "medium",
"double",
"dashDot",
"slantDashDot",
# "thick",
"mediumDashed",
]
@pytest.mark.parametrize(
"engine",
["xlsxwriter", "openpyxl"],
)
@pytest.mark.parametrize("border_style", excel_border_styles)
def test_styler_to_excel_border_style(engine, border_style):
css = f"border-left: {border_style} black thin"
attrs = ["border", "left", "style"]
expected = border_style
pytest.importorskip(engine)
df = DataFrame(np.random.default_rng(2).standard_normal((1, 1)))
styler = df.style.map(lambda x: css)
with tm.ensure_clean(".xlsx") as path:
with ExcelWriter(path, engine=engine) as writer:
df.to_excel(writer, sheet_name="dataframe")
styler.to_excel(writer, sheet_name="styled")
openpyxl = pytest.importorskip("openpyxl") # test loading only with openpyxl
with contextlib.closing(openpyxl.load_workbook(path)) as wb:
# test unstyled data cell does not have expected styles
# test styled cell has expected styles
u_cell, s_cell = wb["dataframe"].cell(2, 2), wb["styled"].cell(2, 2)
for attr in attrs:
u_cell, s_cell = getattr(u_cell, attr, None), getattr(s_cell, attr)
if isinstance(expected, dict):
assert u_cell is None or u_cell != expected[engine]
assert s_cell == expected[engine]
else:
assert u_cell is None or u_cell != expected
assert s_cell == expected
def test_styler_custom_converter():
openpyxl = pytest.importorskip("openpyxl")
def custom_converter(css):
return {"font": {"color": {"rgb": "111222"}}}
df = DataFrame(np.random.default_rng(2).standard_normal((1, 1)))
styler = df.style.map(lambda x: "color: #888999")
with tm.ensure_clean(".xlsx") as path:
with ExcelWriter(path, engine="openpyxl") as writer:
ExcelFormatter(styler, style_converter=custom_converter).write(
writer, sheet_name="custom"
)
with contextlib.closing(openpyxl.load_workbook(path)) as wb:
assert wb["custom"].cell(2, 2).font.color.value == "00111222"
@pytest.mark.single_cpu
@td.skip_if_not_us_locale
def test_styler_to_s3(s3_public_bucket, s3so):
# GH#46381
mock_bucket_name, target_file = s3_public_bucket.name, "test.xlsx"
df = DataFrame({"x": [1, 2, 3], "y": [2, 4, 6]})
styler = df.style.set_sticky(axis="index")
styler.to_excel(f"s3://{mock_bucket_name}/{target_file}", storage_options=s3so)
timeout = 5
while True:
if target_file in (obj.key for obj in s3_public_bucket.objects.all()):
break
time.sleep(0.1)
timeout -= 0.1
assert timeout > 0, "Timed out waiting for file to appear on moto"
result = read_excel(
f"s3://{mock_bucket_name}/{target_file}", index_col=0, storage_options=s3so
)
tm.assert_frame_equal(result, df)

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,76 @@
import io
import numpy as np
import pytest
from pandas.compat import is_platform_windows
import pandas as pd
import pandas._testing as tm
from pandas.io.excel import ExcelFile
from pandas.io.excel._base import inspect_excel_format
xlrd = pytest.importorskip("xlrd")
if is_platform_windows():
pytestmark = pytest.mark.single_cpu
@pytest.fixture(params=[".xls"])
def read_ext_xlrd(request):
"""
Valid extensions for reading Excel files with xlrd.
Similar to read_ext, but excludes .ods, .xlsb, and for xlrd>2 .xlsx, .xlsm
"""
return request.param
def test_read_xlrd_book(read_ext_xlrd, datapath):
engine = "xlrd"
sheet_name = "Sheet1"
pth = datapath("io", "data", "excel", "test1.xls")
with xlrd.open_workbook(pth) as book:
with ExcelFile(book, engine=engine) as xl:
result = pd.read_excel(xl, sheet_name=sheet_name, index_col=0)
expected = pd.read_excel(
book, sheet_name=sheet_name, engine=engine, index_col=0
)
tm.assert_frame_equal(result, expected)
def test_read_xlsx_fails(datapath):
# GH 29375
from xlrd.biffh import XLRDError
path = datapath("io", "data", "excel", "test1.xlsx")
with pytest.raises(XLRDError, match="Excel xlsx file; not supported"):
pd.read_excel(path, engine="xlrd")
def test_nan_in_xls(datapath):
# GH 54564
path = datapath("io", "data", "excel", "test6.xls")
expected = pd.DataFrame({0: np.r_[0, 2].astype("int64"), 1: np.r_[1, np.nan]})
result = pd.read_excel(path, header=None)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"file_header",
[
b"\x09\x00\x04\x00\x07\x00\x10\x00",
b"\x09\x02\x06\x00\x00\x00\x10\x00",
b"\x09\x04\x06\x00\x00\x00\x10\x00",
b"\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1",
],
)
def test_read_old_xls_files(file_header):
# GH 41226
f = io.BytesIO(file_header)
assert inspect_excel_format(f) == "xls"

View File

@ -0,0 +1,86 @@
import contextlib
import pytest
from pandas.compat import is_platform_windows
from pandas import DataFrame
import pandas._testing as tm
from pandas.io.excel import ExcelWriter
xlsxwriter = pytest.importorskip("xlsxwriter")
if is_platform_windows():
pytestmark = pytest.mark.single_cpu
@pytest.fixture
def ext():
return ".xlsx"
def test_column_format(ext):
# Test that column formats are applied to cells. Test for issue #9167.
# Applicable to xlsxwriter only.
openpyxl = pytest.importorskip("openpyxl")
with tm.ensure_clean(ext) as path:
frame = DataFrame({"A": [123456, 123456], "B": [123456, 123456]})
with ExcelWriter(path) as writer:
frame.to_excel(writer)
# Add a number format to col B and ensure it is applied to cells.
num_format = "#,##0"
write_workbook = writer.book
write_worksheet = write_workbook.worksheets()[0]
col_format = write_workbook.add_format({"num_format": num_format})
write_worksheet.set_column("B:B", None, col_format)
with contextlib.closing(openpyxl.load_workbook(path)) as read_workbook:
try:
read_worksheet = read_workbook["Sheet1"]
except TypeError:
# compat
read_worksheet = read_workbook.get_sheet_by_name(name="Sheet1")
# Get the number format from the cell.
try:
cell = read_worksheet["B2"]
except TypeError:
# compat
cell = read_worksheet.cell("B2")
try:
read_num_format = cell.number_format
except AttributeError:
read_num_format = cell.style.number_format._format_code
assert read_num_format == num_format
def test_write_append_mode_raises(ext):
msg = "Append mode is not supported with xlsxwriter!"
with tm.ensure_clean(ext) as f:
with pytest.raises(ValueError, match=msg):
ExcelWriter(f, engine="xlsxwriter", mode="a")
@pytest.mark.parametrize("nan_inf_to_errors", [True, False])
def test_engine_kwargs(ext, nan_inf_to_errors):
# GH 42286
engine_kwargs = {"options": {"nan_inf_to_errors": nan_inf_to_errors}}
with tm.ensure_clean(ext) as f:
with ExcelWriter(f, engine="xlsxwriter", engine_kwargs=engine_kwargs) as writer:
assert writer.book.nan_inf_to_errors == nan_inf_to_errors
def test_book_and_sheets_consistent(ext):
# GH#45687 - Ensure sheets is updated if user modifies book
with tm.ensure_clean(ext) as f:
with ExcelWriter(f, engine="xlsxwriter") as writer:
assert writer.sheets == {}
sheet = writer.book.add_worksheet("test_name")
assert writer.sheets == {"test_name": sheet}

View File

@ -0,0 +1,359 @@
import io
import numpy as np
import pytest
from pandas import (
NA,
DataFrame,
read_csv,
)
pytest.importorskip("jinja2")
def bar_grad(a=None, b=None, c=None, d=None):
"""Used in multiple tests to simplify formatting of expected result"""
ret = [("width", "10em")]
if all(x is None for x in [a, b, c, d]):
return ret
return ret + [
(
"background",
f"linear-gradient(90deg,{','.join([x for x in [a, b, c, d] if x])})",
)
]
def no_bar():
return bar_grad()
def bar_to(x, color="#d65f5f"):
return bar_grad(f" {color} {x:.1f}%", f" transparent {x:.1f}%")
def bar_from_to(x, y, color="#d65f5f"):
return bar_grad(
f" transparent {x:.1f}%",
f" {color} {x:.1f}%",
f" {color} {y:.1f}%",
f" transparent {y:.1f}%",
)
@pytest.fixture
def df_pos():
return DataFrame([[1], [2], [3]])
@pytest.fixture
def df_neg():
return DataFrame([[-1], [-2], [-3]])
@pytest.fixture
def df_mix():
return DataFrame([[-3], [1], [2]])
@pytest.mark.parametrize(
"align, exp",
[
("left", [no_bar(), bar_to(50), bar_to(100)]),
("right", [bar_to(100), bar_from_to(50, 100), no_bar()]),
("mid", [bar_to(33.33), bar_to(66.66), bar_to(100)]),
("zero", [bar_from_to(50, 66.7), bar_from_to(50, 83.3), bar_from_to(50, 100)]),
("mean", [bar_to(50), no_bar(), bar_from_to(50, 100)]),
(2.0, [bar_to(50), no_bar(), bar_from_to(50, 100)]),
(np.median, [bar_to(50), no_bar(), bar_from_to(50, 100)]),
],
)
def test_align_positive_cases(df_pos, align, exp):
# test different align cases for all positive values
result = df_pos.style.bar(align=align)._compute().ctx
expected = {(0, 0): exp[0], (1, 0): exp[1], (2, 0): exp[2]}
assert result == expected
@pytest.mark.parametrize(
"align, exp",
[
("left", [bar_to(100), bar_to(50), no_bar()]),
("right", [no_bar(), bar_from_to(50, 100), bar_to(100)]),
("mid", [bar_from_to(66.66, 100), bar_from_to(33.33, 100), bar_to(100)]),
("zero", [bar_from_to(33.33, 50), bar_from_to(16.66, 50), bar_to(50)]),
("mean", [bar_from_to(50, 100), no_bar(), bar_to(50)]),
(-2.0, [bar_from_to(50, 100), no_bar(), bar_to(50)]),
(np.median, [bar_from_to(50, 100), no_bar(), bar_to(50)]),
],
)
def test_align_negative_cases(df_neg, align, exp):
# test different align cases for all negative values
result = df_neg.style.bar(align=align)._compute().ctx
expected = {(0, 0): exp[0], (1, 0): exp[1], (2, 0): exp[2]}
assert result == expected
@pytest.mark.parametrize(
"align, exp",
[
("left", [no_bar(), bar_to(80), bar_to(100)]),
("right", [bar_to(100), bar_from_to(80, 100), no_bar()]),
("mid", [bar_to(60), bar_from_to(60, 80), bar_from_to(60, 100)]),
("zero", [bar_to(50), bar_from_to(50, 66.66), bar_from_to(50, 83.33)]),
("mean", [bar_to(50), bar_from_to(50, 66.66), bar_from_to(50, 83.33)]),
(-0.0, [bar_to(50), bar_from_to(50, 66.66), bar_from_to(50, 83.33)]),
(np.nanmedian, [bar_to(50), no_bar(), bar_from_to(50, 62.5)]),
],
)
@pytest.mark.parametrize("nans", [True, False])
def test_align_mixed_cases(df_mix, align, exp, nans):
# test different align cases for mixed positive and negative values
# also test no impact of NaNs and no_bar
expected = {(0, 0): exp[0], (1, 0): exp[1], (2, 0): exp[2]}
if nans:
df_mix.loc[3, :] = np.nan
expected.update({(3, 0): no_bar()})
result = df_mix.style.bar(align=align)._compute().ctx
assert result == expected
@pytest.mark.parametrize(
"align, exp",
[
(
"left",
{
"index": [[no_bar(), no_bar()], [bar_to(100), bar_to(100)]],
"columns": [[no_bar(), bar_to(100)], [no_bar(), bar_to(100)]],
"none": [[no_bar(), bar_to(33.33)], [bar_to(66.66), bar_to(100)]],
},
),
(
"mid",
{
"index": [[bar_to(33.33), bar_to(50)], [bar_to(100), bar_to(100)]],
"columns": [[bar_to(50), bar_to(100)], [bar_to(75), bar_to(100)]],
"none": [[bar_to(25), bar_to(50)], [bar_to(75), bar_to(100)]],
},
),
(
"zero",
{
"index": [
[bar_from_to(50, 66.66), bar_from_to(50, 75)],
[bar_from_to(50, 100), bar_from_to(50, 100)],
],
"columns": [
[bar_from_to(50, 75), bar_from_to(50, 100)],
[bar_from_to(50, 87.5), bar_from_to(50, 100)],
],
"none": [
[bar_from_to(50, 62.5), bar_from_to(50, 75)],
[bar_from_to(50, 87.5), bar_from_to(50, 100)],
],
},
),
(
2,
{
"index": [
[bar_to(50), no_bar()],
[bar_from_to(50, 100), bar_from_to(50, 100)],
],
"columns": [
[bar_to(50), no_bar()],
[bar_from_to(50, 75), bar_from_to(50, 100)],
],
"none": [
[bar_from_to(25, 50), no_bar()],
[bar_from_to(50, 75), bar_from_to(50, 100)],
],
},
),
],
)
@pytest.mark.parametrize("axis", ["index", "columns", "none"])
def test_align_axis(align, exp, axis):
# test all axis combinations with positive values and different aligns
data = DataFrame([[1, 2], [3, 4]])
result = (
data.style.bar(align=align, axis=None if axis == "none" else axis)
._compute()
.ctx
)
expected = {
(0, 0): exp[axis][0][0],
(0, 1): exp[axis][0][1],
(1, 0): exp[axis][1][0],
(1, 1): exp[axis][1][1],
}
assert result == expected
@pytest.mark.parametrize(
"values, vmin, vmax",
[
("positive", 1.5, 2.5),
("negative", -2.5, -1.5),
("mixed", -2.5, 1.5),
],
)
@pytest.mark.parametrize("nullify", [None, "vmin", "vmax"]) # test min/max separately
@pytest.mark.parametrize("align", ["left", "right", "zero", "mid"])
def test_vmin_vmax_clipping(df_pos, df_neg, df_mix, values, vmin, vmax, nullify, align):
# test that clipping occurs if any vmin > data_values or vmax < data_values
if align == "mid": # mid acts as left or right in each case
if values == "positive":
align = "left"
elif values == "negative":
align = "right"
df = {"positive": df_pos, "negative": df_neg, "mixed": df_mix}[values]
vmin = None if nullify == "vmin" else vmin
vmax = None if nullify == "vmax" else vmax
clip_df = df.where(df <= (vmax if vmax else 999), other=vmax)
clip_df = clip_df.where(clip_df >= (vmin if vmin else -999), other=vmin)
result = (
df.style.bar(align=align, vmin=vmin, vmax=vmax, color=["red", "green"])
._compute()
.ctx
)
expected = clip_df.style.bar(align=align, color=["red", "green"])._compute().ctx
assert result == expected
@pytest.mark.parametrize(
"values, vmin, vmax",
[
("positive", 0.5, 4.5),
("negative", -4.5, -0.5),
("mixed", -4.5, 4.5),
],
)
@pytest.mark.parametrize("nullify", [None, "vmin", "vmax"]) # test min/max separately
@pytest.mark.parametrize("align", ["left", "right", "zero", "mid"])
def test_vmin_vmax_widening(df_pos, df_neg, df_mix, values, vmin, vmax, nullify, align):
# test that widening occurs if any vmax > data_values or vmin < data_values
if align == "mid": # mid acts as left or right in each case
if values == "positive":
align = "left"
elif values == "negative":
align = "right"
df = {"positive": df_pos, "negative": df_neg, "mixed": df_mix}[values]
vmin = None if nullify == "vmin" else vmin
vmax = None if nullify == "vmax" else vmax
expand_df = df.copy()
expand_df.loc[3, :], expand_df.loc[4, :] = vmin, vmax
result = (
df.style.bar(align=align, vmin=vmin, vmax=vmax, color=["red", "green"])
._compute()
.ctx
)
expected = expand_df.style.bar(align=align, color=["red", "green"])._compute().ctx
assert result.items() <= expected.items()
def test_numerics():
# test data is pre-selected for numeric values
data = DataFrame([[1, "a"], [2, "b"]])
result = data.style.bar()._compute().ctx
assert (0, 1) not in result
assert (1, 1) not in result
@pytest.mark.parametrize(
"align, exp",
[
("left", [no_bar(), bar_to(100, "green")]),
("right", [bar_to(100, "red"), no_bar()]),
("mid", [bar_to(25, "red"), bar_from_to(25, 100, "green")]),
("zero", [bar_from_to(33.33, 50, "red"), bar_from_to(50, 100, "green")]),
],
)
def test_colors_mixed(align, exp):
data = DataFrame([[-1], [3]])
result = data.style.bar(align=align, color=["red", "green"])._compute().ctx
assert result == {(0, 0): exp[0], (1, 0): exp[1]}
def test_bar_align_height():
# test when keyword height is used 'no-repeat center' and 'background-size' present
data = DataFrame([[1], [2]])
result = data.style.bar(align="left", height=50)._compute().ctx
bg_s = "linear-gradient(90deg, #d65f5f 100.0%, transparent 100.0%) no-repeat center"
expected = {
(0, 0): [("width", "10em")],
(1, 0): [
("width", "10em"),
("background", bg_s),
("background-size", "100% 50.0%"),
],
}
assert result == expected
def test_bar_value_error_raises():
df = DataFrame({"A": [-100, -60, -30, -20]})
msg = "`align` should be in {'left', 'right', 'mid', 'mean', 'zero'} or"
with pytest.raises(ValueError, match=msg):
df.style.bar(align="poorly", color=["#d65f5f", "#5fba7d"]).to_html()
msg = r"`width` must be a value in \[0, 100\]"
with pytest.raises(ValueError, match=msg):
df.style.bar(width=200).to_html()
msg = r"`height` must be a value in \[0, 100\]"
with pytest.raises(ValueError, match=msg):
df.style.bar(height=200).to_html()
def test_bar_color_and_cmap_error_raises():
df = DataFrame({"A": [1, 2, 3, 4]})
msg = "`color` and `cmap` cannot both be given"
# Test that providing both color and cmap raises a ValueError
with pytest.raises(ValueError, match=msg):
df.style.bar(color="#d65f5f", cmap="viridis").to_html()
def test_bar_invalid_color_type_error_raises():
df = DataFrame({"A": [1, 2, 3, 4]})
msg = (
r"`color` must be string or list or tuple of 2 strings,"
r"\(eg: color=\['#d65f5f', '#5fba7d'\]\)"
)
# Test that providing an invalid color type raises a ValueError
with pytest.raises(ValueError, match=msg):
df.style.bar(color=123).to_html()
# Test that providing a color list with more than two elements raises a ValueError
with pytest.raises(ValueError, match=msg):
df.style.bar(color=["#d65f5f", "#5fba7d", "#abcdef"]).to_html()
def test_styler_bar_with_NA_values():
df1 = DataFrame({"A": [1, 2, NA, 4]})
df2 = DataFrame([[NA, NA], [NA, NA]])
expected_substring = "style type="
html_output1 = df1.style.bar(subset="A").to_html()
html_output2 = df2.style.bar(align="left", axis=None).to_html()
assert expected_substring in html_output1
assert expected_substring in html_output2
def test_style_bar_with_pyarrow_NA_values():
pytest.importorskip("pyarrow")
data = """name,age,test1,test2,teacher
Adam,15,95.0,80,Ashby
Bob,16,81.0,82,Ashby
Dave,16,89.0,84,Jones
Fred,15,,88,Jones"""
df = read_csv(io.StringIO(data), dtype_backend="pyarrow")
expected_substring = "style type="
html_output = df.style.bar(subset="test1").to_html()
assert expected_substring in html_output

View File

@ -0,0 +1,44 @@
import pytest
jinja2 = pytest.importorskip("jinja2")
from pandas import (
DataFrame,
MultiIndex,
)
from pandas.io.formats.style import Styler
@pytest.fixture
def df():
return DataFrame(
data=[[0, -0.609], [1, -1.228]],
columns=["A", "B"],
index=["x", "y"],
)
@pytest.fixture
def styler(df):
return Styler(df, uuid_len=0)
def test_concat_bad_columns(styler):
msg = "`other.data` must have same columns as `Styler.data"
with pytest.raises(ValueError, match=msg):
styler.concat(DataFrame([[1, 2]]).style)
def test_concat_bad_type(styler):
msg = "`other` must be of type `Styler`"
with pytest.raises(TypeError, match=msg):
styler.concat(DataFrame([[1, 2]]))
def test_concat_bad_index_levels(styler, df):
df = df.copy()
df.index = MultiIndex.from_tuples([(0, 0), (1, 1)])
msg = "number of index levels must be same in `other`"
with pytest.raises(ValueError, match=msg):
styler.concat(df.style)

View File

@ -0,0 +1,562 @@
import numpy as np
import pytest
from pandas import (
NA,
DataFrame,
IndexSlice,
MultiIndex,
NaT,
Timestamp,
option_context,
)
pytest.importorskip("jinja2")
from pandas.io.formats.style import Styler
from pandas.io.formats.style_render import _str_escape
@pytest.fixture
def df():
return DataFrame(
data=[[0, -0.609], [1, -1.228]],
columns=["A", "B"],
index=["x", "y"],
)
@pytest.fixture
def styler(df):
return Styler(df, uuid_len=0)
@pytest.fixture
def df_multi():
return DataFrame(
data=np.arange(16).reshape(4, 4),
columns=MultiIndex.from_product([["A", "B"], ["a", "b"]]),
index=MultiIndex.from_product([["X", "Y"], ["x", "y"]]),
)
@pytest.fixture
def styler_multi(df_multi):
return Styler(df_multi, uuid_len=0)
def test_display_format(styler):
ctx = styler.format("{:0.1f}")._translate(True, True)
assert all(["display_value" in c for c in row] for row in ctx["body"])
assert all([len(c["display_value"]) <= 3 for c in row[1:]] for row in ctx["body"])
assert len(ctx["body"][0][1]["display_value"].lstrip("-")) <= 3
@pytest.mark.parametrize("index", [True, False])
@pytest.mark.parametrize("columns", [True, False])
def test_display_format_index(styler, index, columns):
exp_index = ["x", "y"]
if index:
styler.format_index(lambda v: v.upper(), axis=0) # test callable
exp_index = ["X", "Y"]
exp_columns = ["A", "B"]
if columns:
styler.format_index("*{}*", axis=1) # test string
exp_columns = ["*A*", "*B*"]
ctx = styler._translate(True, True)
for r, row in enumerate(ctx["body"]):
assert row[0]["display_value"] == exp_index[r]
for c, col in enumerate(ctx["head"][1:]):
assert col["display_value"] == exp_columns[c]
def test_format_dict(styler):
ctx = styler.format({"A": "{:0.1f}", "B": "{0:.2%}"})._translate(True, True)
assert ctx["body"][0][1]["display_value"] == "0.0"
assert ctx["body"][0][2]["display_value"] == "-60.90%"
def test_format_index_dict(styler):
ctx = styler.format_index({0: lambda v: v.upper()})._translate(True, True)
for i, val in enumerate(["X", "Y"]):
assert ctx["body"][i][0]["display_value"] == val
def test_format_string(styler):
ctx = styler.format("{:.2f}")._translate(True, True)
assert ctx["body"][0][1]["display_value"] == "0.00"
assert ctx["body"][0][2]["display_value"] == "-0.61"
assert ctx["body"][1][1]["display_value"] == "1.00"
assert ctx["body"][1][2]["display_value"] == "-1.23"
def test_format_callable(styler):
ctx = styler.format(lambda v: "neg" if v < 0 else "pos")._translate(True, True)
assert ctx["body"][0][1]["display_value"] == "pos"
assert ctx["body"][0][2]["display_value"] == "neg"
assert ctx["body"][1][1]["display_value"] == "pos"
assert ctx["body"][1][2]["display_value"] == "neg"
def test_format_with_na_rep():
# GH 21527 28358
df = DataFrame([[None, None], [1.1, 1.2]], columns=["A", "B"])
ctx = df.style.format(None, na_rep="-")._translate(True, True)
assert ctx["body"][0][1]["display_value"] == "-"
assert ctx["body"][0][2]["display_value"] == "-"
ctx = df.style.format("{:.2%}", na_rep="-")._translate(True, True)
assert ctx["body"][0][1]["display_value"] == "-"
assert ctx["body"][0][2]["display_value"] == "-"
assert ctx["body"][1][1]["display_value"] == "110.00%"
assert ctx["body"][1][2]["display_value"] == "120.00%"
ctx = df.style.format("{:.2%}", na_rep="-", subset=["B"])._translate(True, True)
assert ctx["body"][0][2]["display_value"] == "-"
assert ctx["body"][1][2]["display_value"] == "120.00%"
def test_format_index_with_na_rep():
df = DataFrame([[1, 2, 3, 4, 5]], columns=["A", None, np.nan, NaT, NA])
ctx = df.style.format_index(None, na_rep="--", axis=1)._translate(True, True)
assert ctx["head"][0][1]["display_value"] == "A"
for i in [2, 3, 4, 5]:
assert ctx["head"][0][i]["display_value"] == "--"
def test_format_non_numeric_na():
# GH 21527 28358
df = DataFrame(
{
"object": [None, np.nan, "foo"],
"datetime": [None, NaT, Timestamp("20120101")],
}
)
ctx = df.style.format(None, na_rep="-")._translate(True, True)
assert ctx["body"][0][1]["display_value"] == "-"
assert ctx["body"][0][2]["display_value"] == "-"
assert ctx["body"][1][1]["display_value"] == "-"
assert ctx["body"][1][2]["display_value"] == "-"
@pytest.mark.parametrize(
"func, attr, kwargs",
[
("format", "_display_funcs", {}),
("format_index", "_display_funcs_index", {"axis": 0}),
("format_index", "_display_funcs_columns", {"axis": 1}),
],
)
def test_format_clear(styler, func, attr, kwargs):
assert (0, 0) not in getattr(styler, attr) # using default
getattr(styler, func)("{:.2f}", **kwargs)
assert (0, 0) in getattr(styler, attr) # formatter is specified
getattr(styler, func)(**kwargs)
assert (0, 0) not in getattr(styler, attr) # formatter cleared to default
@pytest.mark.parametrize(
"escape, exp",
[
("html", "&lt;&gt;&amp;&#34;%$#_{}~^\\~ ^ \\ "),
(
"latex",
'<>\\&"\\%\\$\\#\\_\\{\\}\\textasciitilde \\textasciicircum '
"\\textbackslash \\textasciitilde \\space \\textasciicircum \\space "
"\\textbackslash \\space ",
),
],
)
def test_format_escape_html(escape, exp):
chars = '<>&"%$#_{}~^\\~ ^ \\ '
df = DataFrame([[chars]])
s = Styler(df, uuid_len=0).format("&{0}&", escape=None)
expected = f'<td id="T__row0_col0" class="data row0 col0" >&{chars}&</td>'
assert expected in s.to_html()
# only the value should be escaped before passing to the formatter
s = Styler(df, uuid_len=0).format("&{0}&", escape=escape)
expected = f'<td id="T__row0_col0" class="data row0 col0" >&{exp}&</td>'
assert expected in s.to_html()
# also test format_index()
styler = Styler(DataFrame(columns=[chars]), uuid_len=0)
styler.format_index("&{0}&", escape=None, axis=1)
assert styler._translate(True, True)["head"][0][1]["display_value"] == f"&{chars}&"
styler.format_index("&{0}&", escape=escape, axis=1)
assert styler._translate(True, True)["head"][0][1]["display_value"] == f"&{exp}&"
@pytest.mark.parametrize(
"chars, expected",
[
(
r"$ \$&%#_{}~^\ $ &%#_{}~^\ $",
"".join(
[
r"$ \$&%#_{}~^\ $ ",
r"\&\%\#\_\{\}\textasciitilde \textasciicircum ",
r"\textbackslash \space \$",
]
),
),
(
r"\( &%#_{}~^\ \) &%#_{}~^\ \(",
"".join(
[
r"\( &%#_{}~^\ \) ",
r"\&\%\#\_\{\}\textasciitilde \textasciicircum ",
r"\textbackslash \space \textbackslash (",
]
),
),
(
r"$\&%#_{}^\$",
r"\$\textbackslash \&\%\#\_\{\}\textasciicircum \textbackslash \$",
),
(
r"$ \frac{1}{2} $ \( \frac{1}{2} \)",
"".join(
[
r"$ \frac{1}{2} $",
r" \textbackslash ( \textbackslash frac\{1\}\{2\} \textbackslash )",
]
),
),
],
)
def test_format_escape_latex_math(chars, expected):
# GH 51903
# latex-math escape works for each DataFrame cell separately. If we have
# a combination of dollar signs and brackets, the dollar sign would apply.
df = DataFrame([[chars]])
s = df.style.format("{0}", escape="latex-math")
assert s._translate(True, True)["body"][0][1]["display_value"] == expected
def test_format_escape_na_rep():
# tests the na_rep is not escaped
df = DataFrame([['<>&"', None]])
s = Styler(df, uuid_len=0).format("X&{0}>X", escape="html", na_rep="&")
ex = '<td id="T__row0_col0" class="data row0 col0" >X&&lt;&gt;&amp;&#34;>X</td>'
expected2 = '<td id="T__row0_col1" class="data row0 col1" >&</td>'
assert ex in s.to_html()
assert expected2 in s.to_html()
# also test for format_index()
df = DataFrame(columns=['<>&"', None])
styler = Styler(df, uuid_len=0)
styler.format_index("X&{0}>X", escape="html", na_rep="&", axis=1)
ctx = styler._translate(True, True)
assert ctx["head"][0][1]["display_value"] == "X&&lt;&gt;&amp;&#34;>X"
assert ctx["head"][0][2]["display_value"] == "&"
def test_format_escape_floats(styler):
# test given formatter for number format is not impacted by escape
s = styler.format("{:.1f}", escape="html")
for expected in [">0.0<", ">1.0<", ">-1.2<", ">-0.6<"]:
assert expected in s.to_html()
# tests precision of floats is not impacted by escape
s = styler.format(precision=1, escape="html")
for expected in [">0<", ">1<", ">-1.2<", ">-0.6<"]:
assert expected in s.to_html()
@pytest.mark.parametrize("formatter", [5, True, [2.0]])
@pytest.mark.parametrize("func", ["format", "format_index"])
def test_format_raises(styler, formatter, func):
with pytest.raises(TypeError, match="expected str or callable"):
getattr(styler, func)(formatter)
@pytest.mark.parametrize(
"precision, expected",
[
(1, ["1.0", "2.0", "3.2", "4.6"]),
(2, ["1.00", "2.01", "3.21", "4.57"]),
(3, ["1.000", "2.009", "3.212", "4.566"]),
],
)
def test_format_with_precision(precision, expected):
# Issue #13257
df = DataFrame([[1.0, 2.0090, 3.2121, 4.566]], columns=[1.0, 2.0090, 3.2121, 4.566])
styler = Styler(df)
styler.format(precision=precision)
styler.format_index(precision=precision, axis=1)
ctx = styler._translate(True, True)
for col, exp in enumerate(expected):
assert ctx["body"][0][col + 1]["display_value"] == exp # format test
assert ctx["head"][0][col + 1]["display_value"] == exp # format_index test
@pytest.mark.parametrize("axis", [0, 1])
@pytest.mark.parametrize(
"level, expected",
[
(0, ["X", "X", "_", "_"]), # level int
("zero", ["X", "X", "_", "_"]), # level name
(1, ["_", "_", "X", "X"]), # other level int
("one", ["_", "_", "X", "X"]), # other level name
([0, 1], ["X", "X", "X", "X"]), # both levels
([0, "zero"], ["X", "X", "_", "_"]), # level int and name simultaneous
([0, "one"], ["X", "X", "X", "X"]), # both levels as int and name
(["one", "zero"], ["X", "X", "X", "X"]), # both level names, reversed
],
)
def test_format_index_level(axis, level, expected):
midx = MultiIndex.from_arrays([["_", "_"], ["_", "_"]], names=["zero", "one"])
df = DataFrame([[1, 2], [3, 4]])
if axis == 0:
df.index = midx
else:
df.columns = midx
styler = df.style.format_index(lambda v: "X", level=level, axis=axis)
ctx = styler._translate(True, True)
if axis == 0: # compare index
result = [ctx["body"][s][0]["display_value"] for s in range(2)]
result += [ctx["body"][s][1]["display_value"] for s in range(2)]
else: # compare columns
result = [ctx["head"][0][s + 1]["display_value"] for s in range(2)]
result += [ctx["head"][1][s + 1]["display_value"] for s in range(2)]
assert expected == result
def test_format_subset():
df = DataFrame([[0.1234, 0.1234], [1.1234, 1.1234]], columns=["a", "b"])
ctx = df.style.format(
{"a": "{:0.1f}", "b": "{0:.2%}"}, subset=IndexSlice[0, :]
)._translate(True, True)
expected = "0.1"
raw_11 = "1.123400"
assert ctx["body"][0][1]["display_value"] == expected
assert ctx["body"][1][1]["display_value"] == raw_11
assert ctx["body"][0][2]["display_value"] == "12.34%"
ctx = df.style.format("{:0.1f}", subset=IndexSlice[0, :])._translate(True, True)
assert ctx["body"][0][1]["display_value"] == expected
assert ctx["body"][1][1]["display_value"] == raw_11
ctx = df.style.format("{:0.1f}", subset=IndexSlice["a"])._translate(True, True)
assert ctx["body"][0][1]["display_value"] == expected
assert ctx["body"][0][2]["display_value"] == "0.123400"
ctx = df.style.format("{:0.1f}", subset=IndexSlice[0, "a"])._translate(True, True)
assert ctx["body"][0][1]["display_value"] == expected
assert ctx["body"][1][1]["display_value"] == raw_11
ctx = df.style.format("{:0.1f}", subset=IndexSlice[[0, 1], ["a"]])._translate(
True, True
)
assert ctx["body"][0][1]["display_value"] == expected
assert ctx["body"][1][1]["display_value"] == "1.1"
assert ctx["body"][0][2]["display_value"] == "0.123400"
assert ctx["body"][1][2]["display_value"] == raw_11
@pytest.mark.parametrize("formatter", [None, "{:,.1f}"])
@pytest.mark.parametrize("decimal", [".", "*"])
@pytest.mark.parametrize("precision", [None, 2])
@pytest.mark.parametrize("func, col", [("format", 1), ("format_index", 0)])
def test_format_thousands(formatter, decimal, precision, func, col):
styler = DataFrame([[1000000.123456789]], index=[1000000.123456789]).style
result = getattr(styler, func)( # testing float
thousands="_", formatter=formatter, decimal=decimal, precision=precision
)._translate(True, True)
assert "1_000_000" in result["body"][0][col]["display_value"]
styler = DataFrame([[1000000]], index=[1000000]).style
result = getattr(styler, func)( # testing int
thousands="_", formatter=formatter, decimal=decimal, precision=precision
)._translate(True, True)
assert "1_000_000" in result["body"][0][col]["display_value"]
styler = DataFrame([[1 + 1000000.123456789j]], index=[1 + 1000000.123456789j]).style
result = getattr(styler, func)( # testing complex
thousands="_", formatter=formatter, decimal=decimal, precision=precision
)._translate(True, True)
assert "1_000_000" in result["body"][0][col]["display_value"]
@pytest.mark.parametrize("formatter", [None, "{:,.4f}"])
@pytest.mark.parametrize("thousands", [None, ",", "*"])
@pytest.mark.parametrize("precision", [None, 4])
@pytest.mark.parametrize("func, col", [("format", 1), ("format_index", 0)])
def test_format_decimal(formatter, thousands, precision, func, col):
styler = DataFrame([[1000000.123456789]], index=[1000000.123456789]).style
result = getattr(styler, func)( # testing float
decimal="_", formatter=formatter, thousands=thousands, precision=precision
)._translate(True, True)
assert "000_123" in result["body"][0][col]["display_value"]
styler = DataFrame([[1 + 1000000.123456789j]], index=[1 + 1000000.123456789j]).style
result = getattr(styler, func)( # testing complex
decimal="_", formatter=formatter, thousands=thousands, precision=precision
)._translate(True, True)
assert "000_123" in result["body"][0][col]["display_value"]
def test_str_escape_error():
msg = "`escape` only permitted in {'html', 'latex', 'latex-math'}, got "
with pytest.raises(ValueError, match=msg):
_str_escape("text", "bad_escape")
with pytest.raises(ValueError, match=msg):
_str_escape("text", [])
_str_escape(2.00, "bad_escape") # OK since dtype is float
def test_long_int_formatting():
df = DataFrame(data=[[1234567890123456789]], columns=["test"])
styler = df.style
ctx = styler._translate(True, True)
assert ctx["body"][0][1]["display_value"] == "1234567890123456789"
styler = df.style.format(thousands="_")
ctx = styler._translate(True, True)
assert ctx["body"][0][1]["display_value"] == "1_234_567_890_123_456_789"
def test_format_options():
df = DataFrame({"int": [2000, 1], "float": [1.009, None], "str": ["&<", "&~"]})
ctx = df.style._translate(True, True)
# test option: na_rep
assert ctx["body"][1][2]["display_value"] == "nan"
with option_context("styler.format.na_rep", "MISSING"):
ctx_with_op = df.style._translate(True, True)
assert ctx_with_op["body"][1][2]["display_value"] == "MISSING"
# test option: decimal and precision
assert ctx["body"][0][2]["display_value"] == "1.009000"
with option_context("styler.format.decimal", "_"):
ctx_with_op = df.style._translate(True, True)
assert ctx_with_op["body"][0][2]["display_value"] == "1_009000"
with option_context("styler.format.precision", 2):
ctx_with_op = df.style._translate(True, True)
assert ctx_with_op["body"][0][2]["display_value"] == "1.01"
# test option: thousands
assert ctx["body"][0][1]["display_value"] == "2000"
with option_context("styler.format.thousands", "_"):
ctx_with_op = df.style._translate(True, True)
assert ctx_with_op["body"][0][1]["display_value"] == "2_000"
# test option: escape
assert ctx["body"][0][3]["display_value"] == "&<"
assert ctx["body"][1][3]["display_value"] == "&~"
with option_context("styler.format.escape", "html"):
ctx_with_op = df.style._translate(True, True)
assert ctx_with_op["body"][0][3]["display_value"] == "&amp;&lt;"
with option_context("styler.format.escape", "latex"):
ctx_with_op = df.style._translate(True, True)
assert ctx_with_op["body"][1][3]["display_value"] == "\\&\\textasciitilde "
with option_context("styler.format.escape", "latex-math"):
ctx_with_op = df.style._translate(True, True)
assert ctx_with_op["body"][1][3]["display_value"] == "\\&\\textasciitilde "
# test option: formatter
with option_context("styler.format.formatter", {"int": "{:,.2f}"}):
ctx_with_op = df.style._translate(True, True)
assert ctx_with_op["body"][0][1]["display_value"] == "2,000.00"
def test_precision_zero(df):
styler = Styler(df, precision=0)
ctx = styler._translate(True, True)
assert ctx["body"][0][2]["display_value"] == "-1"
assert ctx["body"][1][2]["display_value"] == "-1"
@pytest.mark.parametrize(
"formatter, exp",
[
(lambda x: f"{x:.3f}", "9.000"),
("{:.2f}", "9.00"),
({0: "{:.1f}"}, "9.0"),
(None, "9"),
],
)
def test_formatter_options_validator(formatter, exp):
df = DataFrame([[9]])
with option_context("styler.format.formatter", formatter):
assert f" {exp} " in df.style.to_latex()
def test_formatter_options_raises():
msg = "Value must be an instance of"
with pytest.raises(ValueError, match=msg):
with option_context("styler.format.formatter", ["bad", "type"]):
DataFrame().style.to_latex()
def test_1level_multiindex():
# GH 43383
midx = MultiIndex.from_product([[1, 2]], names=[""])
df = DataFrame(-1, index=midx, columns=[0, 1])
ctx = df.style._translate(True, True)
assert ctx["body"][0][0]["display_value"] == "1"
assert ctx["body"][0][0]["is_visible"] is True
assert ctx["body"][1][0]["display_value"] == "2"
assert ctx["body"][1][0]["is_visible"] is True
def test_boolean_format():
# gh 46384: booleans do not collapse to integer representation on display
df = DataFrame([[True, False]])
ctx = df.style._translate(True, True)
assert ctx["body"][0][1]["display_value"] is True
assert ctx["body"][0][2]["display_value"] is False
@pytest.mark.parametrize(
"hide, labels",
[
(False, [1, 2]),
(True, [1, 2, 3, 4]),
],
)
def test_relabel_raise_length(styler_multi, hide, labels):
if hide:
styler_multi.hide(axis=0, subset=[("X", "x"), ("Y", "y")])
with pytest.raises(ValueError, match="``labels`` must be of length equal"):
styler_multi.relabel_index(labels=labels)
def test_relabel_index(styler_multi):
labels = [(1, 2), (3, 4)]
styler_multi.hide(axis=0, subset=[("X", "x"), ("Y", "y")])
styler_multi.relabel_index(labels=labels)
ctx = styler_multi._translate(True, True)
assert {"value": "X", "display_value": 1}.items() <= ctx["body"][0][0].items()
assert {"value": "y", "display_value": 2}.items() <= ctx["body"][0][1].items()
assert {"value": "Y", "display_value": 3}.items() <= ctx["body"][1][0].items()
assert {"value": "x", "display_value": 4}.items() <= ctx["body"][1][1].items()
def test_relabel_columns(styler_multi):
labels = [(1, 2), (3, 4)]
styler_multi.hide(axis=1, subset=[("A", "a"), ("B", "b")])
styler_multi.relabel_index(axis=1, labels=labels)
ctx = styler_multi._translate(True, True)
assert {"value": "A", "display_value": 1}.items() <= ctx["head"][0][3].items()
assert {"value": "B", "display_value": 3}.items() <= ctx["head"][0][4].items()
assert {"value": "b", "display_value": 2}.items() <= ctx["head"][1][3].items()
assert {"value": "a", "display_value": 4}.items() <= ctx["head"][1][4].items()
def test_relabel_roundtrip(styler):
styler.relabel_index(["{}", "{}"])
ctx = styler._translate(True, True)
assert {"value": "x", "display_value": "x"}.items() <= ctx["body"][0][0].items()
assert {"value": "y", "display_value": "y"}.items() <= ctx["body"][1][0].items()

View File

@ -0,0 +1,218 @@
import numpy as np
import pytest
from pandas import (
NA,
DataFrame,
IndexSlice,
)
pytest.importorskip("jinja2")
from pandas.io.formats.style import Styler
@pytest.fixture(params=[(None, "float64"), (NA, "Int64")])
def df(request):
# GH 45804
return DataFrame(
{"A": [0, np.nan, 10], "B": [1, request.param[0], 2]}, dtype=request.param[1]
)
@pytest.fixture
def styler(df):
return Styler(df, uuid_len=0)
def test_highlight_null(styler):
result = styler.highlight_null()._compute().ctx
expected = {
(1, 0): [("background-color", "red")],
(1, 1): [("background-color", "red")],
}
assert result == expected
def test_highlight_null_subset(styler):
# GH 31345
result = (
styler.highlight_null(color="red", subset=["A"])
.highlight_null(color="green", subset=["B"])
._compute()
.ctx
)
expected = {
(1, 0): [("background-color", "red")],
(1, 1): [("background-color", "green")],
}
assert result == expected
@pytest.mark.parametrize("f", ["highlight_min", "highlight_max"])
def test_highlight_minmax_basic(df, f):
expected = {
(0, 1): [("background-color", "red")],
# ignores NaN row,
(2, 0): [("background-color", "red")],
}
if f == "highlight_min":
df = -df
result = getattr(df.style, f)(axis=1, color="red")._compute().ctx
assert result == expected
@pytest.mark.parametrize("f", ["highlight_min", "highlight_max"])
@pytest.mark.parametrize(
"kwargs",
[
{"axis": None, "color": "red"}, # test axis
{"axis": 0, "subset": ["A"], "color": "red"}, # test subset and ignores NaN
{"axis": None, "props": "background-color: red"}, # test props
],
)
def test_highlight_minmax_ext(df, f, kwargs):
expected = {(2, 0): [("background-color", "red")]}
if f == "highlight_min":
df = -df
result = getattr(df.style, f)(**kwargs)._compute().ctx
assert result == expected
@pytest.mark.parametrize("f", ["highlight_min", "highlight_max"])
@pytest.mark.parametrize("axis", [None, 0, 1])
def test_highlight_minmax_nulls(f, axis):
# GH 42750
expected = {
(1, 0): [("background-color", "yellow")],
(1, 1): [("background-color", "yellow")],
}
if axis == 1:
expected.update({(2, 1): [("background-color", "yellow")]})
if f == "highlight_max":
df = DataFrame({"a": [NA, 1, None], "b": [np.nan, 1, -1]})
else:
df = DataFrame({"a": [NA, -1, None], "b": [np.nan, -1, 1]})
result = getattr(df.style, f)(axis=axis)._compute().ctx
assert result == expected
@pytest.mark.parametrize(
"kwargs",
[
{"left": 0, "right": 1}, # test basic range
{"left": 0, "right": 1, "props": "background-color: yellow"}, # test props
{"left": -100, "right": 100, "subset": IndexSlice[[0, 1], :]}, # test subset
{"left": 0, "subset": IndexSlice[[0, 1], :]}, # test no right
{"right": 1}, # test no left
{"left": [0, 0, 11], "axis": 0}, # test left as sequence
{"left": DataFrame({"A": [0, 0, 11], "B": [1, 1, 11]}), "axis": None}, # axis
{"left": 0, "right": [0, 1], "axis": 1}, # test sequence right
],
)
def test_highlight_between(styler, kwargs):
expected = {
(0, 0): [("background-color", "yellow")],
(0, 1): [("background-color", "yellow")],
}
result = styler.highlight_between(**kwargs)._compute().ctx
assert result == expected
@pytest.mark.parametrize(
"arg, map, axis",
[
("left", [1, 2], 0), # 0 axis has 3 elements not 2
("left", [1, 2, 3], 1), # 1 axis has 2 elements not 3
("left", np.array([[1, 2], [1, 2]]), None), # df is (2,3) not (2,2)
("right", [1, 2], 0), # same tests as above for 'right' not 'left'
("right", [1, 2, 3], 1), # ..
("right", np.array([[1, 2], [1, 2]]), None), # ..
],
)
def test_highlight_between_raises(arg, styler, map, axis):
msg = f"supplied '{arg}' is not correct shape"
with pytest.raises(ValueError, match=msg):
styler.highlight_between(**{arg: map, "axis": axis})._compute()
def test_highlight_between_raises2(styler):
msg = "values can be 'both', 'left', 'right', or 'neither'"
with pytest.raises(ValueError, match=msg):
styler.highlight_between(inclusive="badstring")._compute()
with pytest.raises(ValueError, match=msg):
styler.highlight_between(inclusive=1)._compute()
@pytest.mark.parametrize(
"inclusive, expected",
[
(
"both",
{
(0, 0): [("background-color", "yellow")],
(0, 1): [("background-color", "yellow")],
},
),
("neither", {}),
("left", {(0, 0): [("background-color", "yellow")]}),
("right", {(0, 1): [("background-color", "yellow")]}),
],
)
def test_highlight_between_inclusive(styler, inclusive, expected):
kwargs = {"left": 0, "right": 1, "subset": IndexSlice[[0, 1], :]}
result = styler.highlight_between(**kwargs, inclusive=inclusive)._compute()
assert result.ctx == expected
@pytest.mark.parametrize(
"kwargs",
[
{"q_left": 0.5, "q_right": 1, "axis": 0}, # base case
{"q_left": 0.5, "q_right": 1, "axis": None}, # test axis
{"q_left": 0, "q_right": 1, "subset": IndexSlice[2, :]}, # test subset
{"q_left": 0.5, "axis": 0}, # test no high
{"q_right": 1, "subset": IndexSlice[2, :], "axis": 1}, # test no low
{"q_left": 0.5, "axis": 0, "props": "background-color: yellow"}, # tst prop
],
)
def test_highlight_quantile(styler, kwargs):
expected = {
(2, 0): [("background-color", "yellow")],
(2, 1): [("background-color", "yellow")],
}
result = styler.highlight_quantile(**kwargs)._compute().ctx
assert result == expected
@pytest.mark.parametrize(
"f,kwargs",
[
("highlight_min", {"axis": 1, "subset": IndexSlice[1, :]}),
("highlight_max", {"axis": 0, "subset": [0]}),
("highlight_quantile", {"axis": None, "q_left": 0.6, "q_right": 0.8}),
("highlight_between", {"subset": [0]}),
],
)
@pytest.mark.parametrize(
"df",
[
DataFrame([[0, 10], [20, 30]], dtype=int),
DataFrame([[0, 10], [20, 30]], dtype=float),
DataFrame([[0, 10], [20, 30]], dtype="datetime64[ns]"),
DataFrame([[0, 10], [20, 30]], dtype=str),
DataFrame([[0, 10], [20, 30]], dtype="timedelta64[ns]"),
],
)
def test_all_highlight_dtypes(f, kwargs, df):
if f == "highlight_quantile" and isinstance(df.iloc[0, 0], (str)):
return None # quantile incompatible with str
if f == "highlight_between":
kwargs["left"] = df.iloc[1, 0] # set the range low for testing
expected = {(1, 0): [("background-color", "yellow")]}
result = getattr(df.style, f)(**kwargs)._compute().ctx
assert result == expected

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,335 @@
import gc
import numpy as np
import pytest
from pandas import (
DataFrame,
IndexSlice,
Series,
)
pytest.importorskip("matplotlib")
pytest.importorskip("jinja2")
import matplotlib as mpl
from pandas.io.formats.style import Styler
@pytest.fixture(autouse=True)
def mpl_cleanup():
# matplotlib/testing/decorators.py#L24
# 1) Resets units registry
# 2) Resets rc_context
# 3) Closes all figures
mpl = pytest.importorskip("matplotlib")
mpl_units = pytest.importorskip("matplotlib.units")
plt = pytest.importorskip("matplotlib.pyplot")
orig_units_registry = mpl_units.registry.copy()
with mpl.rc_context():
mpl.use("template")
yield
mpl_units.registry.clear()
mpl_units.registry.update(orig_units_registry)
plt.close("all")
# https://matplotlib.org/stable/users/prev_whats_new/whats_new_3.6.0.html#garbage-collection-is-no-longer-run-on-figure-close # noqa: E501
gc.collect(1)
@pytest.fixture
def df():
return DataFrame([[1, 2], [2, 4]], columns=["A", "B"])
@pytest.fixture
def styler(df):
return Styler(df, uuid_len=0)
@pytest.fixture
def df_blank():
return DataFrame([[0, 0], [0, 0]], columns=["A", "B"], index=["X", "Y"])
@pytest.fixture
def styler_blank(df_blank):
return Styler(df_blank, uuid_len=0)
@pytest.mark.parametrize("f", ["background_gradient", "text_gradient"])
def test_function_gradient(styler, f):
for c_map in [None, "YlOrRd"]:
result = getattr(styler, f)(cmap=c_map)._compute().ctx
assert all("#" in x[0][1] for x in result.values())
assert result[(0, 0)] == result[(0, 1)]
assert result[(1, 0)] == result[(1, 1)]
@pytest.mark.parametrize("f", ["background_gradient", "text_gradient"])
def test_background_gradient_color(styler, f):
result = getattr(styler, f)(subset=IndexSlice[1, "A"])._compute().ctx
if f == "background_gradient":
assert result[(1, 0)] == [("background-color", "#fff7fb"), ("color", "#000000")]
elif f == "text_gradient":
assert result[(1, 0)] == [("color", "#fff7fb")]
@pytest.mark.parametrize(
"axis, expected",
[
(0, ["low", "low", "high", "high"]),
(1, ["low", "high", "low", "high"]),
(None, ["low", "mid", "mid", "high"]),
],
)
@pytest.mark.parametrize("f", ["background_gradient", "text_gradient"])
def test_background_gradient_axis(styler, axis, expected, f):
if f == "background_gradient":
colors = {
"low": [("background-color", "#f7fbff"), ("color", "#000000")],
"mid": [("background-color", "#abd0e6"), ("color", "#000000")],
"high": [("background-color", "#08306b"), ("color", "#f1f1f1")],
}
elif f == "text_gradient":
colors = {
"low": [("color", "#f7fbff")],
"mid": [("color", "#abd0e6")],
"high": [("color", "#08306b")],
}
result = getattr(styler, f)(cmap="Blues", axis=axis)._compute().ctx
for i, cell in enumerate([(0, 0), (0, 1), (1, 0), (1, 1)]):
assert result[cell] == colors[expected[i]]
@pytest.mark.parametrize(
"cmap, expected",
[
(
"PuBu",
{
(4, 5): [("background-color", "#86b0d3"), ("color", "#000000")],
(4, 6): [("background-color", "#83afd3"), ("color", "#f1f1f1")],
},
),
(
"YlOrRd",
{
(4, 8): [("background-color", "#fd913e"), ("color", "#000000")],
(4, 9): [("background-color", "#fd8f3d"), ("color", "#f1f1f1")],
},
),
(
None,
{
(7, 0): [("background-color", "#48c16e"), ("color", "#f1f1f1")],
(7, 1): [("background-color", "#4cc26c"), ("color", "#000000")],
},
),
],
)
def test_text_color_threshold(cmap, expected):
# GH 39888
df = DataFrame(np.arange(100).reshape(10, 10))
result = df.style.background_gradient(cmap=cmap, axis=None)._compute().ctx
for k in expected.keys():
assert result[k] == expected[k]
def test_background_gradient_vmin_vmax():
# GH 12145
df = DataFrame(range(5))
ctx = df.style.background_gradient(vmin=1, vmax=3)._compute().ctx
assert ctx[(0, 0)] == ctx[(1, 0)]
assert ctx[(4, 0)] == ctx[(3, 0)]
def test_background_gradient_int64():
# GH 28869
df1 = Series(range(3)).to_frame()
df2 = Series(range(3), dtype="Int64").to_frame()
ctx1 = df1.style.background_gradient()._compute().ctx
ctx2 = df2.style.background_gradient()._compute().ctx
assert ctx2[(0, 0)] == ctx1[(0, 0)]
assert ctx2[(1, 0)] == ctx1[(1, 0)]
assert ctx2[(2, 0)] == ctx1[(2, 0)]
@pytest.mark.parametrize(
"axis, gmap, expected",
[
(
0,
[1, 2],
{
(0, 0): [("background-color", "#fff7fb"), ("color", "#000000")],
(1, 0): [("background-color", "#023858"), ("color", "#f1f1f1")],
(0, 1): [("background-color", "#fff7fb"), ("color", "#000000")],
(1, 1): [("background-color", "#023858"), ("color", "#f1f1f1")],
},
),
(
1,
[1, 2],
{
(0, 0): [("background-color", "#fff7fb"), ("color", "#000000")],
(1, 0): [("background-color", "#fff7fb"), ("color", "#000000")],
(0, 1): [("background-color", "#023858"), ("color", "#f1f1f1")],
(1, 1): [("background-color", "#023858"), ("color", "#f1f1f1")],
},
),
(
None,
np.array([[2, 1], [1, 2]]),
{
(0, 0): [("background-color", "#023858"), ("color", "#f1f1f1")],
(1, 0): [("background-color", "#fff7fb"), ("color", "#000000")],
(0, 1): [("background-color", "#fff7fb"), ("color", "#000000")],
(1, 1): [("background-color", "#023858"), ("color", "#f1f1f1")],
},
),
],
)
def test_background_gradient_gmap_array(styler_blank, axis, gmap, expected):
# tests when gmap is given as a sequence and converted to ndarray
result = styler_blank.background_gradient(axis=axis, gmap=gmap)._compute().ctx
assert result == expected
@pytest.mark.parametrize(
"gmap, axis", [([1, 2, 3], 0), ([1, 2], 1), (np.array([[1, 2], [1, 2]]), None)]
)
def test_background_gradient_gmap_array_raises(gmap, axis):
# test when gmap as converted ndarray is bad shape
df = DataFrame([[0, 0, 0], [0, 0, 0]])
msg = "supplied 'gmap' is not correct shape"
with pytest.raises(ValueError, match=msg):
df.style.background_gradient(gmap=gmap, axis=axis)._compute()
@pytest.mark.parametrize(
"gmap",
[
DataFrame( # reverse the columns
[[2, 1], [1, 2]], columns=["B", "A"], index=["X", "Y"]
),
DataFrame( # reverse the index
[[2, 1], [1, 2]], columns=["A", "B"], index=["Y", "X"]
),
DataFrame( # reverse the index and columns
[[1, 2], [2, 1]], columns=["B", "A"], index=["Y", "X"]
),
DataFrame( # add unnecessary columns
[[1, 2, 3], [2, 1, 3]], columns=["A", "B", "C"], index=["X", "Y"]
),
DataFrame( # add unnecessary index
[[1, 2], [2, 1], [3, 3]], columns=["A", "B"], index=["X", "Y", "Z"]
),
],
)
@pytest.mark.parametrize(
"subset, exp_gmap", # exp_gmap is underlying map DataFrame should conform to
[
(None, [[1, 2], [2, 1]]),
(["A"], [[1], [2]]), # slice only column "A" in data and gmap
(["B", "A"], [[2, 1], [1, 2]]), # reverse the columns in data
(IndexSlice["X", :], [[1, 2]]), # slice only index "X" in data and gmap
(IndexSlice[["Y", "X"], :], [[2, 1], [1, 2]]), # reverse the index in data
],
)
def test_background_gradient_gmap_dataframe_align(styler_blank, gmap, subset, exp_gmap):
# test gmap given as DataFrame that it aligns to the data including subset
expected = styler_blank.background_gradient(axis=None, gmap=exp_gmap, subset=subset)
result = styler_blank.background_gradient(axis=None, gmap=gmap, subset=subset)
assert expected._compute().ctx == result._compute().ctx
@pytest.mark.parametrize(
"gmap, axis, exp_gmap",
[
(Series([2, 1], index=["Y", "X"]), 0, [[1, 1], [2, 2]]), # revrse the index
(Series([2, 1], index=["B", "A"]), 1, [[1, 2], [1, 2]]), # revrse the cols
(Series([1, 2, 3], index=["X", "Y", "Z"]), 0, [[1, 1], [2, 2]]), # add idx
(Series([1, 2, 3], index=["A", "B", "C"]), 1, [[1, 2], [1, 2]]), # add col
],
)
def test_background_gradient_gmap_series_align(styler_blank, gmap, axis, exp_gmap):
# test gmap given as Series that it aligns to the data including subset
expected = styler_blank.background_gradient(axis=None, gmap=exp_gmap)._compute()
result = styler_blank.background_gradient(axis=axis, gmap=gmap)._compute()
assert expected.ctx == result.ctx
@pytest.mark.parametrize(
"gmap, axis",
[
(DataFrame([[1, 2], [2, 1]], columns=["A", "B"], index=["X", "Y"]), 1),
(DataFrame([[1, 2], [2, 1]], columns=["A", "B"], index=["X", "Y"]), 0),
],
)
def test_background_gradient_gmap_wrong_dataframe(styler_blank, gmap, axis):
# test giving a gmap in DataFrame but with wrong axis
msg = "'gmap' is a DataFrame but underlying data for operations is a Series"
with pytest.raises(ValueError, match=msg):
styler_blank.background_gradient(gmap=gmap, axis=axis)._compute()
def test_background_gradient_gmap_wrong_series(styler_blank):
# test giving a gmap in Series form but with wrong axis
msg = "'gmap' is a Series but underlying data for operations is a DataFrame"
gmap = Series([1, 2], index=["X", "Y"])
with pytest.raises(ValueError, match=msg):
styler_blank.background_gradient(gmap=gmap, axis=None)._compute()
def test_background_gradient_nullable_dtypes():
# GH 50712
df1 = DataFrame([[1], [0], [np.nan]], dtype=float)
df2 = DataFrame([[1], [0], [None]], dtype="Int64")
ctx1 = df1.style.background_gradient()._compute().ctx
ctx2 = df2.style.background_gradient()._compute().ctx
assert ctx1 == ctx2
@pytest.mark.parametrize(
"cmap",
["PuBu", mpl.colormaps["PuBu"]],
)
def test_bar_colormap(cmap):
data = DataFrame([[1, 2], [3, 4]])
ctx = data.style.bar(cmap=cmap, axis=None)._compute().ctx
pubu_colors = {
(0, 0): "#d0d1e6",
(1, 0): "#056faf",
(0, 1): "#73a9cf",
(1, 1): "#023858",
}
for k, v in pubu_colors.items():
assert v in ctx[k][1][1]
def test_bar_color_raises(df):
msg = "`color` must be string or list or tuple of 2 strings"
with pytest.raises(ValueError, match=msg):
df.style.bar(color={"a", "b"}).to_html()
with pytest.raises(ValueError, match=msg):
df.style.bar(color=["a", "b", "c"]).to_html()
msg = "`color` and `cmap` cannot both be given"
with pytest.raises(ValueError, match=msg):
df.style.bar(color="something", cmap="something else").to_html()
@pytest.mark.parametrize(
"plot_method",
["scatter", "hexbin"],
)
def test_pass_colormap_instance(df, plot_method):
# https://github.com/pandas-dev/pandas/issues/49374
cmap = mpl.colors.ListedColormap([[1, 1, 1], [0, 0, 0]])
df["c"] = df.A + df.B
kwargs = {"x": "A", "y": "B", "c": "c", "colormap": cmap}
if plot_method == "hexbin":
kwargs["C"] = kwargs.pop("c")
getattr(df.plot, plot_method)(**kwargs)

View File

@ -0,0 +1,140 @@
from textwrap import dedent
import pytest
from pandas import (
DataFrame,
IndexSlice,
)
pytest.importorskip("jinja2")
from pandas.io.formats.style import Styler
@pytest.fixture
def df():
return DataFrame(
[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
index=["i", "j", "j"],
columns=["c", "d", "d"],
dtype=float,
)
@pytest.fixture
def styler(df):
return Styler(df, uuid_len=0)
def test_format_non_unique(df):
# GH 41269
# test dict
html = df.style.format({"d": "{:.1f}"}).to_html()
for val in ["1.000000<", "4.000000<", "7.000000<"]:
assert val in html
for val in ["2.0<", "3.0<", "5.0<", "6.0<", "8.0<", "9.0<"]:
assert val in html
# test subset
html = df.style.format(precision=1, subset=IndexSlice["j", "d"]).to_html()
for val in ["1.000000<", "4.000000<", "7.000000<", "2.000000<", "3.000000<"]:
assert val in html
for val in ["5.0<", "6.0<", "8.0<", "9.0<"]:
assert val in html
@pytest.mark.parametrize("func", ["apply", "map"])
def test_apply_map_non_unique_raises(df, func):
# GH 41269
if func == "apply":
op = lambda s: ["color: red;"] * len(s)
else:
op = lambda v: "color: red;"
with pytest.raises(KeyError, match="`Styler.apply` and `.map` are not"):
getattr(df.style, func)(op)._compute()
def test_table_styles_dict_non_unique_index(styler):
styles = styler.set_table_styles(
{"j": [{"selector": "td", "props": "a: v;"}]}, axis=1
).table_styles
assert styles == [
{"selector": "td.row1", "props": [("a", "v")]},
{"selector": "td.row2", "props": [("a", "v")]},
]
def test_table_styles_dict_non_unique_columns(styler):
styles = styler.set_table_styles(
{"d": [{"selector": "td", "props": "a: v;"}]}, axis=0
).table_styles
assert styles == [
{"selector": "td.col1", "props": [("a", "v")]},
{"selector": "td.col2", "props": [("a", "v")]},
]
def test_tooltips_non_unique_raises(styler):
# ttips has unique keys
ttips = DataFrame([["1", "2"], ["3", "4"]], columns=["c", "d"], index=["a", "b"])
styler.set_tooltips(ttips=ttips) # OK
# ttips has non-unique columns
ttips = DataFrame([["1", "2"], ["3", "4"]], columns=["c", "c"], index=["a", "b"])
with pytest.raises(KeyError, match="Tooltips render only if `ttips` has unique"):
styler.set_tooltips(ttips=ttips)
# ttips has non-unique index
ttips = DataFrame([["1", "2"], ["3", "4"]], columns=["c", "d"], index=["a", "a"])
with pytest.raises(KeyError, match="Tooltips render only if `ttips` has unique"):
styler.set_tooltips(ttips=ttips)
def test_set_td_classes_non_unique_raises(styler):
# classes has unique keys
classes = DataFrame([["1", "2"], ["3", "4"]], columns=["c", "d"], index=["a", "b"])
styler.set_td_classes(classes=classes) # OK
# classes has non-unique columns
classes = DataFrame([["1", "2"], ["3", "4"]], columns=["c", "c"], index=["a", "b"])
with pytest.raises(KeyError, match="Classes render only if `classes` has unique"):
styler.set_td_classes(classes=classes)
# classes has non-unique index
classes = DataFrame([["1", "2"], ["3", "4"]], columns=["c", "d"], index=["a", "a"])
with pytest.raises(KeyError, match="Classes render only if `classes` has unique"):
styler.set_td_classes(classes=classes)
def test_hide_columns_non_unique(styler):
ctx = styler.hide(["d"], axis="columns")._translate(True, True)
assert ctx["head"][0][1]["display_value"] == "c"
assert ctx["head"][0][1]["is_visible"] is True
assert ctx["head"][0][2]["display_value"] == "d"
assert ctx["head"][0][2]["is_visible"] is False
assert ctx["head"][0][3]["display_value"] == "d"
assert ctx["head"][0][3]["is_visible"] is False
assert ctx["body"][0][1]["is_visible"] is True
assert ctx["body"][0][2]["is_visible"] is False
assert ctx["body"][0][3]["is_visible"] is False
def test_latex_non_unique(styler):
result = styler.to_latex()
assert result == dedent(
"""\
\\begin{tabular}{lrrr}
& c & d & d \\\\
i & 1.000000 & 2.000000 & 3.000000 \\\\
j & 4.000000 & 5.000000 & 6.000000 \\\\
j & 7.000000 & 8.000000 & 9.000000 \\\\
\\end{tabular}
"""
)

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,96 @@
from textwrap import dedent
import pytest
from pandas import (
DataFrame,
Series,
)
pytest.importorskip("jinja2")
from pandas.io.formats.style import Styler
@pytest.fixture
def df():
return DataFrame(
{"A": [0, 1], "B": [-0.61, -1.22], "C": Series(["ab", "cd"], dtype=object)}
)
@pytest.fixture
def styler(df):
return Styler(df, uuid_len=0, precision=2)
def test_basic_string(styler):
result = styler.to_string()
expected = dedent(
"""\
A B C
0 0 -0.61 ab
1 1 -1.22 cd
"""
)
assert result == expected
def test_string_delimiter(styler):
result = styler.to_string(delimiter=";")
expected = dedent(
"""\
;A;B;C
0;0;-0.61;ab
1;1;-1.22;cd
"""
)
assert result == expected
def test_concat(styler):
result = styler.concat(styler.data.agg(["sum"]).style).to_string()
expected = dedent(
"""\
A B C
0 0 -0.61 ab
1 1 -1.22 cd
sum 1 -1.830000 abcd
"""
)
assert result == expected
def test_concat_recursion(styler):
df = styler.data
styler1 = styler
styler2 = Styler(df.agg(["sum"]), uuid_len=0, precision=3)
styler3 = Styler(df.agg(["sum"]), uuid_len=0, precision=4)
result = styler1.concat(styler2.concat(styler3)).to_string()
expected = dedent(
"""\
A B C
0 0 -0.61 ab
1 1 -1.22 cd
sum 1 -1.830 abcd
sum 1 -1.8300 abcd
"""
)
assert result == expected
def test_concat_chain(styler):
df = styler.data
styler1 = styler
styler2 = Styler(df.agg(["sum"]), uuid_len=0, precision=3)
styler3 = Styler(df.agg(["sum"]), uuid_len=0, precision=4)
result = styler1.concat(styler2).concat(styler3).to_string()
expected = dedent(
"""\
A B C
0 0 -0.61 ab
1 1 -1.22 cd
sum 1 -1.830 abcd
sum 1 -1.8300 abcd
"""
)
assert result == expected

View File

@ -0,0 +1,85 @@
import numpy as np
import pytest
from pandas import DataFrame
pytest.importorskip("jinja2")
from pandas.io.formats.style import Styler
@pytest.fixture
def df():
return DataFrame(
data=[[0, 1, 2], [3, 4, 5], [6, 7, 8]],
columns=["A", "B", "C"],
index=["x", "y", "z"],
)
@pytest.fixture
def styler(df):
return Styler(df, uuid_len=0)
@pytest.mark.parametrize(
"ttips",
[
DataFrame( # Test basic reindex and ignoring blank
data=[["Min", "Max"], [np.nan, ""]],
columns=["A", "C"],
index=["x", "y"],
),
DataFrame( # Test non-referenced columns, reversed col names, short index
data=[["Max", "Min", "Bad-Col"]], columns=["C", "A", "D"], index=["x"]
),
],
)
def test_tooltip_render(ttips, styler):
# GH 21266
result = styler.set_tooltips(ttips).to_html()
# test tooltip table level class
assert "#T_ .pd-t {\n visibility: hidden;\n" in result
# test 'Min' tooltip added
assert "#T_ #T__row0_col0:hover .pd-t {\n visibility: visible;\n}" in result
assert '#T_ #T__row0_col0 .pd-t::after {\n content: "Min";\n}' in result
assert 'class="data row0 col0" >0<span class="pd-t"></span></td>' in result
# test 'Max' tooltip added
assert "#T_ #T__row0_col2:hover .pd-t {\n visibility: visible;\n}" in result
assert '#T_ #T__row0_col2 .pd-t::after {\n content: "Max";\n}' in result
assert 'class="data row0 col2" >2<span class="pd-t"></span></td>' in result
# test Nan, empty string and bad column ignored
assert "#T_ #T__row1_col0:hover .pd-t {\n visibility: visible;\n}" not in result
assert "#T_ #T__row1_col1:hover .pd-t {\n visibility: visible;\n}" not in result
assert "#T_ #T__row0_col1:hover .pd-t {\n visibility: visible;\n}" not in result
assert "#T_ #T__row1_col2:hover .pd-t {\n visibility: visible;\n}" not in result
assert "Bad-Col" not in result
def test_tooltip_ignored(styler):
# GH 21266
result = styler.to_html() # no set_tooltips() creates no <span>
assert '<style type="text/css">\n</style>' in result
assert '<span class="pd-t"></span>' not in result
def test_tooltip_css_class(styler):
# GH 21266
result = styler.set_tooltips(
DataFrame([["tooltip"]], index=["x"], columns=["A"]),
css_class="other-class",
props=[("color", "green")],
).to_html()
assert "#T_ .other-class {\n color: green;\n" in result
assert '#T_ #T__row0_col0 .other-class::after {\n content: "tooltip";\n' in result
# GH 39563
result = styler.set_tooltips( # set_tooltips overwrites previous
DataFrame([["tooltip"]], index=["x"], columns=["A"]),
css_class="another-class",
props="color:green;color:red;",
).to_html()
assert "#T_ .another-class {\n color: green;\n color: red;\n}" in result

View File

@ -0,0 +1,72 @@
import locale
import pytest
from pandas._config import detect_console_encoding
class MockEncoding:
"""
Used to add a side effect when accessing the 'encoding' property. If the
side effect is a str in nature, the value will be returned. Otherwise, the
side effect should be an exception that will be raised.
"""
def __init__(self, encoding) -> None:
super().__init__()
self.val = encoding
@property
def encoding(self):
return self.raise_or_return(self.val)
@staticmethod
def raise_or_return(val):
if isinstance(val, str):
return val
else:
raise val
@pytest.mark.parametrize("empty,filled", [["stdin", "stdout"], ["stdout", "stdin"]])
def test_detect_console_encoding_from_stdout_stdin(monkeypatch, empty, filled):
# Ensures that when sys.stdout.encoding or sys.stdin.encoding is used when
# they have values filled.
# GH 21552
with monkeypatch.context() as context:
context.setattr(f"sys.{empty}", MockEncoding(""))
context.setattr(f"sys.{filled}", MockEncoding(filled))
assert detect_console_encoding() == filled
@pytest.mark.parametrize("encoding", [AttributeError, OSError, "ascii"])
def test_detect_console_encoding_fallback_to_locale(monkeypatch, encoding):
# GH 21552
with monkeypatch.context() as context:
context.setattr("locale.getpreferredencoding", lambda: "foo")
context.setattr("sys.stdout", MockEncoding(encoding))
assert detect_console_encoding() == "foo"
@pytest.mark.parametrize(
"std,locale",
[
["ascii", "ascii"],
["ascii", locale.Error],
[AttributeError, "ascii"],
[AttributeError, locale.Error],
[OSError, "ascii"],
[OSError, locale.Error],
],
)
def test_detect_console_encoding_fallback_to_default(monkeypatch, std, locale):
# When both the stdout/stdin encoding and locale preferred encoding checks
# fail (or return 'ascii', we should default to the sys default encoding.
# GH 21552
with monkeypatch.context() as context:
context.setattr(
"locale.getpreferredencoding", lambda: MockEncoding.raise_or_return(locale)
)
context.setattr("sys.stdout", MockEncoding(std))
context.setattr("sys.getdefaultencoding", lambda: "sysDefaultEncoding")
assert detect_console_encoding() == "sysDefaultEncoding"

View File

@ -0,0 +1,289 @@
import pytest
from pandas.errors import CSSWarning
import pandas._testing as tm
from pandas.io.formats.css import CSSResolver
def assert_resolves(css, props, inherited=None):
resolve = CSSResolver()
actual = resolve(css, inherited=inherited)
assert props == actual
def assert_same_resolution(css1, css2, inherited=None):
resolve = CSSResolver()
resolved1 = resolve(css1, inherited=inherited)
resolved2 = resolve(css2, inherited=inherited)
assert resolved1 == resolved2
@pytest.mark.parametrize(
"name,norm,abnorm",
[
(
"whitespace",
"hello: world; foo: bar",
" \t hello \t :\n world \n ; \n foo: \tbar\n\n",
),
("case", "hello: world; foo: bar", "Hello: WORLD; foO: bar"),
("empty-decl", "hello: world; foo: bar", "; hello: world;; foo: bar;\n; ;"),
("empty-list", "", ";"),
],
)
def test_css_parse_normalisation(name, norm, abnorm):
assert_same_resolution(norm, abnorm)
@pytest.mark.parametrize(
"invalid_css,remainder",
[
# No colon
("hello-world", ""),
("border-style: solid; hello-world", "border-style: solid"),
(
"border-style: solid; hello-world; font-weight: bold",
"border-style: solid; font-weight: bold",
),
# Unclosed string fail
# Invalid size
("font-size: blah", "font-size: 1em"),
("font-size: 1a2b", "font-size: 1em"),
("font-size: 1e5pt", "font-size: 1em"),
("font-size: 1+6pt", "font-size: 1em"),
("font-size: 1unknownunit", "font-size: 1em"),
("font-size: 10", "font-size: 1em"),
("font-size: 10 pt", "font-size: 1em"),
# Too many args
("border-top: 1pt solid red green", "border-top: 1pt solid green"),
],
)
def test_css_parse_invalid(invalid_css, remainder):
with tm.assert_produces_warning(CSSWarning):
assert_same_resolution(invalid_css, remainder)
@pytest.mark.parametrize(
"shorthand,expansions",
[
("margin", ["margin-top", "margin-right", "margin-bottom", "margin-left"]),
("padding", ["padding-top", "padding-right", "padding-bottom", "padding-left"]),
(
"border-width",
[
"border-top-width",
"border-right-width",
"border-bottom-width",
"border-left-width",
],
),
(
"border-color",
[
"border-top-color",
"border-right-color",
"border-bottom-color",
"border-left-color",
],
),
(
"border-style",
[
"border-top-style",
"border-right-style",
"border-bottom-style",
"border-left-style",
],
),
],
)
def test_css_side_shorthands(shorthand, expansions):
top, right, bottom, left = expansions
assert_resolves(
f"{shorthand}: 1pt", {top: "1pt", right: "1pt", bottom: "1pt", left: "1pt"}
)
assert_resolves(
f"{shorthand}: 1pt 4pt", {top: "1pt", right: "4pt", bottom: "1pt", left: "4pt"}
)
assert_resolves(
f"{shorthand}: 1pt 4pt 2pt",
{top: "1pt", right: "4pt", bottom: "2pt", left: "4pt"},
)
assert_resolves(
f"{shorthand}: 1pt 4pt 2pt 0pt",
{top: "1pt", right: "4pt", bottom: "2pt", left: "0pt"},
)
with tm.assert_produces_warning(CSSWarning):
assert_resolves(f"{shorthand}: 1pt 1pt 1pt 1pt 1pt", {})
@pytest.mark.parametrize(
"shorthand,sides",
[
("border-top", ["top"]),
("border-right", ["right"]),
("border-bottom", ["bottom"]),
("border-left", ["left"]),
("border", ["top", "right", "bottom", "left"]),
],
)
def test_css_border_shorthand_sides(shorthand, sides):
def create_border_dict(sides, color=None, style=None, width=None):
resolved = {}
for side in sides:
if color:
resolved[f"border-{side}-color"] = color
if style:
resolved[f"border-{side}-style"] = style
if width:
resolved[f"border-{side}-width"] = width
return resolved
assert_resolves(
f"{shorthand}: 1pt red solid", create_border_dict(sides, "red", "solid", "1pt")
)
@pytest.mark.parametrize(
"prop, expected",
[
("1pt red solid", ("red", "solid", "1pt")),
("red 1pt solid", ("red", "solid", "1pt")),
("red solid 1pt", ("red", "solid", "1pt")),
("solid 1pt red", ("red", "solid", "1pt")),
("red solid", ("red", "solid", "1.500000pt")),
# Note: color=black is not CSS conforming
# (See https://drafts.csswg.org/css-backgrounds/#border-shorthands)
("1pt solid", ("black", "solid", "1pt")),
("1pt red", ("red", "none", "1pt")),
("red", ("red", "none", "1.500000pt")),
("1pt", ("black", "none", "1pt")),
("solid", ("black", "solid", "1.500000pt")),
# Sizes
("1em", ("black", "none", "12pt")),
],
)
def test_css_border_shorthands(prop, expected):
color, style, width = expected
assert_resolves(
f"border-left: {prop}",
{
"border-left-color": color,
"border-left-style": style,
"border-left-width": width,
},
)
@pytest.mark.parametrize(
"style,inherited,equiv",
[
("margin: 1px; margin: 2px", "", "margin: 2px"),
("margin: 1px", "margin: 2px", "margin: 1px"),
("margin: 1px; margin: inherit", "margin: 2px", "margin: 2px"),
(
"margin: 1px; margin-top: 2px",
"",
"margin-left: 1px; margin-right: 1px; "
"margin-bottom: 1px; margin-top: 2px",
),
("margin-top: 2px", "margin: 1px", "margin: 1px; margin-top: 2px"),
("margin: 1px", "margin-top: 2px", "margin: 1px"),
(
"margin: 1px; margin-top: inherit",
"margin: 2px",
"margin: 1px; margin-top: 2px",
),
],
)
def test_css_precedence(style, inherited, equiv):
resolve = CSSResolver()
inherited_props = resolve(inherited)
style_props = resolve(style, inherited=inherited_props)
equiv_props = resolve(equiv)
assert style_props == equiv_props
@pytest.mark.parametrize(
"style,equiv",
[
(
"margin: 1px; margin-top: inherit",
"margin-bottom: 1px; margin-right: 1px; margin-left: 1px",
),
("margin-top: inherit", ""),
("margin-top: initial", ""),
],
)
def test_css_none_absent(style, equiv):
assert_same_resolution(style, equiv)
@pytest.mark.parametrize(
"size,resolved",
[
("xx-small", "6pt"),
("x-small", f"{7.5:f}pt"),
("small", f"{9.6:f}pt"),
("medium", "12pt"),
("large", f"{13.5:f}pt"),
("x-large", "18pt"),
("xx-large", "24pt"),
("8px", "6pt"),
("1.25pc", "15pt"),
(".25in", "18pt"),
("02.54cm", "72pt"),
("25.4mm", "72pt"),
("101.6q", "72pt"),
("101.6q", "72pt"),
],
)
@pytest.mark.parametrize("relative_to", [None, "16pt"]) # invariant to inherited size
def test_css_absolute_font_size(size, relative_to, resolved):
if relative_to is None:
inherited = None
else:
inherited = {"font-size": relative_to}
assert_resolves(f"font-size: {size}", {"font-size": resolved}, inherited=inherited)
@pytest.mark.parametrize(
"size,relative_to,resolved",
[
("1em", None, "12pt"),
("1.0em", None, "12pt"),
("1.25em", None, "15pt"),
("1em", "16pt", "16pt"),
("1.0em", "16pt", "16pt"),
("1.25em", "16pt", "20pt"),
("1rem", "16pt", "12pt"),
("1.0rem", "16pt", "12pt"),
("1.25rem", "16pt", "15pt"),
("100%", None, "12pt"),
("125%", None, "15pt"),
("100%", "16pt", "16pt"),
("125%", "16pt", "20pt"),
("2ex", None, "12pt"),
("2.0ex", None, "12pt"),
("2.50ex", None, "15pt"),
("inherit", "16pt", "16pt"),
("smaller", None, "10pt"),
("smaller", "18pt", "15pt"),
("larger", None, f"{14.4:f}pt"),
("larger", "15pt", "18pt"),
],
)
def test_css_relative_font_size(size, relative_to, resolved):
if relative_to is None:
inherited = None
else:
inherited = {"font-size": relative_to}
assert_resolves(f"font-size: {size}", {"font-size": resolved}, inherited=inherited)

View File

@ -0,0 +1,254 @@
import numpy as np
import pytest
from pandas import (
DataFrame,
reset_option,
set_eng_float_format,
)
from pandas.io.formats.format import EngFormatter
@pytest.fixture(autouse=True)
def reset_float_format():
yield
reset_option("display.float_format")
class TestEngFormatter:
def test_eng_float_formatter2(self, float_frame):
df = float_frame
df.loc[5] = 0
set_eng_float_format()
repr(df)
set_eng_float_format(use_eng_prefix=True)
repr(df)
set_eng_float_format(accuracy=0)
repr(df)
def test_eng_float_formatter(self):
df = DataFrame({"A": [1.41, 141.0, 14100, 1410000.0]})
set_eng_float_format()
result = df.to_string()
expected = (
" A\n"
"0 1.410E+00\n"
"1 141.000E+00\n"
"2 14.100E+03\n"
"3 1.410E+06"
)
assert result == expected
set_eng_float_format(use_eng_prefix=True)
result = df.to_string()
expected = " A\n0 1.410\n1 141.000\n2 14.100k\n3 1.410M"
assert result == expected
set_eng_float_format(accuracy=0)
result = df.to_string()
expected = " A\n0 1E+00\n1 141E+00\n2 14E+03\n3 1E+06"
assert result == expected
def compare(self, formatter, input, output):
formatted_input = formatter(input)
assert formatted_input == output
def compare_all(self, formatter, in_out):
"""
Parameters:
-----------
formatter: EngFormatter under test
in_out: list of tuples. Each tuple = (number, expected_formatting)
It is tested if 'formatter(number) == expected_formatting'.
*number* should be >= 0 because formatter(-number) == fmt is also
tested. *fmt* is derived from *expected_formatting*
"""
for input, output in in_out:
self.compare(formatter, input, output)
self.compare(formatter, -input, "-" + output[1:])
def test_exponents_with_eng_prefix(self):
formatter = EngFormatter(accuracy=3, use_eng_prefix=True)
f = np.sqrt(2)
in_out = [
(f * 10**-24, " 1.414y"),
(f * 10**-23, " 14.142y"),
(f * 10**-22, " 141.421y"),
(f * 10**-21, " 1.414z"),
(f * 10**-20, " 14.142z"),
(f * 10**-19, " 141.421z"),
(f * 10**-18, " 1.414a"),
(f * 10**-17, " 14.142a"),
(f * 10**-16, " 141.421a"),
(f * 10**-15, " 1.414f"),
(f * 10**-14, " 14.142f"),
(f * 10**-13, " 141.421f"),
(f * 10**-12, " 1.414p"),
(f * 10**-11, " 14.142p"),
(f * 10**-10, " 141.421p"),
(f * 10**-9, " 1.414n"),
(f * 10**-8, " 14.142n"),
(f * 10**-7, " 141.421n"),
(f * 10**-6, " 1.414u"),
(f * 10**-5, " 14.142u"),
(f * 10**-4, " 141.421u"),
(f * 10**-3, " 1.414m"),
(f * 10**-2, " 14.142m"),
(f * 10**-1, " 141.421m"),
(f * 10**0, " 1.414"),
(f * 10**1, " 14.142"),
(f * 10**2, " 141.421"),
(f * 10**3, " 1.414k"),
(f * 10**4, " 14.142k"),
(f * 10**5, " 141.421k"),
(f * 10**6, " 1.414M"),
(f * 10**7, " 14.142M"),
(f * 10**8, " 141.421M"),
(f * 10**9, " 1.414G"),
(f * 10**10, " 14.142G"),
(f * 10**11, " 141.421G"),
(f * 10**12, " 1.414T"),
(f * 10**13, " 14.142T"),
(f * 10**14, " 141.421T"),
(f * 10**15, " 1.414P"),
(f * 10**16, " 14.142P"),
(f * 10**17, " 141.421P"),
(f * 10**18, " 1.414E"),
(f * 10**19, " 14.142E"),
(f * 10**20, " 141.421E"),
(f * 10**21, " 1.414Z"),
(f * 10**22, " 14.142Z"),
(f * 10**23, " 141.421Z"),
(f * 10**24, " 1.414Y"),
(f * 10**25, " 14.142Y"),
(f * 10**26, " 141.421Y"),
]
self.compare_all(formatter, in_out)
def test_exponents_without_eng_prefix(self):
formatter = EngFormatter(accuracy=4, use_eng_prefix=False)
f = np.pi
in_out = [
(f * 10**-24, " 3.1416E-24"),
(f * 10**-23, " 31.4159E-24"),
(f * 10**-22, " 314.1593E-24"),
(f * 10**-21, " 3.1416E-21"),
(f * 10**-20, " 31.4159E-21"),
(f * 10**-19, " 314.1593E-21"),
(f * 10**-18, " 3.1416E-18"),
(f * 10**-17, " 31.4159E-18"),
(f * 10**-16, " 314.1593E-18"),
(f * 10**-15, " 3.1416E-15"),
(f * 10**-14, " 31.4159E-15"),
(f * 10**-13, " 314.1593E-15"),
(f * 10**-12, " 3.1416E-12"),
(f * 10**-11, " 31.4159E-12"),
(f * 10**-10, " 314.1593E-12"),
(f * 10**-9, " 3.1416E-09"),
(f * 10**-8, " 31.4159E-09"),
(f * 10**-7, " 314.1593E-09"),
(f * 10**-6, " 3.1416E-06"),
(f * 10**-5, " 31.4159E-06"),
(f * 10**-4, " 314.1593E-06"),
(f * 10**-3, " 3.1416E-03"),
(f * 10**-2, " 31.4159E-03"),
(f * 10**-1, " 314.1593E-03"),
(f * 10**0, " 3.1416E+00"),
(f * 10**1, " 31.4159E+00"),
(f * 10**2, " 314.1593E+00"),
(f * 10**3, " 3.1416E+03"),
(f * 10**4, " 31.4159E+03"),
(f * 10**5, " 314.1593E+03"),
(f * 10**6, " 3.1416E+06"),
(f * 10**7, " 31.4159E+06"),
(f * 10**8, " 314.1593E+06"),
(f * 10**9, " 3.1416E+09"),
(f * 10**10, " 31.4159E+09"),
(f * 10**11, " 314.1593E+09"),
(f * 10**12, " 3.1416E+12"),
(f * 10**13, " 31.4159E+12"),
(f * 10**14, " 314.1593E+12"),
(f * 10**15, " 3.1416E+15"),
(f * 10**16, " 31.4159E+15"),
(f * 10**17, " 314.1593E+15"),
(f * 10**18, " 3.1416E+18"),
(f * 10**19, " 31.4159E+18"),
(f * 10**20, " 314.1593E+18"),
(f * 10**21, " 3.1416E+21"),
(f * 10**22, " 31.4159E+21"),
(f * 10**23, " 314.1593E+21"),
(f * 10**24, " 3.1416E+24"),
(f * 10**25, " 31.4159E+24"),
(f * 10**26, " 314.1593E+24"),
]
self.compare_all(formatter, in_out)
def test_rounding(self):
formatter = EngFormatter(accuracy=3, use_eng_prefix=True)
in_out = [
(5.55555, " 5.556"),
(55.5555, " 55.556"),
(555.555, " 555.555"),
(5555.55, " 5.556k"),
(55555.5, " 55.556k"),
(555555, " 555.555k"),
]
self.compare_all(formatter, in_out)
formatter = EngFormatter(accuracy=1, use_eng_prefix=True)
in_out = [
(5.55555, " 5.6"),
(55.5555, " 55.6"),
(555.555, " 555.6"),
(5555.55, " 5.6k"),
(55555.5, " 55.6k"),
(555555, " 555.6k"),
]
self.compare_all(formatter, in_out)
formatter = EngFormatter(accuracy=0, use_eng_prefix=True)
in_out = [
(5.55555, " 6"),
(55.5555, " 56"),
(555.555, " 556"),
(5555.55, " 6k"),
(55555.5, " 56k"),
(555555, " 556k"),
]
self.compare_all(formatter, in_out)
formatter = EngFormatter(accuracy=3, use_eng_prefix=True)
result = formatter(0)
assert result == " 0.000"
def test_nan(self):
# Issue #11981
formatter = EngFormatter(accuracy=1, use_eng_prefix=True)
result = formatter(np.nan)
assert result == "NaN"
df = DataFrame(
{
"a": [1.5, 10.3, 20.5],
"b": [50.3, 60.67, 70.12],
"c": [100.2, 101.33, 120.33],
}
)
pt = df.pivot_table(values="a", index="b", columns="c")
set_eng_float_format(accuracy=1)
result = pt.to_string()
assert "NaN" in result
def test_inf(self):
# Issue #11981
formatter = EngFormatter(accuracy=1, use_eng_prefix=True)
result = formatter(np.inf)
assert result == "inf"

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,90 @@
import numpy as np
import pandas._config.config as cf
from pandas import (
DataFrame,
MultiIndex,
)
class TestTableSchemaRepr:
def test_publishes(self, ip):
ipython = ip.instance(config=ip.config)
df = DataFrame({"A": [1, 2]})
objects = [df["A"], df] # dataframe / series
expected_keys = [
{"text/plain", "application/vnd.dataresource+json"},
{"text/plain", "text/html", "application/vnd.dataresource+json"},
]
opt = cf.option_context("display.html.table_schema", True)
last_obj = None
for obj, expected in zip(objects, expected_keys):
last_obj = obj
with opt:
formatted = ipython.display_formatter.format(obj)
assert set(formatted[0].keys()) == expected
with_latex = cf.option_context("styler.render.repr", "latex")
with opt, with_latex:
formatted = ipython.display_formatter.format(last_obj)
expected = {
"text/plain",
"text/html",
"text/latex",
"application/vnd.dataresource+json",
}
assert set(formatted[0].keys()) == expected
def test_publishes_not_implemented(self, ip):
# column MultiIndex
# GH#15996
midx = MultiIndex.from_product([["A", "B"], ["a", "b", "c"]])
df = DataFrame(
np.random.default_rng(2).standard_normal((5, len(midx))), columns=midx
)
opt = cf.option_context("display.html.table_schema", True)
with opt:
formatted = ip.instance(config=ip.config).display_formatter.format(df)
expected = {"text/plain", "text/html"}
assert set(formatted[0].keys()) == expected
def test_config_on(self):
df = DataFrame({"A": [1, 2]})
with cf.option_context("display.html.table_schema", True):
result = df._repr_data_resource_()
assert result is not None
def test_config_default_off(self):
df = DataFrame({"A": [1, 2]})
with cf.option_context("display.html.table_schema", False):
result = df._repr_data_resource_()
assert result is None
def test_enable_data_resource_formatter(self, ip):
# GH#10491
formatters = ip.instance(config=ip.config).display_formatter.formatters
mimetype = "application/vnd.dataresource+json"
with cf.option_context("display.html.table_schema", True):
assert "application/vnd.dataresource+json" in formatters
assert formatters[mimetype].enabled
# still there, just disabled
assert "application/vnd.dataresource+json" in formatters
assert not formatters[mimetype].enabled
# able to re-set
with cf.option_context("display.html.table_schema", True):
assert "application/vnd.dataresource+json" in formatters
assert formatters[mimetype].enabled
# smoke test that it works
ip.instance(config=ip.config).display_formatter.format(cf)

View File

@ -0,0 +1,129 @@
# Note! This file is aimed specifically at pandas.io.formats.printing utility
# functions, not the general printing of pandas objects.
import string
import pandas._config.config as cf
from pandas.io.formats import printing
def test_adjoin():
data = [["a", "b", "c"], ["dd", "ee", "ff"], ["ggg", "hhh", "iii"]]
expected = "a dd ggg\nb ee hhh\nc ff iii"
adjoined = printing.adjoin(2, *data)
assert adjoined == expected
class TestPPrintThing:
def test_repr_binary_type(self):
letters = string.ascii_letters
try:
raw = bytes(letters, encoding=cf.get_option("display.encoding"))
except TypeError:
raw = bytes(letters)
b = str(raw.decode("utf-8"))
res = printing.pprint_thing(b, quote_strings=True)
assert res == repr(b)
res = printing.pprint_thing(b, quote_strings=False)
assert res == b
def test_repr_obeys_max_seq_limit(self):
with cf.option_context("display.max_seq_items", 2000):
assert len(printing.pprint_thing(list(range(1000)))) > 1000
with cf.option_context("display.max_seq_items", 5):
assert len(printing.pprint_thing(list(range(1000)))) < 100
with cf.option_context("display.max_seq_items", 1):
assert len(printing.pprint_thing(list(range(1000)))) < 9
def test_repr_set(self):
assert printing.pprint_thing({1}) == "{1}"
class TestFormatBase:
def test_adjoin(self):
data = [["a", "b", "c"], ["dd", "ee", "ff"], ["ggg", "hhh", "iii"]]
expected = "a dd ggg\nb ee hhh\nc ff iii"
adjoined = printing.adjoin(2, *data)
assert adjoined == expected
def test_adjoin_unicode(self):
data = [["", "b", "c"], ["dd", "ええ", "ff"], ["ggg", "hhh", "いいい"]]
expected = "あ dd ggg\nb ええ hhh\nc ff いいい"
adjoined = printing.adjoin(2, *data)
assert adjoined == expected
adj = printing._EastAsianTextAdjustment()
expected = """あ dd ggg
b ええ hhh
c ff いいい"""
adjoined = adj.adjoin(2, *data)
assert adjoined == expected
cols = adjoined.split("\n")
assert adj.len(cols[0]) == 13
assert adj.len(cols[1]) == 13
assert adj.len(cols[2]) == 16
expected = """あ dd ggg
b ええ hhh
c ff いいい"""
adjoined = adj.adjoin(7, *data)
assert adjoined == expected
cols = adjoined.split("\n")
assert adj.len(cols[0]) == 23
assert adj.len(cols[1]) == 23
assert adj.len(cols[2]) == 26
def test_justify(self):
adj = printing._EastAsianTextAdjustment()
def just(x, *args, **kwargs):
# wrapper to test single str
return adj.justify([x], *args, **kwargs)[0]
assert just("abc", 5, mode="left") == "abc "
assert just("abc", 5, mode="center") == " abc "
assert just("abc", 5, mode="right") == " abc"
assert just("abc", 5, mode="left") == "abc "
assert just("abc", 5, mode="center") == " abc "
assert just("abc", 5, mode="right") == " abc"
assert just("パンダ", 5, mode="left") == "パンダ"
assert just("パンダ", 5, mode="center") == "パンダ"
assert just("パンダ", 5, mode="right") == "パンダ"
assert just("パンダ", 10, mode="left") == "パンダ "
assert just("パンダ", 10, mode="center") == " パンダ "
assert just("パンダ", 10, mode="right") == " パンダ"
def test_east_asian_len(self):
adj = printing._EastAsianTextAdjustment()
assert adj.len("abc") == 3
assert adj.len("abc") == 3
assert adj.len("パンダ") == 6
assert adj.len("パンダ") == 5
assert adj.len("パンダpanda") == 11
assert adj.len("パンダpanda") == 10
def test_ambiguous_width(self):
adj = printing._EastAsianTextAdjustment()
assert adj.len("¡¡ab") == 4
with cf.option_context("display.unicode.ambiguous_as_wide", True):
adj = printing._EastAsianTextAdjustment()
assert adj.len("¡¡ab") == 6
data = [["", "b", "c"], ["dd", "ええ", "ff"], ["ggg", "¡¡ab", "いいい"]]
expected = "あ dd ggg \nb ええ ¡¡ab\nc ff いいい"
adjoined = adj.adjoin(2, *data)
assert adjoined == expected

View File

@ -0,0 +1,758 @@
import io
import os
import sys
from zipfile import ZipFile
from _csv import Error
import numpy as np
import pytest
import pandas as pd
from pandas import (
DataFrame,
Index,
compat,
)
import pandas._testing as tm
class TestToCSV:
def test_to_csv_with_single_column(self):
# see gh-18676, https://bugs.python.org/issue32255
#
# Python's CSV library adds an extraneous '""'
# before the newline when the NaN-value is in
# the first row. Otherwise, only the newline
# character is added. This behavior is inconsistent
# and was patched in https://bugs.python.org/pull_request4672.
df1 = DataFrame([None, 1])
expected1 = """\
""
1.0
"""
with tm.ensure_clean("test.csv") as path:
df1.to_csv(path, header=None, index=None)
with open(path, encoding="utf-8") as f:
assert f.read() == expected1
df2 = DataFrame([1, None])
expected2 = """\
1.0
""
"""
with tm.ensure_clean("test.csv") as path:
df2.to_csv(path, header=None, index=None)
with open(path, encoding="utf-8") as f:
assert f.read() == expected2
def test_to_csv_default_encoding(self):
# GH17097
df = DataFrame({"col": ["AAAAA", "ÄÄÄÄÄ", "ßßßßß", "聞聞聞聞聞"]})
with tm.ensure_clean("test.csv") as path:
# the default to_csv encoding is uft-8.
df.to_csv(path)
tm.assert_frame_equal(pd.read_csv(path, index_col=0), df)
def test_to_csv_quotechar(self):
df = DataFrame({"col": [1, 2]})
expected = """\
"","col"
"0","1"
"1","2"
"""
with tm.ensure_clean("test.csv") as path:
df.to_csv(path, quoting=1) # 1=QUOTE_ALL
with open(path, encoding="utf-8") as f:
assert f.read() == expected
expected = """\
$$,$col$
$0$,$1$
$1$,$2$
"""
with tm.ensure_clean("test.csv") as path:
df.to_csv(path, quoting=1, quotechar="$")
with open(path, encoding="utf-8") as f:
assert f.read() == expected
with tm.ensure_clean("test.csv") as path:
with pytest.raises(TypeError, match="quotechar"):
df.to_csv(path, quoting=1, quotechar=None)
def test_to_csv_doublequote(self):
df = DataFrame({"col": ['a"a', '"bb"']})
expected = '''\
"","col"
"0","a""a"
"1","""bb"""
'''
with tm.ensure_clean("test.csv") as path:
df.to_csv(path, quoting=1, doublequote=True) # QUOTE_ALL
with open(path, encoding="utf-8") as f:
assert f.read() == expected
with tm.ensure_clean("test.csv") as path:
with pytest.raises(Error, match="escapechar"):
df.to_csv(path, doublequote=False) # no escapechar set
def test_to_csv_escapechar(self):
df = DataFrame({"col": ['a"a', '"bb"']})
expected = """\
"","col"
"0","a\\"a"
"1","\\"bb\\""
"""
with tm.ensure_clean("test.csv") as path: # QUOTE_ALL
df.to_csv(path, quoting=1, doublequote=False, escapechar="\\")
with open(path, encoding="utf-8") as f:
assert f.read() == expected
df = DataFrame({"col": ["a,a", ",bb,"]})
expected = """\
,col
0,a\\,a
1,\\,bb\\,
"""
with tm.ensure_clean("test.csv") as path:
df.to_csv(path, quoting=3, escapechar="\\") # QUOTE_NONE
with open(path, encoding="utf-8") as f:
assert f.read() == expected
def test_csv_to_string(self):
df = DataFrame({"col": [1, 2]})
expected_rows = [",col", "0,1", "1,2"]
expected = tm.convert_rows_list_to_csv_str(expected_rows)
assert df.to_csv() == expected
def test_to_csv_decimal(self):
# see gh-781
df = DataFrame({"col1": [1], "col2": ["a"], "col3": [10.1]})
expected_rows = [",col1,col2,col3", "0,1,a,10.1"]
expected_default = tm.convert_rows_list_to_csv_str(expected_rows)
assert df.to_csv() == expected_default
expected_rows = [";col1;col2;col3", "0;1;a;10,1"]
expected_european_excel = tm.convert_rows_list_to_csv_str(expected_rows)
assert df.to_csv(decimal=",", sep=";") == expected_european_excel
expected_rows = [",col1,col2,col3", "0,1,a,10.10"]
expected_float_format_default = tm.convert_rows_list_to_csv_str(expected_rows)
assert df.to_csv(float_format="%.2f") == expected_float_format_default
expected_rows = [";col1;col2;col3", "0;1;a;10,10"]
expected_float_format = tm.convert_rows_list_to_csv_str(expected_rows)
assert (
df.to_csv(decimal=",", sep=";", float_format="%.2f")
== expected_float_format
)
# see gh-11553: testing if decimal is taken into account for '0.0'
df = DataFrame({"a": [0, 1.1], "b": [2.2, 3.3], "c": 1})
expected_rows = ["a,b,c", "0^0,2^2,1", "1^1,3^3,1"]
expected = tm.convert_rows_list_to_csv_str(expected_rows)
assert df.to_csv(index=False, decimal="^") == expected
# same but for an index
assert df.set_index("a").to_csv(decimal="^") == expected
# same for a multi-index
assert df.set_index(["a", "b"]).to_csv(decimal="^") == expected
def test_to_csv_float_format(self):
# testing if float_format is taken into account for the index
# GH 11553
df = DataFrame({"a": [0, 1], "b": [2.2, 3.3], "c": 1})
expected_rows = ["a,b,c", "0,2.20,1", "1,3.30,1"]
expected = tm.convert_rows_list_to_csv_str(expected_rows)
assert df.set_index("a").to_csv(float_format="%.2f") == expected
# same for a multi-index
assert df.set_index(["a", "b"]).to_csv(float_format="%.2f") == expected
def test_to_csv_na_rep(self):
# see gh-11553
#
# Testing if NaN values are correctly represented in the index.
df = DataFrame({"a": [0, np.nan], "b": [0, 1], "c": [2, 3]})
expected_rows = ["a,b,c", "0.0,0,2", "_,1,3"]
expected = tm.convert_rows_list_to_csv_str(expected_rows)
assert df.set_index("a").to_csv(na_rep="_") == expected
assert df.set_index(["a", "b"]).to_csv(na_rep="_") == expected
# now with an index containing only NaNs
df = DataFrame({"a": np.nan, "b": [0, 1], "c": [2, 3]})
expected_rows = ["a,b,c", "_,0,2", "_,1,3"]
expected = tm.convert_rows_list_to_csv_str(expected_rows)
assert df.set_index("a").to_csv(na_rep="_") == expected
assert df.set_index(["a", "b"]).to_csv(na_rep="_") == expected
# check if na_rep parameter does not break anything when no NaN
df = DataFrame({"a": 0, "b": [0, 1], "c": [2, 3]})
expected_rows = ["a,b,c", "0,0,2", "0,1,3"]
expected = tm.convert_rows_list_to_csv_str(expected_rows)
assert df.set_index("a").to_csv(na_rep="_") == expected
assert df.set_index(["a", "b"]).to_csv(na_rep="_") == expected
csv = pd.Series(["a", pd.NA, "c"]).to_csv(na_rep="ZZZZZ")
expected = tm.convert_rows_list_to_csv_str([",0", "0,a", "1,ZZZZZ", "2,c"])
assert expected == csv
def test_to_csv_na_rep_nullable_string(self, nullable_string_dtype):
# GH 29975
# Make sure full na_rep shows up when a dtype is provided
expected = tm.convert_rows_list_to_csv_str([",0", "0,a", "1,ZZZZZ", "2,c"])
csv = pd.Series(["a", pd.NA, "c"], dtype=nullable_string_dtype).to_csv(
na_rep="ZZZZZ"
)
assert expected == csv
def test_to_csv_date_format(self):
# GH 10209
df_sec = DataFrame({"A": pd.date_range("20130101", periods=5, freq="s")})
df_day = DataFrame({"A": pd.date_range("20130101", periods=5, freq="d")})
expected_rows = [
",A",
"0,2013-01-01 00:00:00",
"1,2013-01-01 00:00:01",
"2,2013-01-01 00:00:02",
"3,2013-01-01 00:00:03",
"4,2013-01-01 00:00:04",
]
expected_default_sec = tm.convert_rows_list_to_csv_str(expected_rows)
assert df_sec.to_csv() == expected_default_sec
expected_rows = [
",A",
"0,2013-01-01 00:00:00",
"1,2013-01-02 00:00:00",
"2,2013-01-03 00:00:00",
"3,2013-01-04 00:00:00",
"4,2013-01-05 00:00:00",
]
expected_ymdhms_day = tm.convert_rows_list_to_csv_str(expected_rows)
assert df_day.to_csv(date_format="%Y-%m-%d %H:%M:%S") == expected_ymdhms_day
expected_rows = [
",A",
"0,2013-01-01",
"1,2013-01-01",
"2,2013-01-01",
"3,2013-01-01",
"4,2013-01-01",
]
expected_ymd_sec = tm.convert_rows_list_to_csv_str(expected_rows)
assert df_sec.to_csv(date_format="%Y-%m-%d") == expected_ymd_sec
expected_rows = [
",A",
"0,2013-01-01",
"1,2013-01-02",
"2,2013-01-03",
"3,2013-01-04",
"4,2013-01-05",
]
expected_default_day = tm.convert_rows_list_to_csv_str(expected_rows)
assert df_day.to_csv() == expected_default_day
assert df_day.to_csv(date_format="%Y-%m-%d") == expected_default_day
# see gh-7791
#
# Testing if date_format parameter is taken into account
# for multi-indexed DataFrames.
df_sec["B"] = 0
df_sec["C"] = 1
expected_rows = ["A,B,C", "2013-01-01,0,1.0"]
expected_ymd_sec = tm.convert_rows_list_to_csv_str(expected_rows)
df_sec_grouped = df_sec.groupby([pd.Grouper(key="A", freq="1h"), "B"])
assert df_sec_grouped.mean().to_csv(date_format="%Y-%m-%d") == expected_ymd_sec
def test_to_csv_different_datetime_formats(self):
# GH#21734
df = DataFrame(
{
"date": pd.to_datetime("1970-01-01"),
"datetime": pd.date_range("1970-01-01", periods=2, freq="h"),
}
)
expected_rows = [
"date,datetime",
"1970-01-01,1970-01-01 00:00:00",
"1970-01-01,1970-01-01 01:00:00",
]
expected = tm.convert_rows_list_to_csv_str(expected_rows)
assert df.to_csv(index=False) == expected
def test_to_csv_date_format_in_categorical(self):
# GH#40754
ser = pd.Series(pd.to_datetime(["2021-03-27", pd.NaT], format="%Y-%m-%d"))
ser = ser.astype("category")
expected = tm.convert_rows_list_to_csv_str(["0", "2021-03-27", '""'])
assert ser.to_csv(index=False) == expected
ser = pd.Series(
pd.date_range(
start="2021-03-27", freq="D", periods=1, tz="Europe/Berlin"
).append(pd.DatetimeIndex([pd.NaT]))
)
ser = ser.astype("category")
assert ser.to_csv(index=False, date_format="%Y-%m-%d") == expected
def test_to_csv_float_ea_float_format(self):
# GH#45991
df = DataFrame({"a": [1.1, 2.02, pd.NA, 6.000006], "b": "c"})
df["a"] = df["a"].astype("Float64")
result = df.to_csv(index=False, float_format="%.5f")
expected = tm.convert_rows_list_to_csv_str(
["a,b", "1.10000,c", "2.02000,c", ",c", "6.00001,c"]
)
assert result == expected
def test_to_csv_float_ea_no_float_format(self):
# GH#45991
df = DataFrame({"a": [1.1, 2.02, pd.NA, 6.000006], "b": "c"})
df["a"] = df["a"].astype("Float64")
result = df.to_csv(index=False)
expected = tm.convert_rows_list_to_csv_str(
["a,b", "1.1,c", "2.02,c", ",c", "6.000006,c"]
)
assert result == expected
def test_to_csv_multi_index(self):
# see gh-6618
df = DataFrame([1], columns=pd.MultiIndex.from_arrays([[1], [2]]))
exp_rows = [",1", ",2", "0,1"]
exp = tm.convert_rows_list_to_csv_str(exp_rows)
assert df.to_csv() == exp
exp_rows = ["1", "2", "1"]
exp = tm.convert_rows_list_to_csv_str(exp_rows)
assert df.to_csv(index=False) == exp
df = DataFrame(
[1],
columns=pd.MultiIndex.from_arrays([[1], [2]]),
index=pd.MultiIndex.from_arrays([[1], [2]]),
)
exp_rows = [",,1", ",,2", "1,2,1"]
exp = tm.convert_rows_list_to_csv_str(exp_rows)
assert df.to_csv() == exp
exp_rows = ["1", "2", "1"]
exp = tm.convert_rows_list_to_csv_str(exp_rows)
assert df.to_csv(index=False) == exp
df = DataFrame([1], columns=pd.MultiIndex.from_arrays([["foo"], ["bar"]]))
exp_rows = [",foo", ",bar", "0,1"]
exp = tm.convert_rows_list_to_csv_str(exp_rows)
assert df.to_csv() == exp
exp_rows = ["foo", "bar", "1"]
exp = tm.convert_rows_list_to_csv_str(exp_rows)
assert df.to_csv(index=False) == exp
@pytest.mark.parametrize(
"ind,expected",
[
(
pd.MultiIndex(levels=[[1.0]], codes=[[0]], names=["x"]),
"x,data\n1.0,1\n",
),
(
pd.MultiIndex(
levels=[[1.0], [2.0]], codes=[[0], [0]], names=["x", "y"]
),
"x,y,data\n1.0,2.0,1\n",
),
],
)
def test_to_csv_single_level_multi_index(self, ind, expected, frame_or_series):
# see gh-19589
obj = frame_or_series(pd.Series([1], ind, name="data"))
result = obj.to_csv(lineterminator="\n", header=True)
assert result == expected
def test_to_csv_string_array_ascii(self):
# GH 10813
str_array = [{"names": ["foo", "bar"]}, {"names": ["baz", "qux"]}]
df = DataFrame(str_array)
expected_ascii = """\
,names
0,"['foo', 'bar']"
1,"['baz', 'qux']"
"""
with tm.ensure_clean("str_test.csv") as path:
df.to_csv(path, encoding="ascii")
with open(path, encoding="utf-8") as f:
assert f.read() == expected_ascii
def test_to_csv_string_array_utf8(self):
# GH 10813
str_array = [{"names": ["foo", "bar"]}, {"names": ["baz", "qux"]}]
df = DataFrame(str_array)
expected_utf8 = """\
,names
0,"['foo', 'bar']"
1,"['baz', 'qux']"
"""
with tm.ensure_clean("unicode_test.csv") as path:
df.to_csv(path, encoding="utf-8")
with open(path, encoding="utf-8") as f:
assert f.read() == expected_utf8
def test_to_csv_string_with_lf(self):
# GH 20353
data = {"int": [1, 2, 3], "str_lf": ["abc", "d\nef", "g\nh\n\ni"]}
df = DataFrame(data)
with tm.ensure_clean("lf_test.csv") as path:
# case 1: The default line terminator(=os.linesep)(PR 21406)
os_linesep = os.linesep.encode("utf-8")
expected_noarg = (
b"int,str_lf"
+ os_linesep
+ b"1,abc"
+ os_linesep
+ b'2,"d\nef"'
+ os_linesep
+ b'3,"g\nh\n\ni"'
+ os_linesep
)
df.to_csv(path, index=False)
with open(path, "rb") as f:
assert f.read() == expected_noarg
with tm.ensure_clean("lf_test.csv") as path:
# case 2: LF as line terminator
expected_lf = b'int,str_lf\n1,abc\n2,"d\nef"\n3,"g\nh\n\ni"\n'
df.to_csv(path, lineterminator="\n", index=False)
with open(path, "rb") as f:
assert f.read() == expected_lf
with tm.ensure_clean("lf_test.csv") as path:
# case 3: CRLF as line terminator
# 'lineterminator' should not change inner element
expected_crlf = b'int,str_lf\r\n1,abc\r\n2,"d\nef"\r\n3,"g\nh\n\ni"\r\n'
df.to_csv(path, lineterminator="\r\n", index=False)
with open(path, "rb") as f:
assert f.read() == expected_crlf
def test_to_csv_string_with_crlf(self):
# GH 20353
data = {"int": [1, 2, 3], "str_crlf": ["abc", "d\r\nef", "g\r\nh\r\n\r\ni"]}
df = DataFrame(data)
with tm.ensure_clean("crlf_test.csv") as path:
# case 1: The default line terminator(=os.linesep)(PR 21406)
os_linesep = os.linesep.encode("utf-8")
expected_noarg = (
b"int,str_crlf"
+ os_linesep
+ b"1,abc"
+ os_linesep
+ b'2,"d\r\nef"'
+ os_linesep
+ b'3,"g\r\nh\r\n\r\ni"'
+ os_linesep
)
df.to_csv(path, index=False)
with open(path, "rb") as f:
assert f.read() == expected_noarg
with tm.ensure_clean("crlf_test.csv") as path:
# case 2: LF as line terminator
expected_lf = b'int,str_crlf\n1,abc\n2,"d\r\nef"\n3,"g\r\nh\r\n\r\ni"\n'
df.to_csv(path, lineterminator="\n", index=False)
with open(path, "rb") as f:
assert f.read() == expected_lf
with tm.ensure_clean("crlf_test.csv") as path:
# case 3: CRLF as line terminator
# 'lineterminator' should not change inner element
expected_crlf = (
b"int,str_crlf\r\n"
b"1,abc\r\n"
b'2,"d\r\nef"\r\n'
b'3,"g\r\nh\r\n\r\ni"\r\n'
)
df.to_csv(path, lineterminator="\r\n", index=False)
with open(path, "rb") as f:
assert f.read() == expected_crlf
def test_to_csv_stdout_file(self, capsys):
# GH 21561
df = DataFrame([["foo", "bar"], ["baz", "qux"]], columns=["name_1", "name_2"])
expected_rows = [",name_1,name_2", "0,foo,bar", "1,baz,qux"]
expected_ascii = tm.convert_rows_list_to_csv_str(expected_rows)
df.to_csv(sys.stdout, encoding="ascii")
captured = capsys.readouterr()
assert captured.out == expected_ascii
assert not sys.stdout.closed
@pytest.mark.xfail(
compat.is_platform_windows(),
reason=(
"Especially in Windows, file stream should not be passed"
"to csv writer without newline='' option."
"(https://docs.python.org/3/library/csv.html#csv.writer)"
),
)
def test_to_csv_write_to_open_file(self):
# GH 21696
df = DataFrame({"a": ["x", "y", "z"]})
expected = """\
manual header
x
y
z
"""
with tm.ensure_clean("test.txt") as path:
with open(path, "w", encoding="utf-8") as f:
f.write("manual header\n")
df.to_csv(f, header=None, index=None)
with open(path, encoding="utf-8") as f:
assert f.read() == expected
def test_to_csv_write_to_open_file_with_newline_py3(self):
# see gh-21696
# see gh-20353
df = DataFrame({"a": ["x", "y", "z"]})
expected_rows = ["x", "y", "z"]
expected = "manual header\n" + tm.convert_rows_list_to_csv_str(expected_rows)
with tm.ensure_clean("test.txt") as path:
with open(path, "w", newline="", encoding="utf-8") as f:
f.write("manual header\n")
df.to_csv(f, header=None, index=None)
with open(path, "rb") as f:
assert f.read() == bytes(expected, "utf-8")
@pytest.mark.parametrize("to_infer", [True, False])
@pytest.mark.parametrize("read_infer", [True, False])
def test_to_csv_compression(
self, compression_only, read_infer, to_infer, compression_to_extension
):
# see gh-15008
compression = compression_only
# We'll complete file extension subsequently.
filename = "test."
filename += compression_to_extension[compression]
df = DataFrame({"A": [1]})
to_compression = "infer" if to_infer else compression
read_compression = "infer" if read_infer else compression
with tm.ensure_clean(filename) as path:
df.to_csv(path, compression=to_compression)
result = pd.read_csv(path, index_col=0, compression=read_compression)
tm.assert_frame_equal(result, df)
def test_to_csv_compression_dict(self, compression_only):
# GH 26023
method = compression_only
df = DataFrame({"ABC": [1]})
filename = "to_csv_compress_as_dict."
extension = {
"gzip": "gz",
"zstd": "zst",
}.get(method, method)
filename += extension
with tm.ensure_clean(filename) as path:
df.to_csv(path, compression={"method": method})
read_df = pd.read_csv(path, index_col=0)
tm.assert_frame_equal(read_df, df)
def test_to_csv_compression_dict_no_method_raises(self):
# GH 26023
df = DataFrame({"ABC": [1]})
compression = {"some_option": True}
msg = "must have key 'method'"
with tm.ensure_clean("out.zip") as path:
with pytest.raises(ValueError, match=msg):
df.to_csv(path, compression=compression)
@pytest.mark.parametrize("compression", ["zip", "infer"])
@pytest.mark.parametrize("archive_name", ["test_to_csv.csv", "test_to_csv.zip"])
def test_to_csv_zip_arguments(self, compression, archive_name):
# GH 26023
df = DataFrame({"ABC": [1]})
with tm.ensure_clean("to_csv_archive_name.zip") as path:
df.to_csv(
path, compression={"method": compression, "archive_name": archive_name}
)
with ZipFile(path) as zp:
assert len(zp.filelist) == 1
archived_file = zp.filelist[0].filename
assert archived_file == archive_name
@pytest.mark.parametrize(
"filename,expected_arcname",
[
("archive.csv", "archive.csv"),
("archive.tsv", "archive.tsv"),
("archive.csv.zip", "archive.csv"),
("archive.tsv.zip", "archive.tsv"),
("archive.zip", "archive"),
],
)
def test_to_csv_zip_infer_name(self, tmp_path, filename, expected_arcname):
# GH 39465
df = DataFrame({"ABC": [1]})
path = tmp_path / filename
df.to_csv(path, compression="zip")
with ZipFile(path) as zp:
assert len(zp.filelist) == 1
archived_file = zp.filelist[0].filename
assert archived_file == expected_arcname
@pytest.mark.parametrize("df_new_type", ["Int64"])
def test_to_csv_na_rep_long_string(self, df_new_type):
# see gh-25099
df = DataFrame({"c": [float("nan")] * 3})
df = df.astype(df_new_type)
expected_rows = ["c", "mynull", "mynull", "mynull"]
expected = tm.convert_rows_list_to_csv_str(expected_rows)
result = df.to_csv(index=False, na_rep="mynull", encoding="ascii")
assert expected == result
def test_to_csv_timedelta_precision(self):
# GH 6783
s = pd.Series([1, 1]).astype("timedelta64[ns]")
buf = io.StringIO()
s.to_csv(buf)
result = buf.getvalue()
expected_rows = [
",0",
"0,0 days 00:00:00.000000001",
"1,0 days 00:00:00.000000001",
]
expected = tm.convert_rows_list_to_csv_str(expected_rows)
assert result == expected
def test_na_rep_truncated(self):
# https://github.com/pandas-dev/pandas/issues/31447
result = pd.Series(range(8, 12)).to_csv(na_rep="-")
expected = tm.convert_rows_list_to_csv_str([",0", "0,8", "1,9", "2,10", "3,11"])
assert result == expected
result = pd.Series([True, False]).to_csv(na_rep="nan")
expected = tm.convert_rows_list_to_csv_str([",0", "0,True", "1,False"])
assert result == expected
result = pd.Series([1.1, 2.2]).to_csv(na_rep=".")
expected = tm.convert_rows_list_to_csv_str([",0", "0,1.1", "1,2.2"])
assert result == expected
@pytest.mark.parametrize("errors", ["surrogatepass", "ignore", "replace"])
def test_to_csv_errors(self, errors):
# GH 22610
data = ["\ud800foo"]
ser = pd.Series(data, index=Index(data, dtype=object), dtype=object)
with tm.ensure_clean("test.csv") as path:
ser.to_csv(path, errors=errors)
# No use in reading back the data as it is not the same anymore
# due to the error handling
@pytest.mark.parametrize("mode", ["wb", "w"])
def test_to_csv_binary_handle(self, mode):
"""
Binary file objects should work (if 'mode' contains a 'b') or even without
it in most cases.
GH 35058 and GH 19827
"""
df = DataFrame(
1.1 * np.arange(120).reshape((30, 4)),
columns=Index(list("ABCD")),
index=Index([f"i-{i}" for i in range(30)]),
)
with tm.ensure_clean() as path:
with open(path, mode="w+b") as handle:
df.to_csv(handle, mode=mode)
tm.assert_frame_equal(df, pd.read_csv(path, index_col=0))
@pytest.mark.parametrize("mode", ["wb", "w"])
def test_to_csv_encoding_binary_handle(self, mode):
"""
Binary file objects should honor a specified encoding.
GH 23854 and GH 13068 with binary handles
"""
# example from GH 23854
content = "a, b, 🐟".encode("utf-8-sig")
buffer = io.BytesIO(content)
df = pd.read_csv(buffer, encoding="utf-8-sig")
buffer = io.BytesIO()
df.to_csv(buffer, mode=mode, encoding="utf-8-sig", index=False)
buffer.seek(0) # tests whether file handle wasn't closed
assert buffer.getvalue().startswith(content)
# example from GH 13068
with tm.ensure_clean() as path:
with open(path, "w+b") as handle:
DataFrame().to_csv(handle, mode=mode, encoding="utf-8-sig")
handle.seek(0)
assert handle.read().startswith(b'\xef\xbb\xbf""')
def test_to_csv_iterative_compression_name(compression):
# GH 38714
df = DataFrame(
1.1 * np.arange(120).reshape((30, 4)),
columns=Index(list("ABCD")),
index=Index([f"i-{i}" for i in range(30)]),
)
with tm.ensure_clean() as path:
df.to_csv(path, compression=compression, chunksize=1)
tm.assert_frame_equal(
pd.read_csv(path, compression=compression, index_col=0), df
)
def test_to_csv_iterative_compression_buffer(compression):
# GH 38714
df = DataFrame(
1.1 * np.arange(120).reshape((30, 4)),
columns=Index(list("ABCD")),
index=Index([f"i-{i}" for i in range(30)]),
)
with io.BytesIO() as buffer:
df.to_csv(buffer, compression=compression, chunksize=1)
buffer.seek(0)
tm.assert_frame_equal(
pd.read_csv(buffer, compression=compression, index_col=0), df
)
assert not buffer.closed
def test_to_csv_pos_args_deprecation():
# GH-54229
df = DataFrame({"a": [1, 2, 3]})
msg = (
r"Starting with pandas version 3.0 all arguments of to_csv except for the "
r"argument 'path_or_buf' will be keyword-only."
)
with tm.assert_produces_warning(FutureWarning, match=msg):
buffer = io.BytesIO()
df.to_csv(buffer, ";")

View File

@ -0,0 +1,429 @@
"""Tests formatting as writer-agnostic ExcelCells
ExcelFormatter is tested implicitly in pandas/tests/io/excel
"""
import string
import pytest
from pandas.errors import CSSWarning
import pandas._testing as tm
from pandas.io.formats.excel import (
CssExcelCell,
CSSToExcelConverter,
)
@pytest.mark.parametrize(
"css,expected",
[
# FONT
# - name
("font-family: foo,bar", {"font": {"name": "foo"}}),
('font-family: "foo bar",baz', {"font": {"name": "foo bar"}}),
("font-family: foo,\nbar", {"font": {"name": "foo"}}),
("font-family: foo, bar, baz", {"font": {"name": "foo"}}),
("font-family: bar, foo", {"font": {"name": "bar"}}),
("font-family: 'foo bar', baz", {"font": {"name": "foo bar"}}),
("font-family: 'foo \\'bar', baz", {"font": {"name": "foo 'bar"}}),
('font-family: "foo \\"bar", baz', {"font": {"name": 'foo "bar'}}),
('font-family: "foo ,bar", baz', {"font": {"name": "foo ,bar"}}),
# - family
("font-family: serif", {"font": {"name": "serif", "family": 1}}),
("font-family: Serif", {"font": {"name": "serif", "family": 1}}),
("font-family: roman, serif", {"font": {"name": "roman", "family": 1}}),
("font-family: roman, sans-serif", {"font": {"name": "roman", "family": 2}}),
("font-family: roman, sans serif", {"font": {"name": "roman"}}),
("font-family: roman, sansserif", {"font": {"name": "roman"}}),
("font-family: roman, cursive", {"font": {"name": "roman", "family": 4}}),
("font-family: roman, fantasy", {"font": {"name": "roman", "family": 5}}),
# - size
("font-size: 1em", {"font": {"size": 12}}),
("font-size: xx-small", {"font": {"size": 6}}),
("font-size: x-small", {"font": {"size": 7.5}}),
("font-size: small", {"font": {"size": 9.6}}),
("font-size: medium", {"font": {"size": 12}}),
("font-size: large", {"font": {"size": 13.5}}),
("font-size: x-large", {"font": {"size": 18}}),
("font-size: xx-large", {"font": {"size": 24}}),
("font-size: 50%", {"font": {"size": 6}}),
# - bold
("font-weight: 100", {"font": {"bold": False}}),
("font-weight: 200", {"font": {"bold": False}}),
("font-weight: 300", {"font": {"bold": False}}),
("font-weight: 400", {"font": {"bold": False}}),
("font-weight: normal", {"font": {"bold": False}}),
("font-weight: lighter", {"font": {"bold": False}}),
("font-weight: bold", {"font": {"bold": True}}),
("font-weight: bolder", {"font": {"bold": True}}),
("font-weight: 700", {"font": {"bold": True}}),
("font-weight: 800", {"font": {"bold": True}}),
("font-weight: 900", {"font": {"bold": True}}),
# - italic
("font-style: italic", {"font": {"italic": True}}),
("font-style: oblique", {"font": {"italic": True}}),
# - underline
("text-decoration: underline", {"font": {"underline": "single"}}),
("text-decoration: overline", {}),
("text-decoration: none", {}),
# - strike
("text-decoration: line-through", {"font": {"strike": True}}),
(
"text-decoration: underline line-through",
{"font": {"strike": True, "underline": "single"}},
),
(
"text-decoration: underline; text-decoration: line-through",
{"font": {"strike": True}},
),
# - color
("color: red", {"font": {"color": "FF0000"}}),
("color: #ff0000", {"font": {"color": "FF0000"}}),
("color: #f0a", {"font": {"color": "FF00AA"}}),
# - shadow
("text-shadow: none", {"font": {"shadow": False}}),
("text-shadow: 0px -0em 0px #CCC", {"font": {"shadow": False}}),
("text-shadow: 0px -0em 0px #999", {"font": {"shadow": False}}),
("text-shadow: 0px -0em 0px", {"font": {"shadow": False}}),
("text-shadow: 2px -0em 0px #CCC", {"font": {"shadow": True}}),
("text-shadow: 0px -2em 0px #CCC", {"font": {"shadow": True}}),
("text-shadow: 0px -0em 2px #CCC", {"font": {"shadow": True}}),
("text-shadow: 0px -0em 2px", {"font": {"shadow": True}}),
("text-shadow: 0px -2em", {"font": {"shadow": True}}),
# FILL
# - color, fillType
(
"background-color: red",
{"fill": {"fgColor": "FF0000", "patternType": "solid"}},
),
(
"background-color: #ff0000",
{"fill": {"fgColor": "FF0000", "patternType": "solid"}},
),
(
"background-color: #f0a",
{"fill": {"fgColor": "FF00AA", "patternType": "solid"}},
),
# BORDER
# - style
(
"border-style: solid",
{
"border": {
"top": {"style": "medium"},
"bottom": {"style": "medium"},
"left": {"style": "medium"},
"right": {"style": "medium"},
}
},
),
(
"border-style: solid; border-width: thin",
{
"border": {
"top": {"style": "thin"},
"bottom": {"style": "thin"},
"left": {"style": "thin"},
"right": {"style": "thin"},
}
},
),
(
"border-top-style: solid; border-top-width: thin",
{"border": {"top": {"style": "thin"}}},
),
(
"border-top-style: solid; border-top-width: 1pt",
{"border": {"top": {"style": "thin"}}},
),
("border-top-style: solid", {"border": {"top": {"style": "medium"}}}),
(
"border-top-style: solid; border-top-width: medium",
{"border": {"top": {"style": "medium"}}},
),
(
"border-top-style: solid; border-top-width: 2pt",
{"border": {"top": {"style": "medium"}}},
),
(
"border-top-style: solid; border-top-width: thick",
{"border": {"top": {"style": "thick"}}},
),
(
"border-top-style: solid; border-top-width: 4pt",
{"border": {"top": {"style": "thick"}}},
),
(
"border-top-style: dotted",
{"border": {"top": {"style": "mediumDashDotDot"}}},
),
(
"border-top-style: dotted; border-top-width: thin",
{"border": {"top": {"style": "dotted"}}},
),
("border-top-style: dashed", {"border": {"top": {"style": "mediumDashed"}}}),
(
"border-top-style: dashed; border-top-width: thin",
{"border": {"top": {"style": "dashed"}}},
),
("border-top-style: double", {"border": {"top": {"style": "double"}}}),
# - color
(
"border-style: solid; border-color: #0000ff",
{
"border": {
"top": {"style": "medium", "color": "0000FF"},
"right": {"style": "medium", "color": "0000FF"},
"bottom": {"style": "medium", "color": "0000FF"},
"left": {"style": "medium", "color": "0000FF"},
}
},
),
(
"border-top-style: double; border-top-color: blue",
{"border": {"top": {"style": "double", "color": "0000FF"}}},
),
(
"border-top-style: solid; border-top-color: #06c",
{"border": {"top": {"style": "medium", "color": "0066CC"}}},
),
(
"border-top-color: blue",
{"border": {"top": {"color": "0000FF", "style": "none"}}},
),
# ALIGNMENT
# - horizontal
("text-align: center", {"alignment": {"horizontal": "center"}}),
("text-align: left", {"alignment": {"horizontal": "left"}}),
("text-align: right", {"alignment": {"horizontal": "right"}}),
("text-align: justify", {"alignment": {"horizontal": "justify"}}),
# - vertical
("vertical-align: top", {"alignment": {"vertical": "top"}}),
("vertical-align: text-top", {"alignment": {"vertical": "top"}}),
("vertical-align: middle", {"alignment": {"vertical": "center"}}),
("vertical-align: bottom", {"alignment": {"vertical": "bottom"}}),
("vertical-align: text-bottom", {"alignment": {"vertical": "bottom"}}),
# - wrap_text
("white-space: nowrap", {"alignment": {"wrap_text": False}}),
("white-space: pre", {"alignment": {"wrap_text": False}}),
("white-space: pre-line", {"alignment": {"wrap_text": False}}),
("white-space: normal", {"alignment": {"wrap_text": True}}),
# NUMBER FORMAT
("number-format: 0%", {"number_format": {"format_code": "0%"}}),
(
"number-format: 0§[Red](0)§-§@;",
{"number_format": {"format_code": "0;[red](0);-;@"}}, # GH 46152
),
],
)
def test_css_to_excel(css, expected):
convert = CSSToExcelConverter()
assert expected == convert(css)
def test_css_to_excel_multiple():
convert = CSSToExcelConverter()
actual = convert(
"""
font-weight: bold;
text-decoration: underline;
color: red;
border-width: thin;
text-align: center;
vertical-align: top;
unused: something;
"""
)
assert {
"font": {"bold": True, "underline": "single", "color": "FF0000"},
"border": {
"top": {"style": "thin"},
"right": {"style": "thin"},
"bottom": {"style": "thin"},
"left": {"style": "thin"},
},
"alignment": {"horizontal": "center", "vertical": "top"},
} == actual
@pytest.mark.parametrize(
"css,inherited,expected",
[
("font-weight: bold", "", {"font": {"bold": True}}),
("", "font-weight: bold", {"font": {"bold": True}}),
(
"font-weight: bold",
"font-style: italic",
{"font": {"bold": True, "italic": True}},
),
("font-style: normal", "font-style: italic", {"font": {"italic": False}}),
("font-style: inherit", "", {}),
(
"font-style: normal; font-style: inherit",
"font-style: italic",
{"font": {"italic": True}},
),
],
)
def test_css_to_excel_inherited(css, inherited, expected):
convert = CSSToExcelConverter(inherited)
assert expected == convert(css)
@pytest.mark.parametrize(
"input_color,output_color",
(
list(CSSToExcelConverter.NAMED_COLORS.items())
+ [("#" + rgb, rgb) for rgb in CSSToExcelConverter.NAMED_COLORS.values()]
+ [("#F0F", "FF00FF"), ("#ABC", "AABBCC")]
),
)
def test_css_to_excel_good_colors(input_color, output_color):
# see gh-18392
css = (
f"border-top-color: {input_color}; "
f"border-right-color: {input_color}; "
f"border-bottom-color: {input_color}; "
f"border-left-color: {input_color}; "
f"background-color: {input_color}; "
f"color: {input_color}"
)
expected = {}
expected["fill"] = {"patternType": "solid", "fgColor": output_color}
expected["font"] = {"color": output_color}
expected["border"] = {
k: {"color": output_color, "style": "none"}
for k in ("top", "right", "bottom", "left")
}
with tm.assert_produces_warning(None):
convert = CSSToExcelConverter()
assert expected == convert(css)
@pytest.mark.parametrize("input_color", [None, "not-a-color"])
def test_css_to_excel_bad_colors(input_color):
# see gh-18392
css = (
f"border-top-color: {input_color}; "
f"border-right-color: {input_color}; "
f"border-bottom-color: {input_color}; "
f"border-left-color: {input_color}; "
f"background-color: {input_color}; "
f"color: {input_color}"
)
expected = {}
if input_color is not None:
expected["fill"] = {"patternType": "solid"}
with tm.assert_produces_warning(CSSWarning):
convert = CSSToExcelConverter()
assert expected == convert(css)
def tests_css_named_colors_valid():
upper_hexs = set(map(str.upper, string.hexdigits))
for color in CSSToExcelConverter.NAMED_COLORS.values():
assert len(color) == 6 and all(c in upper_hexs for c in color)
def test_css_named_colors_from_mpl_present():
mpl_colors = pytest.importorskip("matplotlib.colors")
pd_colors = CSSToExcelConverter.NAMED_COLORS
for name, color in mpl_colors.CSS4_COLORS.items():
assert name in pd_colors and pd_colors[name] == color[1:]
@pytest.mark.parametrize(
"styles,expected",
[
([("color", "green"), ("color", "red")], "color: red;"),
([("font-weight", "bold"), ("font-weight", "normal")], "font-weight: normal;"),
([("text-align", "center"), ("TEXT-ALIGN", "right")], "text-align: right;"),
],
)
def test_css_excel_cell_precedence(styles, expected):
"""It applies favors latter declarations over former declarations"""
# See GH 47371
converter = CSSToExcelConverter()
converter._call_cached.cache_clear()
css_styles = {(0, 0): styles}
cell = CssExcelCell(
row=0,
col=0,
val="",
style=None,
css_styles=css_styles,
css_row=0,
css_col=0,
css_converter=converter,
)
converter._call_cached.cache_clear()
assert cell.style == converter(expected)
@pytest.mark.parametrize(
"styles,cache_hits,cache_misses",
[
([[("color", "green"), ("color", "red"), ("color", "green")]], 0, 1),
(
[
[("font-weight", "bold")],
[("font-weight", "normal"), ("font-weight", "bold")],
],
1,
1,
),
([[("text-align", "center")], [("TEXT-ALIGN", "center")]], 1, 1),
(
[
[("font-weight", "bold"), ("text-align", "center")],
[("font-weight", "bold"), ("text-align", "left")],
],
0,
2,
),
(
[
[("font-weight", "bold"), ("text-align", "center")],
[("font-weight", "bold"), ("text-align", "left")],
[("font-weight", "bold"), ("text-align", "center")],
],
1,
2,
),
],
)
def test_css_excel_cell_cache(styles, cache_hits, cache_misses):
"""It caches unique cell styles"""
# See GH 47371
converter = CSSToExcelConverter()
converter._call_cached.cache_clear()
css_styles = {(0, i): _style for i, _style in enumerate(styles)}
for css_row, css_col in css_styles:
CssExcelCell(
row=0,
col=0,
val="",
style=None,
css_styles=css_styles,
css_row=css_row,
css_col=css_col,
css_converter=converter,
)
cache_info = converter._call_cached.cache_info()
converter._call_cached.cache_clear()
assert cache_info.hits == cache_hits
assert cache_info.misses == cache_misses

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,106 @@
from io import (
BytesIO,
StringIO,
)
import pytest
import pandas as pd
import pandas._testing as tm
pytest.importorskip("tabulate")
def test_simple():
buf = StringIO()
df = pd.DataFrame([1, 2, 3])
df.to_markdown(buf=buf)
result = buf.getvalue()
assert (
result == "| | 0 |\n|---:|----:|\n| 0 | 1 |\n| 1 | 2 |\n| 2 | 3 |"
)
def test_empty_frame():
buf = StringIO()
df = pd.DataFrame({"id": [], "first_name": [], "last_name": []}).set_index("id")
df.to_markdown(buf=buf)
result = buf.getvalue()
assert result == (
"| id | first_name | last_name |\n"
"|------|--------------|-------------|"
)
def test_other_tablefmt():
buf = StringIO()
df = pd.DataFrame([1, 2, 3])
df.to_markdown(buf=buf, tablefmt="jira")
result = buf.getvalue()
assert result == "|| || 0 ||\n| 0 | 1 |\n| 1 | 2 |\n| 2 | 3 |"
def test_other_headers():
buf = StringIO()
df = pd.DataFrame([1, 2, 3])
df.to_markdown(buf=buf, headers=["foo", "bar"])
result = buf.getvalue()
assert result == (
"| foo | bar |\n|------:|------:|\n| 0 "
"| 1 |\n| 1 | 2 |\n| 2 | 3 |"
)
def test_series():
buf = StringIO()
s = pd.Series([1, 2, 3], name="foo")
s.to_markdown(buf=buf)
result = buf.getvalue()
assert result == (
"| | foo |\n|---:|------:|\n| 0 | 1 "
"|\n| 1 | 2 |\n| 2 | 3 |"
)
def test_no_buf():
df = pd.DataFrame([1, 2, 3])
result = df.to_markdown()
assert (
result == "| | 0 |\n|---:|----:|\n| 0 | 1 |\n| 1 | 2 |\n| 2 | 3 |"
)
@pytest.mark.parametrize("index", [True, False])
def test_index(index):
# GH 32667
df = pd.DataFrame([1, 2, 3])
result = df.to_markdown(index=index)
if index:
expected = (
"| | 0 |\n|---:|----:|\n| 0 | 1 |\n| 1 | 2 |\n| 2 | 3 |"
)
else:
expected = "| 0 |\n|----:|\n| 1 |\n| 2 |\n| 3 |"
assert result == expected
def test_showindex_disallowed_in_kwargs():
# GH 32667; disallowing showindex in kwargs enforced in 2.0
df = pd.DataFrame([1, 2, 3])
with pytest.raises(ValueError, match="Pass 'index' instead of 'showindex"):
df.to_markdown(index=True, showindex=True)
def test_markdown_pos_args_deprecatation():
# GH-54229
df = pd.DataFrame({"a": [1, 2, 3]})
msg = (
r"Starting with pandas version 3.0 all arguments of to_markdown except for the "
r"argument 'buf' will be keyword-only."
)
with tm.assert_produces_warning(FutureWarning, match=msg):
buffer = BytesIO()
df.to_markdown(buffer, "grid")

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,350 @@
"""
self-contained to write legacy storage pickle files
To use this script. Create an environment where you want
generate pickles, say its for 0.20.3, with your pandas clone
in ~/pandas
. activate pandas_0.20.3
cd ~/pandas/pandas
$ python -m tests.io.generate_legacy_storage_files \
tests/io/data/legacy_pickle/0.20.3/ pickle
This script generates a storage file for the current arch, system,
and python version
pandas version: 0.20.3
output dir : pandas/pandas/tests/io/data/legacy_pickle/0.20.3/
storage format: pickle
created pickle file: 0.20.3_x86_64_darwin_3.5.2.pickle
The idea here is you are using the *current* version of the
generate_legacy_storage_files with an *older* version of pandas to
generate a pickle file. We will then check this file into a current
branch, and test using test_pickle.py. This will load the *older*
pickles and test versus the current data that is generated
(with main). These are then compared.
If we have cases where we changed the signature (e.g. we renamed
offset -> freq in Timestamp). Then we have to conditionally execute
in the generate_legacy_storage_files.py to make it
run under the older AND the newer version.
"""
from datetime import timedelta
import os
import pickle
import platform as pl
import sys
# Remove script directory from path, otherwise Python will try to
# import the JSON test directory as the json module
sys.path.pop(0)
import numpy as np
import pandas
from pandas import (
Categorical,
DataFrame,
Index,
MultiIndex,
NaT,
Period,
RangeIndex,
Series,
Timestamp,
bdate_range,
date_range,
interval_range,
period_range,
timedelta_range,
)
from pandas.arrays import SparseArray
from pandas.tseries.offsets import (
FY5253,
BusinessDay,
BusinessHour,
CustomBusinessDay,
DateOffset,
Day,
Easter,
Hour,
LastWeekOfMonth,
Minute,
MonthBegin,
MonthEnd,
QuarterBegin,
QuarterEnd,
SemiMonthBegin,
SemiMonthEnd,
Week,
WeekOfMonth,
YearBegin,
YearEnd,
)
def _create_sp_series():
nan = np.nan
# nan-based
arr = np.arange(15, dtype=np.float64)
arr[7:12] = nan
arr[-1:] = nan
bseries = Series(SparseArray(arr, kind="block"))
bseries.name = "bseries"
return bseries
def _create_sp_tsseries():
nan = np.nan
# nan-based
arr = np.arange(15, dtype=np.float64)
arr[7:12] = nan
arr[-1:] = nan
date_index = bdate_range("1/1/2011", periods=len(arr))
bseries = Series(SparseArray(arr, kind="block"), index=date_index)
bseries.name = "btsseries"
return bseries
def _create_sp_frame():
nan = np.nan
data = {
"A": [nan, nan, nan, 0, 1, 2, 3, 4, 5, 6],
"B": [0, 1, 2, nan, nan, nan, 3, 4, 5, 6],
"C": np.arange(10).astype(np.int64),
"D": [0, 1, 2, 3, 4, 5, nan, nan, nan, nan],
}
dates = bdate_range("1/1/2011", periods=10)
return DataFrame(data, index=dates).apply(SparseArray)
def create_pickle_data():
"""create the pickle data"""
data = {
"A": [0.0, 1.0, 2.0, 3.0, np.nan],
"B": [0, 1, 0, 1, 0],
"C": ["foo1", "foo2", "foo3", "foo4", "foo5"],
"D": date_range("1/1/2009", periods=5),
"E": [0.0, 1, Timestamp("20100101"), "foo", 2.0],
}
scalars = {"timestamp": Timestamp("20130101"), "period": Period("2012", "M")}
index = {
"int": Index(np.arange(10)),
"date": date_range("20130101", periods=10),
"period": period_range("2013-01-01", freq="M", periods=10),
"float": Index(np.arange(10, dtype=np.float64)),
"uint": Index(np.arange(10, dtype=np.uint64)),
"timedelta": timedelta_range("00:00:00", freq="30min", periods=10),
"string": Index(["foo", "bar", "baz", "qux", "quux"], dtype="string"),
}
index["range"] = RangeIndex(10)
index["interval"] = interval_range(0, periods=10)
mi = {
"reg2": MultiIndex.from_tuples(
tuple(
zip(
*[
["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"],
["one", "two", "one", "two", "one", "two", "one", "two"],
]
)
),
names=["first", "second"],
)
}
series = {
"float": Series(data["A"]),
"int": Series(data["B"]),
"mixed": Series(data["E"]),
"ts": Series(
np.arange(10).astype(np.int64), index=date_range("20130101", periods=10)
),
"mi": Series(
np.arange(5).astype(np.float64),
index=MultiIndex.from_tuples(
tuple(zip(*[[1, 1, 2, 2, 2], [3, 4, 3, 4, 5]])), names=["one", "two"]
),
),
"dup": Series(np.arange(5).astype(np.float64), index=["A", "B", "C", "D", "A"]),
"cat": Series(Categorical(["foo", "bar", "baz"])),
"dt": Series(date_range("20130101", periods=5)),
"dt_tz": Series(date_range("20130101", periods=5, tz="US/Eastern")),
"period": Series([Period("2000Q1")] * 5),
"string": Series(["foo", "bar", "baz", "qux", "quux"], dtype="string"),
}
mixed_dup_df = DataFrame(data)
mixed_dup_df.columns = list("ABCDA")
frame = {
"float": DataFrame({"A": series["float"], "B": series["float"] + 1}),
"int": DataFrame({"A": series["int"], "B": series["int"] + 1}),
"mixed": DataFrame({k: data[k] for k in ["A", "B", "C", "D"]}),
"mi": DataFrame(
{"A": np.arange(5).astype(np.float64), "B": np.arange(5).astype(np.int64)},
index=MultiIndex.from_tuples(
tuple(
zip(
*[
["bar", "bar", "baz", "baz", "baz"],
["one", "two", "one", "two", "three"],
]
)
),
names=["first", "second"],
),
),
"dup": DataFrame(
np.arange(15).reshape(5, 3).astype(np.float64), columns=["A", "B", "A"]
),
"cat_onecol": DataFrame({"A": Categorical(["foo", "bar"])}),
"cat_and_float": DataFrame(
{
"A": Categorical(["foo", "bar", "baz"]),
"B": np.arange(3).astype(np.int64),
}
),
"mixed_dup": mixed_dup_df,
"dt_mixed_tzs": DataFrame(
{
"A": Timestamp("20130102", tz="US/Eastern"),
"B": Timestamp("20130603", tz="CET"),
},
index=range(5),
),
"dt_mixed2_tzs": DataFrame(
{
"A": Timestamp("20130102", tz="US/Eastern"),
"B": Timestamp("20130603", tz="CET"),
"C": Timestamp("20130603", tz="UTC"),
},
index=range(5),
),
"string": DataFrame(
{
"A": Series(["foo", "bar", "baz", "qux", "quux"], dtype="string"),
"B": Series(["one", "two", "one", "two", "three"], dtype="string"),
}
),
}
cat = {
"int8": Categorical(list("abcdefg")),
"int16": Categorical(np.arange(1000)),
"int32": Categorical(np.arange(10000)),
}
timestamp = {
"normal": Timestamp("2011-01-01"),
"nat": NaT,
"tz": Timestamp("2011-01-01", tz="US/Eastern"),
}
off = {
"DateOffset": DateOffset(years=1),
"DateOffset_h_ns": DateOffset(hour=6, nanoseconds=5824),
"BusinessDay": BusinessDay(offset=timedelta(seconds=9)),
"BusinessHour": BusinessHour(normalize=True, n=6, end="15:14"),
"CustomBusinessDay": CustomBusinessDay(weekmask="Mon Fri"),
"SemiMonthBegin": SemiMonthBegin(day_of_month=9),
"SemiMonthEnd": SemiMonthEnd(day_of_month=24),
"MonthBegin": MonthBegin(1),
"MonthEnd": MonthEnd(1),
"QuarterBegin": QuarterBegin(1),
"QuarterEnd": QuarterEnd(1),
"Day": Day(1),
"YearBegin": YearBegin(1),
"YearEnd": YearEnd(1),
"Week": Week(1),
"Week_Tues": Week(2, normalize=False, weekday=1),
"WeekOfMonth": WeekOfMonth(week=3, weekday=4),
"LastWeekOfMonth": LastWeekOfMonth(n=1, weekday=3),
"FY5253": FY5253(n=2, weekday=6, startingMonth=7, variation="last"),
"Easter": Easter(),
"Hour": Hour(1),
"Minute": Minute(1),
}
return {
"series": series,
"frame": frame,
"index": index,
"scalars": scalars,
"mi": mi,
"sp_series": {"float": _create_sp_series(), "ts": _create_sp_tsseries()},
"sp_frame": {"float": _create_sp_frame()},
"cat": cat,
"timestamp": timestamp,
"offsets": off,
}
def platform_name():
return "_".join(
[
str(pandas.__version__),
str(pl.machine()),
str(pl.system().lower()),
str(pl.python_version()),
]
)
def write_legacy_pickles(output_dir):
version = pandas.__version__
print(
"This script generates a storage file for the current arch, system, "
"and python version"
)
print(f" pandas version: {version}")
print(f" output dir : {output_dir}")
print(" storage format: pickle")
pth = f"{platform_name()}.pickle"
with open(os.path.join(output_dir, pth), "wb") as fh:
pickle.dump(create_pickle_data(), fh, pickle.DEFAULT_PROTOCOL)
print(f"created pickle file: {pth}")
def write_legacy_file():
# force our cwd to be the first searched
sys.path.insert(0, "")
if not 3 <= len(sys.argv) <= 4:
sys.exit(
"Specify output directory and storage type: generate_legacy_"
"storage_files.py <output_dir> <storage_type> "
)
output_dir = str(sys.argv[1])
storage_type = str(sys.argv[2])
if not os.path.exists(output_dir):
os.mkdir(output_dir)
if storage_type == "pickle":
write_legacy_pickles(output_dir=output_dir)
else:
sys.exit("storage_type must be one of {'pickle'}")
if __name__ == "__main__":
write_legacy_file()

View File

@ -0,0 +1,9 @@
import pytest
@pytest.fixture(params=["split", "records", "index", "columns", "values"])
def orient(request):
"""
Fixture for orients excluding the table format.
"""
return request.param

View File

@ -0,0 +1,130 @@
from io import (
BytesIO,
StringIO,
)
import pytest
import pandas.util._test_decorators as td
import pandas as pd
import pandas._testing as tm
def test_compression_roundtrip(compression):
df = pd.DataFrame(
[[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]],
index=["A", "B"],
columns=["X", "Y", "Z"],
)
with tm.ensure_clean() as path:
df.to_json(path, compression=compression)
tm.assert_frame_equal(df, pd.read_json(path, compression=compression))
# explicitly ensure file was compressed.
with tm.decompress_file(path, compression) as fh:
result = fh.read().decode("utf8")
data = StringIO(result)
tm.assert_frame_equal(df, pd.read_json(data))
def test_read_zipped_json(datapath):
uncompressed_path = datapath("io", "json", "data", "tsframe_v012.json")
uncompressed_df = pd.read_json(uncompressed_path)
compressed_path = datapath("io", "json", "data", "tsframe_v012.json.zip")
compressed_df = pd.read_json(compressed_path, compression="zip")
tm.assert_frame_equal(uncompressed_df, compressed_df)
@td.skip_if_not_us_locale
@pytest.mark.single_cpu
def test_with_s3_url(compression, s3_public_bucket, s3so):
# Bucket created in tests/io/conftest.py
df = pd.read_json(StringIO('{"a": [1, 2, 3], "b": [4, 5, 6]}'))
with tm.ensure_clean() as path:
df.to_json(path, compression=compression)
with open(path, "rb") as f:
s3_public_bucket.put_object(Key="test-1", Body=f)
roundtripped_df = pd.read_json(
f"s3://{s3_public_bucket.name}/test-1",
compression=compression,
storage_options=s3so,
)
tm.assert_frame_equal(df, roundtripped_df)
def test_lines_with_compression(compression):
with tm.ensure_clean() as path:
df = pd.read_json(StringIO('{"a": [1, 2, 3], "b": [4, 5, 6]}'))
df.to_json(path, orient="records", lines=True, compression=compression)
roundtripped_df = pd.read_json(path, lines=True, compression=compression)
tm.assert_frame_equal(df, roundtripped_df)
def test_chunksize_with_compression(compression):
with tm.ensure_clean() as path:
df = pd.read_json(StringIO('{"a": ["foo", "bar", "baz"], "b": [4, 5, 6]}'))
df.to_json(path, orient="records", lines=True, compression=compression)
with pd.read_json(
path, lines=True, chunksize=1, compression=compression
) as res:
roundtripped_df = pd.concat(res)
tm.assert_frame_equal(df, roundtripped_df)
def test_write_unsupported_compression_type():
df = pd.read_json(StringIO('{"a": [1, 2, 3], "b": [4, 5, 6]}'))
with tm.ensure_clean() as path:
msg = "Unrecognized compression type: unsupported"
with pytest.raises(ValueError, match=msg):
df.to_json(path, compression="unsupported")
def test_read_unsupported_compression_type():
with tm.ensure_clean() as path:
msg = "Unrecognized compression type: unsupported"
with pytest.raises(ValueError, match=msg):
pd.read_json(path, compression="unsupported")
@pytest.mark.parametrize(
"infer_string", [False, pytest.param(True, marks=td.skip_if_no("pyarrow"))]
)
@pytest.mark.parametrize("to_infer", [True, False])
@pytest.mark.parametrize("read_infer", [True, False])
def test_to_json_compression(
compression_only, read_infer, to_infer, compression_to_extension, infer_string
):
with pd.option_context("future.infer_string", infer_string):
# see gh-15008
compression = compression_only
# We'll complete file extension subsequently.
filename = "test."
filename += compression_to_extension[compression]
df = pd.DataFrame({"A": [1]})
to_compression = "infer" if to_infer else compression
read_compression = "infer" if read_infer else compression
with tm.ensure_clean(filename) as path:
df.to_json(path, compression=to_compression)
result = pd.read_json(path, compression=read_compression)
tm.assert_frame_equal(result, df)
def test_to_json_compression_mode(compression):
# GH 39985 (read_json does not support user-provided binary files)
expected = pd.DataFrame({"A": [1]})
with BytesIO() as buffer:
expected.to_json(buffer, compression=compression)
# df = pd.read_json(buffer, compression=compression)
# tm.assert_frame_equal(expected, df)

View File

@ -0,0 +1,21 @@
"""
Tests for the deprecated keyword arguments for `read_json`.
"""
from io import StringIO
import pandas as pd
import pandas._testing as tm
from pandas.io.json import read_json
def test_good_kwargs():
df = pd.DataFrame({"A": [2, 4, 6], "B": [3, 6, 9]}, index=[0, 1, 2])
with tm.assert_produces_warning(None):
data1 = StringIO(df.to_json(orient="split"))
tm.assert_frame_equal(df, read_json(data1, orient="split"))
data2 = StringIO(df.to_json(orient="columns"))
tm.assert_frame_equal(df, read_json(data2, orient="columns"))
data3 = StringIO(df.to_json(orient="index"))
tm.assert_frame_equal(df, read_json(data3, orient="index"))

View File

@ -0,0 +1,873 @@
"""Tests for Table Schema integration."""
from collections import OrderedDict
from io import StringIO
import json
import numpy as np
import pytest
from pandas.core.dtypes.dtypes import (
CategoricalDtype,
DatetimeTZDtype,
PeriodDtype,
)
import pandas as pd
from pandas import DataFrame
import pandas._testing as tm
from pandas.io.json._table_schema import (
as_json_table_type,
build_table_schema,
convert_json_field_to_pandas_type,
convert_pandas_type_to_json_field,
set_default_names,
)
@pytest.fixture
def df_schema():
return DataFrame(
{
"A": [1, 2, 3, 4],
"B": ["a", "b", "c", "c"],
"C": pd.date_range("2016-01-01", freq="d", periods=4),
"D": pd.timedelta_range("1h", periods=4, freq="min"),
},
index=pd.Index(range(4), name="idx"),
)
@pytest.fixture
def df_table():
return DataFrame(
{
"A": [1, 2, 3, 4],
"B": ["a", "b", "c", "c"],
"C": pd.date_range("2016-01-01", freq="d", periods=4),
"D": pd.timedelta_range("1h", periods=4, freq="min"),
"E": pd.Series(pd.Categorical(["a", "b", "c", "c"])),
"F": pd.Series(pd.Categorical(["a", "b", "c", "c"], ordered=True)),
"G": [1.0, 2.0, 3, 4.0],
"H": pd.date_range("2016-01-01", freq="d", periods=4, tz="US/Central"),
},
index=pd.Index(range(4), name="idx"),
)
class TestBuildSchema:
def test_build_table_schema(self, df_schema, using_infer_string):
result = build_table_schema(df_schema, version=False)
expected = {
"fields": [
{"name": "idx", "type": "integer"},
{"name": "A", "type": "integer"},
{"name": "B", "type": "string"},
{"name": "C", "type": "datetime"},
{"name": "D", "type": "duration"},
],
"primaryKey": ["idx"],
}
if using_infer_string:
expected["fields"][2] = {"name": "B", "type": "string", "extDtype": "str"}
assert result == expected
result = build_table_schema(df_schema)
assert "pandas_version" in result
def test_series(self):
s = pd.Series([1, 2, 3], name="foo")
result = build_table_schema(s, version=False)
expected = {
"fields": [
{"name": "index", "type": "integer"},
{"name": "foo", "type": "integer"},
],
"primaryKey": ["index"],
}
assert result == expected
result = build_table_schema(s)
assert "pandas_version" in result
def test_series_unnamed(self):
result = build_table_schema(pd.Series([1, 2, 3]), version=False)
expected = {
"fields": [
{"name": "index", "type": "integer"},
{"name": "values", "type": "integer"},
],
"primaryKey": ["index"],
}
assert result == expected
def test_multiindex(self, df_schema, using_infer_string):
df = df_schema
idx = pd.MultiIndex.from_product([("a", "b"), (1, 2)])
df.index = idx
result = build_table_schema(df, version=False)
expected = {
"fields": [
{"name": "level_0", "type": "string"},
{"name": "level_1", "type": "integer"},
{"name": "A", "type": "integer"},
{"name": "B", "type": "string"},
{"name": "C", "type": "datetime"},
{"name": "D", "type": "duration"},
],
"primaryKey": ["level_0", "level_1"],
}
if using_infer_string:
expected["fields"][0] = {
"name": "level_0",
"type": "string",
"extDtype": "str",
}
expected["fields"][3] = {"name": "B", "type": "string", "extDtype": "str"}
assert result == expected
df.index.names = ["idx0", None]
expected["fields"][0]["name"] = "idx0"
expected["primaryKey"] = ["idx0", "level_1"]
result = build_table_schema(df, version=False)
assert result == expected
class TestTableSchemaType:
@pytest.mark.parametrize("int_type", [int, np.int16, np.int32, np.int64])
def test_as_json_table_type_int_data(self, int_type):
int_data = [1, 2, 3]
assert as_json_table_type(np.array(int_data, dtype=int_type).dtype) == "integer"
@pytest.mark.parametrize("float_type", [float, np.float16, np.float32, np.float64])
def test_as_json_table_type_float_data(self, float_type):
float_data = [1.0, 2.0, 3.0]
assert (
as_json_table_type(np.array(float_data, dtype=float_type).dtype) == "number"
)
@pytest.mark.parametrize("bool_type", [bool, np.bool_])
def test_as_json_table_type_bool_data(self, bool_type):
bool_data = [True, False]
assert (
as_json_table_type(np.array(bool_data, dtype=bool_type).dtype) == "boolean"
)
@pytest.mark.parametrize(
"date_data",
[
pd.to_datetime(["2016"]),
pd.to_datetime(["2016"], utc=True),
pd.Series(pd.to_datetime(["2016"])),
pd.Series(pd.to_datetime(["2016"], utc=True)),
pd.period_range("2016", freq="Y", periods=3),
],
)
def test_as_json_table_type_date_data(self, date_data):
assert as_json_table_type(date_data.dtype) == "datetime"
@pytest.mark.parametrize(
"str_data",
[pd.Series(["a", "b"], dtype=object), pd.Index(["a", "b"], dtype=object)],
)
def test_as_json_table_type_string_data(self, str_data):
assert as_json_table_type(str_data.dtype) == "string"
@pytest.mark.parametrize(
"cat_data",
[
pd.Categorical(["a"]),
pd.Categorical([1]),
pd.Series(pd.Categorical([1])),
pd.CategoricalIndex([1]),
pd.Categorical([1]),
],
)
def test_as_json_table_type_categorical_data(self, cat_data):
assert as_json_table_type(cat_data.dtype) == "any"
# ------
# dtypes
# ------
@pytest.mark.parametrize("int_dtype", [int, np.int16, np.int32, np.int64])
def test_as_json_table_type_int_dtypes(self, int_dtype):
assert as_json_table_type(int_dtype) == "integer"
@pytest.mark.parametrize("float_dtype", [float, np.float16, np.float32, np.float64])
def test_as_json_table_type_float_dtypes(self, float_dtype):
assert as_json_table_type(float_dtype) == "number"
@pytest.mark.parametrize("bool_dtype", [bool, np.bool_])
def test_as_json_table_type_bool_dtypes(self, bool_dtype):
assert as_json_table_type(bool_dtype) == "boolean"
@pytest.mark.parametrize(
"date_dtype",
[
np.dtype("<M8[ns]"),
PeriodDtype("D"),
DatetimeTZDtype("ns", "US/Central"),
],
)
def test_as_json_table_type_date_dtypes(self, date_dtype):
# TODO: datedate.date? datetime.time?
assert as_json_table_type(date_dtype) == "datetime"
@pytest.mark.parametrize("td_dtype", [np.dtype("<m8[ns]")])
def test_as_json_table_type_timedelta_dtypes(self, td_dtype):
assert as_json_table_type(td_dtype) == "duration"
@pytest.mark.parametrize("str_dtype", [object]) # TODO(GH#14904) flesh out dtypes?
def test_as_json_table_type_string_dtypes(self, str_dtype):
assert as_json_table_type(str_dtype) == "string"
def test_as_json_table_type_categorical_dtypes(self):
assert as_json_table_type(pd.Categorical(["a"]).dtype) == "any"
assert as_json_table_type(CategoricalDtype()) == "any"
class TestTableOrient:
def test_build_series(self):
s = pd.Series([1, 2], name="a")
s.index.name = "id"
result = s.to_json(orient="table", date_format="iso")
result = json.loads(result, object_pairs_hook=OrderedDict)
assert "pandas_version" in result["schema"]
result["schema"].pop("pandas_version")
fields = [{"name": "id", "type": "integer"}, {"name": "a", "type": "integer"}]
schema = {"fields": fields, "primaryKey": ["id"]}
expected = OrderedDict(
[
("schema", schema),
(
"data",
[
OrderedDict([("id", 0), ("a", 1)]),
OrderedDict([("id", 1), ("a", 2)]),
],
),
]
)
assert result == expected
def test_read_json_from_to_json_results(self):
# GH32383
df = DataFrame(
{
"_id": {"row_0": 0},
"category": {"row_0": "Goods"},
"recommender_id": {"row_0": 3},
"recommender_name_jp": {"row_0": "浦田"},
"recommender_name_en": {"row_0": "Urata"},
"name_jp": {"row_0": "博多人形(松尾吉将まつお よしまさ)"},
"name_en": {"row_0": "Hakata Dolls Matsuo"},
}
)
result1 = pd.read_json(StringIO(df.to_json()))
result2 = DataFrame.from_dict(json.loads(df.to_json()))
tm.assert_frame_equal(result1, df)
tm.assert_frame_equal(result2, df)
def test_to_json(self, df_table, using_infer_string):
df = df_table
df.index.name = "idx"
result = df.to_json(orient="table", date_format="iso")
result = json.loads(result, object_pairs_hook=OrderedDict)
assert "pandas_version" in result["schema"]
result["schema"].pop("pandas_version")
fields = [
{"name": "idx", "type": "integer"},
{"name": "A", "type": "integer"},
{"name": "B", "type": "string"},
{"name": "C", "type": "datetime"},
{"name": "D", "type": "duration"},
{
"constraints": {"enum": ["a", "b", "c"]},
"name": "E",
"ordered": False,
"type": "any",
},
{
"constraints": {"enum": ["a", "b", "c"]},
"name": "F",
"ordered": True,
"type": "any",
},
{"name": "G", "type": "number"},
{"name": "H", "type": "datetime", "tz": "US/Central"},
]
if using_infer_string:
fields[2] = {"name": "B", "type": "string", "extDtype": "str"}
schema = {"fields": fields, "primaryKey": ["idx"]}
data = [
OrderedDict(
[
("idx", 0),
("A", 1),
("B", "a"),
("C", "2016-01-01T00:00:00.000"),
("D", "P0DT1H0M0S"),
("E", "a"),
("F", "a"),
("G", 1.0),
("H", "2016-01-01T06:00:00.000Z"),
]
),
OrderedDict(
[
("idx", 1),
("A", 2),
("B", "b"),
("C", "2016-01-02T00:00:00.000"),
("D", "P0DT1H1M0S"),
("E", "b"),
("F", "b"),
("G", 2.0),
("H", "2016-01-02T06:00:00.000Z"),
]
),
OrderedDict(
[
("idx", 2),
("A", 3),
("B", "c"),
("C", "2016-01-03T00:00:00.000"),
("D", "P0DT1H2M0S"),
("E", "c"),
("F", "c"),
("G", 3.0),
("H", "2016-01-03T06:00:00.000Z"),
]
),
OrderedDict(
[
("idx", 3),
("A", 4),
("B", "c"),
("C", "2016-01-04T00:00:00.000"),
("D", "P0DT1H3M0S"),
("E", "c"),
("F", "c"),
("G", 4.0),
("H", "2016-01-04T06:00:00.000Z"),
]
),
]
expected = OrderedDict([("schema", schema), ("data", data)])
assert result == expected
def test_to_json_float_index(self):
data = pd.Series(1, index=[1.0, 2.0])
result = data.to_json(orient="table", date_format="iso")
result = json.loads(result, object_pairs_hook=OrderedDict)
result["schema"].pop("pandas_version")
expected = OrderedDict(
[
(
"schema",
{
"fields": [
{"name": "index", "type": "number"},
{"name": "values", "type": "integer"},
],
"primaryKey": ["index"],
},
),
(
"data",
[
OrderedDict([("index", 1.0), ("values", 1)]),
OrderedDict([("index", 2.0), ("values", 1)]),
],
),
]
)
assert result == expected
def test_to_json_period_index(self):
idx = pd.period_range("2016", freq="Q-JAN", periods=2)
data = pd.Series(1, idx)
result = data.to_json(orient="table", date_format="iso")
result = json.loads(result, object_pairs_hook=OrderedDict)
result["schema"].pop("pandas_version")
fields = [
{"freq": "QE-JAN", "name": "index", "type": "datetime"},
{"name": "values", "type": "integer"},
]
schema = {"fields": fields, "primaryKey": ["index"]}
data = [
OrderedDict([("index", "2015-11-01T00:00:00.000"), ("values", 1)]),
OrderedDict([("index", "2016-02-01T00:00:00.000"), ("values", 1)]),
]
expected = OrderedDict([("schema", schema), ("data", data)])
assert result == expected
def test_to_json_categorical_index(self):
data = pd.Series(1, pd.CategoricalIndex(["a", "b"]))
result = data.to_json(orient="table", date_format="iso")
result = json.loads(result, object_pairs_hook=OrderedDict)
result["schema"].pop("pandas_version")
expected = OrderedDict(
[
(
"schema",
{
"fields": [
{
"name": "index",
"type": "any",
"constraints": {"enum": ["a", "b"]},
"ordered": False,
},
{"name": "values", "type": "integer"},
],
"primaryKey": ["index"],
},
),
(
"data",
[
OrderedDict([("index", "a"), ("values", 1)]),
OrderedDict([("index", "b"), ("values", 1)]),
],
),
]
)
assert result == expected
def test_date_format_raises(self, df_table):
msg = (
"Trying to write with `orient='table'` and `date_format='epoch'`. Table "
"Schema requires dates to be formatted with `date_format='iso'`"
)
with pytest.raises(ValueError, match=msg):
df_table.to_json(orient="table", date_format="epoch")
# others work
df_table.to_json(orient="table", date_format="iso")
df_table.to_json(orient="table")
def test_convert_pandas_type_to_json_field_int(self, index_or_series):
kind = index_or_series
data = [1, 2, 3]
result = convert_pandas_type_to_json_field(kind(data, name="name"))
expected = {"name": "name", "type": "integer"}
assert result == expected
def test_convert_pandas_type_to_json_field_float(self, index_or_series):
kind = index_or_series
data = [1.0, 2.0, 3.0]
result = convert_pandas_type_to_json_field(kind(data, name="name"))
expected = {"name": "name", "type": "number"}
assert result == expected
@pytest.mark.parametrize(
"dt_args,extra_exp", [({}, {}), ({"utc": True}, {"tz": "UTC"})]
)
@pytest.mark.parametrize("wrapper", [None, pd.Series])
def test_convert_pandas_type_to_json_field_datetime(
self, dt_args, extra_exp, wrapper
):
data = [1.0, 2.0, 3.0]
data = pd.to_datetime(data, **dt_args)
if wrapper is pd.Series:
data = pd.Series(data, name="values")
result = convert_pandas_type_to_json_field(data)
expected = {"name": "values", "type": "datetime"}
expected.update(extra_exp)
assert result == expected
def test_convert_pandas_type_to_json_period_range(self):
arr = pd.period_range("2016", freq="Y-DEC", periods=4)
result = convert_pandas_type_to_json_field(arr)
expected = {"name": "values", "type": "datetime", "freq": "YE-DEC"}
assert result == expected
@pytest.mark.parametrize("kind", [pd.Categorical, pd.CategoricalIndex])
@pytest.mark.parametrize("ordered", [True, False])
def test_convert_pandas_type_to_json_field_categorical(self, kind, ordered):
data = ["a", "b", "c"]
if kind is pd.Categorical:
arr = pd.Series(kind(data, ordered=ordered), name="cats")
elif kind is pd.CategoricalIndex:
arr = kind(data, ordered=ordered, name="cats")
result = convert_pandas_type_to_json_field(arr)
expected = {
"name": "cats",
"type": "any",
"constraints": {"enum": data},
"ordered": ordered,
}
assert result == expected
@pytest.mark.parametrize(
"inp,exp",
[
({"type": "integer"}, "int64"),
({"type": "number"}, "float64"),
({"type": "boolean"}, "bool"),
({"type": "duration"}, "timedelta64"),
({"type": "datetime"}, "datetime64[ns]"),
({"type": "datetime", "tz": "US/Hawaii"}, "datetime64[ns, US/Hawaii]"),
({"type": "any"}, "object"),
(
{
"type": "any",
"constraints": {"enum": ["a", "b", "c"]},
"ordered": False,
},
CategoricalDtype(categories=["a", "b", "c"], ordered=False),
),
(
{
"type": "any",
"constraints": {"enum": ["a", "b", "c"]},
"ordered": True,
},
CategoricalDtype(categories=["a", "b", "c"], ordered=True),
),
({"type": "string"}, None),
],
)
def test_convert_json_field_to_pandas_type(self, inp, exp):
field = {"name": "foo"}
field.update(inp)
assert convert_json_field_to_pandas_type(field) == exp
@pytest.mark.parametrize("inp", ["geopoint", "geojson", "fake_type"])
def test_convert_json_field_to_pandas_type_raises(self, inp):
field = {"type": inp}
with pytest.raises(
ValueError, match=f"Unsupported or invalid field type: {inp}"
):
convert_json_field_to_pandas_type(field)
def test_categorical(self):
s = pd.Series(pd.Categorical(["a", "b", "a"]))
s.index.name = "idx"
result = s.to_json(orient="table", date_format="iso")
result = json.loads(result, object_pairs_hook=OrderedDict)
result["schema"].pop("pandas_version")
fields = [
{"name": "idx", "type": "integer"},
{
"constraints": {"enum": ["a", "b"]},
"name": "values",
"ordered": False,
"type": "any",
},
]
expected = OrderedDict(
[
("schema", {"fields": fields, "primaryKey": ["idx"]}),
(
"data",
[
OrderedDict([("idx", 0), ("values", "a")]),
OrderedDict([("idx", 1), ("values", "b")]),
OrderedDict([("idx", 2), ("values", "a")]),
],
),
]
)
assert result == expected
@pytest.mark.parametrize(
"idx,nm,prop",
[
(pd.Index([1]), "index", "name"),
(pd.Index([1], name="myname"), "myname", "name"),
(
pd.MultiIndex.from_product([("a", "b"), ("c", "d")]),
["level_0", "level_1"],
"names",
),
(
pd.MultiIndex.from_product(
[("a", "b"), ("c", "d")], names=["n1", "n2"]
),
["n1", "n2"],
"names",
),
(
pd.MultiIndex.from_product(
[("a", "b"), ("c", "d")], names=["n1", None]
),
["n1", "level_1"],
"names",
),
],
)
def test_set_names_unset(self, idx, nm, prop):
data = pd.Series(1, idx)
result = set_default_names(data)
assert getattr(result.index, prop) == nm
@pytest.mark.parametrize(
"idx",
[
pd.Index([], name="index"),
pd.MultiIndex.from_arrays([["foo"], ["bar"]], names=("level_0", "level_1")),
pd.MultiIndex.from_arrays([["foo"], ["bar"]], names=("foo", "level_1")),
],
)
def test_warns_non_roundtrippable_names(self, idx):
# GH 19130
df = DataFrame(index=idx)
df.index.name = "index"
with tm.assert_produces_warning():
set_default_names(df)
def test_timestamp_in_columns(self):
df = DataFrame(
[[1, 2]], columns=[pd.Timestamp("2016"), pd.Timedelta(10, unit="s")]
)
result = df.to_json(orient="table")
js = json.loads(result)
assert js["schema"]["fields"][1]["name"] == "2016-01-01T00:00:00.000"
assert js["schema"]["fields"][2]["name"] == "P0DT0H0M10S"
@pytest.mark.parametrize(
"case",
[
pd.Series([1], index=pd.Index([1], name="a"), name="a"),
DataFrame({"A": [1]}, index=pd.Index([1], name="A")),
DataFrame(
{"A": [1]},
index=pd.MultiIndex.from_arrays([["a"], [1]], names=["A", "a"]),
),
],
)
def test_overlapping_names(self, case):
with pytest.raises(ValueError, match="Overlapping"):
case.to_json(orient="table")
def test_mi_falsey_name(self):
# GH 16203
df = DataFrame(
np.random.default_rng(2).standard_normal((4, 4)),
index=pd.MultiIndex.from_product([("A", "B"), ("a", "b")]),
)
result = [x["name"] for x in build_table_schema(df)["fields"]]
assert result == ["level_0", "level_1", 0, 1, 2, 3]
class TestTableOrientReader:
@pytest.mark.parametrize(
"index_nm",
[None, "idx", pytest.param("index", marks=pytest.mark.xfail), "level_0"],
)
@pytest.mark.parametrize(
"vals",
[
{"ints": [1, 2, 3, 4]},
{"objects": ["a", "b", "c", "d"]},
{"objects": ["1", "2", "3", "4"]},
{"date_ranges": pd.date_range("2016-01-01", freq="d", periods=4)},
{"categoricals": pd.Series(pd.Categorical(["a", "b", "c", "c"]))},
{
"ordered_cats": pd.Series(
pd.Categorical(["a", "b", "c", "c"], ordered=True)
)
},
{"floats": [1.0, 2.0, 3.0, 4.0]},
{"floats": [1.1, 2.2, 3.3, 4.4]},
{"bools": [True, False, False, True]},
{
"timezones": pd.date_range(
"2016-01-01", freq="d", periods=4, tz="US/Central"
) # added in # GH 35973
},
],
)
def test_read_json_table_orient(self, index_nm, vals, recwarn):
df = DataFrame(vals, index=pd.Index(range(4), name=index_nm))
out = df.to_json(orient="table")
result = pd.read_json(out, orient="table")
tm.assert_frame_equal(df, result)
@pytest.mark.parametrize("index_nm", [None, "idx", "index"])
@pytest.mark.parametrize(
"vals",
[{"timedeltas": pd.timedelta_range("1h", periods=4, freq="min")}],
)
def test_read_json_table_orient_raises(self, index_nm, vals, recwarn):
df = DataFrame(vals, index=pd.Index(range(4), name=index_nm))
out = df.to_json(orient="table")
with pytest.raises(NotImplementedError, match="can not yet read "):
pd.read_json(out, orient="table")
@pytest.mark.parametrize(
"index_nm",
[None, "idx", pytest.param("index", marks=pytest.mark.xfail), "level_0"],
)
@pytest.mark.parametrize(
"vals",
[
{"ints": [1, 2, 3, 4]},
{"objects": ["a", "b", "c", "d"]},
{"objects": ["1", "2", "3", "4"]},
{"date_ranges": pd.date_range("2016-01-01", freq="d", periods=4)},
{"categoricals": pd.Series(pd.Categorical(["a", "b", "c", "c"]))},
{
"ordered_cats": pd.Series(
pd.Categorical(["a", "b", "c", "c"], ordered=True)
)
},
{"floats": [1.0, 2.0, 3.0, 4.0]},
{"floats": [1.1, 2.2, 3.3, 4.4]},
{"bools": [True, False, False, True]},
{
"timezones": pd.date_range(
"2016-01-01", freq="d", periods=4, tz="US/Central"
) # added in # GH 35973
},
],
)
def test_read_json_table_period_orient(self, index_nm, vals, recwarn):
df = DataFrame(
vals,
index=pd.Index(
(pd.Period(f"2022Q{q}") for q in range(1, 5)), name=index_nm
),
)
out = df.to_json(orient="table")
result = pd.read_json(out, orient="table")
tm.assert_frame_equal(df, result)
@pytest.mark.parametrize(
"idx",
[
pd.Index(range(4)),
pd.date_range(
"2020-08-30",
freq="d",
periods=4,
)._with_freq(None),
pd.date_range(
"2020-08-30", freq="d", periods=4, tz="US/Central"
)._with_freq(None),
pd.MultiIndex.from_product(
[
pd.date_range("2020-08-30", freq="d", periods=2, tz="US/Central"),
["x", "y"],
],
),
],
)
@pytest.mark.parametrize(
"vals",
[
{"floats": [1.1, 2.2, 3.3, 4.4]},
{"dates": pd.date_range("2020-08-30", freq="d", periods=4)},
{
"timezones": pd.date_range(
"2020-08-30", freq="d", periods=4, tz="Europe/London"
)
},
],
)
def test_read_json_table_timezones_orient(self, idx, vals, recwarn):
# GH 35973
df = DataFrame(vals, index=idx)
out = df.to_json(orient="table")
result = pd.read_json(out, orient="table")
tm.assert_frame_equal(df, result)
def test_comprehensive(self):
df = DataFrame(
{
"A": [1, 2, 3, 4],
"B": ["a", "b", "c", "c"],
"C": pd.date_range("2016-01-01", freq="d", periods=4),
# 'D': pd.timedelta_range('1h', periods=4, freq='min'),
"E": pd.Series(pd.Categorical(["a", "b", "c", "c"])),
"F": pd.Series(pd.Categorical(["a", "b", "c", "c"], ordered=True)),
"G": [1.1, 2.2, 3.3, 4.4],
"H": pd.date_range("2016-01-01", freq="d", periods=4, tz="US/Central"),
"I": [True, False, False, True],
},
index=pd.Index(range(4), name="idx"),
)
out = StringIO(df.to_json(orient="table"))
result = pd.read_json(out, orient="table")
tm.assert_frame_equal(df, result)
@pytest.mark.parametrize(
"index_names",
[[None, None], ["foo", "bar"], ["foo", None], [None, "foo"], ["index", "foo"]],
)
def test_multiindex(self, index_names):
# GH 18912
df = DataFrame(
[["Arr", "alpha", [1, 2, 3, 4]], ["Bee", "Beta", [10, 20, 30, 40]]],
index=[["A", "B"], ["Null", "Eins"]],
columns=["Aussprache", "Griechisch", "Args"],
)
df.index.names = index_names
out = StringIO(df.to_json(orient="table"))
result = pd.read_json(out, orient="table")
tm.assert_frame_equal(df, result)
def test_empty_frame_roundtrip(self):
# GH 21287
df = DataFrame(columns=["a", "b", "c"])
expected = df.copy()
out = StringIO(df.to_json(orient="table"))
result = pd.read_json(out, orient="table")
tm.assert_frame_equal(expected, result)
def test_read_json_orient_table_old_schema_version(self):
df_json = """
{
"schema":{
"fields":[
{"name":"index","type":"integer"},
{"name":"a","type":"string"}
],
"primaryKey":["index"],
"pandas_version":"0.20.0"
},
"data":[
{"index":0,"a":1},
{"index":1,"a":2.0},
{"index":2,"a":"s"}
]
}
"""
expected = DataFrame({"a": [1, 2.0, "s"]})
result = pd.read_json(StringIO(df_json), orient="table")
tm.assert_frame_equal(expected, result)
@pytest.mark.parametrize("freq", ["M", "2M", "Q", "2Q", "Y", "2Y"])
def test_read_json_table_orient_period_depr_freq(self, freq, recwarn):
# GH#9586
df = DataFrame(
{"ints": [1, 2]},
index=pd.PeriodIndex(["2020-01", "2021-06"], freq=freq),
)
out = df.to_json(orient="table")
result = pd.read_json(out, orient="table")
tm.assert_frame_equal(df, result)

View File

@ -0,0 +1,317 @@
"""Tests for ExtensionDtype Table Schema integration."""
from collections import OrderedDict
import datetime as dt
import decimal
from io import StringIO
import json
import pytest
from pandas import (
NA,
DataFrame,
Index,
array,
read_json,
)
import pandas._testing as tm
from pandas.core.arrays.integer import Int64Dtype
from pandas.core.arrays.string_ import StringDtype
from pandas.core.series import Series
from pandas.tests.extension.date import (
DateArray,
DateDtype,
)
from pandas.tests.extension.decimal.array import (
DecimalArray,
DecimalDtype,
)
from pandas.io.json._table_schema import (
as_json_table_type,
build_table_schema,
)
class TestBuildSchema:
def test_build_table_schema(self):
df = DataFrame(
{
"A": DateArray([dt.date(2021, 10, 10)]),
"B": DecimalArray([decimal.Decimal(10)]),
"C": array(["pandas"], dtype="string"),
"D": array([10], dtype="Int64"),
}
)
result = build_table_schema(df, version=False)
expected = {
"fields": [
{"name": "index", "type": "integer"},
{"name": "A", "type": "any", "extDtype": "DateDtype"},
{"name": "B", "type": "number", "extDtype": "decimal"},
{"name": "C", "type": "string", "extDtype": "string"},
{"name": "D", "type": "integer", "extDtype": "Int64"},
],
"primaryKey": ["index"],
}
assert result == expected
result = build_table_schema(df)
assert "pandas_version" in result
class TestTableSchemaType:
@pytest.mark.parametrize(
"date_data",
[
DateArray([dt.date(2021, 10, 10)]),
DateArray(dt.date(2021, 10, 10)),
Series(DateArray(dt.date(2021, 10, 10))),
],
)
def test_as_json_table_type_ext_date_array_dtype(self, date_data):
assert as_json_table_type(date_data.dtype) == "any"
def test_as_json_table_type_ext_date_dtype(self):
assert as_json_table_type(DateDtype()) == "any"
@pytest.mark.parametrize(
"decimal_data",
[
DecimalArray([decimal.Decimal(10)]),
Series(DecimalArray([decimal.Decimal(10)])),
],
)
def test_as_json_table_type_ext_decimal_array_dtype(self, decimal_data):
assert as_json_table_type(decimal_data.dtype) == "number"
def test_as_json_table_type_ext_decimal_dtype(self):
assert as_json_table_type(DecimalDtype()) == "number"
@pytest.mark.parametrize(
"string_data",
[
array(["pandas"], dtype="string"),
Series(array(["pandas"], dtype="string")),
],
)
def test_as_json_table_type_ext_string_array_dtype(self, string_data):
assert as_json_table_type(string_data.dtype) == "string"
def test_as_json_table_type_ext_string_dtype(self):
assert as_json_table_type(StringDtype()) == "string"
@pytest.mark.parametrize(
"integer_data",
[
array([10], dtype="Int64"),
Series(array([10], dtype="Int64")),
],
)
def test_as_json_table_type_ext_integer_array_dtype(self, integer_data):
assert as_json_table_type(integer_data.dtype) == "integer"
def test_as_json_table_type_ext_integer_dtype(self):
assert as_json_table_type(Int64Dtype()) == "integer"
class TestTableOrient:
@pytest.fixture
def da(self):
return DateArray([dt.date(2021, 10, 10)])
@pytest.fixture
def dc(self):
return DecimalArray([decimal.Decimal(10)])
@pytest.fixture
def sa(self):
return array(["pandas"], dtype="string")
@pytest.fixture
def ia(self):
return array([10], dtype="Int64")
@pytest.fixture
def df(self, da, dc, sa, ia):
return DataFrame(
{
"A": da,
"B": dc,
"C": sa,
"D": ia,
}
)
def test_build_date_series(self, da):
s = Series(da, name="a")
s.index.name = "id"
result = s.to_json(orient="table", date_format="iso")
result = json.loads(result, object_pairs_hook=OrderedDict)
assert "pandas_version" in result["schema"]
result["schema"].pop("pandas_version")
fields = [
{"name": "id", "type": "integer"},
{"name": "a", "type": "any", "extDtype": "DateDtype"},
]
schema = {"fields": fields, "primaryKey": ["id"]}
expected = OrderedDict(
[
("schema", schema),
("data", [OrderedDict([("id", 0), ("a", "2021-10-10T00:00:00.000")])]),
]
)
assert result == expected
def test_build_decimal_series(self, dc):
s = Series(dc, name="a")
s.index.name = "id"
result = s.to_json(orient="table", date_format="iso")
result = json.loads(result, object_pairs_hook=OrderedDict)
assert "pandas_version" in result["schema"]
result["schema"].pop("pandas_version")
fields = [
{"name": "id", "type": "integer"},
{"name": "a", "type": "number", "extDtype": "decimal"},
]
schema = {"fields": fields, "primaryKey": ["id"]}
expected = OrderedDict(
[
("schema", schema),
("data", [OrderedDict([("id", 0), ("a", 10.0)])]),
]
)
assert result == expected
def test_build_string_series(self, sa):
s = Series(sa, name="a")
s.index.name = "id"
result = s.to_json(orient="table", date_format="iso")
result = json.loads(result, object_pairs_hook=OrderedDict)
assert "pandas_version" in result["schema"]
result["schema"].pop("pandas_version")
fields = [
{"name": "id", "type": "integer"},
{"name": "a", "type": "string", "extDtype": "string"},
]
schema = {"fields": fields, "primaryKey": ["id"]}
expected = OrderedDict(
[
("schema", schema),
("data", [OrderedDict([("id", 0), ("a", "pandas")])]),
]
)
assert result == expected
def test_build_int64_series(self, ia):
s = Series(ia, name="a")
s.index.name = "id"
result = s.to_json(orient="table", date_format="iso")
result = json.loads(result, object_pairs_hook=OrderedDict)
assert "pandas_version" in result["schema"]
result["schema"].pop("pandas_version")
fields = [
{"name": "id", "type": "integer"},
{"name": "a", "type": "integer", "extDtype": "Int64"},
]
schema = {"fields": fields, "primaryKey": ["id"]}
expected = OrderedDict(
[
("schema", schema),
("data", [OrderedDict([("id", 0), ("a", 10)])]),
]
)
assert result == expected
def test_to_json(self, df):
df = df.copy()
df.index.name = "idx"
result = df.to_json(orient="table", date_format="iso")
result = json.loads(result, object_pairs_hook=OrderedDict)
assert "pandas_version" in result["schema"]
result["schema"].pop("pandas_version")
fields = [
OrderedDict({"name": "idx", "type": "integer"}),
OrderedDict({"name": "A", "type": "any", "extDtype": "DateDtype"}),
OrderedDict({"name": "B", "type": "number", "extDtype": "decimal"}),
OrderedDict({"name": "C", "type": "string", "extDtype": "string"}),
OrderedDict({"name": "D", "type": "integer", "extDtype": "Int64"}),
]
schema = OrderedDict({"fields": fields, "primaryKey": ["idx"]})
data = [
OrderedDict(
[
("idx", 0),
("A", "2021-10-10T00:00:00.000"),
("B", 10.0),
("C", "pandas"),
("D", 10),
]
)
]
expected = OrderedDict([("schema", schema), ("data", data)])
assert result == expected
def test_json_ext_dtype_reading_roundtrip(self):
# GH#40255
df = DataFrame(
{
"a": Series([2, NA], dtype="Int64"),
"b": Series([1.5, NA], dtype="Float64"),
"c": Series([True, NA], dtype="boolean"),
},
index=Index([1, NA], dtype="Int64"),
)
expected = df.copy()
data_json = df.to_json(orient="table", indent=4)
result = read_json(StringIO(data_json), orient="table")
tm.assert_frame_equal(result, expected)
def test_json_ext_dtype_reading(self):
# GH#40255
data_json = """{
"schema":{
"fields":[
{
"name":"a",
"type":"integer",
"extDtype":"Int64"
}
],
},
"data":[
{
"a":2
},
{
"a":null
}
]
}"""
result = read_json(StringIO(data_json), orient="table")
expected = DataFrame({"a": Series([2, NA], dtype="Int64")})
tm.assert_frame_equal(result, expected)

View File

@ -0,0 +1,907 @@
import json
import numpy as np
import pytest
from pandas import (
DataFrame,
Index,
Series,
json_normalize,
)
import pandas._testing as tm
from pandas.io.json._normalize import nested_to_record
@pytest.fixture
def deep_nested():
# deeply nested data
return [
{
"country": "USA",
"states": [
{
"name": "California",
"cities": [
{"name": "San Francisco", "pop": 12345},
{"name": "Los Angeles", "pop": 12346},
],
},
{
"name": "Ohio",
"cities": [
{"name": "Columbus", "pop": 1234},
{"name": "Cleveland", "pop": 1236},
],
},
],
},
{
"country": "Germany",
"states": [
{"name": "Bayern", "cities": [{"name": "Munich", "pop": 12347}]},
{
"name": "Nordrhein-Westfalen",
"cities": [
{"name": "Duesseldorf", "pop": 1238},
{"name": "Koeln", "pop": 1239},
],
},
],
},
]
@pytest.fixture
def state_data():
return [
{
"counties": [
{"name": "Dade", "population": 12345},
{"name": "Broward", "population": 40000},
{"name": "Palm Beach", "population": 60000},
],
"info": {"governor": "Rick Scott"},
"shortname": "FL",
"state": "Florida",
},
{
"counties": [
{"name": "Summit", "population": 1234},
{"name": "Cuyahoga", "population": 1337},
],
"info": {"governor": "John Kasich"},
"shortname": "OH",
"state": "Ohio",
},
]
@pytest.fixture
def author_missing_data():
return [
{"info": None},
{
"info": {"created_at": "11/08/1993", "last_updated": "26/05/2012"},
"author_name": {"first": "Jane", "last_name": "Doe"},
},
]
@pytest.fixture
def missing_metadata():
return [
{
"name": "Alice",
"addresses": [
{
"number": 9562,
"street": "Morris St.",
"city": "Massillon",
"state": "OH",
"zip": 44646,
}
],
"previous_residences": {"cities": [{"city_name": "Foo York City"}]},
},
{
"addresses": [
{
"number": 8449,
"street": "Spring St.",
"city": "Elizabethton",
"state": "TN",
"zip": 37643,
}
],
"previous_residences": {"cities": [{"city_name": "Barmingham"}]},
},
]
@pytest.fixture
def max_level_test_input_data():
"""
input data to test json_normalize with max_level param
"""
return [
{
"CreatedBy": {"Name": "User001"},
"Lookup": {
"TextField": "Some text",
"UserField": {"Id": "ID001", "Name": "Name001"},
},
"Image": {"a": "b"},
}
]
class TestJSONNormalize:
def test_simple_records(self):
recs = [
{"a": 1, "b": 2, "c": 3},
{"a": 4, "b": 5, "c": 6},
{"a": 7, "b": 8, "c": 9},
{"a": 10, "b": 11, "c": 12},
]
result = json_normalize(recs)
expected = DataFrame(recs)
tm.assert_frame_equal(result, expected)
def test_simple_normalize(self, state_data):
result = json_normalize(state_data[0], "counties")
expected = DataFrame(state_data[0]["counties"])
tm.assert_frame_equal(result, expected)
result = json_normalize(state_data, "counties")
expected = []
for rec in state_data:
expected.extend(rec["counties"])
expected = DataFrame(expected)
tm.assert_frame_equal(result, expected)
result = json_normalize(state_data, "counties", meta="state")
expected["state"] = np.array(["Florida", "Ohio"]).repeat([3, 2])
tm.assert_frame_equal(result, expected)
def test_fields_list_type_normalize(self):
parse_metadata_fields_list_type = [
{"values": [1, 2, 3], "metadata": {"listdata": [1, 2]}}
]
result = json_normalize(
parse_metadata_fields_list_type,
record_path=["values"],
meta=[["metadata", "listdata"]],
)
expected = DataFrame(
{0: [1, 2, 3], "metadata.listdata": [[1, 2], [1, 2], [1, 2]]}
)
tm.assert_frame_equal(result, expected)
def test_empty_array(self):
result = json_normalize([])
expected = DataFrame()
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"data, record_path, exception_type",
[
([{"a": 0}, {"a": 1}], None, None),
({"a": [{"a": 0}, {"a": 1}]}, "a", None),
('{"a": [{"a": 0}, {"a": 1}]}', None, NotImplementedError),
(None, None, NotImplementedError),
],
)
def test_accepted_input(self, data, record_path, exception_type):
if exception_type is not None:
with pytest.raises(exception_type, match=""):
json_normalize(data, record_path=record_path)
else:
result = json_normalize(data, record_path=record_path)
expected = DataFrame([0, 1], columns=["a"])
tm.assert_frame_equal(result, expected)
def test_simple_normalize_with_separator(self, deep_nested):
# GH 14883
result = json_normalize({"A": {"A": 1, "B": 2}})
expected = DataFrame([[1, 2]], columns=["A.A", "A.B"])
tm.assert_frame_equal(result.reindex_like(expected), expected)
result = json_normalize({"A": {"A": 1, "B": 2}}, sep="_")
expected = DataFrame([[1, 2]], columns=["A_A", "A_B"])
tm.assert_frame_equal(result.reindex_like(expected), expected)
result = json_normalize({"A": {"A": 1, "B": 2}}, sep="\u03c3")
expected = DataFrame([[1, 2]], columns=["A\u03c3A", "A\u03c3B"])
tm.assert_frame_equal(result.reindex_like(expected), expected)
result = json_normalize(
deep_nested,
["states", "cities"],
meta=["country", ["states", "name"]],
sep="_",
)
expected = Index(["name", "pop", "country", "states_name"]).sort_values()
assert result.columns.sort_values().equals(expected)
def test_normalize_with_multichar_separator(self):
# GH #43831
data = {"a": [1, 2], "b": {"b_1": 2, "b_2": (3, 4)}}
result = json_normalize(data, sep="__")
expected = DataFrame([[[1, 2], 2, (3, 4)]], columns=["a", "b__b_1", "b__b_2"])
tm.assert_frame_equal(result, expected)
def test_value_array_record_prefix(self):
# GH 21536
result = json_normalize({"A": [1, 2]}, "A", record_prefix="Prefix.")
expected = DataFrame([[1], [2]], columns=["Prefix.0"])
tm.assert_frame_equal(result, expected)
def test_nested_object_record_path(self):
# GH 22706
data = {
"state": "Florida",
"info": {
"governor": "Rick Scott",
"counties": [
{"name": "Dade", "population": 12345},
{"name": "Broward", "population": 40000},
{"name": "Palm Beach", "population": 60000},
],
},
}
result = json_normalize(data, record_path=["info", "counties"])
expected = DataFrame(
[["Dade", 12345], ["Broward", 40000], ["Palm Beach", 60000]],
columns=["name", "population"],
)
tm.assert_frame_equal(result, expected)
def test_more_deeply_nested(self, deep_nested):
result = json_normalize(
deep_nested, ["states", "cities"], meta=["country", ["states", "name"]]
)
ex_data = {
"country": ["USA"] * 4 + ["Germany"] * 3,
"states.name": [
"California",
"California",
"Ohio",
"Ohio",
"Bayern",
"Nordrhein-Westfalen",
"Nordrhein-Westfalen",
],
"name": [
"San Francisco",
"Los Angeles",
"Columbus",
"Cleveland",
"Munich",
"Duesseldorf",
"Koeln",
],
"pop": [12345, 12346, 1234, 1236, 12347, 1238, 1239],
}
expected = DataFrame(ex_data, columns=result.columns)
tm.assert_frame_equal(result, expected)
def test_shallow_nested(self):
data = [
{
"state": "Florida",
"shortname": "FL",
"info": {"governor": "Rick Scott"},
"counties": [
{"name": "Dade", "population": 12345},
{"name": "Broward", "population": 40000},
{"name": "Palm Beach", "population": 60000},
],
},
{
"state": "Ohio",
"shortname": "OH",
"info": {"governor": "John Kasich"},
"counties": [
{"name": "Summit", "population": 1234},
{"name": "Cuyahoga", "population": 1337},
],
},
]
result = json_normalize(
data, "counties", ["state", "shortname", ["info", "governor"]]
)
ex_data = {
"name": ["Dade", "Broward", "Palm Beach", "Summit", "Cuyahoga"],
"state": ["Florida"] * 3 + ["Ohio"] * 2,
"shortname": ["FL", "FL", "FL", "OH", "OH"],
"info.governor": ["Rick Scott"] * 3 + ["John Kasich"] * 2,
"population": [12345, 40000, 60000, 1234, 1337],
}
expected = DataFrame(ex_data, columns=result.columns)
tm.assert_frame_equal(result, expected)
def test_nested_meta_path_with_nested_record_path(self, state_data):
# GH 27220
result = json_normalize(
data=state_data,
record_path=["counties"],
meta=["state", "shortname", ["info", "governor"]],
errors="ignore",
)
ex_data = {
"name": ["Dade", "Broward", "Palm Beach", "Summit", "Cuyahoga"],
"population": [12345, 40000, 60000, 1234, 1337],
"state": ["Florida"] * 3 + ["Ohio"] * 2,
"shortname": ["FL"] * 3 + ["OH"] * 2,
"info.governor": ["Rick Scott"] * 3 + ["John Kasich"] * 2,
}
expected = DataFrame(ex_data)
tm.assert_frame_equal(result, expected)
def test_meta_name_conflict(self):
data = [
{
"foo": "hello",
"bar": "there",
"data": [
{"foo": "something", "bar": "else"},
{"foo": "something2", "bar": "else2"},
],
}
]
msg = r"Conflicting metadata name (foo|bar), need distinguishing prefix"
with pytest.raises(ValueError, match=msg):
json_normalize(data, "data", meta=["foo", "bar"])
result = json_normalize(data, "data", meta=["foo", "bar"], meta_prefix="meta")
for val in ["metafoo", "metabar", "foo", "bar"]:
assert val in result
def test_meta_parameter_not_modified(self):
# GH 18610
data = [
{
"foo": "hello",
"bar": "there",
"data": [
{"foo": "something", "bar": "else"},
{"foo": "something2", "bar": "else2"},
],
}
]
COLUMNS = ["foo", "bar"]
result = json_normalize(data, "data", meta=COLUMNS, meta_prefix="meta")
assert COLUMNS == ["foo", "bar"]
for val in ["metafoo", "metabar", "foo", "bar"]:
assert val in result
def test_record_prefix(self, state_data):
result = json_normalize(state_data[0], "counties")
expected = DataFrame(state_data[0]["counties"])
tm.assert_frame_equal(result, expected)
result = json_normalize(
state_data, "counties", meta="state", record_prefix="county_"
)
expected = []
for rec in state_data:
expected.extend(rec["counties"])
expected = DataFrame(expected)
expected = expected.rename(columns=lambda x: "county_" + x)
expected["state"] = np.array(["Florida", "Ohio"]).repeat([3, 2])
tm.assert_frame_equal(result, expected)
def test_non_ascii_key(self):
testjson = (
b'[{"\xc3\x9cnic\xc3\xb8de":0,"sub":{"A":1, "B":2}},'
b'{"\xc3\x9cnic\xc3\xb8de":1,"sub":{"A":3, "B":4}}]'
).decode("utf8")
testdata = {
b"\xc3\x9cnic\xc3\xb8de".decode("utf8"): [0, 1],
"sub.A": [1, 3],
"sub.B": [2, 4],
}
expected = DataFrame(testdata)
result = json_normalize(json.loads(testjson))
tm.assert_frame_equal(result, expected)
def test_missing_field(self, author_missing_data):
# GH20030:
result = json_normalize(author_missing_data)
ex_data = [
{
"info": np.nan,
"info.created_at": np.nan,
"info.last_updated": np.nan,
"author_name.first": np.nan,
"author_name.last_name": np.nan,
},
{
"info": None,
"info.created_at": "11/08/1993",
"info.last_updated": "26/05/2012",
"author_name.first": "Jane",
"author_name.last_name": "Doe",
},
]
expected = DataFrame(ex_data)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"max_level,expected",
[
(
0,
[
{
"TextField": "Some text",
"UserField": {"Id": "ID001", "Name": "Name001"},
"CreatedBy": {"Name": "User001"},
"Image": {"a": "b"},
},
{
"TextField": "Some text",
"UserField": {"Id": "ID001", "Name": "Name001"},
"CreatedBy": {"Name": "User001"},
"Image": {"a": "b"},
},
],
),
(
1,
[
{
"TextField": "Some text",
"UserField.Id": "ID001",
"UserField.Name": "Name001",
"CreatedBy": {"Name": "User001"},
"Image": {"a": "b"},
},
{
"TextField": "Some text",
"UserField.Id": "ID001",
"UserField.Name": "Name001",
"CreatedBy": {"Name": "User001"},
"Image": {"a": "b"},
},
],
),
],
)
def test_max_level_with_records_path(self, max_level, expected):
# GH23843: Enhanced JSON normalize
test_input = [
{
"CreatedBy": {"Name": "User001"},
"Lookup": [
{
"TextField": "Some text",
"UserField": {"Id": "ID001", "Name": "Name001"},
},
{
"TextField": "Some text",
"UserField": {"Id": "ID001", "Name": "Name001"},
},
],
"Image": {"a": "b"},
"tags": [
{"foo": "something", "bar": "else"},
{"foo": "something2", "bar": "else2"},
],
}
]
result = json_normalize(
test_input,
record_path=["Lookup"],
meta=[["CreatedBy"], ["Image"]],
max_level=max_level,
)
expected_df = DataFrame(data=expected, columns=result.columns.values)
tm.assert_equal(expected_df, result)
def test_nested_flattening_consistent(self):
# see gh-21537
df1 = json_normalize([{"A": {"B": 1}}])
df2 = json_normalize({"dummy": [{"A": {"B": 1}}]}, "dummy")
# They should be the same.
tm.assert_frame_equal(df1, df2)
def test_nonetype_record_path(self, nulls_fixture):
# see gh-30148
# should not raise TypeError
result = json_normalize(
[
{"state": "Texas", "info": nulls_fixture},
{"state": "Florida", "info": [{"i": 2}]},
],
record_path=["info"],
)
expected = DataFrame({"i": 2}, index=[0])
tm.assert_equal(result, expected)
@pytest.mark.parametrize("value", ["false", "true", "{}", "1", '"text"'])
def test_non_list_record_path_errors(self, value):
# see gh-30148, GH 26284
parsed_value = json.loads(value)
test_input = {"state": "Texas", "info": parsed_value}
test_path = "info"
msg = (
f"{test_input} has non list value {parsed_value} for path {test_path}. "
"Must be list or null."
)
with pytest.raises(TypeError, match=msg):
json_normalize([test_input], record_path=[test_path])
def test_meta_non_iterable(self):
# GH 31507
data = """[{"id": 99, "data": [{"one": 1, "two": 2}]}]"""
result = json_normalize(json.loads(data), record_path=["data"], meta=["id"])
expected = DataFrame(
{"one": [1], "two": [2], "id": np.array([99], dtype=object)}
)
tm.assert_frame_equal(result, expected)
def test_generator(self, state_data):
# GH35923 Fix pd.json_normalize to not skip the first element of a
# generator input
def generator_data():
yield from state_data[0]["counties"]
result = json_normalize(generator_data())
expected = DataFrame(state_data[0]["counties"])
tm.assert_frame_equal(result, expected)
def test_top_column_with_leading_underscore(self):
# 49861
data = {"_id": {"a1": 10, "l2": {"l3": 0}}, "gg": 4}
result = json_normalize(data, sep="_")
expected = DataFrame([[4, 10, 0]], columns=["gg", "_id_a1", "_id_l2_l3"])
tm.assert_frame_equal(result, expected)
class TestNestedToRecord:
def test_flat_stays_flat(self):
recs = [{"flat1": 1, "flat2": 2}, {"flat3": 3, "flat2": 4}]
result = nested_to_record(recs)
expected = recs
assert result == expected
def test_one_level_deep_flattens(self):
data = {"flat1": 1, "dict1": {"c": 1, "d": 2}}
result = nested_to_record(data)
expected = {"dict1.c": 1, "dict1.d": 2, "flat1": 1}
assert result == expected
def test_nested_flattens(self):
data = {
"flat1": 1,
"dict1": {"c": 1, "d": 2},
"nested": {"e": {"c": 1, "d": 2}, "d": 2},
}
result = nested_to_record(data)
expected = {
"dict1.c": 1,
"dict1.d": 2,
"flat1": 1,
"nested.d": 2,
"nested.e.c": 1,
"nested.e.d": 2,
}
assert result == expected
def test_json_normalize_errors(self, missing_metadata):
# GH14583:
# If meta keys are not always present a new option to set
# errors='ignore' has been implemented
msg = (
"Key 'name' not found. To replace missing values of "
"'name' with np.nan, pass in errors='ignore'"
)
with pytest.raises(KeyError, match=msg):
json_normalize(
data=missing_metadata,
record_path="addresses",
meta="name",
errors="raise",
)
def test_missing_meta(self, missing_metadata):
# GH25468
# If metadata is nullable with errors set to ignore, the null values
# should be numpy.nan values
result = json_normalize(
data=missing_metadata, record_path="addresses", meta="name", errors="ignore"
)
ex_data = [
[9562, "Morris St.", "Massillon", "OH", 44646, "Alice"],
[8449, "Spring St.", "Elizabethton", "TN", 37643, np.nan],
]
columns = ["number", "street", "city", "state", "zip", "name"]
expected = DataFrame(ex_data, columns=columns)
tm.assert_frame_equal(result, expected)
def test_missing_nested_meta(self):
# GH44312
# If errors="ignore" and nested metadata is null, we should return nan
data = {"meta": "foo", "nested_meta": None, "value": [{"rec": 1}, {"rec": 2}]}
result = json_normalize(
data,
record_path="value",
meta=["meta", ["nested_meta", "leaf"]],
errors="ignore",
)
ex_data = [[1, "foo", np.nan], [2, "foo", np.nan]]
columns = ["rec", "meta", "nested_meta.leaf"]
expected = DataFrame(ex_data, columns=columns).astype(
{"nested_meta.leaf": object}
)
tm.assert_frame_equal(result, expected)
# If errors="raise" and nested metadata is null, we should raise with the
# key of the first missing level
with pytest.raises(KeyError, match="'leaf' not found"):
json_normalize(
data,
record_path="value",
meta=["meta", ["nested_meta", "leaf"]],
errors="raise",
)
def test_missing_meta_multilevel_record_path_errors_raise(self, missing_metadata):
# GH41876
# Ensure errors='raise' works as intended even when a record_path of length
# greater than one is passed in
msg = (
"Key 'name' not found. To replace missing values of "
"'name' with np.nan, pass in errors='ignore'"
)
with pytest.raises(KeyError, match=msg):
json_normalize(
data=missing_metadata,
record_path=["previous_residences", "cities"],
meta="name",
errors="raise",
)
def test_missing_meta_multilevel_record_path_errors_ignore(self, missing_metadata):
# GH41876
# Ensure errors='ignore' works as intended even when a record_path of length
# greater than one is passed in
result = json_normalize(
data=missing_metadata,
record_path=["previous_residences", "cities"],
meta="name",
errors="ignore",
)
ex_data = [
["Foo York City", "Alice"],
["Barmingham", np.nan],
]
columns = ["city_name", "name"]
expected = DataFrame(ex_data, columns=columns)
tm.assert_frame_equal(result, expected)
def test_donot_drop_nonevalues(self):
# GH21356
data = [
{"info": None, "author_name": {"first": "Smith", "last_name": "Appleseed"}},
{
"info": {"created_at": "11/08/1993", "last_updated": "26/05/2012"},
"author_name": {"first": "Jane", "last_name": "Doe"},
},
]
result = nested_to_record(data)
expected = [
{
"info": None,
"author_name.first": "Smith",
"author_name.last_name": "Appleseed",
},
{
"author_name.first": "Jane",
"author_name.last_name": "Doe",
"info.created_at": "11/08/1993",
"info.last_updated": "26/05/2012",
},
]
assert result == expected
def test_nonetype_top_level_bottom_level(self):
# GH21158: If inner level json has a key with a null value
# make sure it does not do a new_d.pop twice and except
data = {
"id": None,
"location": {
"country": {
"state": {
"id": None,
"town.info": {
"id": None,
"region": None,
"x": 49.151580810546875,
"y": -33.148521423339844,
"z": 27.572303771972656,
},
}
}
},
}
result = nested_to_record(data)
expected = {
"id": None,
"location.country.state.id": None,
"location.country.state.town.info.id": None,
"location.country.state.town.info.region": None,
"location.country.state.town.info.x": 49.151580810546875,
"location.country.state.town.info.y": -33.148521423339844,
"location.country.state.town.info.z": 27.572303771972656,
}
assert result == expected
def test_nonetype_multiple_levels(self):
# GH21158: If inner level json has a key with a null value
# make sure it does not do a new_d.pop twice and except
data = {
"id": None,
"location": {
"id": None,
"country": {
"id": None,
"state": {
"id": None,
"town.info": {
"region": None,
"x": 49.151580810546875,
"y": -33.148521423339844,
"z": 27.572303771972656,
},
},
},
},
}
result = nested_to_record(data)
expected = {
"id": None,
"location.id": None,
"location.country.id": None,
"location.country.state.id": None,
"location.country.state.town.info.region": None,
"location.country.state.town.info.x": 49.151580810546875,
"location.country.state.town.info.y": -33.148521423339844,
"location.country.state.town.info.z": 27.572303771972656,
}
assert result == expected
@pytest.mark.parametrize(
"max_level, expected",
[
(
None,
[
{
"CreatedBy.Name": "User001",
"Lookup.TextField": "Some text",
"Lookup.UserField.Id": "ID001",
"Lookup.UserField.Name": "Name001",
"Image.a": "b",
}
],
),
(
0,
[
{
"CreatedBy": {"Name": "User001"},
"Lookup": {
"TextField": "Some text",
"UserField": {"Id": "ID001", "Name": "Name001"},
},
"Image": {"a": "b"},
}
],
),
(
1,
[
{
"CreatedBy.Name": "User001",
"Lookup.TextField": "Some text",
"Lookup.UserField": {"Id": "ID001", "Name": "Name001"},
"Image.a": "b",
}
],
),
],
)
def test_with_max_level(self, max_level, expected, max_level_test_input_data):
# GH23843: Enhanced JSON normalize
output = nested_to_record(max_level_test_input_data, max_level=max_level)
assert output == expected
def test_with_large_max_level(self):
# GH23843: Enhanced JSON normalize
max_level = 100
input_data = [
{
"CreatedBy": {
"user": {
"name": {"firstname": "Leo", "LastName": "Thomson"},
"family_tree": {
"father": {
"name": "Father001",
"father": {
"Name": "Father002",
"father": {
"name": "Father003",
"father": {"Name": "Father004"},
},
},
}
},
}
}
}
]
expected = [
{
"CreatedBy.user.name.firstname": "Leo",
"CreatedBy.user.name.LastName": "Thomson",
"CreatedBy.user.family_tree.father.name": "Father001",
"CreatedBy.user.family_tree.father.father.Name": "Father002",
"CreatedBy.user.family_tree.father.father.father.name": "Father003",
"CreatedBy.user.family_tree.father.father.father.father.Name": "Father004", # noqa: E501
}
]
output = nested_to_record(input_data, max_level=max_level)
assert output == expected
def test_series_non_zero_index(self):
# GH 19020
data = {
0: {"id": 1, "name": "Foo", "elements": {"a": 1}},
1: {"id": 2, "name": "Bar", "elements": {"b": 2}},
2: {"id": 3, "name": "Baz", "elements": {"c": 3}},
}
s = Series(data)
s.index = [1, 2, 3]
result = json_normalize(s)
expected = DataFrame(
{
"id": [1, 2, 3],
"name": ["Foo", "Bar", "Baz"],
"elements.a": [1.0, np.nan, np.nan],
"elements.b": [np.nan, 2.0, np.nan],
"elements.c": [np.nan, np.nan, 3.0],
}
)
tm.assert_frame_equal(result, expected)

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,543 @@
from collections.abc import Iterator
from io import StringIO
from pathlib import Path
import numpy as np
import pytest
import pandas as pd
from pandas import (
DataFrame,
read_json,
)
import pandas._testing as tm
from pandas.io.json._json import JsonReader
pytestmark = pytest.mark.filterwarnings(
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
)
@pytest.fixture
def lines_json_df():
df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
return df.to_json(lines=True, orient="records")
@pytest.fixture(params=["ujson", "pyarrow"])
def engine(request):
if request.param == "pyarrow":
pytest.importorskip("pyarrow.json")
return request.param
def test_read_jsonl():
# GH9180
result = read_json(StringIO('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n'), lines=True)
expected = DataFrame([[1, 2], [1, 2]], columns=["a", "b"])
tm.assert_frame_equal(result, expected)
def test_read_jsonl_engine_pyarrow(datapath, engine):
result = read_json(
datapath("io", "json", "data", "line_delimited.json"),
lines=True,
engine=engine,
)
expected = DataFrame({"a": [1, 3, 5], "b": [2, 4, 6]})
tm.assert_frame_equal(result, expected)
def test_read_datetime(request, engine):
# GH33787
if engine == "pyarrow":
# GH 48893
reason = "Pyarrow only supports a file path as an input and line delimited json"
request.applymarker(pytest.mark.xfail(reason=reason, raises=ValueError))
df = DataFrame(
[([1, 2], ["2020-03-05", "2020-04-08T09:58:49+00:00"], "hector")],
columns=["accounts", "date", "name"],
)
json_line = df.to_json(lines=True, orient="records")
if engine == "pyarrow":
result = read_json(StringIO(json_line), engine=engine)
else:
result = read_json(StringIO(json_line), engine=engine)
expected = DataFrame(
[[1, "2020-03-05", "hector"], [2, "2020-04-08T09:58:49+00:00", "hector"]],
columns=["accounts", "date", "name"],
)
tm.assert_frame_equal(result, expected)
def test_read_jsonl_unicode_chars():
# GH15132: non-ascii unicode characters
# \u201d == RIGHT DOUBLE QUOTATION MARK
# simulate file handle
json = '{"a": "foo”", "b": "bar"}\n{"a": "foo", "b": "bar"}\n'
json = StringIO(json)
result = read_json(json, lines=True)
expected = DataFrame([["foo\u201d", "bar"], ["foo", "bar"]], columns=["a", "b"])
tm.assert_frame_equal(result, expected)
# simulate string
json = '{"a": "foo”", "b": "bar"}\n{"a": "foo", "b": "bar"}\n'
result = read_json(StringIO(json), lines=True)
expected = DataFrame([["foo\u201d", "bar"], ["foo", "bar"]], columns=["a", "b"])
tm.assert_frame_equal(result, expected)
def test_to_jsonl():
# GH9180
df = DataFrame([[1, 2], [1, 2]], columns=["a", "b"])
result = df.to_json(orient="records", lines=True)
expected = '{"a":1,"b":2}\n{"a":1,"b":2}\n'
assert result == expected
df = DataFrame([["foo}", "bar"], ['foo"', "bar"]], columns=["a", "b"])
result = df.to_json(orient="records", lines=True)
expected = '{"a":"foo}","b":"bar"}\n{"a":"foo\\"","b":"bar"}\n'
assert result == expected
tm.assert_frame_equal(read_json(StringIO(result), lines=True), df)
# GH15096: escaped characters in columns and data
df = DataFrame([["foo\\", "bar"], ['foo"', "bar"]], columns=["a\\", "b"])
result = df.to_json(orient="records", lines=True)
expected = '{"a\\\\":"foo\\\\","b":"bar"}\n{"a\\\\":"foo\\"","b":"bar"}\n'
assert result == expected
tm.assert_frame_equal(read_json(StringIO(result), lines=True), df)
def test_to_jsonl_count_new_lines():
# GH36888
df = DataFrame([[1, 2], [1, 2]], columns=["a", "b"])
actual_new_lines_count = df.to_json(orient="records", lines=True).count("\n")
expected_new_lines_count = 2
assert actual_new_lines_count == expected_new_lines_count
@pytest.mark.parametrize("chunksize", [1, 1.0])
def test_readjson_chunks(request, lines_json_df, chunksize, engine):
# Basic test that read_json(chunks=True) gives the same result as
# read_json(chunks=False)
# GH17048: memory usage when lines=True
if engine == "pyarrow":
# GH 48893
reason = (
"Pyarrow only supports a file path as an input and line delimited json"
"and doesn't support chunksize parameter."
)
request.applymarker(pytest.mark.xfail(reason=reason, raises=ValueError))
unchunked = read_json(StringIO(lines_json_df), lines=True)
with read_json(
StringIO(lines_json_df), lines=True, chunksize=chunksize, engine=engine
) as reader:
chunked = pd.concat(reader)
tm.assert_frame_equal(chunked, unchunked)
def test_readjson_chunksize_requires_lines(lines_json_df, engine):
msg = "chunksize can only be passed if lines=True"
with pytest.raises(ValueError, match=msg):
with read_json(
StringIO(lines_json_df), lines=False, chunksize=2, engine=engine
) as _:
pass
def test_readjson_chunks_series(request, engine):
if engine == "pyarrow":
# GH 48893
reason = (
"Pyarrow only supports a file path as an input and line delimited json"
"and doesn't support chunksize parameter."
)
request.applymarker(pytest.mark.xfail(reason=reason))
# Test reading line-format JSON to Series with chunksize param
s = pd.Series({"A": 1, "B": 2})
strio = StringIO(s.to_json(lines=True, orient="records"))
unchunked = read_json(strio, lines=True, typ="Series", engine=engine)
strio = StringIO(s.to_json(lines=True, orient="records"))
with read_json(
strio, lines=True, typ="Series", chunksize=1, engine=engine
) as reader:
chunked = pd.concat(reader)
tm.assert_series_equal(chunked, unchunked)
def test_readjson_each_chunk(request, lines_json_df, engine):
if engine == "pyarrow":
# GH 48893
reason = (
"Pyarrow only supports a file path as an input and line delimited json"
"and doesn't support chunksize parameter."
)
request.applymarker(pytest.mark.xfail(reason=reason, raises=ValueError))
# Other tests check that the final result of read_json(chunksize=True)
# is correct. This checks the intermediate chunks.
with read_json(
StringIO(lines_json_df), lines=True, chunksize=2, engine=engine
) as reader:
chunks = list(reader)
assert chunks[0].shape == (2, 2)
assert chunks[1].shape == (1, 2)
def test_readjson_chunks_from_file(request, engine):
if engine == "pyarrow":
# GH 48893
reason = (
"Pyarrow only supports a file path as an input and line delimited json"
"and doesn't support chunksize parameter."
)
request.applymarker(pytest.mark.xfail(reason=reason, raises=ValueError))
with tm.ensure_clean("test.json") as path:
df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
df.to_json(path, lines=True, orient="records")
with read_json(path, lines=True, chunksize=1, engine=engine) as reader:
chunked = pd.concat(reader)
unchunked = read_json(path, lines=True, engine=engine)
tm.assert_frame_equal(unchunked, chunked)
@pytest.mark.parametrize("chunksize", [None, 1])
def test_readjson_chunks_closes(chunksize):
with tm.ensure_clean("test.json") as path:
df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
df.to_json(path, lines=True, orient="records")
reader = JsonReader(
path,
orient=None,
typ="frame",
dtype=True,
convert_axes=True,
convert_dates=True,
keep_default_dates=True,
precise_float=False,
date_unit=None,
encoding=None,
lines=True,
chunksize=chunksize,
compression=None,
nrows=None,
)
with reader:
reader.read()
assert (
reader.handles.handle.closed
), f"didn't close stream with chunksize = {chunksize}"
@pytest.mark.parametrize("chunksize", [0, -1, 2.2, "foo"])
def test_readjson_invalid_chunksize(lines_json_df, chunksize, engine):
msg = r"'chunksize' must be an integer >=1"
with pytest.raises(ValueError, match=msg):
with read_json(
StringIO(lines_json_df), lines=True, chunksize=chunksize, engine=engine
) as _:
pass
@pytest.mark.parametrize("chunksize", [None, 1, 2])
def test_readjson_chunks_multiple_empty_lines(chunksize):
j = """
{"A":1,"B":4}
{"A":2,"B":5}
{"A":3,"B":6}
"""
orig = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
test = read_json(StringIO(j), lines=True, chunksize=chunksize)
if chunksize is not None:
with test:
test = pd.concat(test)
tm.assert_frame_equal(orig, test, obj=f"chunksize: {chunksize}")
def test_readjson_unicode(request, monkeypatch, engine):
if engine == "pyarrow":
# GH 48893
reason = (
"Pyarrow only supports a file path as an input and line delimited json"
"and doesn't support chunksize parameter."
)
request.applymarker(pytest.mark.xfail(reason=reason, raises=ValueError))
with tm.ensure_clean("test.json") as path:
monkeypatch.setattr("locale.getpreferredencoding", lambda do_setlocale: "cp949")
with open(path, "w", encoding="utf-8") as f:
f.write('{"£©µÀÆÖÞßéöÿ":["АБВГДабвгд가"]}')
result = read_json(path, engine=engine)
expected = DataFrame({"£©µÀÆÖÞßéöÿ": ["АБВГДабвгд가"]})
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("nrows", [1, 2])
def test_readjson_nrows(nrows, engine):
# GH 33916
# Test reading line-format JSON to Series with nrows param
jsonl = """{"a": 1, "b": 2}
{"a": 3, "b": 4}
{"a": 5, "b": 6}
{"a": 7, "b": 8}"""
result = read_json(StringIO(jsonl), lines=True, nrows=nrows)
expected = DataFrame({"a": [1, 3, 5, 7], "b": [2, 4, 6, 8]}).iloc[:nrows]
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("nrows,chunksize", [(2, 2), (4, 2)])
def test_readjson_nrows_chunks(request, nrows, chunksize, engine):
# GH 33916
# Test reading line-format JSON to Series with nrows and chunksize param
if engine == "pyarrow":
# GH 48893
reason = (
"Pyarrow only supports a file path as an input and line delimited json"
"and doesn't support chunksize parameter."
)
request.applymarker(pytest.mark.xfail(reason=reason, raises=ValueError))
jsonl = """{"a": 1, "b": 2}
{"a": 3, "b": 4}
{"a": 5, "b": 6}
{"a": 7, "b": 8}"""
if engine != "pyarrow":
with read_json(
StringIO(jsonl), lines=True, nrows=nrows, chunksize=chunksize, engine=engine
) as reader:
chunked = pd.concat(reader)
else:
with read_json(
jsonl, lines=True, nrows=nrows, chunksize=chunksize, engine=engine
) as reader:
chunked = pd.concat(reader)
expected = DataFrame({"a": [1, 3, 5, 7], "b": [2, 4, 6, 8]}).iloc[:nrows]
tm.assert_frame_equal(chunked, expected)
def test_readjson_nrows_requires_lines(engine):
# GH 33916
# Test ValueError raised if nrows is set without setting lines in read_json
jsonl = """{"a": 1, "b": 2}
{"a": 3, "b": 4}
{"a": 5, "b": 6}
{"a": 7, "b": 8}"""
msg = "nrows can only be passed if lines=True"
with pytest.raises(ValueError, match=msg):
read_json(jsonl, lines=False, nrows=2, engine=engine)
def test_readjson_lines_chunks_fileurl(request, datapath, engine):
# GH 27135
# Test reading line-format JSON from file url
if engine == "pyarrow":
# GH 48893
reason = (
"Pyarrow only supports a file path as an input and line delimited json"
"and doesn't support chunksize parameter."
)
request.applymarker(pytest.mark.xfail(reason=reason, raises=ValueError))
df_list_expected = [
DataFrame([[1, 2]], columns=["a", "b"], index=[0]),
DataFrame([[3, 4]], columns=["a", "b"], index=[1]),
DataFrame([[5, 6]], columns=["a", "b"], index=[2]),
]
os_path = datapath("io", "json", "data", "line_delimited.json")
file_url = Path(os_path).as_uri()
with read_json(file_url, lines=True, chunksize=1, engine=engine) as url_reader:
for index, chuck in enumerate(url_reader):
tm.assert_frame_equal(chuck, df_list_expected[index])
def test_chunksize_is_incremental():
# See https://github.com/pandas-dev/pandas/issues/34548
jsonl = (
"""{"a": 1, "b": 2}
{"a": 3, "b": 4}
{"a": 5, "b": 6}
{"a": 7, "b": 8}\n"""
* 1000
)
class MyReader:
def __init__(self, contents) -> None:
self.read_count = 0
self.stringio = StringIO(contents)
def read(self, *args):
self.read_count += 1
return self.stringio.read(*args)
def __iter__(self) -> Iterator:
self.read_count += 1
return iter(self.stringio)
reader = MyReader(jsonl)
assert len(list(read_json(reader, lines=True, chunksize=100))) > 1
assert reader.read_count > 10
@pytest.mark.parametrize("orient_", ["split", "index", "table"])
def test_to_json_append_orient(orient_):
# GH 35849
# Test ValueError when orient is not 'records'
df = DataFrame({"col1": [1, 2], "col2": ["a", "b"]})
msg = (
r"mode='a' \(append\) is only supported when "
"lines is True and orient is 'records'"
)
with pytest.raises(ValueError, match=msg):
df.to_json(mode="a", orient=orient_)
def test_to_json_append_lines():
# GH 35849
# Test ValueError when lines is not True
df = DataFrame({"col1": [1, 2], "col2": ["a", "b"]})
msg = (
r"mode='a' \(append\) is only supported when "
"lines is True and orient is 'records'"
)
with pytest.raises(ValueError, match=msg):
df.to_json(mode="a", lines=False, orient="records")
@pytest.mark.parametrize("mode_", ["r", "x"])
def test_to_json_append_mode(mode_):
# GH 35849
# Test ValueError when mode is not supported option
df = DataFrame({"col1": [1, 2], "col2": ["a", "b"]})
msg = (
f"mode={mode_} is not a valid option."
"Only 'w' and 'a' are currently supported."
)
with pytest.raises(ValueError, match=msg):
df.to_json(mode=mode_, lines=False, orient="records")
def test_to_json_append_output_consistent_columns():
# GH 35849
# Testing that resulting output reads in as expected.
# Testing same columns, new rows
df1 = DataFrame({"col1": [1, 2], "col2": ["a", "b"]})
df2 = DataFrame({"col1": [3, 4], "col2": ["c", "d"]})
expected = DataFrame({"col1": [1, 2, 3, 4], "col2": ["a", "b", "c", "d"]})
with tm.ensure_clean("test.json") as path:
# Save dataframes to the same file
df1.to_json(path, lines=True, orient="records")
df2.to_json(path, mode="a", lines=True, orient="records")
# Read path file
result = read_json(path, lines=True)
tm.assert_frame_equal(result, expected)
def test_to_json_append_output_inconsistent_columns():
# GH 35849
# Testing that resulting output reads in as expected.
# Testing one new column, one old column, new rows
df1 = DataFrame({"col1": [1, 2], "col2": ["a", "b"]})
df3 = DataFrame({"col2": ["e", "f"], "col3": ["!", "#"]})
expected = DataFrame(
{
"col1": [1, 2, None, None],
"col2": ["a", "b", "e", "f"],
"col3": [np.nan, np.nan, "!", "#"],
}
)
with tm.ensure_clean("test.json") as path:
# Save dataframes to the same file
df1.to_json(path, mode="a", lines=True, orient="records")
df3.to_json(path, mode="a", lines=True, orient="records")
# Read path file
result = read_json(path, lines=True)
tm.assert_frame_equal(result, expected)
def test_to_json_append_output_different_columns():
# GH 35849
# Testing that resulting output reads in as expected.
# Testing same, differing and new columns
df1 = DataFrame({"col1": [1, 2], "col2": ["a", "b"]})
df2 = DataFrame({"col1": [3, 4], "col2": ["c", "d"]})
df3 = DataFrame({"col2": ["e", "f"], "col3": ["!", "#"]})
df4 = DataFrame({"col4": [True, False]})
expected = DataFrame(
{
"col1": [1, 2, 3, 4, None, None, None, None],
"col2": ["a", "b", "c", "d", "e", "f", np.nan, np.nan],
"col3": [np.nan, np.nan, np.nan, np.nan, "!", "#", np.nan, np.nan],
"col4": [None, None, None, None, None, None, True, False],
}
).astype({"col4": "float"})
with tm.ensure_clean("test.json") as path:
# Save dataframes to the same file
df1.to_json(path, mode="a", lines=True, orient="records")
df2.to_json(path, mode="a", lines=True, orient="records")
df3.to_json(path, mode="a", lines=True, orient="records")
df4.to_json(path, mode="a", lines=True, orient="records")
# Read path file
result = read_json(path, lines=True)
tm.assert_frame_equal(result, expected)
def test_to_json_append_output_different_columns_reordered():
# GH 35849
# Testing that resulting output reads in as expected.
# Testing specific result column order.
df1 = DataFrame({"col1": [1, 2], "col2": ["a", "b"]})
df2 = DataFrame({"col1": [3, 4], "col2": ["c", "d"]})
df3 = DataFrame({"col2": ["e", "f"], "col3": ["!", "#"]})
df4 = DataFrame({"col4": [True, False]})
# df4, df3, df2, df1 (in that order)
expected = DataFrame(
{
"col4": [True, False, None, None, None, None, None, None],
"col2": [np.nan, np.nan, "e", "f", "c", "d", "a", "b"],
"col3": [np.nan, np.nan, "!", "#", np.nan, np.nan, np.nan, np.nan],
"col1": [None, None, None, None, 3, 4, 1, 2],
}
).astype({"col4": "float"})
with tm.ensure_clean("test.json") as path:
# Save dataframes to the same file
df4.to_json(path, mode="a", lines=True, orient="records")
df3.to_json(path, mode="a", lines=True, orient="records")
df2.to_json(path, mode="a", lines=True, orient="records")
df1.to_json(path, mode="a", lines=True, orient="records")
# Read path file
result = read_json(path, lines=True)
tm.assert_frame_equal(result, expected)

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,382 @@
"""
Tests that work on both the Python and C engines but do not have a
specific classification into the other test modules.
"""
from io import StringIO
import numpy as np
import pytest
from pandas._libs import parsers as libparsers
from pandas.errors import DtypeWarning
from pandas import (
DataFrame,
concat,
)
import pandas._testing as tm
pytestmark = pytest.mark.filterwarnings(
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
)
@pytest.mark.parametrize("index_col", [0, "index"])
def test_read_chunksize_with_index(all_parsers, index_col):
parser = all_parsers
data = """index,A,B,C,D
foo,2,3,4,5
bar,7,8,9,10
baz,12,13,14,15
qux,12,13,14,15
foo2,12,13,14,15
bar2,12,13,14,15
"""
expected = DataFrame(
[
["foo", 2, 3, 4, 5],
["bar", 7, 8, 9, 10],
["baz", 12, 13, 14, 15],
["qux", 12, 13, 14, 15],
["foo2", 12, 13, 14, 15],
["bar2", 12, 13, 14, 15],
],
columns=["index", "A", "B", "C", "D"],
)
expected = expected.set_index("index")
if parser.engine == "pyarrow":
msg = "The 'chunksize' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
with parser.read_csv(StringIO(data), index_col=0, chunksize=2) as reader:
list(reader)
return
with parser.read_csv(StringIO(data), index_col=0, chunksize=2) as reader:
chunks = list(reader)
tm.assert_frame_equal(chunks[0], expected[:2])
tm.assert_frame_equal(chunks[1], expected[2:4])
tm.assert_frame_equal(chunks[2], expected[4:])
@pytest.mark.parametrize("chunksize", [1.3, "foo", 0])
def test_read_chunksize_bad(all_parsers, chunksize):
data = """index,A,B,C,D
foo,2,3,4,5
bar,7,8,9,10
baz,12,13,14,15
qux,12,13,14,15
foo2,12,13,14,15
bar2,12,13,14,15
"""
parser = all_parsers
msg = r"'chunksize' must be an integer >=1"
if parser.engine == "pyarrow":
msg = "The 'chunksize' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
with parser.read_csv(StringIO(data), chunksize=chunksize) as _:
pass
@pytest.mark.parametrize("chunksize", [2, 8])
def test_read_chunksize_and_nrows(all_parsers, chunksize):
# see gh-15755
data = """index,A,B,C,D
foo,2,3,4,5
bar,7,8,9,10
baz,12,13,14,15
qux,12,13,14,15
foo2,12,13,14,15
bar2,12,13,14,15
"""
parser = all_parsers
kwargs = {"index_col": 0, "nrows": 5}
if parser.engine == "pyarrow":
msg = "The 'nrows' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), **kwargs)
return
expected = parser.read_csv(StringIO(data), **kwargs)
with parser.read_csv(StringIO(data), chunksize=chunksize, **kwargs) as reader:
tm.assert_frame_equal(concat(reader), expected)
def test_read_chunksize_and_nrows_changing_size(all_parsers):
data = """index,A,B,C,D
foo,2,3,4,5
bar,7,8,9,10
baz,12,13,14,15
qux,12,13,14,15
foo2,12,13,14,15
bar2,12,13,14,15
"""
parser = all_parsers
kwargs = {"index_col": 0, "nrows": 5}
if parser.engine == "pyarrow":
msg = "The 'nrows' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), **kwargs)
return
expected = parser.read_csv(StringIO(data), **kwargs)
with parser.read_csv(StringIO(data), chunksize=8, **kwargs) as reader:
tm.assert_frame_equal(reader.get_chunk(size=2), expected.iloc[:2])
tm.assert_frame_equal(reader.get_chunk(size=4), expected.iloc[2:5])
with pytest.raises(StopIteration, match=""):
reader.get_chunk(size=3)
def test_get_chunk_passed_chunksize(all_parsers):
parser = all_parsers
data = """A,B,C
1,2,3
4,5,6
7,8,9
1,2,3"""
if parser.engine == "pyarrow":
msg = "The 'chunksize' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
with parser.read_csv(StringIO(data), chunksize=2) as reader:
reader.get_chunk()
return
with parser.read_csv(StringIO(data), chunksize=2) as reader:
result = reader.get_chunk()
expected = DataFrame([[1, 2, 3], [4, 5, 6]], columns=["A", "B", "C"])
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("kwargs", [{}, {"index_col": 0}])
def test_read_chunksize_compat(all_parsers, kwargs):
# see gh-12185
data = """index,A,B,C,D
foo,2,3,4,5
bar,7,8,9,10
baz,12,13,14,15
qux,12,13,14,15
foo2,12,13,14,15
bar2,12,13,14,15
"""
parser = all_parsers
result = parser.read_csv(StringIO(data), **kwargs)
if parser.engine == "pyarrow":
msg = "The 'chunksize' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
with parser.read_csv(StringIO(data), chunksize=2, **kwargs) as reader:
concat(reader)
return
with parser.read_csv(StringIO(data), chunksize=2, **kwargs) as reader:
via_reader = concat(reader)
tm.assert_frame_equal(via_reader, result)
def test_read_chunksize_jagged_names(all_parsers):
# see gh-23509
parser = all_parsers
data = "\n".join(["0"] * 7 + [",".join(["0"] * 10)])
expected = DataFrame([[0] + [np.nan] * 9] * 7 + [[0] * 10])
if parser.engine == "pyarrow":
msg = "The 'chunksize' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
with parser.read_csv(
StringIO(data), names=range(10), chunksize=4
) as reader:
concat(reader)
return
with parser.read_csv(StringIO(data), names=range(10), chunksize=4) as reader:
result = concat(reader)
tm.assert_frame_equal(result, expected)
def test_chunk_begins_with_newline_whitespace(all_parsers):
# see gh-10022
parser = all_parsers
data = "\n hello\nworld\n"
result = parser.read_csv(StringIO(data), header=None)
expected = DataFrame([" hello", "world"])
tm.assert_frame_equal(result, expected)
@pytest.mark.slow
def test_chunks_have_consistent_numerical_type(all_parsers, monkeypatch):
# mainly an issue with the C parser
heuristic = 2**3
parser = all_parsers
integers = [str(i) for i in range(heuristic - 1)]
data = "a\n" + "\n".join(integers + ["1.0", "2.0"] + integers)
# Coercions should work without warnings.
with monkeypatch.context() as m:
m.setattr(libparsers, "DEFAULT_BUFFER_HEURISTIC", heuristic)
result = parser.read_csv(StringIO(data))
assert type(result.a[0]) is np.float64
assert result.a.dtype == float
def test_warn_if_chunks_have_mismatched_type(all_parsers, using_infer_string):
warning_type = None
parser = all_parsers
size = 10000
# see gh-3866: if chunks are different types and can't
# be coerced using numerical types, then issue warning.
if parser.engine == "c" and parser.low_memory:
warning_type = DtypeWarning
# Use larger size to hit warning path
size = 499999
integers = [str(i) for i in range(size)]
data = "a\n" + "\n".join(integers + ["a", "b"] + integers)
buf = StringIO(data)
if parser.engine == "pyarrow":
df = parser.read_csv(
buf,
)
else:
df = parser.read_csv_check_warnings(
warning_type,
r"Columns \(0\) have mixed types. "
"Specify dtype option on import or set low_memory=False.",
buf,
)
if parser.engine == "c" and parser.low_memory:
assert df.a.dtype == object
elif using_infer_string:
assert df.a.dtype == "str"
else:
assert df.a.dtype == object
@pytest.mark.parametrize("iterator", [True, False])
def test_empty_with_nrows_chunksize(all_parsers, iterator):
# see gh-9535
parser = all_parsers
expected = DataFrame(columns=["foo", "bar"])
nrows = 10
data = StringIO("foo,bar\n")
if parser.engine == "pyarrow":
msg = (
"The '(nrows|chunksize)' option is not supported with the 'pyarrow' engine"
)
with pytest.raises(ValueError, match=msg):
if iterator:
with parser.read_csv(data, chunksize=nrows) as reader:
next(iter(reader))
else:
parser.read_csv(data, nrows=nrows)
return
if iterator:
with parser.read_csv(data, chunksize=nrows) as reader:
result = next(iter(reader))
else:
result = parser.read_csv(data, nrows=nrows)
tm.assert_frame_equal(result, expected)
def test_read_csv_memory_growth_chunksize(all_parsers):
# see gh-24805
#
# Let's just make sure that we don't crash
# as we iteratively process all chunks.
parser = all_parsers
with tm.ensure_clean() as path:
with open(path, "w", encoding="utf-8") as f:
for i in range(1000):
f.write(str(i) + "\n")
if parser.engine == "pyarrow":
msg = "The 'chunksize' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
with parser.read_csv(path, chunksize=20) as result:
for _ in result:
pass
return
with parser.read_csv(path, chunksize=20) as result:
for _ in result:
pass
def test_chunksize_with_usecols_second_block_shorter(all_parsers):
# GH#21211
parser = all_parsers
data = """1,2,3,4
5,6,7,8
9,10,11
"""
if parser.engine == "pyarrow":
msg = "The 'chunksize' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(
StringIO(data),
names=["a", "b"],
chunksize=2,
usecols=[0, 1],
header=None,
)
return
result_chunks = parser.read_csv(
StringIO(data),
names=["a", "b"],
chunksize=2,
usecols=[0, 1],
header=None,
)
expected_frames = [
DataFrame({"a": [1, 5], "b": [2, 6]}),
DataFrame({"a": [9], "b": [10]}, index=[2]),
]
for i, result in enumerate(result_chunks):
tm.assert_frame_equal(result, expected_frames[i])
def test_chunksize_second_block_shorter(all_parsers):
# GH#21211
parser = all_parsers
data = """a,b,c,d
1,2,3,4
5,6,7,8
9,10,11
"""
if parser.engine == "pyarrow":
msg = "The 'chunksize' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), chunksize=2)
return
result_chunks = parser.read_csv(StringIO(data), chunksize=2)
expected_frames = [
DataFrame({"a": [1, 5], "b": [2, 6], "c": [3, 7], "d": [4, 8]}),
DataFrame({"a": [9], "b": [10], "c": [11], "d": [np.nan]}, index=[2]),
]
for i, result in enumerate(result_chunks):
tm.assert_frame_equal(result, expected_frames[i])

View File

@ -0,0 +1,983 @@
"""
Tests that work on both the Python and C engines but do not have a
specific classification into the other test modules.
"""
from datetime import datetime
from inspect import signature
from io import StringIO
import os
from pathlib import Path
import sys
import numpy as np
import pytest
from pandas._config import using_string_dtype
from pandas.compat import HAS_PYARROW
from pandas.errors import (
EmptyDataError,
ParserError,
ParserWarning,
)
from pandas import (
DataFrame,
Index,
Timestamp,
compat,
)
import pandas._testing as tm
from pandas.io.parsers import TextFileReader
from pandas.io.parsers.c_parser_wrapper import CParserWrapper
pytestmark = pytest.mark.filterwarnings(
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
)
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
def test_override_set_noconvert_columns():
# see gh-17351
#
# Usecols needs to be sorted in _set_noconvert_columns based
# on the test_usecols_with_parse_dates test from test_usecols.py
class MyTextFileReader(TextFileReader):
def __init__(self) -> None:
self._currow = 0
self.squeeze = False
class MyCParserWrapper(CParserWrapper):
def _set_noconvert_columns(self):
if self.usecols_dtype == "integer":
# self.usecols is a set, which is documented as unordered
# but in practice, a CPython set of integers is sorted.
# In other implementations this assumption does not hold.
# The following code simulates a different order, which
# before GH 17351 would cause the wrong columns to be
# converted via the parse_dates parameter
self.usecols = list(self.usecols)
self.usecols.reverse()
return CParserWrapper._set_noconvert_columns(self)
data = """a,b,c,d,e
0,1,2014-01-01,09:00,4
0,1,2014-01-02,10:00,4"""
parse_dates = [[1, 2]]
cols = {
"a": [0, 0],
"c_d": [Timestamp("2014-01-01 09:00:00"), Timestamp("2014-01-02 10:00:00")],
}
expected = DataFrame(cols, columns=["c_d", "a"])
parser = MyTextFileReader()
parser.options = {
"usecols": [0, 2, 3],
"parse_dates": parse_dates,
"delimiter": ",",
}
parser.engine = "c"
parser._engine = MyCParserWrapper(StringIO(data), **parser.options)
result = parser.read()
tm.assert_frame_equal(result, expected)
def test_read_csv_local(all_parsers, csv1):
prefix = "file:///" if compat.is_platform_windows() else "file://"
parser = all_parsers
fname = prefix + str(os.path.abspath(csv1))
result = parser.read_csv(fname, index_col=0, parse_dates=True)
# TODO: make unit check more specific
if parser.engine == "pyarrow":
result.index = result.index.as_unit("ns")
expected = DataFrame(
[
[0.980269, 3.685731, -0.364216805298, -1.159738],
[1.047916, -0.041232, -0.16181208307, 0.212549],
[0.498581, 0.731168, -0.537677223318, 1.346270],
[1.120202, 1.567621, 0.00364077397681, 0.675253],
[-0.487094, 0.571455, -1.6116394093, 0.103469],
[0.836649, 0.246462, 0.588542635376, 1.062782],
[-0.157161, 1.340307, 1.1957779562, -1.097007],
],
columns=["A", "B", "C", "D"],
index=Index(
[
datetime(2000, 1, 3),
datetime(2000, 1, 4),
datetime(2000, 1, 5),
datetime(2000, 1, 6),
datetime(2000, 1, 7),
datetime(2000, 1, 10),
datetime(2000, 1, 11),
],
name="index",
),
)
tm.assert_frame_equal(result, expected)
def test_1000_sep(all_parsers):
parser = all_parsers
data = """A|B|C
1|2,334|5
10|13|10.
"""
expected = DataFrame({"A": [1, 10], "B": [2334, 13], "C": [5, 10.0]})
if parser.engine == "pyarrow":
msg = "The 'thousands' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), sep="|", thousands=",")
return
result = parser.read_csv(StringIO(data), sep="|", thousands=",")
tm.assert_frame_equal(result, expected)
@xfail_pyarrow # ValueError: Found non-unique column index
def test_unnamed_columns(all_parsers):
data = """A,B,C,,
1,2,3,4,5
6,7,8,9,10
11,12,13,14,15
"""
parser = all_parsers
expected = DataFrame(
[[1, 2, 3, 4, 5], [6, 7, 8, 9, 10], [11, 12, 13, 14, 15]],
dtype=np.int64,
columns=["A", "B", "C", "Unnamed: 3", "Unnamed: 4"],
)
result = parser.read_csv(StringIO(data))
tm.assert_frame_equal(result, expected)
def test_csv_mixed_type(all_parsers):
data = """A,B,C
a,1,2
b,3,4
c,4,5
"""
parser = all_parsers
expected = DataFrame({"A": ["a", "b", "c"], "B": [1, 3, 4], "C": [2, 4, 5]})
result = parser.read_csv(StringIO(data))
tm.assert_frame_equal(result, expected)
def test_read_csv_low_memory_no_rows_with_index(all_parsers):
# see gh-21141
parser = all_parsers
if not parser.low_memory:
pytest.skip("This is a low-memory specific test")
data = """A,B,C
1,1,1,2
2,2,3,4
3,3,4,5
"""
if parser.engine == "pyarrow":
msg = "The 'nrows' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), low_memory=True, index_col=0, nrows=0)
return
result = parser.read_csv(StringIO(data), low_memory=True, index_col=0, nrows=0)
expected = DataFrame(columns=["A", "B", "C"])
tm.assert_frame_equal(result, expected)
def test_read_csv_dataframe(all_parsers, csv1):
parser = all_parsers
result = parser.read_csv(csv1, index_col=0, parse_dates=True)
# TODO: make unit check more specific
if parser.engine == "pyarrow":
result.index = result.index.as_unit("ns")
expected = DataFrame(
[
[0.980269, 3.685731, -0.364216805298, -1.159738],
[1.047916, -0.041232, -0.16181208307, 0.212549],
[0.498581, 0.731168, -0.537677223318, 1.346270],
[1.120202, 1.567621, 0.00364077397681, 0.675253],
[-0.487094, 0.571455, -1.6116394093, 0.103469],
[0.836649, 0.246462, 0.588542635376, 1.062782],
[-0.157161, 1.340307, 1.1957779562, -1.097007],
],
columns=["A", "B", "C", "D"],
index=Index(
[
datetime(2000, 1, 3),
datetime(2000, 1, 4),
datetime(2000, 1, 5),
datetime(2000, 1, 6),
datetime(2000, 1, 7),
datetime(2000, 1, 10),
datetime(2000, 1, 11),
],
name="index",
),
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("nrows", [3, 3.0])
def test_read_nrows(all_parsers, nrows):
# see gh-10476
data = """index,A,B,C,D
foo,2,3,4,5
bar,7,8,9,10
baz,12,13,14,15
qux,12,13,14,15
foo2,12,13,14,15
bar2,12,13,14,15
"""
expected = DataFrame(
[["foo", 2, 3, 4, 5], ["bar", 7, 8, 9, 10], ["baz", 12, 13, 14, 15]],
columns=["index", "A", "B", "C", "D"],
)
parser = all_parsers
if parser.engine == "pyarrow":
msg = "The 'nrows' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), nrows=nrows)
return
result = parser.read_csv(StringIO(data), nrows=nrows)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("nrows", [1.2, "foo", -1])
def test_read_nrows_bad(all_parsers, nrows):
data = """index,A,B,C,D
foo,2,3,4,5
bar,7,8,9,10
baz,12,13,14,15
qux,12,13,14,15
foo2,12,13,14,15
bar2,12,13,14,15
"""
msg = r"'nrows' must be an integer >=0"
parser = all_parsers
if parser.engine == "pyarrow":
msg = "The 'nrows' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), nrows=nrows)
def test_nrows_skipfooter_errors(all_parsers):
msg = "'skipfooter' not supported with 'nrows'"
data = "a\n1\n2\n3\n4\n5\n6"
parser = all_parsers
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), skipfooter=1, nrows=5)
@skip_pyarrow
def test_missing_trailing_delimiters(all_parsers):
parser = all_parsers
data = """A,B,C,D
1,2,3,4
1,3,3,
1,4,5"""
result = parser.read_csv(StringIO(data))
expected = DataFrame(
[[1, 2, 3, 4], [1, 3, 3, np.nan], [1, 4, 5, np.nan]],
columns=["A", "B", "C", "D"],
)
tm.assert_frame_equal(result, expected)
def test_skip_initial_space(all_parsers):
data = (
'"09-Apr-2012", "01:10:18.300", 2456026.548822908, 12849, '
"1.00361, 1.12551, 330.65659, 0355626618.16711, 73.48821, "
"314.11625, 1917.09447, 179.71425, 80.000, 240.000, -350, "
"70.06056, 344.98370, 1, 1, -0.689265, -0.692787, "
"0.212036, 14.7674, 41.605, -9999.0, -9999.0, "
"-9999.0, -9999.0, -9999.0, -9999.0, 000, 012, 128"
)
parser = all_parsers
if parser.engine == "pyarrow":
msg = "The 'skipinitialspace' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(
StringIO(data),
names=list(range(33)),
header=None,
na_values=["-9999.0"],
skipinitialspace=True,
)
return
result = parser.read_csv(
StringIO(data),
names=list(range(33)),
header=None,
na_values=["-9999.0"],
skipinitialspace=True,
)
expected = DataFrame(
[
[
"09-Apr-2012",
"01:10:18.300",
2456026.548822908,
12849,
1.00361,
1.12551,
330.65659,
355626618.16711,
73.48821,
314.11625,
1917.09447,
179.71425,
80.0,
240.0,
-350,
70.06056,
344.9837,
1,
1,
-0.689265,
-0.692787,
0.212036,
14.7674,
41.605,
np.nan,
np.nan,
np.nan,
np.nan,
np.nan,
np.nan,
0,
12,
128,
]
]
)
tm.assert_frame_equal(result, expected)
@skip_pyarrow
def test_trailing_delimiters(all_parsers):
# see gh-2442
data = """A,B,C
1,2,3,
4,5,6,
7,8,9,"""
parser = all_parsers
result = parser.read_csv(StringIO(data), index_col=False)
expected = DataFrame({"A": [1, 4, 7], "B": [2, 5, 8], "C": [3, 6, 9]})
tm.assert_frame_equal(result, expected)
def test_escapechar(all_parsers):
# https://stackoverflow.com/questions/13824840/feature-request-for-
# pandas-read-csv
data = '''SEARCH_TERM,ACTUAL_URL
"bra tv board","http://www.ikea.com/se/sv/catalog/categories/departments/living_room/10475/?se%7cps%7cnonbranded%7cvardagsrum%7cgoogle%7ctv_bord"
"tv p\xc3\xa5 hjul","http://www.ikea.com/se/sv/catalog/categories/departments/living_room/10475/?se%7cps%7cnonbranded%7cvardagsrum%7cgoogle%7ctv_bord"
"SLAGBORD, \\"Bergslagen\\", IKEA:s 1700-tals series","http://www.ikea.com/se/sv/catalog/categories/departments/living_room/10475/?se%7cps%7cnonbranded%7cvardagsrum%7cgoogle%7ctv_bord"'''
parser = all_parsers
result = parser.read_csv(
StringIO(data), escapechar="\\", quotechar='"', encoding="utf-8"
)
assert result["SEARCH_TERM"][2] == 'SLAGBORD, "Bergslagen", IKEA:s 1700-tals series'
tm.assert_index_equal(result.columns, Index(["SEARCH_TERM", "ACTUAL_URL"]))
def test_ignore_leading_whitespace(all_parsers):
# see gh-3374, gh-6607
parser = all_parsers
data = " a b c\n 1 2 3\n 4 5 6\n 7 8 9"
if parser.engine == "pyarrow":
msg = "the 'pyarrow' engine does not support regex separators"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), sep=r"\s+")
return
result = parser.read_csv(StringIO(data), sep=r"\s+")
expected = DataFrame({"a": [1, 4, 7], "b": [2, 5, 8], "c": [3, 6, 9]})
tm.assert_frame_equal(result, expected)
@skip_pyarrow
@pytest.mark.parametrize("usecols", [None, [0, 1], ["a", "b"]])
def test_uneven_lines_with_usecols(all_parsers, usecols):
# see gh-12203
parser = all_parsers
data = r"""a,b,c
0,1,2
3,4,5,6,7
8,9,10"""
if usecols is None:
# Make sure that an error is still raised
# when the "usecols" parameter is not provided.
msg = r"Expected \d+ fields in line \d+, saw \d+"
with pytest.raises(ParserError, match=msg):
parser.read_csv(StringIO(data))
else:
expected = DataFrame({"a": [0, 3, 8], "b": [1, 4, 9]})
result = parser.read_csv(StringIO(data), usecols=usecols)
tm.assert_frame_equal(result, expected)
@skip_pyarrow
@pytest.mark.parametrize(
"data,kwargs,expected",
[
# First, check to see that the response of parser when faced with no
# provided columns raises the correct error, with or without usecols.
("", {}, None),
("", {"usecols": ["X"]}, None),
(
",,",
{"names": ["Dummy", "X", "Dummy_2"], "usecols": ["X"]},
DataFrame(columns=["X"], index=[0], dtype=np.float64),
),
(
"",
{"names": ["Dummy", "X", "Dummy_2"], "usecols": ["X"]},
DataFrame(columns=["X"]),
),
],
)
def test_read_empty_with_usecols(all_parsers, data, kwargs, expected):
# see gh-12493
parser = all_parsers
if expected is None:
msg = "No columns to parse from file"
with pytest.raises(EmptyDataError, match=msg):
parser.read_csv(StringIO(data), **kwargs)
else:
result = parser.read_csv(StringIO(data), **kwargs)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"kwargs,expected",
[
# gh-8661, gh-8679: this should ignore six lines, including
# lines with trailing whitespace and blank lines.
(
{
"header": None,
"delim_whitespace": True,
"skiprows": [0, 1, 2, 3, 5, 6],
"skip_blank_lines": True,
},
DataFrame([[1.0, 2.0, 4.0], [5.1, np.nan, 10.0]]),
),
# gh-8983: test skipping set of rows after a row with trailing spaces.
(
{
"delim_whitespace": True,
"skiprows": [1, 2, 3, 5, 6],
"skip_blank_lines": True,
},
DataFrame({"A": [1.0, 5.1], "B": [2.0, np.nan], "C": [4.0, 10]}),
),
],
)
def test_trailing_spaces(all_parsers, kwargs, expected):
data = "A B C \nrandom line with trailing spaces \nskip\n1,2,3\n1,2.,4.\nrandom line with trailing tabs\t\t\t\n \n5.1,NaN,10.0\n" # noqa: E501
parser = all_parsers
depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated"
if parser.engine == "pyarrow":
msg = "The 'delim_whitespace' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
with tm.assert_produces_warning(
FutureWarning, match=depr_msg, check_stacklevel=False
):
parser.read_csv(StringIO(data.replace(",", " ")), **kwargs)
return
with tm.assert_produces_warning(
FutureWarning, match=depr_msg, check_stacklevel=False
):
result = parser.read_csv(StringIO(data.replace(",", " ")), **kwargs)
tm.assert_frame_equal(result, expected)
def test_raise_on_sep_with_delim_whitespace(all_parsers):
# see gh-6607
data = "a b c\n1 2 3"
parser = all_parsers
depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated"
with pytest.raises(ValueError, match="you can only specify one"):
with tm.assert_produces_warning(
FutureWarning, match=depr_msg, check_stacklevel=False
):
parser.read_csv(StringIO(data), sep=r"\s", delim_whitespace=True)
def test_read_filepath_or_buffer(all_parsers):
# see gh-43366
parser = all_parsers
with pytest.raises(TypeError, match="Expected file path name or file-like"):
parser.read_csv(filepath_or_buffer=b"input")
@pytest.mark.parametrize("delim_whitespace", [True, False])
def test_single_char_leading_whitespace(all_parsers, delim_whitespace):
# see gh-9710
parser = all_parsers
data = """\
MyColumn
a
b
a
b\n"""
expected = DataFrame({"MyColumn": list("abab")})
depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated"
if parser.engine == "pyarrow":
msg = "The 'skipinitialspace' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
with tm.assert_produces_warning(
FutureWarning, match=depr_msg, check_stacklevel=False
):
parser.read_csv(
StringIO(data),
skipinitialspace=True,
delim_whitespace=delim_whitespace,
)
return
with tm.assert_produces_warning(
FutureWarning, match=depr_msg, check_stacklevel=False
):
result = parser.read_csv(
StringIO(data), skipinitialspace=True, delim_whitespace=delim_whitespace
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"sep,skip_blank_lines,exp_data",
[
(",", True, [[1.0, 2.0, 4.0], [5.0, np.nan, 10.0], [-70.0, 0.4, 1.0]]),
(r"\s+", True, [[1.0, 2.0, 4.0], [5.0, np.nan, 10.0], [-70.0, 0.4, 1.0]]),
(
",",
False,
[
[1.0, 2.0, 4.0],
[np.nan, np.nan, np.nan],
[np.nan, np.nan, np.nan],
[5.0, np.nan, 10.0],
[np.nan, np.nan, np.nan],
[-70.0, 0.4, 1.0],
],
),
],
)
def test_empty_lines(all_parsers, sep, skip_blank_lines, exp_data, request):
parser = all_parsers
data = """\
A,B,C
1,2.,4.
5.,NaN,10.0
-70,.4,1
"""
if sep == r"\s+":
data = data.replace(",", " ")
if parser.engine == "pyarrow":
msg = "the 'pyarrow' engine does not support regex separators"
with pytest.raises(ValueError, match=msg):
parser.read_csv(
StringIO(data), sep=sep, skip_blank_lines=skip_blank_lines
)
return
result = parser.read_csv(StringIO(data), sep=sep, skip_blank_lines=skip_blank_lines)
expected = DataFrame(exp_data, columns=["A", "B", "C"])
tm.assert_frame_equal(result, expected)
@skip_pyarrow
def test_whitespace_lines(all_parsers):
parser = all_parsers
data = """
\t \t\t
\t
A,B,C
\t 1,2.,4.
5.,NaN,10.0
"""
expected = DataFrame([[1, 2.0, 4.0], [5.0, np.nan, 10.0]], columns=["A", "B", "C"])
result = parser.read_csv(StringIO(data))
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"data,expected",
[
(
""" A B C D
a 1 2 3 4
b 1 2 3 4
c 1 2 3 4
""",
DataFrame(
[[1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]],
columns=["A", "B", "C", "D"],
index=["a", "b", "c"],
),
),
(
" a b c\n1 2 3 \n4 5 6\n 7 8 9",
DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=["a", "b", "c"]),
),
],
)
def test_whitespace_regex_separator(all_parsers, data, expected):
# see gh-6607
parser = all_parsers
if parser.engine == "pyarrow":
msg = "the 'pyarrow' engine does not support regex separators"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), sep=r"\s+")
return
result = parser.read_csv(StringIO(data), sep=r"\s+")
tm.assert_frame_equal(result, expected)
def test_sub_character(all_parsers, csv_dir_path):
# see gh-16893
filename = os.path.join(csv_dir_path, "sub_char.csv")
expected = DataFrame([[1, 2, 3]], columns=["a", "\x1ab", "c"])
parser = all_parsers
result = parser.read_csv(filename)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("filename", ["sé-es-vé.csv", "ru-sй.csv", "中文文件名.csv"])
def test_filename_with_special_chars(all_parsers, filename):
# see gh-15086.
parser = all_parsers
df = DataFrame({"a": [1, 2, 3]})
with tm.ensure_clean(filename) as path:
df.to_csv(path, index=False)
result = parser.read_csv(path)
tm.assert_frame_equal(result, df)
def test_read_table_same_signature_as_read_csv(all_parsers):
# GH-34976
parser = all_parsers
table_sign = signature(parser.read_table)
csv_sign = signature(parser.read_csv)
assert table_sign.parameters.keys() == csv_sign.parameters.keys()
assert table_sign.return_annotation == csv_sign.return_annotation
for key, csv_param in csv_sign.parameters.items():
table_param = table_sign.parameters[key]
if key == "sep":
assert csv_param.default == ","
assert table_param.default == "\t"
assert table_param.annotation == csv_param.annotation
assert table_param.kind == csv_param.kind
continue
assert table_param == csv_param
def test_read_table_equivalency_to_read_csv(all_parsers):
# see gh-21948
# As of 0.25.0, read_table is undeprecated
parser = all_parsers
data = "a\tb\n1\t2\n3\t4"
expected = parser.read_csv(StringIO(data), sep="\t")
result = parser.read_table(StringIO(data))
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("read_func", ["read_csv", "read_table"])
def test_read_csv_and_table_sys_setprofile(all_parsers, read_func):
# GH#41069
parser = all_parsers
data = "a b\n0 1"
sys.setprofile(lambda *a, **k: None)
result = getattr(parser, read_func)(StringIO(data))
sys.setprofile(None)
expected = DataFrame({"a b": ["0 1"]})
tm.assert_frame_equal(result, expected)
@skip_pyarrow
def test_first_row_bom(all_parsers):
# see gh-26545
parser = all_parsers
data = '''\ufeff"Head1"\t"Head2"\t"Head3"'''
result = parser.read_csv(StringIO(data), delimiter="\t")
expected = DataFrame(columns=["Head1", "Head2", "Head3"])
tm.assert_frame_equal(result, expected)
@skip_pyarrow
def test_first_row_bom_unquoted(all_parsers):
# see gh-36343
parser = all_parsers
data = """\ufeffHead1\tHead2\tHead3"""
result = parser.read_csv(StringIO(data), delimiter="\t")
expected = DataFrame(columns=["Head1", "Head2", "Head3"])
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("nrows", range(1, 6))
def test_blank_lines_between_header_and_data_rows(all_parsers, nrows):
# GH 28071
ref = DataFrame(
[[np.nan, np.nan], [np.nan, np.nan], [1, 2], [np.nan, np.nan], [3, 4]],
columns=list("ab"),
)
csv = "\nheader\n\na,b\n\n\n1,2\n\n3,4"
parser = all_parsers
if parser.engine == "pyarrow":
msg = "The 'nrows' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(
StringIO(csv), header=3, nrows=nrows, skip_blank_lines=False
)
return
df = parser.read_csv(StringIO(csv), header=3, nrows=nrows, skip_blank_lines=False)
tm.assert_frame_equal(df, ref[:nrows])
@skip_pyarrow
def test_no_header_two_extra_columns(all_parsers):
# GH 26218
column_names = ["one", "two", "three"]
ref = DataFrame([["foo", "bar", "baz"]], columns=column_names)
stream = StringIO("foo,bar,baz,bam,blah")
parser = all_parsers
df = parser.read_csv_check_warnings(
ParserWarning,
"Length of header or names does not match length of data. "
"This leads to a loss of data with index_col=False.",
stream,
header=None,
names=column_names,
index_col=False,
)
tm.assert_frame_equal(df, ref)
def test_read_csv_names_not_accepting_sets(all_parsers):
# GH 34946
data = """\
1,2,3
4,5,6\n"""
parser = all_parsers
with pytest.raises(ValueError, match="Names should be an ordered collection."):
parser.read_csv(StringIO(data), names=set("QAZ"))
def test_read_table_delim_whitespace_default_sep(all_parsers):
# GH: 35958
f = StringIO("a b c\n1 -2 -3\n4 5 6")
parser = all_parsers
depr_msg = "The 'delim_whitespace' keyword in pd.read_table is deprecated"
if parser.engine == "pyarrow":
msg = "The 'delim_whitespace' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
with tm.assert_produces_warning(
FutureWarning, match=depr_msg, check_stacklevel=False
):
parser.read_table(f, delim_whitespace=True)
return
with tm.assert_produces_warning(
FutureWarning, match=depr_msg, check_stacklevel=False
):
result = parser.read_table(f, delim_whitespace=True)
expected = DataFrame({"a": [1, 4], "b": [-2, 5], "c": [-3, 6]})
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("delimiter", [",", "\t"])
def test_read_csv_delim_whitespace_non_default_sep(all_parsers, delimiter):
# GH: 35958
f = StringIO("a b c\n1 -2 -3\n4 5 6")
parser = all_parsers
msg = (
"Specified a delimiter with both sep and "
"delim_whitespace=True; you can only specify one."
)
depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated"
with tm.assert_produces_warning(
FutureWarning, match=depr_msg, check_stacklevel=False
):
with pytest.raises(ValueError, match=msg):
parser.read_csv(f, delim_whitespace=True, sep=delimiter)
with pytest.raises(ValueError, match=msg):
parser.read_csv(f, delim_whitespace=True, delimiter=delimiter)
def test_read_csv_delimiter_and_sep_no_default(all_parsers):
# GH#39823
f = StringIO("a,b\n1,2")
parser = all_parsers
msg = "Specified a sep and a delimiter; you can only specify one."
with pytest.raises(ValueError, match=msg):
parser.read_csv(f, sep=" ", delimiter=".")
@pytest.mark.parametrize("kwargs", [{"delimiter": "\n"}, {"sep": "\n"}])
def test_read_csv_line_break_as_separator(kwargs, all_parsers):
# GH#43528
parser = all_parsers
data = """a,b,c
1,2,3
"""
msg = (
r"Specified \\n as separator or delimiter. This forces the python engine "
r"which does not accept a line terminator. Hence it is not allowed to use "
r"the line terminator as separator."
)
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), **kwargs)
@pytest.mark.parametrize("delimiter", [",", "\t"])
def test_read_table_delim_whitespace_non_default_sep(all_parsers, delimiter):
# GH: 35958
f = StringIO("a b c\n1 -2 -3\n4 5 6")
parser = all_parsers
msg = (
"Specified a delimiter with both sep and "
"delim_whitespace=True; you can only specify one."
)
depr_msg = "The 'delim_whitespace' keyword in pd.read_table is deprecated"
with tm.assert_produces_warning(
FutureWarning, match=depr_msg, check_stacklevel=False
):
with pytest.raises(ValueError, match=msg):
parser.read_table(f, delim_whitespace=True, sep=delimiter)
with pytest.raises(ValueError, match=msg):
parser.read_table(f, delim_whitespace=True, delimiter=delimiter)
@skip_pyarrow
def test_dict_keys_as_names(all_parsers):
# GH: 36928
data = "1,2"
keys = {"a": int, "b": int}.keys()
parser = all_parsers
result = parser.read_csv(StringIO(data), names=keys)
expected = DataFrame({"a": [1], "b": [2]})
tm.assert_frame_equal(result, expected)
@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)")
@xfail_pyarrow # UnicodeDecodeError: 'utf-8' codec can't decode byte 0xed in position 0
def test_encoding_surrogatepass(all_parsers):
# GH39017
parser = all_parsers
content = b"\xed\xbd\xbf"
decoded = content.decode("utf-8", errors="surrogatepass")
expected = DataFrame({decoded: [decoded]}, index=[decoded * 2])
expected.index.name = decoded * 2
with tm.ensure_clean() as path:
Path(path).write_bytes(
content * 2 + b"," + content + b"\n" + content * 2 + b"," + content
)
df = parser.read_csv(path, encoding_errors="surrogatepass", index_col=0)
tm.assert_frame_equal(df, expected)
with pytest.raises(UnicodeDecodeError, match="'utf-8' codec can't decode byte"):
parser.read_csv(path)
def test_malformed_second_line(all_parsers):
# see GH14782
parser = all_parsers
data = "\na\nb\n"
result = parser.read_csv(StringIO(data), skip_blank_lines=False, header=1)
expected = DataFrame({"a": ["b"]})
tm.assert_frame_equal(result, expected)
@skip_pyarrow
def test_short_single_line(all_parsers):
# GH 47566
parser = all_parsers
columns = ["a", "b", "c"]
data = "1,2"
result = parser.read_csv(StringIO(data), header=None, names=columns)
expected = DataFrame({"a": [1], "b": [2], "c": [np.nan]})
tm.assert_frame_equal(result, expected)
@xfail_pyarrow # ValueError: Length mismatch: Expected axis has 2 elements
def test_short_multi_line(all_parsers):
# GH 47566
parser = all_parsers
columns = ["a", "b", "c"]
data = "1,2\n1,2"
result = parser.read_csv(StringIO(data), header=None, names=columns)
expected = DataFrame({"a": [1, 1], "b": [2, 2], "c": [np.nan, np.nan]})
tm.assert_frame_equal(result, expected)
def test_read_seek(all_parsers):
# GH48646
parser = all_parsers
prefix = "### DATA\n"
content = "nkey,value\ntables,rectangular\n"
with tm.ensure_clean() as path:
Path(path).write_text(prefix + content, encoding="utf-8")
with open(path, encoding="utf-8") as file:
file.readline()
actual = parser.read_csv(file)
expected = parser.read_csv(StringIO(content))
tm.assert_frame_equal(actual, expected)

View File

@ -0,0 +1,91 @@
"""
Tests that work on both the Python and C engines but do not have a
specific classification into the other test modules.
"""
import csv
from io import StringIO
import pytest
from pandas import DataFrame
import pandas._testing as tm
from pandas.io.parsers import TextParser
pytestmark = pytest.mark.filterwarnings(
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
)
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
@xfail_pyarrow
def test_read_data_list(all_parsers):
parser = all_parsers
kwargs = {"index_col": 0}
data = "A,B,C\nfoo,1,2,3\nbar,4,5,6"
data_list = [["A", "B", "C"], ["foo", "1", "2", "3"], ["bar", "4", "5", "6"]]
expected = parser.read_csv(StringIO(data), **kwargs)
with TextParser(data_list, chunksize=2, **kwargs) as parser:
result = parser.read()
tm.assert_frame_equal(result, expected)
def test_reader_list(all_parsers):
data = """index,A,B,C,D
foo,2,3,4,5
bar,7,8,9,10
baz,12,13,14,15
qux,12,13,14,15
foo2,12,13,14,15
bar2,12,13,14,15
"""
parser = all_parsers
kwargs = {"index_col": 0}
lines = list(csv.reader(StringIO(data)))
with TextParser(lines, chunksize=2, **kwargs) as reader:
chunks = list(reader)
expected = parser.read_csv(StringIO(data), **kwargs)
tm.assert_frame_equal(chunks[0], expected[:2])
tm.assert_frame_equal(chunks[1], expected[2:4])
tm.assert_frame_equal(chunks[2], expected[4:])
def test_reader_list_skiprows(all_parsers):
data = """index,A,B,C,D
foo,2,3,4,5
bar,7,8,9,10
baz,12,13,14,15
qux,12,13,14,15
foo2,12,13,14,15
bar2,12,13,14,15
"""
parser = all_parsers
kwargs = {"index_col": 0}
lines = list(csv.reader(StringIO(data)))
with TextParser(lines, chunksize=2, skiprows=[1], **kwargs) as reader:
chunks = list(reader)
expected = parser.read_csv(StringIO(data), **kwargs)
tm.assert_frame_equal(chunks[0], expected[1:3])
def test_read_csv_parse_simple_list(all_parsers):
parser = all_parsers
data = """foo
bar baz
qux foo
foo
bar"""
result = parser.read_csv(StringIO(data), header=None)
expected = DataFrame(["foo", "bar baz", "qux foo", "foo", "bar"])
tm.assert_frame_equal(result, expected)

View File

@ -0,0 +1,72 @@
"""
Tests that work on both the Python and C engines but do not have a
specific classification into the other test modules.
"""
from io import StringIO
import pytest
from pandas import DataFrame
import pandas._testing as tm
pytestmark = pytest.mark.filterwarnings(
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
)
@pytest.mark.parametrize(
"data,thousands,decimal",
[
(
"""A|B|C
1|2,334.01|5
10|13|10.
""",
",",
".",
),
(
"""A|B|C
1|2.334,01|5
10|13|10,
""",
".",
",",
),
],
)
def test_1000_sep_with_decimal(all_parsers, data, thousands, decimal):
parser = all_parsers
expected = DataFrame({"A": [1, 10], "B": [2334.01, 13], "C": [5, 10.0]})
if parser.engine == "pyarrow":
msg = "The 'thousands' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(
StringIO(data), sep="|", thousands=thousands, decimal=decimal
)
return
result = parser.read_csv(
StringIO(data), sep="|", thousands=thousands, decimal=decimal
)
tm.assert_frame_equal(result, expected)
def test_euro_decimal_format(all_parsers):
parser = all_parsers
data = """Id;Number1;Number2;Text1;Text2;Number3
1;1521,1541;187101,9543;ABC;poi;4,738797819
2;121,12;14897,76;DEF;uyt;0,377320872
3;878,158;108013,434;GHI;rez;2,735694704"""
result = parser.read_csv(StringIO(data), sep=";", decimal=",")
expected = DataFrame(
[
[1, 1521.1541, 187101.9543, "ABC", "poi", 4.738797819],
[2, 121.12, 14897.76, "DEF", "uyt", 0.377320872],
[3, 878.158, 108013.434, "GHI", "rez", 2.735694704],
],
columns=["Id", "Number1", "Number2", "Text1", "Text2", "Number3"],
)
tm.assert_frame_equal(result, expected)

View File

@ -0,0 +1,478 @@
"""
Tests that work on both the Python and C engines but do not have a
specific classification into the other test modules.
"""
from io import (
BytesIO,
StringIO,
)
import os
import platform
from urllib.error import URLError
import uuid
import numpy as np
import pytest
from pandas.errors import (
EmptyDataError,
ParserError,
)
import pandas.util._test_decorators as td
from pandas import (
DataFrame,
Index,
)
import pandas._testing as tm
pytestmark = pytest.mark.filterwarnings(
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
)
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
@pytest.mark.network
@pytest.mark.single_cpu
def test_url(all_parsers, csv_dir_path, httpserver):
parser = all_parsers
kwargs = {"sep": "\t"}
local_path = os.path.join(csv_dir_path, "salaries.csv")
with open(local_path, encoding="utf-8") as f:
httpserver.serve_content(content=f.read())
url_result = parser.read_csv(httpserver.url, **kwargs)
local_result = parser.read_csv(local_path, **kwargs)
tm.assert_frame_equal(url_result, local_result)
@pytest.mark.slow
def test_local_file(all_parsers, csv_dir_path):
parser = all_parsers
kwargs = {"sep": "\t"}
local_path = os.path.join(csv_dir_path, "salaries.csv")
local_result = parser.read_csv(local_path, **kwargs)
url = "file://localhost/" + local_path
try:
url_result = parser.read_csv(url, **kwargs)
tm.assert_frame_equal(url_result, local_result)
except URLError:
# Fails on some systems.
pytest.skip("Failing on: " + " ".join(platform.uname()))
@xfail_pyarrow # AssertionError: DataFrame.index are different
def test_path_path_lib(all_parsers):
parser = all_parsers
df = DataFrame(
1.1 * np.arange(120).reshape((30, 4)),
columns=Index(list("ABCD")),
index=Index([f"i-{i}" for i in range(30)]),
)
result = tm.round_trip_pathlib(df.to_csv, lambda p: parser.read_csv(p, index_col=0))
tm.assert_frame_equal(df, result)
@xfail_pyarrow # AssertionError: DataFrame.index are different
def test_path_local_path(all_parsers):
parser = all_parsers
df = DataFrame(
1.1 * np.arange(120).reshape((30, 4)),
columns=Index(list("ABCD")),
index=Index([f"i-{i}" for i in range(30)]),
)
result = tm.round_trip_localpath(
df.to_csv, lambda p: parser.read_csv(p, index_col=0)
)
tm.assert_frame_equal(df, result)
def test_nonexistent_path(all_parsers):
# gh-2428: pls no segfault
# gh-14086: raise more helpful FileNotFoundError
# GH#29233 "File foo" instead of "File b'foo'"
parser = all_parsers
path = f"{uuid.uuid4()}.csv"
msg = r"\[Errno 2\]"
with pytest.raises(FileNotFoundError, match=msg) as e:
parser.read_csv(path)
assert path == e.value.filename
@td.skip_if_windows # os.chmod does not work in windows
def test_no_permission(all_parsers):
# GH 23784
parser = all_parsers
msg = r"\[Errno 13\]"
with tm.ensure_clean() as path:
os.chmod(path, 0) # make file unreadable
# verify that this process cannot open the file (not running as sudo)
try:
with open(path, encoding="utf-8"):
pass
pytest.skip("Running as sudo.")
except PermissionError:
pass
with pytest.raises(PermissionError, match=msg) as e:
parser.read_csv(path)
assert path == e.value.filename
@pytest.mark.parametrize(
"data,kwargs,expected,msg",
[
# gh-10728: WHITESPACE_LINE
(
"a,b,c\n4,5,6\n ",
{},
DataFrame([[4, 5, 6]], columns=["a", "b", "c"]),
None,
),
# gh-10548: EAT_LINE_COMMENT
(
"a,b,c\n4,5,6\n#comment",
{"comment": "#"},
DataFrame([[4, 5, 6]], columns=["a", "b", "c"]),
None,
),
# EAT_CRNL_NOP
(
"a,b,c\n4,5,6\n\r",
{},
DataFrame([[4, 5, 6]], columns=["a", "b", "c"]),
None,
),
# EAT_COMMENT
(
"a,b,c\n4,5,6#comment",
{"comment": "#"},
DataFrame([[4, 5, 6]], columns=["a", "b", "c"]),
None,
),
# SKIP_LINE
(
"a,b,c\n4,5,6\nskipme",
{"skiprows": [2]},
DataFrame([[4, 5, 6]], columns=["a", "b", "c"]),
None,
),
# EAT_LINE_COMMENT
(
"a,b,c\n4,5,6\n#comment",
{"comment": "#", "skip_blank_lines": False},
DataFrame([[4, 5, 6]], columns=["a", "b", "c"]),
None,
),
# IN_FIELD
(
"a,b,c\n4,5,6\n ",
{"skip_blank_lines": False},
DataFrame([["4", 5, 6], [" ", None, None]], columns=["a", "b", "c"]),
None,
),
# EAT_CRNL
(
"a,b,c\n4,5,6\n\r",
{"skip_blank_lines": False},
DataFrame([[4, 5, 6], [None, None, None]], columns=["a", "b", "c"]),
None,
),
# ESCAPED_CHAR
(
"a,b,c\n4,5,6\n\\",
{"escapechar": "\\"},
None,
"(EOF following escape character)|(unexpected end of data)",
),
# ESCAPE_IN_QUOTED_FIELD
(
'a,b,c\n4,5,6\n"\\',
{"escapechar": "\\"},
None,
"(EOF inside string starting at row 2)|(unexpected end of data)",
),
# IN_QUOTED_FIELD
(
'a,b,c\n4,5,6\n"',
{"escapechar": "\\"},
None,
"(EOF inside string starting at row 2)|(unexpected end of data)",
),
],
ids=[
"whitespace-line",
"eat-line-comment",
"eat-crnl-nop",
"eat-comment",
"skip-line",
"eat-line-comment",
"in-field",
"eat-crnl",
"escaped-char",
"escape-in-quoted-field",
"in-quoted-field",
],
)
def test_eof_states(all_parsers, data, kwargs, expected, msg, request):
# see gh-10728, gh-10548
parser = all_parsers
if parser.engine == "pyarrow" and "comment" in kwargs:
msg = "The 'comment' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), **kwargs)
return
if parser.engine == "pyarrow" and "\r" not in data:
# pandas.errors.ParserError: CSV parse error: Expected 3 columns, got 1:
# ValueError: skiprows argument must be an integer when using engine='pyarrow'
# AssertionError: Regex pattern did not match.
pytest.skip(reason="https://github.com/apache/arrow/issues/38676")
if expected is None:
with pytest.raises(ParserError, match=msg):
parser.read_csv(StringIO(data), **kwargs)
else:
result = parser.read_csv(StringIO(data), **kwargs)
tm.assert_frame_equal(result, expected)
def test_temporary_file(all_parsers):
# see gh-13398
parser = all_parsers
data = "0 0"
with tm.ensure_clean(mode="w+", return_filelike=True) as new_file:
new_file.write(data)
new_file.flush()
new_file.seek(0)
if parser.engine == "pyarrow":
msg = "the 'pyarrow' engine does not support regex separators"
with pytest.raises(ValueError, match=msg):
parser.read_csv(new_file, sep=r"\s+", header=None)
return
result = parser.read_csv(new_file, sep=r"\s+", header=None)
expected = DataFrame([[0, 0]])
tm.assert_frame_equal(result, expected)
def test_internal_eof_byte(all_parsers):
# see gh-5500
parser = all_parsers
data = "a,b\n1\x1a,2"
expected = DataFrame([["1\x1a", 2]], columns=["a", "b"])
result = parser.read_csv(StringIO(data))
tm.assert_frame_equal(result, expected)
def test_internal_eof_byte_to_file(all_parsers):
# see gh-16559
parser = all_parsers
data = b'c1,c2\r\n"test \x1a test", test\r\n'
expected = DataFrame([["test \x1a test", " test"]], columns=["c1", "c2"])
path = f"__{uuid.uuid4()}__.csv"
with tm.ensure_clean(path) as path:
with open(path, "wb") as f:
f.write(data)
result = parser.read_csv(path)
tm.assert_frame_equal(result, expected)
def test_file_handle_string_io(all_parsers):
# gh-14418
#
# Don't close user provided file handles.
parser = all_parsers
data = "a,b\n1,2"
fh = StringIO(data)
parser.read_csv(fh)
assert not fh.closed
def test_file_handles_with_open(all_parsers, csv1):
# gh-14418
#
# Don't close user provided file handles.
parser = all_parsers
for mode in ["r", "rb"]:
with open(csv1, mode, encoding="utf-8" if mode == "r" else None) as f:
parser.read_csv(f)
assert not f.closed
def test_invalid_file_buffer_class(all_parsers):
# see gh-15337
class InvalidBuffer:
pass
parser = all_parsers
msg = "Invalid file path or buffer object type"
with pytest.raises(ValueError, match=msg):
parser.read_csv(InvalidBuffer())
def test_invalid_file_buffer_mock(all_parsers):
# see gh-15337
parser = all_parsers
msg = "Invalid file path or buffer object type"
class Foo:
pass
with pytest.raises(ValueError, match=msg):
parser.read_csv(Foo())
def test_valid_file_buffer_seems_invalid(all_parsers):
# gh-16135: we want to ensure that "tell" and "seek"
# aren't actually being used when we call `read_csv`
#
# Thus, while the object may look "invalid" (these
# methods are attributes of the `StringIO` class),
# it is still a valid file-object for our purposes.
class NoSeekTellBuffer(StringIO):
def tell(self):
raise AttributeError("No tell method")
def seek(self, pos, whence=0):
raise AttributeError("No seek method")
data = "a\n1"
parser = all_parsers
expected = DataFrame({"a": [1]})
result = parser.read_csv(NoSeekTellBuffer(data))
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("io_class", [StringIO, BytesIO])
@pytest.mark.parametrize("encoding", [None, "utf-8"])
def test_read_csv_file_handle(all_parsers, io_class, encoding):
"""
Test whether read_csv does not close user-provided file handles.
GH 36980
"""
parser = all_parsers
expected = DataFrame({"a": [1], "b": [2]})
content = "a,b\n1,2"
handle = io_class(content.encode("utf-8") if io_class == BytesIO else content)
tm.assert_frame_equal(parser.read_csv(handle, encoding=encoding), expected)
assert not handle.closed
def test_memory_map_compression(all_parsers, compression):
"""
Support memory map for compressed files.
GH 37621
"""
parser = all_parsers
expected = DataFrame({"a": [1], "b": [2]})
with tm.ensure_clean() as path:
expected.to_csv(path, index=False, compression=compression)
if parser.engine == "pyarrow":
msg = "The 'memory_map' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(path, memory_map=True, compression=compression)
return
result = parser.read_csv(path, memory_map=True, compression=compression)
tm.assert_frame_equal(
result,
expected,
)
def test_context_manager(all_parsers, datapath):
# make sure that opened files are closed
parser = all_parsers
path = datapath("io", "data", "csv", "iris.csv")
if parser.engine == "pyarrow":
msg = "The 'chunksize' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(path, chunksize=1)
return
reader = parser.read_csv(path, chunksize=1)
assert not reader.handles.handle.closed
try:
with reader:
next(reader)
assert False
except AssertionError:
assert reader.handles.handle.closed
def test_context_manageri_user_provided(all_parsers, datapath):
# make sure that user-provided handles are not closed
parser = all_parsers
with open(datapath("io", "data", "csv", "iris.csv"), encoding="utf-8") as path:
if parser.engine == "pyarrow":
msg = "The 'chunksize' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(path, chunksize=1)
return
reader = parser.read_csv(path, chunksize=1)
assert not reader.handles.handle.closed
try:
with reader:
next(reader)
assert False
except AssertionError:
assert not reader.handles.handle.closed
@skip_pyarrow # ParserError: Empty CSV file
def test_file_descriptor_leak(all_parsers, using_copy_on_write):
# GH 31488
parser = all_parsers
with tm.ensure_clean() as path:
with pytest.raises(EmptyDataError, match="No columns to parse from file"):
parser.read_csv(path)
def test_memory_map(all_parsers, csv_dir_path):
mmap_file = os.path.join(csv_dir_path, "test_mmap.csv")
parser = all_parsers
expected = DataFrame(
{"a": [1, 2, 3], "b": ["one", "two", "three"], "c": ["I", "II", "III"]}
)
if parser.engine == "pyarrow":
msg = "The 'memory_map' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(mmap_file, memory_map=True)
return
result = parser.read_csv(mmap_file, memory_map=True)
tm.assert_frame_equal(result, expected)

View File

@ -0,0 +1,79 @@
"""
Tests that work on both the Python and C engines but do not have a
specific classification into the other test modules.
"""
from io import StringIO
import numpy as np
import pytest
from pandas.compat import is_platform_linux
from pandas import DataFrame
import pandas._testing as tm
pytestmark = pytest.mark.filterwarnings(
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
)
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
@skip_pyarrow # ParserError: CSV parse error: Empty CSV file or block
def test_float_parser(all_parsers):
# see gh-9565
parser = all_parsers
data = "45e-1,4.5,45.,inf,-inf"
result = parser.read_csv(StringIO(data), header=None)
expected = DataFrame([[float(s) for s in data.split(",")]])
tm.assert_frame_equal(result, expected)
def test_scientific_no_exponent(all_parsers_all_precisions):
# see gh-12215
df = DataFrame.from_dict({"w": ["2e"], "x": ["3E"], "y": ["42e"], "z": ["632E"]})
data = df.to_csv(index=False)
parser, precision = all_parsers_all_precisions
df_roundtrip = parser.read_csv(StringIO(data), float_precision=precision)
tm.assert_frame_equal(df_roundtrip, df)
@pytest.mark.parametrize(
"neg_exp",
[
-617,
-100000,
pytest.param(-99999999999999999, marks=pytest.mark.skip_ubsan),
],
)
def test_very_negative_exponent(all_parsers_all_precisions, neg_exp):
# GH#38753
parser, precision = all_parsers_all_precisions
data = f"data\n10E{neg_exp}"
result = parser.read_csv(StringIO(data), float_precision=precision)
expected = DataFrame({"data": [0.0]})
tm.assert_frame_equal(result, expected)
@pytest.mark.skip_ubsan
@xfail_pyarrow # AssertionError: Attributes of DataFrame.iloc[:, 0] are different
@pytest.mark.parametrize("exp", [999999999999999999, -999999999999999999])
def test_too_many_exponent_digits(all_parsers_all_precisions, exp, request):
# GH#38753
parser, precision = all_parsers_all_precisions
data = f"data\n10E{exp}"
result = parser.read_csv(StringIO(data), float_precision=precision)
if precision == "round_trip":
if exp == 999999999999999999 and is_platform_linux():
mark = pytest.mark.xfail(reason="GH38794, on Linux gives object result")
request.applymarker(mark)
value = np.inf if exp > 0 else 0.0
expected = DataFrame({"data": [value]})
else:
expected = DataFrame({"data": [f"10E{exp}"]})
tm.assert_frame_equal(result, expected)

View File

@ -0,0 +1,304 @@
"""
Tests that work on both the Python and C engines but do not have a
specific classification into the other test modules.
"""
from datetime import datetime
from io import StringIO
import os
import pytest
from pandas import (
DataFrame,
Index,
MultiIndex,
)
import pandas._testing as tm
pytestmark = pytest.mark.filterwarnings(
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
)
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
@pytest.mark.parametrize(
"data,kwargs,expected",
[
(
"""foo,2,3,4,5
bar,7,8,9,10
baz,12,13,14,15
qux,12,13,14,15
foo2,12,13,14,15
bar2,12,13,14,15
""",
{"index_col": 0, "names": ["index", "A", "B", "C", "D"]},
DataFrame(
[
[2, 3, 4, 5],
[7, 8, 9, 10],
[12, 13, 14, 15],
[12, 13, 14, 15],
[12, 13, 14, 15],
[12, 13, 14, 15],
],
index=Index(["foo", "bar", "baz", "qux", "foo2", "bar2"], name="index"),
columns=["A", "B", "C", "D"],
),
),
(
"""foo,one,2,3,4,5
foo,two,7,8,9,10
foo,three,12,13,14,15
bar,one,12,13,14,15
bar,two,12,13,14,15
""",
{"index_col": [0, 1], "names": ["index1", "index2", "A", "B", "C", "D"]},
DataFrame(
[
[2, 3, 4, 5],
[7, 8, 9, 10],
[12, 13, 14, 15],
[12, 13, 14, 15],
[12, 13, 14, 15],
],
index=MultiIndex.from_tuples(
[
("foo", "one"),
("foo", "two"),
("foo", "three"),
("bar", "one"),
("bar", "two"),
],
names=["index1", "index2"],
),
columns=["A", "B", "C", "D"],
),
),
],
)
def test_pass_names_with_index(all_parsers, data, kwargs, expected):
parser = all_parsers
result = parser.read_csv(StringIO(data), **kwargs)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("index_col", [[0, 1], [1, 0]])
def test_multi_index_no_level_names(
request, all_parsers, index_col, using_infer_string
):
data = """index1,index2,A,B,C,D
foo,one,2,3,4,5
foo,two,7,8,9,10
foo,three,12,13,14,15
bar,one,12,13,14,15
bar,two,12,13,14,15
"""
headless_data = "\n".join(data.split("\n")[1:])
names = ["A", "B", "C", "D"]
parser = all_parsers
result = parser.read_csv(
StringIO(headless_data), index_col=index_col, header=None, names=names
)
expected = parser.read_csv(StringIO(data), index_col=index_col)
# No index names in headless data.
expected.index.names = [None] * 2
tm.assert_frame_equal(result, expected)
@skip_pyarrow
def test_multi_index_no_level_names_implicit(all_parsers):
parser = all_parsers
data = """A,B,C,D
foo,one,2,3,4,5
foo,two,7,8,9,10
foo,three,12,13,14,15
bar,one,12,13,14,15
bar,two,12,13,14,15
"""
result = parser.read_csv(StringIO(data))
expected = DataFrame(
[
[2, 3, 4, 5],
[7, 8, 9, 10],
[12, 13, 14, 15],
[12, 13, 14, 15],
[12, 13, 14, 15],
],
columns=["A", "B", "C", "D"],
index=MultiIndex.from_tuples(
[
("foo", "one"),
("foo", "two"),
("foo", "three"),
("bar", "one"),
("bar", "two"),
]
),
)
tm.assert_frame_equal(result, expected)
@xfail_pyarrow # TypeError: an integer is required
@pytest.mark.parametrize(
"data,expected,header",
[
("a,b", DataFrame(columns=["a", "b"]), [0]),
(
"a,b\nc,d",
DataFrame(columns=MultiIndex.from_tuples([("a", "c"), ("b", "d")])),
[0, 1],
),
],
)
@pytest.mark.parametrize("round_trip", [True, False])
def test_multi_index_blank_df(all_parsers, data, expected, header, round_trip):
# see gh-14545
parser = all_parsers
data = expected.to_csv(index=False) if round_trip else data
result = parser.read_csv(StringIO(data), header=header)
tm.assert_frame_equal(result, expected)
@xfail_pyarrow # AssertionError: DataFrame.columns are different
def test_no_unnamed_index(all_parsers):
parser = all_parsers
data = """ id c0 c1 c2
0 1 0 a b
1 2 0 c d
2 2 2 e f
"""
result = parser.read_csv(StringIO(data), sep=" ")
expected = DataFrame(
[[0, 1, 0, "a", "b"], [1, 2, 0, "c", "d"], [2, 2, 2, "e", "f"]],
columns=["Unnamed: 0", "id", "c0", "c1", "c2"],
)
tm.assert_frame_equal(result, expected)
def test_read_duplicate_index_explicit(all_parsers):
data = """index,A,B,C,D
foo,2,3,4,5
bar,7,8,9,10
baz,12,13,14,15
qux,12,13,14,15
foo,12,13,14,15
bar,12,13,14,15
"""
parser = all_parsers
result = parser.read_csv(StringIO(data), index_col=0)
expected = DataFrame(
[
[2, 3, 4, 5],
[7, 8, 9, 10],
[12, 13, 14, 15],
[12, 13, 14, 15],
[12, 13, 14, 15],
[12, 13, 14, 15],
],
columns=["A", "B", "C", "D"],
index=Index(["foo", "bar", "baz", "qux", "foo", "bar"], name="index"),
)
tm.assert_frame_equal(result, expected)
@skip_pyarrow
def test_read_duplicate_index_implicit(all_parsers):
data = """A,B,C,D
foo,2,3,4,5
bar,7,8,9,10
baz,12,13,14,15
qux,12,13,14,15
foo,12,13,14,15
bar,12,13,14,15
"""
parser = all_parsers
result = parser.read_csv(StringIO(data))
expected = DataFrame(
[
[2, 3, 4, 5],
[7, 8, 9, 10],
[12, 13, 14, 15],
[12, 13, 14, 15],
[12, 13, 14, 15],
[12, 13, 14, 15],
],
columns=["A", "B", "C", "D"],
index=Index(["foo", "bar", "baz", "qux", "foo", "bar"]),
)
tm.assert_frame_equal(result, expected)
@skip_pyarrow
def test_read_csv_no_index_name(all_parsers, csv_dir_path):
parser = all_parsers
csv2 = os.path.join(csv_dir_path, "test2.csv")
result = parser.read_csv(csv2, index_col=0, parse_dates=True)
expected = DataFrame(
[
[0.980269, 3.685731, -0.364216805298, -1.159738, "foo"],
[1.047916, -0.041232, -0.16181208307, 0.212549, "bar"],
[0.498581, 0.731168, -0.537677223318, 1.346270, "baz"],
[1.120202, 1.567621, 0.00364077397681, 0.675253, "qux"],
[-0.487094, 0.571455, -1.6116394093, 0.103469, "foo2"],
],
columns=["A", "B", "C", "D", "E"],
index=Index(
[
datetime(2000, 1, 3),
datetime(2000, 1, 4),
datetime(2000, 1, 5),
datetime(2000, 1, 6),
datetime(2000, 1, 7),
]
),
)
tm.assert_frame_equal(result, expected)
@skip_pyarrow
def test_empty_with_index(all_parsers):
# see gh-10184
data = "x,y"
parser = all_parsers
result = parser.read_csv(StringIO(data), index_col=0)
expected = DataFrame(columns=["y"], index=Index([], name="x"))
tm.assert_frame_equal(result, expected)
# CSV parse error: Empty CSV file or block: cannot infer number of columns
@skip_pyarrow
def test_empty_with_multi_index(all_parsers):
# see gh-10467
data = "x,y,z"
parser = all_parsers
result = parser.read_csv(StringIO(data), index_col=["x", "y"])
expected = DataFrame(
columns=["z"], index=MultiIndex.from_arrays([[]] * 2, names=["x", "y"])
)
tm.assert_frame_equal(result, expected)
# CSV parse error: Empty CSV file or block: cannot infer number of columns
@skip_pyarrow
def test_empty_with_reversed_multi_index(all_parsers):
data = "x,y,z"
parser = all_parsers
result = parser.read_csv(StringIO(data), index_col=[1, 0])
expected = DataFrame(
columns=["z"], index=MultiIndex.from_arrays([[]] * 2, names=["y", "x"])
)
tm.assert_frame_equal(result, expected)

View File

@ -0,0 +1,78 @@
"""
Tests that work on both the Python and C engines but do not have a
specific classification into the other test modules.
"""
from io import StringIO
import numpy as np
import pytest
from pandas import (
DataFrame,
option_context,
)
import pandas._testing as tm
pytestmark = pytest.mark.filterwarnings(
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
)
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
@xfail_pyarrow # AssertionError: DataFrame.index are different
@pytest.mark.parametrize("na_filter", [True, False])
def test_inf_parsing(all_parsers, na_filter):
parser = all_parsers
data = """\
,A
a,inf
b,-inf
c,+Inf
d,-Inf
e,INF
f,-INF
g,+INf
h,-INf
i,inF
j,-inF"""
expected = DataFrame(
{"A": [float("inf"), float("-inf")] * 5},
index=["a", "b", "c", "d", "e", "f", "g", "h", "i", "j"],
)
result = parser.read_csv(StringIO(data), index_col=0, na_filter=na_filter)
tm.assert_frame_equal(result, expected)
@xfail_pyarrow # AssertionError: DataFrame.index are different
@pytest.mark.parametrize("na_filter", [True, False])
def test_infinity_parsing(all_parsers, na_filter):
parser = all_parsers
data = """\
,A
a,Infinity
b,-Infinity
c,+Infinity
"""
expected = DataFrame(
{"A": [float("infinity"), float("-infinity"), float("+infinity")]},
index=["a", "b", "c"],
)
result = parser.read_csv(StringIO(data), index_col=0, na_filter=na_filter)
tm.assert_frame_equal(result, expected)
def test_read_csv_with_use_inf_as_na(all_parsers):
# https://github.com/pandas-dev/pandas/issues/35493
parser = all_parsers
data = "1.0\nNaN\n3.0"
msg = "use_inf_as_na option is deprecated"
warn = FutureWarning
if parser.engine == "pyarrow":
warn = (FutureWarning, DeprecationWarning)
with tm.assert_produces_warning(warn, match=msg, check_stacklevel=False):
with option_context("use_inf_as_na", True):
result = parser.read_csv(StringIO(data), header=None)
expected = DataFrame([1.0, np.nan, 3.0])
tm.assert_frame_equal(result, expected)

View File

@ -0,0 +1,231 @@
"""
Tests that work on both the Python and C engines but do not have a
specific classification into the other test modules.
"""
from io import StringIO
import numpy as np
import pytest
from pandas import (
DataFrame,
Series,
)
import pandas._testing as tm
pytestmark = pytest.mark.filterwarnings(
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
)
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
def test_int_conversion(all_parsers):
data = """A,B
1.0,1
2.0,2
3.0,3
"""
parser = all_parsers
result = parser.read_csv(StringIO(data))
expected = DataFrame([[1.0, 1], [2.0, 2], [3.0, 3]], columns=["A", "B"])
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"data,kwargs,expected",
[
(
"A,B\nTrue,1\nFalse,2\nTrue,3",
{},
DataFrame([[True, 1], [False, 2], [True, 3]], columns=["A", "B"]),
),
(
"A,B\nYES,1\nno,2\nyes,3\nNo,3\nYes,3",
{"true_values": ["yes", "Yes", "YES"], "false_values": ["no", "NO", "No"]},
DataFrame(
[[True, 1], [False, 2], [True, 3], [False, 3], [True, 3]],
columns=["A", "B"],
),
),
(
"A,B\nTRUE,1\nFALSE,2\nTRUE,3",
{},
DataFrame([[True, 1], [False, 2], [True, 3]], columns=["A", "B"]),
),
(
"A,B\nfoo,bar\nbar,foo",
{"true_values": ["foo"], "false_values": ["bar"]},
DataFrame([[True, False], [False, True]], columns=["A", "B"]),
),
],
)
def test_parse_bool(all_parsers, data, kwargs, expected):
parser = all_parsers
result = parser.read_csv(StringIO(data), **kwargs)
tm.assert_frame_equal(result, expected)
def test_parse_integers_above_fp_precision(all_parsers):
data = """Numbers
17007000002000191
17007000002000191
17007000002000191
17007000002000191
17007000002000192
17007000002000192
17007000002000192
17007000002000192
17007000002000192
17007000002000194"""
parser = all_parsers
result = parser.read_csv(StringIO(data))
expected = DataFrame(
{
"Numbers": [
17007000002000191,
17007000002000191,
17007000002000191,
17007000002000191,
17007000002000192,
17007000002000192,
17007000002000192,
17007000002000192,
17007000002000192,
17007000002000194,
]
}
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("sep", [" ", r"\s+"])
def test_integer_overflow_bug(all_parsers, sep):
# see gh-2601
data = "65248E10 11\n55555E55 22\n"
parser = all_parsers
if parser.engine == "pyarrow" and sep != " ":
msg = "the 'pyarrow' engine does not support regex separators"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), header=None, sep=sep)
return
result = parser.read_csv(StringIO(data), header=None, sep=sep)
expected = DataFrame([[6.5248e14, 11], [5.5555e59, 22]])
tm.assert_frame_equal(result, expected)
def test_int64_min_issues(all_parsers):
# see gh-2599
parser = all_parsers
data = "A,B\n0,0\n0,"
result = parser.read_csv(StringIO(data))
expected = DataFrame({"A": [0, 0], "B": [0, np.nan]})
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("conv", [None, np.int64, np.uint64])
def test_int64_overflow(all_parsers, conv, request):
data = """ID
00013007854817840016671868
00013007854817840016749251
00013007854817840016754630
00013007854817840016781876
00013007854817840017028824
00013007854817840017963235
00013007854817840018860166"""
parser = all_parsers
if conv is None:
# 13007854817840016671868 > UINT64_MAX, so this
# will overflow and return object as the dtype.
if parser.engine == "pyarrow":
mark = pytest.mark.xfail(reason="parses to float64")
request.applymarker(mark)
result = parser.read_csv(StringIO(data))
expected = DataFrame(
[
"00013007854817840016671868",
"00013007854817840016749251",
"00013007854817840016754630",
"00013007854817840016781876",
"00013007854817840017028824",
"00013007854817840017963235",
"00013007854817840018860166",
],
columns=["ID"],
)
tm.assert_frame_equal(result, expected)
else:
# 13007854817840016671868 > UINT64_MAX, so attempts
# to cast to either int64 or uint64 will result in
# an OverflowError being raised.
msg = "|".join(
[
"Python int too large to convert to C long",
"long too big to convert",
"int too big to convert",
]
)
err = OverflowError
if parser.engine == "pyarrow":
err = ValueError
msg = "The 'converters' option is not supported with the 'pyarrow' engine"
with pytest.raises(err, match=msg):
parser.read_csv(StringIO(data), converters={"ID": conv})
@skip_pyarrow # CSV parse error: Empty CSV file or block
@pytest.mark.parametrize(
"val", [np.iinfo(np.uint64).max, np.iinfo(np.int64).max, np.iinfo(np.int64).min]
)
def test_int64_uint64_range(all_parsers, val):
# These numbers fall right inside the int64-uint64
# range, so they should be parsed as string.
parser = all_parsers
result = parser.read_csv(StringIO(str(val)), header=None)
expected = DataFrame([val])
tm.assert_frame_equal(result, expected)
@skip_pyarrow # CSV parse error: Empty CSV file or block
@pytest.mark.parametrize(
"val", [np.iinfo(np.uint64).max + 1, np.iinfo(np.int64).min - 1]
)
def test_outside_int64_uint64_range(all_parsers, val):
# These numbers fall just outside the int64-uint64
# range, so they should be parsed as string.
parser = all_parsers
result = parser.read_csv(StringIO(str(val)), header=None)
expected = DataFrame([str(val)])
tm.assert_frame_equal(result, expected)
@xfail_pyarrow # gets float64 dtype instead of object
@pytest.mark.parametrize("exp_data", [[str(-1), str(2**63)], [str(2**63), str(-1)]])
def test_numeric_range_too_wide(all_parsers, exp_data):
# No numerical dtype can hold both negative and uint64
# values, so they should be cast as string.
parser = all_parsers
data = "\n".join(exp_data)
expected = DataFrame(exp_data)
result = parser.read_csv(StringIO(data), header=None)
tm.assert_frame_equal(result, expected)
def test_integer_precision(all_parsers):
# Gh 7072
s = """1,1;0;0;0;1;1;3844;3844;3844;1;1;1;1;1;1;0;0;1;1;0;0,,,4321583677327450765
5,1;0;0;0;1;1;843;843;843;1;1;1;1;1;1;0;0;1;1;0;0,64.0,;,4321113141090630389"""
parser = all_parsers
result = parser.read_csv(StringIO(s), header=None)[4]
expected = Series([4321583677327450765, 4321113141090630389], name=4)
tm.assert_series_equal(result, expected)

View File

@ -0,0 +1,134 @@
"""
Tests that work on both the Python and C engines but do not have a
specific classification into the other test modules.
"""
from io import StringIO
import pytest
from pandas import (
DataFrame,
concat,
)
import pandas._testing as tm
pytestmark = pytest.mark.filterwarnings(
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
)
def test_iterator(all_parsers):
# see gh-6607
data = """index,A,B,C,D
foo,2,3,4,5
bar,7,8,9,10
baz,12,13,14,15
qux,12,13,14,15
foo2,12,13,14,15
bar2,12,13,14,15
"""
parser = all_parsers
kwargs = {"index_col": 0}
expected = parser.read_csv(StringIO(data), **kwargs)
if parser.engine == "pyarrow":
msg = "The 'iterator' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), iterator=True, **kwargs)
return
with parser.read_csv(StringIO(data), iterator=True, **kwargs) as reader:
first_chunk = reader.read(3)
tm.assert_frame_equal(first_chunk, expected[:3])
last_chunk = reader.read(5)
tm.assert_frame_equal(last_chunk, expected[3:])
def test_iterator2(all_parsers):
parser = all_parsers
data = """A,B,C
foo,1,2,3
bar,4,5,6
baz,7,8,9
"""
if parser.engine == "pyarrow":
msg = "The 'iterator' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), iterator=True)
return
with parser.read_csv(StringIO(data), iterator=True) as reader:
result = list(reader)
expected = DataFrame(
[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
index=["foo", "bar", "baz"],
columns=["A", "B", "C"],
)
tm.assert_frame_equal(result[0], expected)
def test_iterator_stop_on_chunksize(all_parsers):
# gh-3967: stopping iteration when chunksize is specified
parser = all_parsers
data = """A,B,C
foo,1,2,3
bar,4,5,6
baz,7,8,9
"""
if parser.engine == "pyarrow":
msg = "The 'chunksize' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), chunksize=1)
return
with parser.read_csv(StringIO(data), chunksize=1) as reader:
result = list(reader)
assert len(result) == 3
expected = DataFrame(
[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
index=["foo", "bar", "baz"],
columns=["A", "B", "C"],
)
tm.assert_frame_equal(concat(result), expected)
@pytest.mark.parametrize(
"kwargs", [{"iterator": True, "chunksize": 1}, {"iterator": True}, {"chunksize": 1}]
)
def test_iterator_skipfooter_errors(all_parsers, kwargs):
msg = "'skipfooter' not supported for iteration"
parser = all_parsers
data = "a\n1\n2"
if parser.engine == "pyarrow":
msg = (
"The '(chunksize|iterator)' option is not supported with the "
"'pyarrow' engine"
)
with pytest.raises(ValueError, match=msg):
with parser.read_csv(StringIO(data), skipfooter=1, **kwargs) as _:
pass
def test_iteration_open_handle(all_parsers):
parser = all_parsers
kwargs = {"header": None}
with tm.ensure_clean() as path:
with open(path, "w", encoding="utf-8") as f:
f.write("AAA\nBBB\nCCC\nDDD\nEEE\nFFF\nGGG")
with open(path, encoding="utf-8") as f:
for line in f:
if "CCC" in line:
break
result = parser.read_csv(f, **kwargs)
expected = DataFrame({0: ["DDD", "EEE", "FFF", "GGG"]})
tm.assert_frame_equal(result, expected)

View File

@ -0,0 +1,320 @@
"""
Tests that work on the Python, C and PyArrow engines but do not have a
specific classification into the other test modules.
"""
import codecs
import csv
from io import StringIO
import os
from pathlib import Path
import numpy as np
import pytest
from pandas.compat import PY311
from pandas.errors import (
EmptyDataError,
ParserError,
ParserWarning,
)
from pandas import DataFrame
import pandas._testing as tm
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
def test_empty_decimal_marker(all_parsers):
data = """A|B|C
1|2,334|5
10|13|10.
"""
# Parsers support only length-1 decimals
msg = "Only length-1 decimal markers supported"
parser = all_parsers
if parser.engine == "pyarrow":
msg = (
"only single character unicode strings can be "
"converted to Py_UCS4, got length 0"
)
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), decimal="")
def test_bad_stream_exception(all_parsers, csv_dir_path):
# see gh-13652
#
# This test validates that both the Python engine and C engine will
# raise UnicodeDecodeError instead of C engine raising ParserError
# and swallowing the exception that caused read to fail.
path = os.path.join(csv_dir_path, "sauron.SHIFT_JIS.csv")
codec = codecs.lookup("utf-8")
utf8 = codecs.lookup("utf-8")
parser = all_parsers
msg = "'utf-8' codec can't decode byte"
# Stream must be binary UTF8.
with open(path, "rb") as handle, codecs.StreamRecoder(
handle, utf8.encode, utf8.decode, codec.streamreader, codec.streamwriter
) as stream:
with pytest.raises(UnicodeDecodeError, match=msg):
parser.read_csv(stream)
def test_malformed(all_parsers):
# see gh-6607
parser = all_parsers
data = """ignore
A,B,C
1,2,3 # comment
1,2,3,4,5
2,3,4
"""
msg = "Expected 3 fields in line 4, saw 5"
err = ParserError
if parser.engine == "pyarrow":
msg = "The 'comment' option is not supported with the 'pyarrow' engine"
err = ValueError
with pytest.raises(err, match=msg):
parser.read_csv(StringIO(data), header=1, comment="#")
@pytest.mark.parametrize("nrows", [5, 3, None])
def test_malformed_chunks(all_parsers, nrows):
data = """ignore
A,B,C
skip
1,2,3
3,5,10 # comment
1,2,3,4,5
2,3,4
"""
parser = all_parsers
if parser.engine == "pyarrow":
msg = "The 'iterator' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(
StringIO(data),
header=1,
comment="#",
iterator=True,
chunksize=1,
skiprows=[2],
)
return
msg = "Expected 3 fields in line 6, saw 5"
with parser.read_csv(
StringIO(data), header=1, comment="#", iterator=True, chunksize=1, skiprows=[2]
) as reader:
with pytest.raises(ParserError, match=msg):
reader.read(nrows)
@xfail_pyarrow # does not raise
def test_catch_too_many_names(all_parsers):
# see gh-5156
data = """\
1,2,3
4,,6
7,8,9
10,11,12\n"""
parser = all_parsers
msg = (
"Too many columns specified: expected 4 and found 3"
if parser.engine == "c"
else "Number of passed names did not match "
"number of header fields in the file"
)
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), header=0, names=["a", "b", "c", "d"])
@skip_pyarrow # CSV parse error: Empty CSV file or block
@pytest.mark.parametrize("nrows", [0, 1, 2, 3, 4, 5])
def test_raise_on_no_columns(all_parsers, nrows):
parser = all_parsers
data = "\n" * nrows
msg = "No columns to parse from file"
with pytest.raises(EmptyDataError, match=msg):
parser.read_csv(StringIO(data))
def test_unexpected_keyword_parameter_exception(all_parsers):
# GH-34976
parser = all_parsers
msg = "{}\\(\\) got an unexpected keyword argument 'foo'"
with pytest.raises(TypeError, match=msg.format("read_csv")):
parser.read_csv("foo.csv", foo=1)
with pytest.raises(TypeError, match=msg.format("read_table")):
parser.read_table("foo.tsv", foo=1)
def test_suppress_error_output(all_parsers):
# see gh-15925
parser = all_parsers
data = "a\n1\n1,2,3\n4\n5,6,7"
expected = DataFrame({"a": [1, 4]})
result = parser.read_csv(StringIO(data), on_bad_lines="skip")
tm.assert_frame_equal(result, expected)
def test_error_bad_lines(all_parsers):
# see gh-15925
parser = all_parsers
data = "a\n1\n1,2,3\n4\n5,6,7"
msg = "Expected 1 fields in line 3, saw 3"
if parser.engine == "pyarrow":
# "CSV parse error: Expected 1 columns, got 3: 1,2,3"
pytest.skip(reason="https://github.com/apache/arrow/issues/38676")
with pytest.raises(ParserError, match=msg):
parser.read_csv(StringIO(data), on_bad_lines="error")
def test_warn_bad_lines(all_parsers):
# see gh-15925
parser = all_parsers
data = "a\n1\n1,2,3\n4\n5,6,7"
expected = DataFrame({"a": [1, 4]})
match_msg = "Skipping line"
expected_warning = ParserWarning
if parser.engine == "pyarrow":
match_msg = "Expected 1 columns, but found 3: 1,2,3"
expected_warning = (ParserWarning, DeprecationWarning)
with tm.assert_produces_warning(
expected_warning, match=match_msg, check_stacklevel=False
):
result = parser.read_csv(StringIO(data), on_bad_lines="warn")
tm.assert_frame_equal(result, expected)
def test_read_csv_wrong_num_columns(all_parsers):
# Too few columns.
data = """A,B,C,D,E,F
1,2,3,4,5,6
6,7,8,9,10,11,12
11,12,13,14,15,16
"""
parser = all_parsers
msg = "Expected 6 fields in line 3, saw 7"
if parser.engine == "pyarrow":
# Expected 6 columns, got 7: 6,7,8,9,10,11,12
pytest.skip(reason="https://github.com/apache/arrow/issues/38676")
with pytest.raises(ParserError, match=msg):
parser.read_csv(StringIO(data))
def test_null_byte_char(request, all_parsers):
# see gh-2741
data = "\x00,foo"
names = ["a", "b"]
parser = all_parsers
if parser.engine == "c" or (parser.engine == "python" and PY311):
if parser.engine == "python" and PY311:
request.applymarker(
pytest.mark.xfail(
reason="In Python 3.11, this is read as an empty character not null"
)
)
expected = DataFrame([[np.nan, "foo"]], columns=names)
out = parser.read_csv(StringIO(data), names=names)
tm.assert_frame_equal(out, expected)
else:
if parser.engine == "pyarrow":
# CSV parse error: Empty CSV file or block: "
# cannot infer number of columns"
pytest.skip(reason="https://github.com/apache/arrow/issues/38676")
else:
msg = "NULL byte detected"
with pytest.raises(ParserError, match=msg):
parser.read_csv(StringIO(data), names=names)
@pytest.mark.filterwarnings("always::ResourceWarning")
def test_open_file(request, all_parsers):
# GH 39024
parser = all_parsers
msg = "Could not determine delimiter"
err = csv.Error
if parser.engine == "c":
msg = "the 'c' engine does not support sep=None with delim_whitespace=False"
err = ValueError
elif parser.engine == "pyarrow":
msg = (
"the 'pyarrow' engine does not support sep=None with delim_whitespace=False"
)
err = ValueError
with tm.ensure_clean() as path:
file = Path(path)
file.write_bytes(b"\xe4\na\n1")
with tm.assert_produces_warning(None):
# should not trigger a ResourceWarning
with pytest.raises(err, match=msg):
parser.read_csv(file, sep=None, encoding_errors="replace")
def test_invalid_on_bad_line(all_parsers):
parser = all_parsers
data = "a\n1\n1,2,3\n4\n5,6,7"
with pytest.raises(ValueError, match="Argument abc is invalid for on_bad_lines"):
parser.read_csv(StringIO(data), on_bad_lines="abc")
def test_bad_header_uniform_error(all_parsers):
parser = all_parsers
data = "+++123456789...\ncol1,col2,col3,col4\n1,2,3,4\n"
msg = "Expected 2 fields in line 2, saw 4"
if parser.engine == "c":
msg = (
"Could not construct index. Requested to use 1 "
"number of columns, but 3 left to parse."
)
elif parser.engine == "pyarrow":
# "CSV parse error: Expected 1 columns, got 4: col1,col2,col3,col4"
pytest.skip(reason="https://github.com/apache/arrow/issues/38676")
with pytest.raises(ParserError, match=msg):
parser.read_csv(StringIO(data), index_col=0, on_bad_lines="error")
def test_on_bad_lines_warn_correct_formatting(all_parsers):
# see gh-15925
parser = all_parsers
data = """1,2
a,b
a,b,c
a,b,d
a,b
"""
expected = DataFrame({"1": "a", "2": ["b"] * 2})
match_msg = "Skipping line"
expected_warning = ParserWarning
if parser.engine == "pyarrow":
match_msg = "Expected 2 columns, but found 3: a,b,c"
expected_warning = (ParserWarning, DeprecationWarning)
with tm.assert_produces_warning(
expected_warning, match=match_msg, check_stacklevel=False
):
result = parser.read_csv(StringIO(data), on_bad_lines="warn")
tm.assert_frame_equal(result, expected)

View File

@ -0,0 +1,81 @@
"""
Tests that work on both the Python and C engines but do not have a
specific classification into the other test modules.
"""
from io import StringIO
import pytest
import pandas._testing as tm
depr_msg = "The 'verbose' keyword in pd.read_csv is deprecated"
def test_verbose_read(all_parsers, capsys):
parser = all_parsers
data = """a,b,c,d
one,1,2,3
one,1,2,3
,1,2,3
one,1,2,3
,1,2,3
,1,2,3
one,1,2,3
two,1,2,3"""
if parser.engine == "pyarrow":
msg = "The 'verbose' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
with tm.assert_produces_warning(
FutureWarning, match=depr_msg, check_stacklevel=False
):
parser.read_csv(StringIO(data), verbose=True)
return
# Engines are verbose in different ways.
with tm.assert_produces_warning(
FutureWarning, match=depr_msg, check_stacklevel=False
):
parser.read_csv(StringIO(data), verbose=True)
captured = capsys.readouterr()
if parser.engine == "c":
assert "Tokenization took:" in captured.out
assert "Parser memory cleanup took:" in captured.out
else: # Python engine
assert captured.out == "Filled 3 NA values in column a\n"
def test_verbose_read2(all_parsers, capsys):
parser = all_parsers
data = """a,b,c,d
one,1,2,3
two,1,2,3
three,1,2,3
four,1,2,3
five,1,2,3
,1,2,3
seven,1,2,3
eight,1,2,3"""
if parser.engine == "pyarrow":
msg = "The 'verbose' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
with tm.assert_produces_warning(
FutureWarning, match=depr_msg, check_stacklevel=False
):
parser.read_csv(StringIO(data), verbose=True, index_col=0)
return
with tm.assert_produces_warning(
FutureWarning, match=depr_msg, check_stacklevel=False
):
parser.read_csv(StringIO(data), verbose=True, index_col=0)
captured = capsys.readouterr()
# Engines are verbose in different ways.
if parser.engine == "c":
assert "Tokenization took:" in captured.out
assert "Parser memory cleanup took:" in captured.out
else: # Python engine
assert captured.out == "Filled 1 NA values in column a\n"

View File

@ -0,0 +1,337 @@
from __future__ import annotations
import os
import pytest
from pandas.compat import HAS_PYARROW
from pandas.compat._optional import VERSIONS
from pandas import (
read_csv,
read_table,
)
import pandas._testing as tm
class BaseParser:
engine: str | None = None
low_memory = True
float_precision_choices: list[str | None] = []
def update_kwargs(self, kwargs):
kwargs = kwargs.copy()
kwargs.update({"engine": self.engine, "low_memory": self.low_memory})
return kwargs
def read_csv(self, *args, **kwargs):
kwargs = self.update_kwargs(kwargs)
return read_csv(*args, **kwargs)
def read_csv_check_warnings(
self,
warn_type: type[Warning],
warn_msg: str,
*args,
raise_on_extra_warnings=True,
check_stacklevel: bool = True,
**kwargs,
):
# We need to check the stacklevel here instead of in the tests
# since this is where read_csv is called and where the warning
# should point to.
kwargs = self.update_kwargs(kwargs)
with tm.assert_produces_warning(
warn_type,
match=warn_msg,
raise_on_extra_warnings=raise_on_extra_warnings,
check_stacklevel=check_stacklevel,
):
return read_csv(*args, **kwargs)
def read_table(self, *args, **kwargs):
kwargs = self.update_kwargs(kwargs)
return read_table(*args, **kwargs)
def read_table_check_warnings(
self,
warn_type: type[Warning],
warn_msg: str,
*args,
raise_on_extra_warnings=True,
**kwargs,
):
# We need to check the stacklevel here instead of in the tests
# since this is where read_table is called and where the warning
# should point to.
kwargs = self.update_kwargs(kwargs)
with tm.assert_produces_warning(
warn_type, match=warn_msg, raise_on_extra_warnings=raise_on_extra_warnings
):
return read_table(*args, **kwargs)
class CParser(BaseParser):
engine = "c"
float_precision_choices = [None, "high", "round_trip"]
class CParserHighMemory(CParser):
low_memory = False
class CParserLowMemory(CParser):
low_memory = True
class PythonParser(BaseParser):
engine = "python"
float_precision_choices = [None]
class PyArrowParser(BaseParser):
engine = "pyarrow"
float_precision_choices = [None]
@pytest.fixture
def csv_dir_path(datapath):
"""
The directory path to the data files needed for parser tests.
"""
return datapath("io", "parser", "data")
@pytest.fixture
def csv1(datapath):
"""
The path to the data file "test1.csv" needed for parser tests.
"""
return os.path.join(datapath("io", "data", "csv"), "test1.csv")
_cParserHighMemory = CParserHighMemory
_cParserLowMemory = CParserLowMemory
_pythonParser = PythonParser
_pyarrowParser = PyArrowParser
_py_parsers_only = [_pythonParser]
_c_parsers_only = [_cParserHighMemory, _cParserLowMemory]
_pyarrow_parsers_only = [
pytest.param(
_pyarrowParser,
marks=[
pytest.mark.single_cpu,
pytest.mark.skipif(not HAS_PYARROW, reason="pyarrow is not installed"),
],
)
]
_all_parsers = [*_c_parsers_only, *_py_parsers_only, *_pyarrow_parsers_only]
_py_parser_ids = ["python"]
_c_parser_ids = ["c_high", "c_low"]
_pyarrow_parsers_ids = ["pyarrow"]
_all_parser_ids = [*_c_parser_ids, *_py_parser_ids, *_pyarrow_parsers_ids]
@pytest.fixture(params=_all_parsers, ids=_all_parser_ids)
def all_parsers(request):
"""
Fixture all of the CSV parsers.
"""
parser = request.param()
if parser.engine == "pyarrow":
pytest.importorskip("pyarrow", VERSIONS["pyarrow"])
# Try finding a way to disable threads all together
# for more stable CI runs
import pyarrow
pyarrow.set_cpu_count(1)
return parser
@pytest.fixture(params=_c_parsers_only, ids=_c_parser_ids)
def c_parser_only(request):
"""
Fixture all of the CSV parsers using the C engine.
"""
return request.param()
@pytest.fixture(params=_py_parsers_only, ids=_py_parser_ids)
def python_parser_only(request):
"""
Fixture all of the CSV parsers using the Python engine.
"""
return request.param()
@pytest.fixture(params=_pyarrow_parsers_only, ids=_pyarrow_parsers_ids)
def pyarrow_parser_only(request):
"""
Fixture all of the CSV parsers using the Pyarrow engine.
"""
return request.param()
def _get_all_parser_float_precision_combinations():
"""
Return all allowable parser and float precision
combinations and corresponding ids.
"""
params = []
ids = []
for parser, parser_id in zip(_all_parsers, _all_parser_ids):
if hasattr(parser, "values"):
# Wrapped in pytest.param, get the actual parser back
parser = parser.values[0]
for precision in parser.float_precision_choices:
# Re-wrap in pytest.param for pyarrow
mark = (
[
pytest.mark.single_cpu,
pytest.mark.skipif(
not HAS_PYARROW, reason="pyarrow is not installed"
),
]
if parser.engine == "pyarrow"
else ()
)
param = pytest.param((parser(), precision), marks=mark)
params.append(param)
ids.append(f"{parser_id}-{precision}")
return {"params": params, "ids": ids}
@pytest.fixture(
params=_get_all_parser_float_precision_combinations()["params"],
ids=_get_all_parser_float_precision_combinations()["ids"],
)
def all_parsers_all_precisions(request):
"""
Fixture for all allowable combinations of parser
and float precision
"""
return request.param
_utf_values = [8, 16, 32]
_encoding_seps = ["", "-", "_"]
_encoding_prefixes = ["utf", "UTF"]
_encoding_fmts = [
f"{prefix}{sep}{{0}}" for sep in _encoding_seps for prefix in _encoding_prefixes
]
@pytest.fixture(params=_utf_values)
def utf_value(request):
"""
Fixture for all possible integer values for a UTF encoding.
"""
return request.param
@pytest.fixture(params=_encoding_fmts)
def encoding_fmt(request):
"""
Fixture for all possible string formats of a UTF encoding.
"""
return request.param
@pytest.fixture(
params=[
("-1,0", -1.0),
("-1,2e0", -1.2),
("-1e0", -1.0),
("+1e0", 1.0),
("+1e+0", 1.0),
("+1e-1", 0.1),
("+,1e1", 1.0),
("+1,e0", 1.0),
("-,1e1", -1.0),
("-1,e0", -1.0),
("0,1", 0.1),
("1,", 1.0),
(",1", 0.1),
("-,1", -0.1),
("1_,", 1.0),
("1_234,56", 1234.56),
("1_234,56e0", 1234.56),
# negative cases; must not parse as float
("_", "_"),
("-_", "-_"),
("-_1", "-_1"),
("-_1e0", "-_1e0"),
("_1", "_1"),
("_1,", "_1,"),
("_1,_", "_1,_"),
("_1e0", "_1e0"),
("1,2e_1", "1,2e_1"),
("1,2e1_0", "1,2e1_0"),
("1,_2", "1,_2"),
(",1__2", ",1__2"),
(",1e", ",1e"),
("-,1e", "-,1e"),
("1_000,000_000", "1_000,000_000"),
("1,e1_2", "1,e1_2"),
("e11,2", "e11,2"),
("1e11,2", "1e11,2"),
("1,2,2", "1,2,2"),
("1,2_1", "1,2_1"),
("1,2e-10e1", "1,2e-10e1"),
("--1,2", "--1,2"),
("1a_2,1", "1a_2,1"),
("1,2E-1", 0.12),
("1,2E1", 12.0),
]
)
def numeric_decimal(request):
"""
Fixture for all numeric formats which should get recognized. The first entry
represents the value to read while the second represents the expected result.
"""
return request.param
@pytest.fixture
def pyarrow_xfail(request):
"""
Fixture that xfails a test if the engine is pyarrow.
Use if failure is do to unsupported keywords or inconsistent results.
"""
if "all_parsers" in request.fixturenames:
parser = request.getfixturevalue("all_parsers")
elif "all_parsers_all_precisions" in request.fixturenames:
# Return value is tuple of (engine, precision)
parser = request.getfixturevalue("all_parsers_all_precisions")[0]
else:
return
if parser.engine == "pyarrow":
mark = pytest.mark.xfail(reason="pyarrow doesn't support this.")
request.applymarker(mark)
@pytest.fixture
def pyarrow_skip(request):
"""
Fixture that skips a test if the engine is pyarrow.
Use if failure is do a parsing failure from pyarrow.csv.read_csv
"""
if "all_parsers" in request.fixturenames:
parser = request.getfixturevalue("all_parsers")
elif "all_parsers_all_precisions" in request.fixturenames:
# Return value is tuple of (engine, precision)
parser = request.getfixturevalue("all_parsers_all_precisions")[0]
else:
return
if parser.engine == "pyarrow":
pytest.skip(reason="https://github.com/apache/arrow/issues/38676")

View File

@ -0,0 +1,334 @@
"""
Tests dtype specification during parsing
for all of the parsers defined in parsers.py
"""
from io import StringIO
import os
import numpy as np
import pytest
from pandas._libs import parsers as libparsers
from pandas.core.dtypes.dtypes import CategoricalDtype
import pandas as pd
from pandas import (
Categorical,
DataFrame,
Timestamp,
)
import pandas._testing as tm
pytestmark = pytest.mark.filterwarnings(
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
)
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
@xfail_pyarrow # AssertionError: Attributes of DataFrame.iloc[:, 0] are different
@pytest.mark.parametrize(
"dtype",
[
"category",
CategoricalDtype(),
{"a": "category", "b": "category", "c": CategoricalDtype()},
],
)
def test_categorical_dtype(all_parsers, dtype):
# see gh-10153
parser = all_parsers
data = """a,b,c
1,a,3.4
1,a,3.4
2,b,4.5"""
expected = DataFrame(
{
"a": Categorical(["1", "1", "2"]),
"b": Categorical(["a", "a", "b"]),
"c": Categorical(["3.4", "3.4", "4.5"]),
}
)
actual = parser.read_csv(StringIO(data), dtype=dtype)
tm.assert_frame_equal(actual, expected)
@pytest.mark.parametrize("dtype", [{"b": "category"}, {1: "category"}])
def test_categorical_dtype_single(all_parsers, dtype, request):
# see gh-10153
parser = all_parsers
data = """a,b,c
1,a,3.4
1,a,3.4
2,b,4.5"""
expected = DataFrame(
{"a": [1, 1, 2], "b": Categorical(["a", "a", "b"]), "c": [3.4, 3.4, 4.5]}
)
if parser.engine == "pyarrow":
mark = pytest.mark.xfail(
strict=False,
reason="Flaky test sometimes gives object dtype instead of Categorical",
)
request.applymarker(mark)
actual = parser.read_csv(StringIO(data), dtype=dtype)
tm.assert_frame_equal(actual, expected)
@xfail_pyarrow # AssertionError: Attributes of DataFrame.iloc[:, 0] are different
def test_categorical_dtype_unsorted(all_parsers):
# see gh-10153
parser = all_parsers
data = """a,b,c
1,b,3.4
1,b,3.4
2,a,4.5"""
expected = DataFrame(
{
"a": Categorical(["1", "1", "2"]),
"b": Categorical(["b", "b", "a"]),
"c": Categorical(["3.4", "3.4", "4.5"]),
}
)
actual = parser.read_csv(StringIO(data), dtype="category")
tm.assert_frame_equal(actual, expected)
@xfail_pyarrow # AssertionError: Attributes of DataFrame.iloc[:, 0] are different
def test_categorical_dtype_missing(all_parsers):
# see gh-10153
parser = all_parsers
data = """a,b,c
1,b,3.4
1,nan,3.4
2,a,4.5"""
expected = DataFrame(
{
"a": Categorical(["1", "1", "2"]),
"b": Categorical(["b", np.nan, "a"]),
"c": Categorical(["3.4", "3.4", "4.5"]),
}
)
actual = parser.read_csv(StringIO(data), dtype="category")
tm.assert_frame_equal(actual, expected)
@xfail_pyarrow # AssertionError: Attributes of DataFrame.iloc[:, 0] are different
@pytest.mark.slow
def test_categorical_dtype_high_cardinality_numeric(all_parsers, monkeypatch):
# see gh-18186
# was an issue with C parser, due to DEFAULT_BUFFER_HEURISTIC
parser = all_parsers
heuristic = 2**5
data = np.sort([str(i) for i in range(heuristic + 1)])
expected = DataFrame({"a": Categorical(data, ordered=True)})
with monkeypatch.context() as m:
m.setattr(libparsers, "DEFAULT_BUFFER_HEURISTIC", heuristic)
actual = parser.read_csv(StringIO("a\n" + "\n".join(data)), dtype="category")
actual["a"] = actual["a"].cat.reorder_categories(
np.sort(actual.a.cat.categories), ordered=True
)
tm.assert_frame_equal(actual, expected)
def test_categorical_dtype_utf16(all_parsers, csv_dir_path):
# see gh-10153
pth = os.path.join(csv_dir_path, "utf16_ex.txt")
parser = all_parsers
encoding = "utf-16"
sep = "\t"
expected = parser.read_csv(pth, sep=sep, encoding=encoding)
expected = expected.apply(Categorical)
actual = parser.read_csv(pth, sep=sep, encoding=encoding, dtype="category")
tm.assert_frame_equal(actual, expected)
def test_categorical_dtype_chunksize_infer_categories(all_parsers):
# see gh-10153
parser = all_parsers
data = """a,b
1,a
1,b
1,b
2,c"""
expecteds = [
DataFrame({"a": [1, 1], "b": Categorical(["a", "b"])}),
DataFrame({"a": [1, 2], "b": Categorical(["b", "c"])}, index=[2, 3]),
]
if parser.engine == "pyarrow":
msg = "The 'chunksize' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), dtype={"b": "category"}, chunksize=2)
return
with parser.read_csv(
StringIO(data), dtype={"b": "category"}, chunksize=2
) as actuals:
for actual, expected in zip(actuals, expecteds):
tm.assert_frame_equal(actual, expected)
def test_categorical_dtype_chunksize_explicit_categories(all_parsers):
# see gh-10153
parser = all_parsers
data = """a,b
1,a
1,b
1,b
2,c"""
cats = ["a", "b", "c"]
expecteds = [
DataFrame({"a": [1, 1], "b": Categorical(["a", "b"], categories=cats)}),
DataFrame(
{"a": [1, 2], "b": Categorical(["b", "c"], categories=cats)},
index=[2, 3],
),
]
dtype = CategoricalDtype(cats)
if parser.engine == "pyarrow":
msg = "The 'chunksize' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), dtype={"b": dtype}, chunksize=2)
return
with parser.read_csv(StringIO(data), dtype={"b": dtype}, chunksize=2) as actuals:
for actual, expected in zip(actuals, expecteds):
tm.assert_frame_equal(actual, expected)
def test_categorical_dtype_latin1(all_parsers, csv_dir_path):
# see gh-10153
pth = os.path.join(csv_dir_path, "unicode_series.csv")
parser = all_parsers
encoding = "latin-1"
expected = parser.read_csv(pth, header=None, encoding=encoding)
expected[1] = Categorical(expected[1])
actual = parser.read_csv(pth, header=None, encoding=encoding, dtype={1: "category"})
tm.assert_frame_equal(actual, expected)
@pytest.mark.parametrize("ordered", [False, True])
@pytest.mark.parametrize(
"categories",
[["a", "b", "c"], ["a", "c", "b"], ["a", "b", "c", "d"], ["c", "b", "a"]],
)
def test_categorical_category_dtype(all_parsers, categories, ordered):
parser = all_parsers
data = """a,b
1,a
1,b
1,b
2,c"""
expected = DataFrame(
{
"a": [1, 1, 1, 2],
"b": Categorical(
["a", "b", "b", "c"], categories=categories, ordered=ordered
),
}
)
dtype = {"b": CategoricalDtype(categories=categories, ordered=ordered)}
result = parser.read_csv(StringIO(data), dtype=dtype)
tm.assert_frame_equal(result, expected)
def test_categorical_category_dtype_unsorted(all_parsers):
parser = all_parsers
data = """a,b
1,a
1,b
1,b
2,c"""
dtype = CategoricalDtype(["c", "b", "a"])
expected = DataFrame(
{
"a": [1, 1, 1, 2],
"b": Categorical(["a", "b", "b", "c"], categories=["c", "b", "a"]),
}
)
result = parser.read_csv(StringIO(data), dtype={"b": dtype})
tm.assert_frame_equal(result, expected)
def test_categorical_coerces_numeric(all_parsers):
parser = all_parsers
dtype = {"b": CategoricalDtype([1, 2, 3])}
data = "b\n1\n1\n2\n3"
expected = DataFrame({"b": Categorical([1, 1, 2, 3])})
result = parser.read_csv(StringIO(data), dtype=dtype)
tm.assert_frame_equal(result, expected)
def test_categorical_coerces_datetime(all_parsers):
parser = all_parsers
dti = pd.DatetimeIndex(["2017-01-01", "2018-01-01", "2019-01-01"], freq=None)
dtype = {"b": CategoricalDtype(dti)}
data = "b\n2017-01-01\n2018-01-01\n2019-01-01"
expected = DataFrame({"b": Categorical(dtype["b"].categories)})
result = parser.read_csv(StringIO(data), dtype=dtype)
tm.assert_frame_equal(result, expected)
def test_categorical_coerces_timestamp(all_parsers):
parser = all_parsers
dtype = {"b": CategoricalDtype([Timestamp("2014")])}
data = "b\n2014-01-01\n2014-01-01"
expected = DataFrame({"b": Categorical([Timestamp("2014")] * 2)})
result = parser.read_csv(StringIO(data), dtype=dtype)
tm.assert_frame_equal(result, expected)
def test_categorical_coerces_timedelta(all_parsers):
parser = all_parsers
dtype = {"b": CategoricalDtype(pd.to_timedelta(["1h", "2h", "3h"]))}
data = "b\n1h\n2h\n3h"
expected = DataFrame({"b": Categorical(dtype["b"].categories)})
result = parser.read_csv(StringIO(data), dtype=dtype)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"data",
[
"b\nTrue\nFalse\nNA\nFalse",
"b\ntrue\nfalse\nNA\nfalse",
"b\nTRUE\nFALSE\nNA\nFALSE",
"b\nTrue\nFalse\nNA\nFALSE",
],
)
def test_categorical_dtype_coerces_boolean(all_parsers, data):
# see gh-20498
parser = all_parsers
dtype = {"b": CategoricalDtype([False, True])}
expected = DataFrame({"b": Categorical([True, False, None, False])})
result = parser.read_csv(StringIO(data), dtype=dtype)
tm.assert_frame_equal(result, expected)
def test_categorical_unexpected_categories(all_parsers):
parser = all_parsers
dtype = {"b": CategoricalDtype(["a", "b", "d", "e"])}
data = "b\nd\na\nc\nd" # Unexpected c
expected = DataFrame({"b": Categorical(list("dacd"), dtype=dtype["b"])})
result = parser.read_csv(StringIO(data), dtype=dtype)
tm.assert_frame_equal(result, expected)

View File

@ -0,0 +1,644 @@
"""
Tests dtype specification during parsing
for all of the parsers defined in parsers.py
"""
from collections import defaultdict
from io import StringIO
import numpy as np
import pytest
from pandas.errors import ParserWarning
import pandas as pd
from pandas import (
DataFrame,
Timestamp,
)
import pandas._testing as tm
from pandas.core.arrays import IntegerArray
pytestmark = pytest.mark.filterwarnings(
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
)
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
@pytest.mark.parametrize("dtype", [str, object])
@pytest.mark.parametrize("check_orig", [True, False])
@pytest.mark.usefixtures("pyarrow_xfail")
def test_dtype_all_columns(all_parsers, dtype, check_orig, using_infer_string):
# see gh-3795, gh-6607
parser = all_parsers
df = DataFrame(
np.random.default_rng(2).random((5, 2)).round(4),
columns=list("AB"),
index=["1A", "1B", "1C", "1D", "1E"],
)
with tm.ensure_clean("__passing_str_as_dtype__.csv") as path:
df.to_csv(path)
result = parser.read_csv(path, dtype=dtype, index_col=0)
if check_orig:
expected = df.copy()
result = result.astype(float)
elif using_infer_string and dtype is str:
expected = df.astype(str)
else:
expected = df.astype(str).astype(object)
tm.assert_frame_equal(result, expected)
@pytest.mark.usefixtures("pyarrow_xfail")
def test_dtype_per_column(all_parsers):
parser = all_parsers
data = """\
one,two
1,2.5
2,3.5
3,4.5
4,5.5"""
expected = DataFrame(
[[1, "2.5"], [2, "3.5"], [3, "4.5"], [4, "5.5"]], columns=["one", "two"]
)
expected["one"] = expected["one"].astype(np.float64)
result = parser.read_csv(StringIO(data), dtype={"one": np.float64, 1: str})
tm.assert_frame_equal(result, expected)
def test_invalid_dtype_per_column(all_parsers):
parser = all_parsers
data = """\
one,two
1,2.5
2,3.5
3,4.5
4,5.5"""
with pytest.raises(TypeError, match="data type [\"']foo[\"'] not understood"):
parser.read_csv(StringIO(data), dtype={"one": "foo", 1: "int"})
def test_raise_on_passed_int_dtype_with_nas(all_parsers):
# see gh-2631
parser = all_parsers
data = """YEAR, DOY, a
2001,106380451,10
2001,,11
2001,106380451,67"""
if parser.engine == "c":
msg = "Integer column has NA values"
elif parser.engine == "pyarrow":
msg = "The 'skipinitialspace' option is not supported with the 'pyarrow' engine"
else:
msg = "Unable to convert column DOY"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), dtype={"DOY": np.int64}, skipinitialspace=True)
def test_dtype_with_converters(all_parsers):
parser = all_parsers
data = """a,b
1.1,2.2
1.2,2.3"""
if parser.engine == "pyarrow":
msg = "The 'converters' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(
StringIO(data), dtype={"a": "i8"}, converters={"a": lambda x: str(x)}
)
return
# Dtype spec ignored if converted specified.
result = parser.read_csv_check_warnings(
ParserWarning,
"Both a converter and dtype were specified for column a "
"- only the converter will be used.",
StringIO(data),
dtype={"a": "i8"},
converters={"a": lambda x: str(x)},
)
expected = DataFrame({"a": ["1.1", "1.2"], "b": [2.2, 2.3]})
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"dtype", list(np.typecodes["AllInteger"] + np.typecodes["Float"])
)
def test_numeric_dtype(all_parsers, dtype):
data = "0\n1"
parser = all_parsers
expected = DataFrame([0, 1], dtype=dtype)
result = parser.read_csv(StringIO(data), header=None, dtype=dtype)
tm.assert_frame_equal(expected, result)
@pytest.mark.usefixtures("pyarrow_xfail")
def test_boolean_dtype(all_parsers):
parser = all_parsers
data = "\n".join(
[
"a",
"True",
"TRUE",
"true",
"1",
"1.0",
"False",
"FALSE",
"false",
"0",
"0.0",
"NaN",
"nan",
"NA",
"null",
"NULL",
]
)
result = parser.read_csv(StringIO(data), dtype="boolean")
expected = DataFrame(
{
"a": pd.array(
[
True,
True,
True,
True,
True,
False,
False,
False,
False,
False,
None,
None,
None,
None,
None,
],
dtype="boolean",
)
}
)
tm.assert_frame_equal(result, expected)
@pytest.mark.usefixtures("pyarrow_xfail")
def test_delimiter_with_usecols_and_parse_dates(all_parsers):
# GH#35873
result = all_parsers.read_csv(
StringIO('"dump","-9,1","-9,1",20101010'),
engine="python",
names=["col", "col1", "col2", "col3"],
usecols=["col1", "col2", "col3"],
parse_dates=["col3"],
decimal=",",
)
expected = DataFrame(
{"col1": [-9.1], "col2": [-9.1], "col3": [Timestamp("2010-10-10")]}
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("thousands", ["_", None])
def test_decimal_and_exponential(
request, python_parser_only, numeric_decimal, thousands
):
# GH#31920
decimal_number_check(request, python_parser_only, numeric_decimal, thousands, None)
@pytest.mark.parametrize("thousands", ["_", None])
@pytest.mark.parametrize("float_precision", [None, "legacy", "high", "round_trip"])
def test_1000_sep_decimal_float_precision(
request, c_parser_only, numeric_decimal, float_precision, thousands
):
# test decimal and thousand sep handling in across 'float_precision'
# parsers
decimal_number_check(
request, c_parser_only, numeric_decimal, thousands, float_precision
)
text, value = numeric_decimal
text = " " + text + " "
if isinstance(value, str): # the negative cases (parse as text)
value = " " + value + " "
decimal_number_check(
request, c_parser_only, (text, value), thousands, float_precision
)
def decimal_number_check(request, parser, numeric_decimal, thousands, float_precision):
# GH#31920
value = numeric_decimal[0]
if thousands is None and value in ("1_,", "1_234,56", "1_234,56e0"):
request.applymarker(
pytest.mark.xfail(reason=f"thousands={thousands} and sep is in {value}")
)
df = parser.read_csv(
StringIO(value),
float_precision=float_precision,
sep="|",
thousands=thousands,
decimal=",",
header=None,
)
val = df.iloc[0, 0]
assert val == numeric_decimal[1]
@pytest.mark.parametrize("float_precision", [None, "legacy", "high", "round_trip"])
def test_skip_whitespace(c_parser_only, float_precision):
DATA = """id\tnum\t
1\t1.2 \t
1\t 2.1\t
2\t 1\t
2\t 1.2 \t
"""
df = c_parser_only.read_csv(
StringIO(DATA),
float_precision=float_precision,
sep="\t",
header=0,
dtype={1: np.float64},
)
tm.assert_series_equal(df.iloc[:, 1], pd.Series([1.2, 2.1, 1.0, 1.2], name="num"))
@pytest.mark.usefixtures("pyarrow_xfail")
def test_true_values_cast_to_bool(all_parsers):
# GH#34655
text = """a,b
yes,xxx
no,yyy
1,zzz
0,aaa
"""
parser = all_parsers
result = parser.read_csv(
StringIO(text),
true_values=["yes"],
false_values=["no"],
dtype={"a": "boolean"},
)
expected = DataFrame(
{"a": [True, False, True, False], "b": ["xxx", "yyy", "zzz", "aaa"]}
)
expected["a"] = expected["a"].astype("boolean")
tm.assert_frame_equal(result, expected)
@pytest.mark.usefixtures("pyarrow_xfail")
@pytest.mark.parametrize("dtypes, exp_value", [({}, "1"), ({"a.1": "int64"}, 1)])
def test_dtype_mangle_dup_cols(all_parsers, dtypes, exp_value):
# GH#35211
parser = all_parsers
data = """a,a\n1,1"""
dtype_dict = {"a": str, **dtypes}
# GH#42462
dtype_dict_copy = dtype_dict.copy()
result = parser.read_csv(StringIO(data), dtype=dtype_dict)
expected = DataFrame({"a": ["1"], "a.1": [exp_value]})
assert dtype_dict == dtype_dict_copy, "dtype dict changed"
tm.assert_frame_equal(result, expected)
@pytest.mark.usefixtures("pyarrow_xfail")
def test_dtype_mangle_dup_cols_single_dtype(all_parsers):
# GH#42022
parser = all_parsers
data = """a,a\n1,1"""
result = parser.read_csv(StringIO(data), dtype=str)
expected = DataFrame({"a": ["1"], "a.1": ["1"]})
tm.assert_frame_equal(result, expected)
@pytest.mark.usefixtures("pyarrow_xfail")
def test_dtype_multi_index(all_parsers):
# GH 42446
parser = all_parsers
data = "A,B,B\nX,Y,Z\n1,2,3"
result = parser.read_csv(
StringIO(data),
header=list(range(2)),
dtype={
("A", "X"): np.int32,
("B", "Y"): np.int32,
("B", "Z"): np.float32,
},
)
expected = DataFrame(
{
("A", "X"): np.int32([1]),
("B", "Y"): np.int32([2]),
("B", "Z"): np.float32([3]),
}
)
tm.assert_frame_equal(result, expected)
def test_nullable_int_dtype(all_parsers, any_int_ea_dtype):
# GH 25472
parser = all_parsers
dtype = any_int_ea_dtype
data = """a,b,c
,3,5
1,,6
2,4,"""
expected = DataFrame(
{
"a": pd.array([pd.NA, 1, 2], dtype=dtype),
"b": pd.array([3, pd.NA, 4], dtype=dtype),
"c": pd.array([5, 6, pd.NA], dtype=dtype),
}
)
actual = parser.read_csv(StringIO(data), dtype=dtype)
tm.assert_frame_equal(actual, expected)
@pytest.mark.usefixtures("pyarrow_xfail")
@pytest.mark.parametrize("default", ["float", "float64"])
def test_dtypes_defaultdict(all_parsers, default):
# GH#41574
data = """a,b
1,2
"""
dtype = defaultdict(lambda: default, a="int64")
parser = all_parsers
result = parser.read_csv(StringIO(data), dtype=dtype)
expected = DataFrame({"a": [1], "b": 2.0})
tm.assert_frame_equal(result, expected)
@pytest.mark.usefixtures("pyarrow_xfail")
def test_dtypes_defaultdict_mangle_dup_cols(all_parsers):
# GH#41574
data = """a,b,a,b,b.1
1,2,3,4,5
"""
dtype = defaultdict(lambda: "float64", a="int64")
dtype["b.1"] = "int64"
parser = all_parsers
result = parser.read_csv(StringIO(data), dtype=dtype)
expected = DataFrame({"a": [1], "b": [2.0], "a.1": [3], "b.2": [4.0], "b.1": [5]})
tm.assert_frame_equal(result, expected)
@pytest.mark.usefixtures("pyarrow_xfail")
def test_dtypes_defaultdict_invalid(all_parsers):
# GH#41574
data = """a,b
1,2
"""
dtype = defaultdict(lambda: "invalid_dtype", a="int64")
parser = all_parsers
with pytest.raises(TypeError, match="not understood"):
parser.read_csv(StringIO(data), dtype=dtype)
def test_dtype_backend(all_parsers):
# GH#36712
parser = all_parsers
data = """a,b,c,d,e,f,g,h,i,j
1,2.5,True,a,,,,,12-31-2019,
3,4.5,False,b,6,7.5,True,a,12-31-2019,
"""
result = parser.read_csv(
StringIO(data), dtype_backend="numpy_nullable", parse_dates=["i"]
)
expected = DataFrame(
{
"a": pd.Series([1, 3], dtype="Int64"),
"b": pd.Series([2.5, 4.5], dtype="Float64"),
"c": pd.Series([True, False], dtype="boolean"),
"d": pd.Series(["a", "b"], dtype="string"),
"e": pd.Series([pd.NA, 6], dtype="Int64"),
"f": pd.Series([pd.NA, 7.5], dtype="Float64"),
"g": pd.Series([pd.NA, True], dtype="boolean"),
"h": pd.Series([pd.NA, "a"], dtype="string"),
"i": pd.Series([Timestamp("2019-12-31")] * 2),
"j": pd.Series([pd.NA, pd.NA], dtype="Int64"),
}
)
tm.assert_frame_equal(result, expected)
def test_dtype_backend_and_dtype(all_parsers):
# GH#36712
parser = all_parsers
data = """a,b
1,2.5
,
"""
result = parser.read_csv(
StringIO(data), dtype_backend="numpy_nullable", dtype="float64"
)
expected = DataFrame({"a": [1.0, np.nan], "b": [2.5, np.nan]})
tm.assert_frame_equal(result, expected)
def test_dtype_backend_string(all_parsers, string_storage):
# GH#36712
with pd.option_context("mode.string_storage", string_storage):
parser = all_parsers
data = """a,b
a,x
b,
"""
result = parser.read_csv(StringIO(data), dtype_backend="numpy_nullable")
expected = DataFrame(
{
"a": pd.array(["a", "b"], dtype=pd.StringDtype(string_storage)),
"b": pd.array(["x", pd.NA], dtype=pd.StringDtype(string_storage)),
},
)
tm.assert_frame_equal(result, expected)
def test_dtype_backend_ea_dtype_specified(all_parsers):
# GH#491496
data = """a,b
1,2
"""
parser = all_parsers
result = parser.read_csv(
StringIO(data), dtype="Int64", dtype_backend="numpy_nullable"
)
expected = DataFrame({"a": [1], "b": 2}, dtype="Int64")
tm.assert_frame_equal(result, expected)
def test_dtype_backend_pyarrow(all_parsers, request):
# GH#36712
pa = pytest.importorskip("pyarrow")
parser = all_parsers
data = """a,b,c,d,e,f,g,h,i,j
1,2.5,True,a,,,,,12-31-2019,
3,4.5,False,b,6,7.5,True,a,12-31-2019,
"""
result = parser.read_csv(StringIO(data), dtype_backend="pyarrow", parse_dates=["i"])
expected = DataFrame(
{
"a": pd.Series([1, 3], dtype="int64[pyarrow]"),
"b": pd.Series([2.5, 4.5], dtype="float64[pyarrow]"),
"c": pd.Series([True, False], dtype="bool[pyarrow]"),
"d": pd.Series(["a", "b"], dtype=pd.ArrowDtype(pa.string())),
"e": pd.Series([pd.NA, 6], dtype="int64[pyarrow]"),
"f": pd.Series([pd.NA, 7.5], dtype="float64[pyarrow]"),
"g": pd.Series([pd.NA, True], dtype="bool[pyarrow]"),
"h": pd.Series(
[pd.NA, "a"],
dtype=pd.ArrowDtype(pa.string()),
),
"i": pd.Series([Timestamp("2019-12-31")] * 2),
"j": pd.Series([pd.NA, pd.NA], dtype="null[pyarrow]"),
}
)
tm.assert_frame_equal(result, expected)
# pyarrow engine failing:
# https://github.com/pandas-dev/pandas/issues/56136
@pytest.mark.usefixtures("pyarrow_xfail")
def test_ea_int_avoid_overflow(all_parsers):
# GH#32134
parser = all_parsers
data = """a,b
1,1
,1
1582218195625938945,1
"""
result = parser.read_csv(StringIO(data), dtype={"a": "Int64"})
expected = DataFrame(
{
"a": IntegerArray(
np.array([1, 1, 1582218195625938945]), np.array([False, True, False])
),
"b": 1,
}
)
tm.assert_frame_equal(result, expected)
def test_string_inference(all_parsers):
# GH#54430
dtype = pd.StringDtype(na_value=np.nan)
data = """a,b
x,1
y,2
,3"""
parser = all_parsers
with pd.option_context("future.infer_string", True):
result = parser.read_csv(StringIO(data))
expected = DataFrame(
{"a": pd.Series(["x", "y", None], dtype=dtype), "b": [1, 2, 3]},
columns=pd.Index(["a", "b"], dtype=dtype),
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("dtype", ["O", object, "object", np.object_, str, np.str_])
def test_string_inference_object_dtype(all_parsers, dtype, using_infer_string):
# GH#56047
data = """a,b
x,a
y,a
z,a"""
parser = all_parsers
with pd.option_context("future.infer_string", True):
result = parser.read_csv(StringIO(data), dtype=dtype)
expected_dtype = pd.StringDtype(na_value=np.nan) if dtype is str else object
expected = DataFrame(
{
"a": pd.Series(["x", "y", "z"], dtype=expected_dtype),
"b": pd.Series(["a", "a", "a"], dtype=expected_dtype),
},
columns=pd.Index(["a", "b"], dtype=pd.StringDtype(na_value=np.nan)),
)
tm.assert_frame_equal(result, expected)
with pd.option_context("future.infer_string", True):
result = parser.read_csv(StringIO(data), dtype={"a": dtype})
expected = DataFrame(
{
"a": pd.Series(["x", "y", "z"], dtype=expected_dtype),
"b": pd.Series(["a", "a", "a"], dtype=pd.StringDtype(na_value=np.nan)),
},
columns=pd.Index(["a", "b"], dtype=pd.StringDtype(na_value=np.nan)),
)
tm.assert_frame_equal(result, expected)
@xfail_pyarrow
def test_accurate_parsing_of_large_integers(all_parsers):
# GH#52505
data = """SYMBOL,MOMENT,ID,ID_DEAL
AAPL,20230301181139587,1925036343869802844,
AAPL,20230301181139587,2023552585717889863,2023552585717263358
NVDA,20230301181139587,2023552585717889863,2023552585717263359
AMC,20230301181139587,2023552585717889863,2023552585717263360
AMZN,20230301181139587,2023552585717889759,2023552585717263360
MSFT,20230301181139587,2023552585717889863,2023552585717263361
NVDA,20230301181139587,2023552585717889827,2023552585717263361"""
orders = all_parsers.read_csv(StringIO(data), dtype={"ID_DEAL": pd.Int64Dtype()})
assert len(orders.loc[orders["ID_DEAL"] == 2023552585717263358, "ID_DEAL"]) == 1
assert len(orders.loc[orders["ID_DEAL"] == 2023552585717263359, "ID_DEAL"]) == 1
assert len(orders.loc[orders["ID_DEAL"] == 2023552585717263360, "ID_DEAL"]) == 2
assert len(orders.loc[orders["ID_DEAL"] == 2023552585717263361, "ID_DEAL"]) == 2
def test_dtypes_with_usecols(all_parsers):
# GH#54868
parser = all_parsers
data = """a,b,c
1,2,3
4,5,6"""
result = parser.read_csv(StringIO(data), usecols=["a", "c"], dtype={"a": object})
if parser.engine == "pyarrow":
values = [1, 4]
else:
values = ["1", "4"]
expected = DataFrame({"a": pd.Series(values, dtype=object), "c": [3, 6]})
tm.assert_frame_equal(result, expected)
def test_index_col_with_dtype_no_rangeindex(all_parsers):
data = StringIO("345.5,519.5,0\n519.5,726.5,1")
result = all_parsers.read_csv(
data,
header=None,
names=["start", "stop", "bin_id"],
dtype={"start": np.float32, "stop": np.float32, "bin_id": np.uint32},
index_col="bin_id",
).index
expected = pd.Index([0, 1], dtype=np.uint32, name="bin_id")
tm.assert_index_equal(result, expected)

View File

@ -0,0 +1,181 @@
"""
Tests dtype specification during parsing
for all of the parsers defined in parsers.py
"""
from io import StringIO
import numpy as np
import pytest
from pandas import (
Categorical,
DataFrame,
Index,
MultiIndex,
Series,
concat,
)
import pandas._testing as tm
skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
@skip_pyarrow # CSV parse error: Empty CSV file or block
def test_dtype_all_columns_empty(all_parsers):
# see gh-12048
parser = all_parsers
result = parser.read_csv(StringIO("A,B"), dtype=str)
expected = DataFrame({"A": [], "B": []}, dtype=str)
tm.assert_frame_equal(result, expected)
@skip_pyarrow # CSV parse error: Empty CSV file or block
def test_empty_pass_dtype(all_parsers):
parser = all_parsers
data = "one,two"
result = parser.read_csv(StringIO(data), dtype={"one": "u1"})
expected = DataFrame(
{"one": np.empty(0, dtype="u1"), "two": np.empty(0, dtype=object)},
)
tm.assert_frame_equal(result, expected)
@skip_pyarrow # CSV parse error: Empty CSV file or block
def test_empty_with_index_pass_dtype(all_parsers):
parser = all_parsers
data = "one,two"
result = parser.read_csv(
StringIO(data), index_col=["one"], dtype={"one": "u1", 1: "f"}
)
expected = DataFrame(
{"two": np.empty(0, dtype="f")}, index=Index([], dtype="u1", name="one")
)
tm.assert_frame_equal(result, expected)
@skip_pyarrow # CSV parse error: Empty CSV file or block
def test_empty_with_multi_index_pass_dtype(all_parsers):
parser = all_parsers
data = "one,two,three"
result = parser.read_csv(
StringIO(data), index_col=["one", "two"], dtype={"one": "u1", 1: "f8"}
)
exp_idx = MultiIndex.from_arrays(
[np.empty(0, dtype="u1"), np.empty(0, dtype=np.float64)],
names=["one", "two"],
)
expected = DataFrame({"three": np.empty(0, dtype=object)}, index=exp_idx)
tm.assert_frame_equal(result, expected)
@skip_pyarrow # CSV parse error: Empty CSV file or block
def test_empty_with_mangled_column_pass_dtype_by_names(all_parsers):
parser = all_parsers
data = "one,one"
result = parser.read_csv(StringIO(data), dtype={"one": "u1", "one.1": "f"})
expected = DataFrame(
{"one": np.empty(0, dtype="u1"), "one.1": np.empty(0, dtype="f")},
)
tm.assert_frame_equal(result, expected)
@skip_pyarrow # CSV parse error: Empty CSV file or block
def test_empty_with_mangled_column_pass_dtype_by_indexes(all_parsers):
parser = all_parsers
data = "one,one"
result = parser.read_csv(StringIO(data), dtype={0: "u1", 1: "f"})
expected = DataFrame(
{"one": np.empty(0, dtype="u1"), "one.1": np.empty(0, dtype="f")},
)
tm.assert_frame_equal(result, expected)
@skip_pyarrow # CSV parse error: Empty CSV file or block
def test_empty_with_dup_column_pass_dtype_by_indexes(all_parsers):
# see gh-9424
parser = all_parsers
expected = concat(
[Series([], name="one", dtype="u1"), Series([], name="one.1", dtype="f")],
axis=1,
)
data = "one,one"
result = parser.read_csv(StringIO(data), dtype={0: "u1", 1: "f"})
tm.assert_frame_equal(result, expected)
def test_empty_with_dup_column_pass_dtype_by_indexes_raises(all_parsers):
# see gh-9424
parser = all_parsers
expected = concat(
[Series([], name="one", dtype="u1"), Series([], name="one.1", dtype="f")],
axis=1,
)
expected.index = expected.index.astype(object)
with pytest.raises(ValueError, match="Duplicate names"):
data = ""
parser.read_csv(StringIO(data), names=["one", "one"], dtype={0: "u1", 1: "f"})
@pytest.mark.parametrize(
"dtype,expected",
[
(np.float64, DataFrame(columns=["a", "b"], dtype=np.float64)),
(
"category",
DataFrame({"a": Categorical([]), "b": Categorical([])}),
),
(
{"a": "category", "b": "category"},
DataFrame({"a": Categorical([]), "b": Categorical([])}),
),
("datetime64[ns]", DataFrame(columns=["a", "b"], dtype="datetime64[ns]")),
(
"timedelta64[ns]",
DataFrame(
{
"a": Series([], dtype="timedelta64[ns]"),
"b": Series([], dtype="timedelta64[ns]"),
},
),
),
(
{"a": np.int64, "b": np.int32},
DataFrame(
{"a": Series([], dtype=np.int64), "b": Series([], dtype=np.int32)},
),
),
(
{0: np.int64, 1: np.int32},
DataFrame(
{"a": Series([], dtype=np.int64), "b": Series([], dtype=np.int32)},
),
),
(
{"a": np.int64, 1: np.int32},
DataFrame(
{"a": Series([], dtype=np.int64), "b": Series([], dtype=np.int32)},
),
),
],
)
@skip_pyarrow # CSV parse error: Empty CSV file or block
def test_empty_dtype(all_parsers, dtype, expected):
# see gh-14712
parser = all_parsers
data = "a,b"
result = parser.read_csv(StringIO(data), header=0, dtype=dtype)
tm.assert_frame_equal(result, expected)

View File

@ -0,0 +1,647 @@
"""
Tests that apply specifically to the CParser. Unless specifically stated
as a CParser-specific issue, the goal is to eventually move as many of
these tests out of this module as soon as the Python parser can accept
further arguments when parsing.
"""
from decimal import Decimal
from io import (
BytesIO,
StringIO,
TextIOWrapper,
)
import mmap
import os
import tarfile
import numpy as np
import pytest
from pandas.compat.numpy import np_version_gte1p24
from pandas.errors import (
ParserError,
ParserWarning,
)
import pandas.util._test_decorators as td
from pandas import (
DataFrame,
concat,
)
import pandas._testing as tm
@pytest.mark.parametrize(
"malformed",
["1\r1\r1\r 1\r 1\r", "1\r1\r1\r 1\r 1\r11\r", "1\r1\r1\r 1\r 1\r11\r1\r"],
ids=["words pointer", "stream pointer", "lines pointer"],
)
def test_buffer_overflow(c_parser_only, malformed):
# see gh-9205: test certain malformed input files that cause
# buffer overflows in tokenizer.c
msg = "Buffer overflow caught - possible malformed input file."
parser = c_parser_only
with pytest.raises(ParserError, match=msg):
parser.read_csv(StringIO(malformed))
def test_delim_whitespace_custom_terminator(c_parser_only):
# See gh-12912
data = "a b c~1 2 3~4 5 6~7 8 9"
parser = c_parser_only
depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated"
with tm.assert_produces_warning(
FutureWarning, match=depr_msg, check_stacklevel=False
):
df = parser.read_csv(StringIO(data), lineterminator="~", delim_whitespace=True)
expected = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=["a", "b", "c"])
tm.assert_frame_equal(df, expected)
def test_dtype_and_names_error(c_parser_only):
# see gh-8833: passing both dtype and names
# resulting in an error reporting issue
parser = c_parser_only
data = """
1.0 1
2.0 2
3.0 3
"""
# base cases
result = parser.read_csv(StringIO(data), sep=r"\s+", header=None)
expected = DataFrame([[1.0, 1], [2.0, 2], [3.0, 3]])
tm.assert_frame_equal(result, expected)
result = parser.read_csv(StringIO(data), sep=r"\s+", header=None, names=["a", "b"])
expected = DataFrame([[1.0, 1], [2.0, 2], [3.0, 3]], columns=["a", "b"])
tm.assert_frame_equal(result, expected)
# fallback casting
result = parser.read_csv(
StringIO(data), sep=r"\s+", header=None, names=["a", "b"], dtype={"a": np.int32}
)
expected = DataFrame([[1, 1], [2, 2], [3, 3]], columns=["a", "b"])
expected["a"] = expected["a"].astype(np.int32)
tm.assert_frame_equal(result, expected)
data = """
1.0 1
nan 2
3.0 3
"""
# fallback casting, but not castable
warning = RuntimeWarning if np_version_gte1p24 else None
with pytest.raises(ValueError, match="cannot safely convert"):
with tm.assert_produces_warning(warning, check_stacklevel=False):
parser.read_csv(
StringIO(data),
sep=r"\s+",
header=None,
names=["a", "b"],
dtype={"a": np.int32},
)
@pytest.mark.parametrize(
"match,kwargs",
[
# For each of these cases, all of the dtypes are valid, just unsupported.
(
(
"the dtype datetime64 is not supported for parsing, "
"pass this column using parse_dates instead"
),
{"dtype": {"A": "datetime64", "B": "float64"}},
),
(
(
"the dtype datetime64 is not supported for parsing, "
"pass this column using parse_dates instead"
),
{"dtype": {"A": "datetime64", "B": "float64"}, "parse_dates": ["B"]},
),
(
"the dtype timedelta64 is not supported for parsing",
{"dtype": {"A": "timedelta64", "B": "float64"}},
),
(
f"the dtype {tm.ENDIAN}U8 is not supported for parsing",
{"dtype": {"A": "U8"}},
),
],
ids=["dt64-0", "dt64-1", "td64", f"{tm.ENDIAN}U8"],
)
def test_unsupported_dtype(c_parser_only, match, kwargs):
parser = c_parser_only
df = DataFrame(
np.random.default_rng(2).random((5, 2)),
columns=list("AB"),
index=["1A", "1B", "1C", "1D", "1E"],
)
with tm.ensure_clean("__unsupported_dtype__.csv") as path:
df.to_csv(path)
with pytest.raises(TypeError, match=match):
parser.read_csv(path, index_col=0, **kwargs)
@td.skip_if_32bit
@pytest.mark.slow
# test numbers between 1 and 2
@pytest.mark.parametrize("num", np.linspace(1.0, 2.0, num=21))
def test_precise_conversion(c_parser_only, num):
parser = c_parser_only
normal_errors = []
precise_errors = []
def error(val: float, actual_val: Decimal) -> Decimal:
return abs(Decimal(f"{val:.100}") - actual_val)
# 25 decimal digits of precision
text = f"a\n{num:.25}"
normal_val = float(
parser.read_csv(StringIO(text), float_precision="legacy")["a"][0]
)
precise_val = float(parser.read_csv(StringIO(text), float_precision="high")["a"][0])
roundtrip_val = float(
parser.read_csv(StringIO(text), float_precision="round_trip")["a"][0]
)
actual_val = Decimal(text[2:])
normal_errors.append(error(normal_val, actual_val))
precise_errors.append(error(precise_val, actual_val))
# round-trip should match float()
assert roundtrip_val == float(text[2:])
assert sum(precise_errors) <= sum(normal_errors)
assert max(precise_errors) <= max(normal_errors)
def test_usecols_dtypes(c_parser_only, using_infer_string):
parser = c_parser_only
data = """\
1,2,3
4,5,6
7,8,9
10,11,12"""
result = parser.read_csv(
StringIO(data),
usecols=(0, 1, 2),
names=("a", "b", "c"),
header=None,
converters={"a": str},
dtype={"b": int, "c": float},
)
result2 = parser.read_csv(
StringIO(data),
usecols=(0, 2),
names=("a", "b", "c"),
header=None,
converters={"a": str},
dtype={"b": int, "c": float},
)
if using_infer_string:
assert (result.dtypes == ["string", int, float]).all()
assert (result2.dtypes == ["string", float]).all()
else:
assert (result.dtypes == [object, int, float]).all()
assert (result2.dtypes == [object, float]).all()
def test_disable_bool_parsing(c_parser_only):
# see gh-2090
parser = c_parser_only
data = """A,B,C
Yes,No,Yes
No,Yes,Yes
Yes,,Yes
No,No,No"""
result = parser.read_csv(StringIO(data), dtype=object)
assert (result.dtypes == object).all()
result = parser.read_csv(StringIO(data), dtype=object, na_filter=False)
assert result["B"][2] == ""
def test_custom_lineterminator(c_parser_only):
parser = c_parser_only
data = "a,b,c~1,2,3~4,5,6"
result = parser.read_csv(StringIO(data), lineterminator="~")
expected = parser.read_csv(StringIO(data.replace("~", "\n")))
tm.assert_frame_equal(result, expected)
def test_parse_ragged_csv(c_parser_only):
parser = c_parser_only
data = """1,2,3
1,2,3,4
1,2,3,4,5
1,2
1,2,3,4"""
nice_data = """1,2,3,,
1,2,3,4,
1,2,3,4,5
1,2,,,
1,2,3,4,"""
result = parser.read_csv(
StringIO(data), header=None, names=["a", "b", "c", "d", "e"]
)
expected = parser.read_csv(
StringIO(nice_data), header=None, names=["a", "b", "c", "d", "e"]
)
tm.assert_frame_equal(result, expected)
# too many columns, cause segfault if not careful
data = "1,2\n3,4,5"
result = parser.read_csv(StringIO(data), header=None, names=range(50))
expected = parser.read_csv(StringIO(data), header=None, names=range(3)).reindex(
columns=range(50)
)
tm.assert_frame_equal(result, expected)
def test_tokenize_CR_with_quoting(c_parser_only):
# see gh-3453
parser = c_parser_only
data = ' a,b,c\r"a,b","e,d","f,f"'
result = parser.read_csv(StringIO(data), header=None)
expected = parser.read_csv(StringIO(data.replace("\r", "\n")), header=None)
tm.assert_frame_equal(result, expected)
result = parser.read_csv(StringIO(data))
expected = parser.read_csv(StringIO(data.replace("\r", "\n")))
tm.assert_frame_equal(result, expected)
@pytest.mark.slow
@pytest.mark.parametrize("count", [3 * 2**n for n in range(6)])
def test_grow_boundary_at_cap(c_parser_only, count):
# See gh-12494
#
# Cause of error was that the C parser
# was not increasing the buffer size when
# the desired space would fill the buffer
# to capacity, which would later cause a
# buffer overflow error when checking the
# EOF terminator of the CSV stream.
# 3 * 2^n commas was observed to break the parser
parser = c_parser_only
with StringIO("," * count) as s:
expected = DataFrame(columns=[f"Unnamed: {i}" for i in range(count + 1)])
df = parser.read_csv(s)
tm.assert_frame_equal(df, expected)
@pytest.mark.slow
@pytest.mark.parametrize("encoding", [None, "utf-8"])
def test_parse_trim_buffers(c_parser_only, encoding):
# This test is part of a bugfix for gh-13703. It attempts to
# to stress the system memory allocator, to cause it to move the
# stream buffer and either let the OS reclaim the region, or let
# other memory requests of parser otherwise modify the contents
# of memory space, where it was formally located.
# This test is designed to cause a `segfault` with unpatched
# `tokenizer.c`. Sometimes the test fails on `segfault`, other
# times it fails due to memory corruption, which causes the
# loaded DataFrame to differ from the expected one.
# Also force 'utf-8' encoding, so that `_string_convert` would take
# a different execution branch.
parser = c_parser_only
# Generate a large mixed-type CSV file on-the-fly (one record is
# approx 1.5KiB).
record_ = (
"""9999-9,99:99,,,,ZZ,ZZ,,,ZZZ-ZZZZ,.Z-ZZZZ,-9.99,,,9.99,Z"""
"""ZZZZ,,-99,9,ZZZ-ZZZZ,ZZ-ZZZZ,,9.99,ZZZ-ZZZZZ,ZZZ-ZZZZZ,"""
"""ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,9"""
"""99,ZZZ-ZZZZ,,ZZ-ZZZZ,,,,,ZZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZ,,,9,9,"""
"""9,9,99,99,999,999,ZZZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZ,9,ZZ-ZZZZ,9."""
"""99,ZZ-ZZZZ,ZZ-ZZZZ,,,,ZZZZ,,,ZZ,ZZ,,,,,,,,,,,,,9,,,999."""
"""99,999.99,,,ZZZZZ,,,Z9,,,,,,,ZZZ,ZZZ,,,,,,,,,,,ZZZZZ,ZZ"""
"""ZZZ,ZZZ-ZZZZZZ,ZZZ-ZZZZZZ,ZZ-ZZZZ,ZZ-ZZZZ,ZZ-ZZZZ,ZZ-ZZ"""
"""ZZ,,,999999,999999,ZZZ,ZZZ,,,ZZZ,ZZZ,999.99,999.99,,,,Z"""
"""ZZ-ZZZ,ZZZ-ZZZ,-9.99,-9.99,9,9,,99,,9.99,9.99,9,9,9.99,"""
"""9.99,,,,9.99,9.99,,99,,99,9.99,9.99,,,ZZZ,ZZZ,,999.99,,"""
"""999.99,ZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,,,ZZZZZ,ZZZZZ,ZZZ,ZZZ,9,9,"""
""",,,,,ZZZ-ZZZZ,ZZZ999Z,,,999.99,,999.99,ZZZ-ZZZZ,,,9.999"""
""",9.999,9.999,9.999,-9.999,-9.999,-9.999,-9.999,9.999,9."""
"""999,9.999,9.999,9.999,9.999,9.999,9.999,99999,ZZZ-ZZZZ,"""
""",9.99,ZZZ,,,,,,,,ZZZ,,,,,9,,,,9,,,,,,,,,,ZZZ-ZZZZ,ZZZ-Z"""
"""ZZZ,,ZZZZZ,ZZZZZ,ZZZZZ,ZZZZZ,,,9.99,,ZZ-ZZZZ,ZZ-ZZZZ,ZZ"""
""",999,,,,ZZ-ZZZZ,ZZZ,ZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,,,99.99,99.99"""
""",,,9.99,9.99,9.99,9.99,ZZZ-ZZZZ,,,ZZZ-ZZZZZ,,,,,-9.99,-"""
"""9.99,-9.99,-9.99,,,,,,,,,ZZZ-ZZZZ,,9,9.99,9.99,99ZZ,,-9"""
""".99,-9.99,ZZZ-ZZZZ,,,,,,,ZZZ-ZZZZ,9.99,9.99,9999,,,,,,,"""
""",,,-9.9,Z/Z-ZZZZ,999.99,9.99,,999.99,ZZ-ZZZZ,ZZ-ZZZZ,9."""
"""99,9.99,9.99,9.99,9.99,9.99,,ZZZ-ZZZZZ,ZZZ-ZZZZZ,ZZZ-ZZ"""
"""ZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZZ,ZZZ,ZZZ,ZZZ,ZZZ,9.99,,,-9.99,ZZ"""
"""-ZZZZ,-999.99,,-9999,,999.99,,,,999.99,99.99,,,ZZ-ZZZZZ"""
"""ZZZ,ZZ-ZZZZ-ZZZZZZZ,,,,ZZ-ZZ-ZZZZZZZZ,ZZZZZZZZ,ZZZ-ZZZZ"""
""",9999,999.99,ZZZ-ZZZZ,-9.99,-9.99,ZZZ-ZZZZ,99:99:99,,99"""
""",99,,9.99,,-99.99,,,,,,9.99,ZZZ-ZZZZ,-9.99,-9.99,9.99,9"""
""".99,,ZZZ,,,,,,,ZZZ,ZZZ,,,,,"""
)
# Set the number of lines so that a call to `parser_trim_buffers`
# is triggered: after a couple of full chunks are consumed a
# relatively small 'residual' chunk would cause reallocation
# within the parser.
chunksize, n_lines = 128, 2 * 128 + 15
csv_data = "\n".join([record_] * n_lines) + "\n"
# We will use StringIO to load the CSV from this text buffer.
# pd.read_csv() will iterate over the file in chunks and will
# finally read a residual chunk of really small size.
# Generate the expected output: manually create the dataframe
# by splitting by comma and repeating the `n_lines` times.
row = tuple(val_ if val_ else np.nan for val_ in record_.split(","))
expected = DataFrame(
[row for _ in range(n_lines)], dtype=object, columns=None, index=None
)
# Iterate over the CSV file in chunks of `chunksize` lines
with parser.read_csv(
StringIO(csv_data),
header=None,
dtype=object,
chunksize=chunksize,
encoding=encoding,
) as chunks_:
result = concat(chunks_, axis=0, ignore_index=True)
# Check for data corruption if there was no segfault
tm.assert_frame_equal(result, expected)
def test_internal_null_byte(c_parser_only):
# see gh-14012
#
# The null byte ('\x00') should not be used as a
# true line terminator, escape character, or comment
# character, only as a placeholder to indicate that
# none was specified.
#
# This test should be moved to test_common.py ONLY when
# Python's csv class supports parsing '\x00'.
parser = c_parser_only
names = ["a", "b", "c"]
data = "1,2,3\n4,\x00,6\n7,8,9"
expected = DataFrame([[1, 2.0, 3], [4, np.nan, 6], [7, 8, 9]], columns=names)
result = parser.read_csv(StringIO(data), names=names)
tm.assert_frame_equal(result, expected)
def test_read_nrows_large(c_parser_only):
# gh-7626 - Read only nrows of data in for large inputs (>262144b)
parser = c_parser_only
header_narrow = "\t".join(["COL_HEADER_" + str(i) for i in range(10)]) + "\n"
data_narrow = "\t".join(["somedatasomedatasomedata1" for _ in range(10)]) + "\n"
header_wide = "\t".join(["COL_HEADER_" + str(i) for i in range(15)]) + "\n"
data_wide = "\t".join(["somedatasomedatasomedata2" for _ in range(15)]) + "\n"
test_input = header_narrow + data_narrow * 1050 + header_wide + data_wide * 2
df = parser.read_csv(StringIO(test_input), sep="\t", nrows=1010)
assert df.size == 1010 * 10
def test_float_precision_round_trip_with_text(c_parser_only):
# see gh-15140
parser = c_parser_only
df = parser.read_csv(StringIO("a"), header=None, float_precision="round_trip")
tm.assert_frame_equal(df, DataFrame({0: ["a"]}))
def test_large_difference_in_columns(c_parser_only):
# see gh-14125
parser = c_parser_only
count = 10000
large_row = ("X," * count)[:-1] + "\n"
normal_row = "XXXXXX XXXXXX,111111111111111\n"
test_input = (large_row + normal_row * 6)[:-1]
result = parser.read_csv(StringIO(test_input), header=None, usecols=[0])
rows = test_input.split("\n")
expected = DataFrame([row.split(",")[0] for row in rows])
tm.assert_frame_equal(result, expected)
def test_data_after_quote(c_parser_only):
# see gh-15910
parser = c_parser_only
data = 'a\n1\n"b"a'
result = parser.read_csv(StringIO(data))
expected = DataFrame({"a": ["1", "ba"]})
tm.assert_frame_equal(result, expected)
def test_comment_whitespace_delimited(c_parser_only):
parser = c_parser_only
test_input = """\
1 2
2 2 3
3 2 3 # 3 fields
4 2 3# 3 fields
5 2 # 2 fields
6 2# 2 fields
7 # 1 field, NaN
8# 1 field, NaN
9 2 3 # skipped line
# comment"""
with tm.assert_produces_warning(
ParserWarning, match="Skipping line", check_stacklevel=False
):
df = parser.read_csv(
StringIO(test_input),
comment="#",
header=None,
delimiter="\\s+",
skiprows=0,
on_bad_lines="warn",
)
expected = DataFrame([[1, 2], [5, 2], [6, 2], [7, np.nan], [8, np.nan]])
tm.assert_frame_equal(df, expected)
def test_file_like_no_next(c_parser_only):
# gh-16530: the file-like need not have a "next" or "__next__"
# attribute despite having an "__iter__" attribute.
#
# NOTE: This is only true for the C engine, not Python engine.
class NoNextBuffer(StringIO):
def __next__(self):
raise AttributeError("No next method")
next = __next__
parser = c_parser_only
data = "a\n1"
expected = DataFrame({"a": [1]})
result = parser.read_csv(NoNextBuffer(data))
tm.assert_frame_equal(result, expected)
def test_buffer_rd_bytes_bad_unicode(c_parser_only):
# see gh-22748
t = BytesIO(b"\xB0")
t = TextIOWrapper(t, encoding="ascii", errors="surrogateescape")
msg = "'utf-8' codec can't encode character"
with pytest.raises(UnicodeError, match=msg):
c_parser_only.read_csv(t, encoding="UTF-8")
@pytest.mark.parametrize("tar_suffix", [".tar", ".tar.gz"])
def test_read_tarfile(c_parser_only, csv_dir_path, tar_suffix):
# see gh-16530
#
# Unfortunately, Python's CSV library can't handle
# tarfile objects (expects string, not bytes when
# iterating through a file-like).
parser = c_parser_only
tar_path = os.path.join(csv_dir_path, "tar_csv" + tar_suffix)
with tarfile.open(tar_path, "r") as tar:
data_file = tar.extractfile("tar_data.csv")
out = parser.read_csv(data_file)
expected = DataFrame({"a": [1]})
tm.assert_frame_equal(out, expected)
def test_chunk_whitespace_on_boundary(c_parser_only):
# see gh-9735: this issue is C parser-specific (bug when
# parsing whitespace and characters at chunk boundary)
#
# This test case has a field too large for the Python parser / CSV library.
parser = c_parser_only
chunk1 = "a" * (1024 * 256 - 2) + "\na"
chunk2 = "\n a"
result = parser.read_csv(StringIO(chunk1 + chunk2), header=None)
expected = DataFrame(["a" * (1024 * 256 - 2), "a", " a"])
tm.assert_frame_equal(result, expected)
def test_file_handles_mmap(c_parser_only, csv1):
# gh-14418
#
# Don't close user provided file handles.
parser = c_parser_only
with open(csv1, encoding="utf-8") as f:
with mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) as m:
parser.read_csv(m)
assert not m.closed
def test_file_binary_mode(c_parser_only):
# see gh-23779
parser = c_parser_only
expected = DataFrame([[1, 2, 3], [4, 5, 6]])
with tm.ensure_clean() as path:
with open(path, "w", encoding="utf-8") as f:
f.write("1,2,3\n4,5,6")
with open(path, "rb") as f:
result = parser.read_csv(f, header=None)
tm.assert_frame_equal(result, expected)
def test_unix_style_breaks(c_parser_only):
# GH 11020
parser = c_parser_only
with tm.ensure_clean() as path:
with open(path, "w", newline="\n", encoding="utf-8") as f:
f.write("blah\n\ncol_1,col_2,col_3\n\n")
result = parser.read_csv(path, skiprows=2, encoding="utf-8", engine="c")
expected = DataFrame(columns=["col_1", "col_2", "col_3"])
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("float_precision", [None, "legacy", "high", "round_trip"])
@pytest.mark.parametrize(
"data,thousands,decimal",
[
(
"""A|B|C
1|2,334.01|5
10|13|10.
""",
",",
".",
),
(
"""A|B|C
1|2.334,01|5
10|13|10,
""",
".",
",",
),
],
)
def test_1000_sep_with_decimal(
c_parser_only, data, thousands, decimal, float_precision
):
parser = c_parser_only
expected = DataFrame({"A": [1, 10], "B": [2334.01, 13], "C": [5, 10.0]})
result = parser.read_csv(
StringIO(data),
sep="|",
thousands=thousands,
decimal=decimal,
float_precision=float_precision,
)
tm.assert_frame_equal(result, expected)
def test_float_precision_options(c_parser_only):
# GH 17154, 36228
parser = c_parser_only
s = "foo\n243.164\n"
df = parser.read_csv(StringIO(s))
df2 = parser.read_csv(StringIO(s), float_precision="high")
tm.assert_frame_equal(df, df2)
df3 = parser.read_csv(StringIO(s), float_precision="legacy")
assert not df.iloc[0, 0] == df3.iloc[0, 0]
msg = "Unrecognized float_precision option: junk"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(s), float_precision="junk")

View File

@ -0,0 +1,227 @@
"""
Tests that comments are properly handled during parsing
for all of the parsers defined in parsers.py
"""
from io import StringIO
import numpy as np
import pytest
from pandas import DataFrame
import pandas._testing as tm
@pytest.mark.parametrize("na_values", [None, ["NaN"]])
def test_comment(all_parsers, na_values):
parser = all_parsers
data = """A,B,C
1,2.,4.#hello world
5.,NaN,10.0
"""
expected = DataFrame(
[[1.0, 2.0, 4.0], [5.0, np.nan, 10.0]], columns=["A", "B", "C"]
)
if parser.engine == "pyarrow":
msg = "The 'comment' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), comment="#", na_values=na_values)
return
result = parser.read_csv(StringIO(data), comment="#", na_values=na_values)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"read_kwargs", [{}, {"lineterminator": "*"}, {"delim_whitespace": True}]
)
def test_line_comment(all_parsers, read_kwargs, request):
parser = all_parsers
data = """# empty
A,B,C
1,2.,4.#hello world
#ignore this line
5.,NaN,10.0
"""
warn = None
depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated"
if read_kwargs.get("delim_whitespace"):
data = data.replace(",", " ")
warn = FutureWarning
elif read_kwargs.get("lineterminator"):
data = data.replace("\n", read_kwargs.get("lineterminator"))
read_kwargs["comment"] = "#"
if parser.engine == "pyarrow":
if "lineterminator" in read_kwargs:
msg = (
"The 'lineterminator' option is not supported with the 'pyarrow' engine"
)
else:
msg = "The 'comment' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
with tm.assert_produces_warning(
warn, match=depr_msg, check_stacklevel=False
):
parser.read_csv(StringIO(data), **read_kwargs)
return
elif parser.engine == "python" and read_kwargs.get("lineterminator"):
msg = r"Custom line terminators not supported in python parser \(yet\)"
with pytest.raises(ValueError, match=msg):
with tm.assert_produces_warning(
warn, match=depr_msg, check_stacklevel=False
):
parser.read_csv(StringIO(data), **read_kwargs)
return
with tm.assert_produces_warning(warn, match=depr_msg, check_stacklevel=False):
result = parser.read_csv(StringIO(data), **read_kwargs)
expected = DataFrame(
[[1.0, 2.0, 4.0], [5.0, np.nan, 10.0]], columns=["A", "B", "C"]
)
tm.assert_frame_equal(result, expected)
def test_comment_skiprows(all_parsers):
parser = all_parsers
data = """# empty
random line
# second empty line
1,2,3
A,B,C
1,2.,4.
5.,NaN,10.0
"""
# This should ignore the first four lines (including comments).
expected = DataFrame(
[[1.0, 2.0, 4.0], [5.0, np.nan, 10.0]], columns=["A", "B", "C"]
)
if parser.engine == "pyarrow":
msg = "The 'comment' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), comment="#", skiprows=4)
return
result = parser.read_csv(StringIO(data), comment="#", skiprows=4)
tm.assert_frame_equal(result, expected)
def test_comment_header(all_parsers):
parser = all_parsers
data = """# empty
# second empty line
1,2,3
A,B,C
1,2.,4.
5.,NaN,10.0
"""
# Header should begin at the second non-comment line.
expected = DataFrame(
[[1.0, 2.0, 4.0], [5.0, np.nan, 10.0]], columns=["A", "B", "C"]
)
if parser.engine == "pyarrow":
msg = "The 'comment' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), comment="#", header=1)
return
result = parser.read_csv(StringIO(data), comment="#", header=1)
tm.assert_frame_equal(result, expected)
def test_comment_skiprows_header(all_parsers):
parser = all_parsers
data = """# empty
# second empty line
# third empty line
X,Y,Z
1,2,3
A,B,C
1,2.,4.
5.,NaN,10.0
"""
# Skiprows should skip the first 4 lines (including comments),
# while header should start from the second non-commented line,
# starting with line 5.
expected = DataFrame(
[[1.0, 2.0, 4.0], [5.0, np.nan, 10.0]], columns=["A", "B", "C"]
)
if parser.engine == "pyarrow":
msg = "The 'comment' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), comment="#", skiprows=4, header=1)
return
result = parser.read_csv(StringIO(data), comment="#", skiprows=4, header=1)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("comment_char", ["#", "~", "&", "^", "*", "@"])
def test_custom_comment_char(all_parsers, comment_char):
parser = all_parsers
data = "a,b,c\n1,2,3#ignore this!\n4,5,6#ignorethistoo"
if parser.engine == "pyarrow":
msg = "The 'comment' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(
StringIO(data.replace("#", comment_char)), comment=comment_char
)
return
result = parser.read_csv(
StringIO(data.replace("#", comment_char)), comment=comment_char
)
expected = DataFrame([[1, 2, 3], [4, 5, 6]], columns=["a", "b", "c"])
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("header", ["infer", None])
def test_comment_first_line(all_parsers, header):
# see gh-4623
parser = all_parsers
data = "# notes\na,b,c\n# more notes\n1,2,3"
if header is None:
expected = DataFrame({0: ["a", "1"], 1: ["b", "2"], 2: ["c", "3"]})
else:
expected = DataFrame([[1, 2, 3]], columns=["a", "b", "c"])
if parser.engine == "pyarrow":
msg = "The 'comment' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), comment="#", header=header)
return
result = parser.read_csv(StringIO(data), comment="#", header=header)
tm.assert_frame_equal(result, expected)
def test_comment_char_in_default_value(all_parsers, request):
# GH#34002
if all_parsers.engine == "c":
reason = "see gh-34002: works on the python engine but not the c engine"
# NA value containing comment char is interpreted as comment
request.applymarker(pytest.mark.xfail(reason=reason, raises=AssertionError))
parser = all_parsers
data = (
"# this is a comment\n"
"col1,col2,col3,col4\n"
"1,2,3,4#inline comment\n"
"4,5#,6,10\n"
"7,8,#N/A,11\n"
)
if parser.engine == "pyarrow":
msg = "The 'comment' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), comment="#", na_values="#N/A")
return
result = parser.read_csv(StringIO(data), comment="#", na_values="#N/A")
expected = DataFrame(
{
"col1": [1, 4, 7],
"col2": [2, 5, 8],
"col3": [3.0, np.nan, np.nan],
"col4": [4.0, np.nan, 11.0],
}
)
tm.assert_frame_equal(result, expected)

View File

@ -0,0 +1,211 @@
"""
Tests compressed data parsing functionality for all
of the parsers defined in parsers.py
"""
import os
from pathlib import Path
import tarfile
import zipfile
import pytest
from pandas import DataFrame
import pandas._testing as tm
pytestmark = pytest.mark.filterwarnings(
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
)
@pytest.fixture(params=[True, False])
def buffer(request):
return request.param
@pytest.fixture
def parser_and_data(all_parsers, csv1):
parser = all_parsers
with open(csv1, "rb") as f:
data = f.read()
expected = parser.read_csv(csv1)
return parser, data, expected
@pytest.mark.parametrize("compression", ["zip", "infer", "zip2"])
def test_zip(parser_and_data, compression):
parser, data, expected = parser_and_data
with tm.ensure_clean("test_file.zip") as path:
with zipfile.ZipFile(path, mode="w") as tmp:
tmp.writestr("test_file", data)
if compression == "zip2":
with open(path, "rb") as f:
result = parser.read_csv(f, compression="zip")
else:
result = parser.read_csv(path, compression=compression)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("compression", ["zip", "infer"])
def test_zip_error_multiple_files(parser_and_data, compression):
parser, data, expected = parser_and_data
with tm.ensure_clean("combined_zip.zip") as path:
inner_file_names = ["test_file", "second_file"]
with zipfile.ZipFile(path, mode="w") as tmp:
for file_name in inner_file_names:
tmp.writestr(file_name, data)
with pytest.raises(ValueError, match="Multiple files"):
parser.read_csv(path, compression=compression)
def test_zip_error_no_files(parser_and_data):
parser, _, _ = parser_and_data
with tm.ensure_clean() as path:
with zipfile.ZipFile(path, mode="w"):
pass
with pytest.raises(ValueError, match="Zero files"):
parser.read_csv(path, compression="zip")
def test_zip_error_invalid_zip(parser_and_data):
parser, _, _ = parser_and_data
with tm.ensure_clean() as path:
with open(path, "rb") as f:
with pytest.raises(zipfile.BadZipFile, match="File is not a zip file"):
parser.read_csv(f, compression="zip")
@pytest.mark.parametrize("filename", [None, "test.{ext}"])
def test_compression(
request,
parser_and_data,
compression_only,
buffer,
filename,
compression_to_extension,
):
parser, data, expected = parser_and_data
compress_type = compression_only
ext = compression_to_extension[compress_type]
filename = filename if filename is None else filename.format(ext=ext)
if filename and buffer:
request.applymarker(
pytest.mark.xfail(
reason="Cannot deduce compression from buffer of compressed data."
)
)
with tm.ensure_clean(filename=filename) as path:
tm.write_to_compressed(compress_type, path, data)
compression = "infer" if filename else compress_type
if buffer:
with open(path, "rb") as f:
result = parser.read_csv(f, compression=compression)
else:
result = parser.read_csv(path, compression=compression)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("ext", [None, "gz", "bz2"])
def test_infer_compression(all_parsers, csv1, buffer, ext):
# see gh-9770
parser = all_parsers
kwargs = {"index_col": 0, "parse_dates": True}
expected = parser.read_csv(csv1, **kwargs)
kwargs["compression"] = "infer"
if buffer:
with open(csv1, encoding="utf-8") as f:
result = parser.read_csv(f, **kwargs)
else:
ext = "." + ext if ext else ""
result = parser.read_csv(csv1 + ext, **kwargs)
tm.assert_frame_equal(result, expected)
def test_compression_utf_encoding(all_parsers, csv_dir_path, utf_value, encoding_fmt):
# see gh-18071, gh-24130
parser = all_parsers
encoding = encoding_fmt.format(utf_value)
path = os.path.join(csv_dir_path, f"utf{utf_value}_ex_small.zip")
result = parser.read_csv(path, encoding=encoding, compression="zip", sep="\t")
expected = DataFrame(
{
"Country": ["Venezuela", "Venezuela"],
"Twitter": ["Hugo Chávez Frías", "Henrique Capriles R."],
}
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("invalid_compression", ["sfark", "bz3", "zipper"])
def test_invalid_compression(all_parsers, invalid_compression):
parser = all_parsers
compress_kwargs = {"compression": invalid_compression}
msg = f"Unrecognized compression type: {invalid_compression}"
with pytest.raises(ValueError, match=msg):
parser.read_csv("test_file.zip", **compress_kwargs)
def test_compression_tar_archive(all_parsers, csv_dir_path):
parser = all_parsers
path = os.path.join(csv_dir_path, "tar_csv.tar.gz")
df = parser.read_csv(path)
assert list(df.columns) == ["a"]
def test_ignore_compression_extension(all_parsers):
parser = all_parsers
df = DataFrame({"a": [0, 1]})
with tm.ensure_clean("test.csv") as path_csv:
with tm.ensure_clean("test.csv.zip") as path_zip:
# make sure to create un-compressed file with zip extension
df.to_csv(path_csv, index=False)
Path(path_zip).write_text(
Path(path_csv).read_text(encoding="utf-8"), encoding="utf-8"
)
tm.assert_frame_equal(parser.read_csv(path_zip, compression=None), df)
def test_writes_tar_gz(all_parsers):
parser = all_parsers
data = DataFrame(
{
"Country": ["Venezuela", "Venezuela"],
"Twitter": ["Hugo Chávez Frías", "Henrique Capriles R."],
}
)
with tm.ensure_clean("test.tar.gz") as tar_path:
data.to_csv(tar_path, index=False)
# test that read_csv infers .tar.gz to gzip:
tm.assert_frame_equal(parser.read_csv(tar_path), data)
# test that file is indeed gzipped:
with tarfile.open(tar_path, "r:gz") as tar:
result = parser.read_csv(
tar.extractfile(tar.getnames()[0]), compression="infer"
)
tm.assert_frame_equal(result, data)

View File

@ -0,0 +1,36 @@
import numpy as np
import pytest
from pandas.errors import DtypeWarning
import pandas._testing as tm
from pandas.core.arrays import ArrowExtensionArray
from pandas.io.parsers.c_parser_wrapper import _concatenate_chunks
def test_concatenate_chunks_pyarrow():
# GH#51876
pa = pytest.importorskip("pyarrow")
chunks = [
{0: ArrowExtensionArray(pa.array([1.5, 2.5]))},
{0: ArrowExtensionArray(pa.array([1, 2]))},
]
result = _concatenate_chunks(chunks)
expected = ArrowExtensionArray(pa.array([1.5, 2.5, 1.0, 2.0]))
tm.assert_extension_array_equal(result[0], expected)
def test_concatenate_chunks_pyarrow_strings():
# GH#51876
pa = pytest.importorskip("pyarrow")
chunks = [
{0: ArrowExtensionArray(pa.array([1.5, 2.5]))},
{0: ArrowExtensionArray(pa.array(["a", "b"]))},
]
with tm.assert_produces_warning(DtypeWarning, match="have mixed types"):
result = _concatenate_chunks(chunks)
expected = np.concatenate(
[np.array([1.5, 2.5], dtype=object), np.array(["a", "b"])]
)
tm.assert_numpy_array_equal(result[0], expected)

View File

@ -0,0 +1,263 @@
"""
Tests column conversion functionality during parsing
for all of the parsers defined in parsers.py
"""
from io import StringIO
from dateutil.parser import parse
import numpy as np
import pytest
import pandas as pd
from pandas import (
DataFrame,
Index,
)
import pandas._testing as tm
def test_converters_type_must_be_dict(all_parsers):
parser = all_parsers
data = """index,A,B,C,D
foo,2,3,4,5
"""
if parser.engine == "pyarrow":
msg = "The 'converters' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), converters=0)
return
with pytest.raises(TypeError, match="Type converters.+"):
parser.read_csv(StringIO(data), converters=0)
@pytest.mark.parametrize("column", [3, "D"])
@pytest.mark.parametrize(
"converter", [parse, lambda x: int(x.split("/")[2])] # Produce integer.
)
def test_converters(all_parsers, column, converter):
parser = all_parsers
data = """A,B,C,D
a,1,2,01/01/2009
b,3,4,01/02/2009
c,4,5,01/03/2009
"""
if parser.engine == "pyarrow":
msg = "The 'converters' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), converters={column: converter})
return
result = parser.read_csv(StringIO(data), converters={column: converter})
expected = parser.read_csv(StringIO(data))
expected["D"] = expected["D"].map(converter)
tm.assert_frame_equal(result, expected)
def test_converters_no_implicit_conv(all_parsers):
# see gh-2184
parser = all_parsers
data = """000102,1.2,A\n001245,2,B"""
converters = {0: lambda x: x.strip()}
if parser.engine == "pyarrow":
msg = "The 'converters' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), header=None, converters=converters)
return
result = parser.read_csv(StringIO(data), header=None, converters=converters)
# Column 0 should not be casted to numeric and should remain as object.
expected = DataFrame([["000102", 1.2, "A"], ["001245", 2, "B"]])
tm.assert_frame_equal(result, expected)
def test_converters_euro_decimal_format(all_parsers):
# see gh-583
converters = {}
parser = all_parsers
data = """Id;Number1;Number2;Text1;Text2;Number3
1;1521,1541;187101,9543;ABC;poi;4,7387
2;121,12;14897,76;DEF;uyt;0,3773
3;878,158;108013,434;GHI;rez;2,7356"""
converters["Number1"] = converters["Number2"] = converters[
"Number3"
] = lambda x: float(x.replace(",", "."))
if parser.engine == "pyarrow":
msg = "The 'converters' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), sep=";", converters=converters)
return
result = parser.read_csv(StringIO(data), sep=";", converters=converters)
expected = DataFrame(
[
[1, 1521.1541, 187101.9543, "ABC", "poi", 4.7387],
[2, 121.12, 14897.76, "DEF", "uyt", 0.3773],
[3, 878.158, 108013.434, "GHI", "rez", 2.7356],
],
columns=["Id", "Number1", "Number2", "Text1", "Text2", "Number3"],
)
tm.assert_frame_equal(result, expected)
def test_converters_corner_with_nans(all_parsers):
parser = all_parsers
data = """id,score,days
1,2,12
2,2-5,
3,,14+
4,6-12,2"""
# Example converters.
def convert_days(x):
x = x.strip()
if not x:
return np.nan
is_plus = x.endswith("+")
if is_plus:
x = int(x[:-1]) + 1
else:
x = int(x)
return x
def convert_days_sentinel(x):
x = x.strip()
if not x:
return np.nan
is_plus = x.endswith("+")
if is_plus:
x = int(x[:-1]) + 1
else:
x = int(x)
return x
def convert_score(x):
x = x.strip()
if not x:
return np.nan
if x.find("-") > 0:
val_min, val_max = map(int, x.split("-"))
val = 0.5 * (val_min + val_max)
else:
val = float(x)
return val
results = []
for day_converter in [convert_days, convert_days_sentinel]:
if parser.engine == "pyarrow":
msg = "The 'converters' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(
StringIO(data),
converters={"score": convert_score, "days": day_converter},
na_values=["", None],
)
continue
result = parser.read_csv(
StringIO(data),
converters={"score": convert_score, "days": day_converter},
na_values=["", None],
)
assert pd.isna(result["days"][1])
results.append(result)
if parser.engine != "pyarrow":
tm.assert_frame_equal(results[0], results[1])
@pytest.mark.parametrize("conv_f", [lambda x: x, str])
def test_converter_index_col_bug(all_parsers, conv_f):
# see gh-1835 , GH#40589
parser = all_parsers
data = "A;B\n1;2\n3;4"
if parser.engine == "pyarrow":
msg = "The 'converters' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(
StringIO(data), sep=";", index_col="A", converters={"A": conv_f}
)
return
rs = parser.read_csv(
StringIO(data), sep=";", index_col="A", converters={"A": conv_f}
)
xp = DataFrame({"B": [2, 4]}, index=Index(["1", "3"], name="A"))
tm.assert_frame_equal(rs, xp)
def test_converter_identity_object(all_parsers):
# GH#40589
parser = all_parsers
data = "A,B\n1,2\n3,4"
if parser.engine == "pyarrow":
msg = "The 'converters' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), converters={"A": lambda x: x})
return
rs = parser.read_csv(StringIO(data), converters={"A": lambda x: x})
xp = DataFrame({"A": ["1", "3"], "B": [2, 4]})
tm.assert_frame_equal(rs, xp)
def test_converter_multi_index(all_parsers):
# GH 42446
parser = all_parsers
data = "A,B,B\nX,Y,Z\n1,2,3"
if parser.engine == "pyarrow":
msg = "The 'converters' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(
StringIO(data),
header=list(range(2)),
converters={
("A", "X"): np.int32,
("B", "Y"): np.int32,
("B", "Z"): np.float32,
},
)
return
result = parser.read_csv(
StringIO(data),
header=list(range(2)),
converters={
("A", "X"): np.int32,
("B", "Y"): np.int32,
("B", "Z"): np.float32,
},
)
expected = DataFrame(
{
("A", "X"): np.int32([1]),
("B", "Y"): np.int32([2]),
("B", "Z"): np.float32([3]),
}
)
tm.assert_frame_equal(result, expected)

View File

@ -0,0 +1,195 @@
"""
Tests that dialects are properly handled during parsing
for all of the parsers defined in parsers.py
"""
import csv
from io import StringIO
import pytest
from pandas.errors import ParserWarning
from pandas import DataFrame
import pandas._testing as tm
pytestmark = pytest.mark.filterwarnings(
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
)
@pytest.fixture
def custom_dialect():
dialect_name = "weird"
dialect_kwargs = {
"doublequote": False,
"escapechar": "~",
"delimiter": ":",
"skipinitialspace": False,
"quotechar": "`",
"quoting": 3,
}
return dialect_name, dialect_kwargs
def test_dialect(all_parsers):
parser = all_parsers
data = """\
label1,label2,label3
index1,"a,c,e
index2,b,d,f
"""
dia = csv.excel()
dia.quoting = csv.QUOTE_NONE
if parser.engine == "pyarrow":
msg = "The 'dialect' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), dialect=dia)
return
df = parser.read_csv(StringIO(data), dialect=dia)
data = """\
label1,label2,label3
index1,a,c,e
index2,b,d,f
"""
exp = parser.read_csv(StringIO(data))
exp.replace("a", '"a', inplace=True)
tm.assert_frame_equal(df, exp)
def test_dialect_str(all_parsers):
dialect_name = "mydialect"
parser = all_parsers
data = """\
fruit:vegetable
apple:broccoli
pear:tomato
"""
exp = DataFrame({"fruit": ["apple", "pear"], "vegetable": ["broccoli", "tomato"]})
with tm.with_csv_dialect(dialect_name, delimiter=":"):
if parser.engine == "pyarrow":
msg = "The 'dialect' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), dialect=dialect_name)
return
df = parser.read_csv(StringIO(data), dialect=dialect_name)
tm.assert_frame_equal(df, exp)
def test_invalid_dialect(all_parsers):
class InvalidDialect:
pass
data = "a\n1"
parser = all_parsers
msg = "Invalid dialect"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), dialect=InvalidDialect)
@pytest.mark.parametrize(
"arg",
[None, "doublequote", "escapechar", "skipinitialspace", "quotechar", "quoting"],
)
@pytest.mark.parametrize("value", ["dialect", "default", "other"])
def test_dialect_conflict_except_delimiter(all_parsers, custom_dialect, arg, value):
# see gh-23761.
dialect_name, dialect_kwargs = custom_dialect
parser = all_parsers
expected = DataFrame({"a": [1], "b": [2]})
data = "a:b\n1:2"
warning_klass = None
kwds = {}
# arg=None tests when we pass in the dialect without any other arguments.
if arg is not None:
if value == "dialect": # No conflict --> no warning.
kwds[arg] = dialect_kwargs[arg]
elif value == "default": # Default --> no warning.
from pandas.io.parsers.base_parser import parser_defaults
kwds[arg] = parser_defaults[arg]
else: # Non-default + conflict with dialect --> warning.
warning_klass = ParserWarning
kwds[arg] = "blah"
with tm.with_csv_dialect(dialect_name, **dialect_kwargs):
if parser.engine == "pyarrow":
msg = "The 'dialect' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv_check_warnings(
# No warning bc we raise
None,
"Conflicting values for",
StringIO(data),
dialect=dialect_name,
**kwds,
)
return
result = parser.read_csv_check_warnings(
warning_klass,
"Conflicting values for",
StringIO(data),
dialect=dialect_name,
**kwds,
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"kwargs,warning_klass",
[
({"sep": ","}, None), # sep is default --> sep_override=True
({"sep": "."}, ParserWarning), # sep isn't default --> sep_override=False
({"delimiter": ":"}, None), # No conflict
({"delimiter": None}, None), # Default arguments --> sep_override=True
({"delimiter": ","}, ParserWarning), # Conflict
({"delimiter": "."}, ParserWarning), # Conflict
],
ids=[
"sep-override-true",
"sep-override-false",
"delimiter-no-conflict",
"delimiter-default-arg",
"delimiter-conflict",
"delimiter-conflict2",
],
)
def test_dialect_conflict_delimiter(all_parsers, custom_dialect, kwargs, warning_klass):
# see gh-23761.
dialect_name, dialect_kwargs = custom_dialect
parser = all_parsers
expected = DataFrame({"a": [1], "b": [2]})
data = "a:b\n1:2"
with tm.with_csv_dialect(dialect_name, **dialect_kwargs):
if parser.engine == "pyarrow":
msg = "The 'dialect' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv_check_warnings(
# no warning bc we raise
None,
"Conflicting values for 'delimiter'",
StringIO(data),
dialect=dialect_name,
**kwargs,
)
return
result = parser.read_csv_check_warnings(
warning_klass,
"Conflicting values for 'delimiter'",
StringIO(data),
dialect=dialect_name,
**kwargs,
)
tm.assert_frame_equal(result, expected)

View File

@ -0,0 +1,337 @@
"""
Tests encoding functionality during parsing
for all of the parsers defined in parsers.py
"""
from io import (
BytesIO,
TextIOWrapper,
)
import os
import tempfile
import uuid
import numpy as np
import pytest
from pandas import (
DataFrame,
read_csv,
)
import pandas._testing as tm
pytestmark = pytest.mark.filterwarnings(
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
)
skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
def test_bytes_io_input(all_parsers):
encoding = "cp1255"
parser = all_parsers
data = BytesIO("שלום:1234\n562:123".encode(encoding))
result = parser.read_csv(data, sep=":", encoding=encoding)
expected = DataFrame([[562, 123]], columns=["שלום", "1234"])
tm.assert_frame_equal(result, expected)
@skip_pyarrow # CSV parse error: Empty CSV file or block
def test_read_csv_unicode(all_parsers):
parser = all_parsers
data = BytesIO("\u0141aski, Jan;1".encode())
result = parser.read_csv(data, sep=";", encoding="utf-8", header=None)
expected = DataFrame([["\u0141aski, Jan", 1]])
tm.assert_frame_equal(result, expected)
@skip_pyarrow
@pytest.mark.parametrize("sep", [",", "\t"])
@pytest.mark.parametrize("encoding", ["utf-16", "utf-16le", "utf-16be"])
def test_utf16_bom_skiprows(all_parsers, sep, encoding):
# see gh-2298
parser = all_parsers
data = """skip this
skip this too
A,B,C
1,2,3
4,5,6""".replace(
",", sep
)
path = f"__{uuid.uuid4()}__.csv"
kwargs = {"sep": sep, "skiprows": 2}
utf8 = "utf-8"
with tm.ensure_clean(path) as path:
bytes_data = data.encode(encoding)
with open(path, "wb") as f:
f.write(bytes_data)
with TextIOWrapper(BytesIO(data.encode(utf8)), encoding=utf8) as bytes_buffer:
result = parser.read_csv(path, encoding=encoding, **kwargs)
expected = parser.read_csv(bytes_buffer, encoding=utf8, **kwargs)
tm.assert_frame_equal(result, expected)
def test_utf16_example(all_parsers, csv_dir_path):
path = os.path.join(csv_dir_path, "utf16_ex.txt")
parser = all_parsers
result = parser.read_csv(path, encoding="utf-16", sep="\t")
assert len(result) == 50
def test_unicode_encoding(all_parsers, csv_dir_path):
path = os.path.join(csv_dir_path, "unicode_series.csv")
parser = all_parsers
result = parser.read_csv(path, header=None, encoding="latin-1")
result = result.set_index(0)
got = result[1][1632]
expected = "\xc1 k\xf6ldum klaka (Cold Fever) (1994)"
assert got == expected
@pytest.mark.parametrize(
"data,kwargs,expected",
[
# Basic test
("a\n1", {}, DataFrame({"a": [1]})),
# "Regular" quoting
('"a"\n1', {"quotechar": '"'}, DataFrame({"a": [1]})),
# Test in a data row instead of header
("b\n1", {"names": ["a"]}, DataFrame({"a": ["b", "1"]})),
# Test in empty data row with skipping
("\n1", {"names": ["a"], "skip_blank_lines": True}, DataFrame({"a": [1]})),
# Test in empty data row without skipping
(
"\n1",
{"names": ["a"], "skip_blank_lines": False},
DataFrame({"a": [np.nan, 1]}),
),
],
)
def test_utf8_bom(all_parsers, data, kwargs, expected, request):
# see gh-4793
parser = all_parsers
bom = "\ufeff"
utf8 = "utf-8"
def _encode_data_with_bom(_data):
bom_data = (bom + _data).encode(utf8)
return BytesIO(bom_data)
if (
parser.engine == "pyarrow"
and data == "\n1"
and kwargs.get("skip_blank_lines", True)
):
# CSV parse error: Empty CSV file or block: cannot infer number of columns
pytest.skip(reason="https://github.com/apache/arrow/issues/38676")
result = parser.read_csv(_encode_data_with_bom(data), encoding=utf8, **kwargs)
tm.assert_frame_equal(result, expected)
def test_read_csv_utf_aliases(all_parsers, utf_value, encoding_fmt):
# see gh-13549
expected = DataFrame({"mb_num": [4.8], "multibyte": ["test"]})
parser = all_parsers
encoding = encoding_fmt.format(utf_value)
data = "mb_num,multibyte\n4.8,test".encode(encoding)
result = parser.read_csv(BytesIO(data), encoding=encoding)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"file_path,encoding",
[
(("io", "data", "csv", "test1.csv"), "utf-8"),
(("io", "parser", "data", "unicode_series.csv"), "latin-1"),
(("io", "parser", "data", "sauron.SHIFT_JIS.csv"), "shiftjis"),
],
)
def test_binary_mode_file_buffers(all_parsers, file_path, encoding, datapath):
# gh-23779: Python csv engine shouldn't error on files opened in binary.
# gh-31575: Python csv engine shouldn't error on files opened in raw binary.
parser = all_parsers
fpath = datapath(*file_path)
expected = parser.read_csv(fpath, encoding=encoding)
with open(fpath, encoding=encoding) as fa:
result = parser.read_csv(fa)
assert not fa.closed
tm.assert_frame_equal(expected, result)
with open(fpath, mode="rb") as fb:
result = parser.read_csv(fb, encoding=encoding)
assert not fb.closed
tm.assert_frame_equal(expected, result)
with open(fpath, mode="rb", buffering=0) as fb:
result = parser.read_csv(fb, encoding=encoding)
assert not fb.closed
tm.assert_frame_equal(expected, result)
@pytest.mark.parametrize("pass_encoding", [True, False])
def test_encoding_temp_file(all_parsers, utf_value, encoding_fmt, pass_encoding):
# see gh-24130
parser = all_parsers
encoding = encoding_fmt.format(utf_value)
if parser.engine == "pyarrow" and pass_encoding is True and utf_value in [16, 32]:
# FIXME: this is bad!
pytest.skip("These cases freeze")
expected = DataFrame({"foo": ["bar"]})
with tm.ensure_clean(mode="w+", encoding=encoding, return_filelike=True) as f:
f.write("foo\nbar")
f.seek(0)
result = parser.read_csv(f, encoding=encoding if pass_encoding else None)
tm.assert_frame_equal(result, expected)
def test_encoding_named_temp_file(all_parsers):
# see gh-31819
parser = all_parsers
encoding = "shift-jis"
title = "てすと"
data = "こむ"
expected = DataFrame({title: [data]})
with tempfile.NamedTemporaryFile() as f:
f.write(f"{title}\n{data}".encode(encoding))
f.seek(0)
result = parser.read_csv(f, encoding=encoding)
tm.assert_frame_equal(result, expected)
assert not f.closed
@pytest.mark.parametrize(
"encoding", ["utf-8", "utf-16", "utf-16-be", "utf-16-le", "utf-32"]
)
def test_parse_encoded_special_characters(encoding):
# GH16218 Verify parsing of data with encoded special characters
# Data contains a Unicode 'FULLWIDTH COLON' (U+FF1A) at position (0,"a")
data = "a\tb\nfoo\t0\nbar\t1\nbaz\t2" # noqa: RUF001
encoded_data = BytesIO(data.encode(encoding))
result = read_csv(encoded_data, delimiter="\t", encoding=encoding)
expected = DataFrame(
data=[["foo", 0], ["bar", 1], ["baz", 2]], # noqa: RUF001
columns=["a", "b"],
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("encoding", ["utf-8", None, "utf-16", "cp1255", "latin-1"])
def test_encoding_memory_map(all_parsers, encoding):
# GH40986
parser = all_parsers
expected = DataFrame(
{
"name": ["Raphael", "Donatello", "Miguel Angel", "Leonardo"],
"mask": ["red", "purple", "orange", "blue"],
"weapon": ["sai", "bo staff", "nunchunk", "katana"],
}
)
with tm.ensure_clean() as file:
expected.to_csv(file, index=False, encoding=encoding)
if parser.engine == "pyarrow":
msg = "The 'memory_map' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(file, encoding=encoding, memory_map=True)
return
df = parser.read_csv(file, encoding=encoding, memory_map=True)
tm.assert_frame_equal(df, expected)
def test_chunk_splits_multibyte_char(all_parsers):
"""
Chunk splits a multibyte character with memory_map=True
GH 43540
"""
parser = all_parsers
# DEFAULT_CHUNKSIZE = 262144, defined in parsers.pyx
df = DataFrame(data=["a" * 127] * 2048)
# Put two-bytes utf-8 encoded character "ą" at the end of chunk
# utf-8 encoding of "ą" is b'\xc4\x85'
df.iloc[2047] = "a" * 127 + "ą"
with tm.ensure_clean("bug-gh43540.csv") as fname:
df.to_csv(fname, index=False, header=False, encoding="utf-8")
if parser.engine == "pyarrow":
msg = "The 'memory_map' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(fname, header=None, memory_map=True)
return
dfr = parser.read_csv(fname, header=None, memory_map=True)
tm.assert_frame_equal(dfr, df)
def test_readcsv_memmap_utf8(all_parsers):
"""
GH 43787
Test correct handling of UTF-8 chars when memory_map=True and encoding is UTF-8
"""
lines = []
line_length = 128
start_char = " "
end_char = "\U00010080"
# This for loop creates a list of 128-char strings
# consisting of consecutive Unicode chars
for lnum in range(ord(start_char), ord(end_char), line_length):
line = "".join([chr(c) for c in range(lnum, lnum + 0x80)]) + "\n"
try:
line.encode("utf-8")
except UnicodeEncodeError:
continue
lines.append(line)
parser = all_parsers
df = DataFrame(lines)
with tm.ensure_clean("utf8test.csv") as fname:
df.to_csv(fname, index=False, header=False, encoding="utf-8")
if parser.engine == "pyarrow":
msg = "The 'memory_map' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(fname, header=None, memory_map=True, encoding="utf-8")
return
dfr = parser.read_csv(fname, header=None, memory_map=True, encoding="utf-8")
tm.assert_frame_equal(df, dfr)
@pytest.mark.usefixtures("pyarrow_xfail")
@pytest.mark.parametrize("mode", ["w+b", "w+t"])
def test_not_readable(all_parsers, mode):
# GH43439
parser = all_parsers
content = b"abcd"
if "t" in mode:
content = "abcd"
with tempfile.SpooledTemporaryFile(mode=mode, encoding="utf-8") as handle:
handle.write(content)
handle.seek(0)
df = parser.read_csv(handle)
expected = DataFrame([], columns=["abcd"])
tm.assert_frame_equal(df, expected)

View File

@ -0,0 +1,733 @@
"""
Tests that the file header is properly handled or inferred
during parsing for all of the parsers defined in parsers.py
"""
from collections import namedtuple
from io import StringIO
import numpy as np
import pytest
from pandas.errors import ParserError
from pandas import (
DataFrame,
Index,
MultiIndex,
)
import pandas._testing as tm
pytestmark = pytest.mark.filterwarnings(
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
)
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
@xfail_pyarrow # TypeError: an integer is required
def test_read_with_bad_header(all_parsers):
parser = all_parsers
msg = r"but only \d+ lines in file"
with pytest.raises(ValueError, match=msg):
s = StringIO(",,")
parser.read_csv(s, header=[10])
def test_negative_header(all_parsers):
# see gh-27779
parser = all_parsers
data = """1,2,3,4,5
6,7,8,9,10
11,12,13,14,15
"""
with pytest.raises(
ValueError,
match="Passing negative integer to header is invalid. "
"For no header, use header=None instead",
):
parser.read_csv(StringIO(data), header=-1)
@pytest.mark.parametrize("header", [([-1, 2, 4]), ([-5, 0])])
def test_negative_multi_index_header(all_parsers, header):
# see gh-27779
parser = all_parsers
data = """1,2,3,4,5
6,7,8,9,10
11,12,13,14,15
"""
with pytest.raises(
ValueError, match="cannot specify multi-index header with negative integers"
):
parser.read_csv(StringIO(data), header=header)
@pytest.mark.parametrize("header", [True, False])
def test_bool_header_arg(all_parsers, header):
# see gh-6114
parser = all_parsers
data = """\
MyColumn
a
b
a
b"""
msg = "Passing a bool to header is invalid"
with pytest.raises(TypeError, match=msg):
parser.read_csv(StringIO(data), header=header)
@xfail_pyarrow # AssertionError: DataFrame are different
def test_header_with_index_col(all_parsers):
parser = all_parsers
data = """foo,1,2,3
bar,4,5,6
baz,7,8,9
"""
names = ["A", "B", "C"]
result = parser.read_csv(StringIO(data), names=names)
expected = DataFrame(
[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
index=["foo", "bar", "baz"],
columns=["A", "B", "C"],
)
tm.assert_frame_equal(result, expected)
def test_header_not_first_line(all_parsers):
parser = all_parsers
data = """got,to,ignore,this,line
got,to,ignore,this,line
index,A,B,C,D
foo,2,3,4,5
bar,7,8,9,10
baz,12,13,14,15
"""
data2 = """index,A,B,C,D
foo,2,3,4,5
bar,7,8,9,10
baz,12,13,14,15
"""
result = parser.read_csv(StringIO(data), header=2, index_col=0)
expected = parser.read_csv(StringIO(data2), header=0, index_col=0)
tm.assert_frame_equal(result, expected)
@xfail_pyarrow # TypeError: an integer is required
def test_header_multi_index(all_parsers):
parser = all_parsers
data = """\
C0,,C_l0_g0,C_l0_g1,C_l0_g2
C1,,C_l1_g0,C_l1_g1,C_l1_g2
C2,,C_l2_g0,C_l2_g1,C_l2_g2
C3,,C_l3_g0,C_l3_g1,C_l3_g2
R0,R1,,,
R_l0_g0,R_l1_g0,R0C0,R0C1,R0C2
R_l0_g1,R_l1_g1,R1C0,R1C1,R1C2
R_l0_g2,R_l1_g2,R2C0,R2C1,R2C2
R_l0_g3,R_l1_g3,R3C0,R3C1,R3C2
R_l0_g4,R_l1_g4,R4C0,R4C1,R4C2
"""
result = parser.read_csv(StringIO(data), header=[0, 1, 2, 3], index_col=[0, 1])
data_gen_f = lambda r, c: f"R{r}C{c}"
data = [[data_gen_f(r, c) for c in range(3)] for r in range(5)]
index = MultiIndex.from_arrays(
[[f"R_l0_g{i}" for i in range(5)], [f"R_l1_g{i}" for i in range(5)]],
names=["R0", "R1"],
)
columns = MultiIndex.from_arrays(
[
[f"C_l0_g{i}" for i in range(3)],
[f"C_l1_g{i}" for i in range(3)],
[f"C_l2_g{i}" for i in range(3)],
[f"C_l3_g{i}" for i in range(3)],
],
names=["C0", "C1", "C2", "C3"],
)
expected = DataFrame(data, columns=columns, index=index)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"kwargs,msg",
[
(
{"index_col": ["foo", "bar"]},
(
"index_col must only contain "
"row numbers when specifying "
"a multi-index header"
),
),
(
{"index_col": [0, 1], "names": ["foo", "bar"]},
("cannot specify names when specifying a multi-index header"),
),
(
{"index_col": [0, 1], "usecols": ["foo", "bar"]},
("cannot specify usecols when specifying a multi-index header"),
),
],
)
def test_header_multi_index_invalid(all_parsers, kwargs, msg):
data = """\
C0,,C_l0_g0,C_l0_g1,C_l0_g2
C1,,C_l1_g0,C_l1_g1,C_l1_g2
C2,,C_l2_g0,C_l2_g1,C_l2_g2
C3,,C_l3_g0,C_l3_g1,C_l3_g2
R0,R1,,,
R_l0_g0,R_l1_g0,R0C0,R0C1,R0C2
R_l0_g1,R_l1_g1,R1C0,R1C1,R1C2
R_l0_g2,R_l1_g2,R2C0,R2C1,R2C2
R_l0_g3,R_l1_g3,R3C0,R3C1,R3C2
R_l0_g4,R_l1_g4,R4C0,R4C1,R4C2
"""
parser = all_parsers
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), header=[0, 1, 2, 3], **kwargs)
_TestTuple = namedtuple("_TestTuple", ["first", "second"])
@xfail_pyarrow # TypeError: an integer is required
@pytest.mark.parametrize(
"kwargs",
[
{"header": [0, 1]},
{
"skiprows": 3,
"names": [
("a", "q"),
("a", "r"),
("a", "s"),
("b", "t"),
("c", "u"),
("c", "v"),
],
},
{
"skiprows": 3,
"names": [
_TestTuple("a", "q"),
_TestTuple("a", "r"),
_TestTuple("a", "s"),
_TestTuple("b", "t"),
_TestTuple("c", "u"),
_TestTuple("c", "v"),
],
},
],
)
def test_header_multi_index_common_format1(all_parsers, kwargs):
parser = all_parsers
expected = DataFrame(
[[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]],
index=["one", "two"],
columns=MultiIndex.from_tuples(
[("a", "q"), ("a", "r"), ("a", "s"), ("b", "t"), ("c", "u"), ("c", "v")]
),
)
data = """,a,a,a,b,c,c
,q,r,s,t,u,v
,,,,,,
one,1,2,3,4,5,6
two,7,8,9,10,11,12"""
result = parser.read_csv(StringIO(data), index_col=0, **kwargs)
tm.assert_frame_equal(result, expected)
@xfail_pyarrow # TypeError: an integer is required
@pytest.mark.parametrize(
"kwargs",
[
{"header": [0, 1]},
{
"skiprows": 2,
"names": [
("a", "q"),
("a", "r"),
("a", "s"),
("b", "t"),
("c", "u"),
("c", "v"),
],
},
{
"skiprows": 2,
"names": [
_TestTuple("a", "q"),
_TestTuple("a", "r"),
_TestTuple("a", "s"),
_TestTuple("b", "t"),
_TestTuple("c", "u"),
_TestTuple("c", "v"),
],
},
],
)
def test_header_multi_index_common_format2(all_parsers, kwargs):
parser = all_parsers
expected = DataFrame(
[[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]],
index=["one", "two"],
columns=MultiIndex.from_tuples(
[("a", "q"), ("a", "r"), ("a", "s"), ("b", "t"), ("c", "u"), ("c", "v")]
),
)
data = """,a,a,a,b,c,c
,q,r,s,t,u,v
one,1,2,3,4,5,6
two,7,8,9,10,11,12"""
result = parser.read_csv(StringIO(data), index_col=0, **kwargs)
tm.assert_frame_equal(result, expected)
@xfail_pyarrow # TypeError: an integer is required
@pytest.mark.parametrize(
"kwargs",
[
{"header": [0, 1]},
{
"skiprows": 2,
"names": [
("a", "q"),
("a", "r"),
("a", "s"),
("b", "t"),
("c", "u"),
("c", "v"),
],
},
{
"skiprows": 2,
"names": [
_TestTuple("a", "q"),
_TestTuple("a", "r"),
_TestTuple("a", "s"),
_TestTuple("b", "t"),
_TestTuple("c", "u"),
_TestTuple("c", "v"),
],
},
],
)
def test_header_multi_index_common_format3(all_parsers, kwargs):
parser = all_parsers
expected = DataFrame(
[[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]],
index=["one", "two"],
columns=MultiIndex.from_tuples(
[("a", "q"), ("a", "r"), ("a", "s"), ("b", "t"), ("c", "u"), ("c", "v")]
),
)
expected = expected.reset_index(drop=True)
data = """a,a,a,b,c,c
q,r,s,t,u,v
1,2,3,4,5,6
7,8,9,10,11,12"""
result = parser.read_csv(StringIO(data), index_col=None, **kwargs)
tm.assert_frame_equal(result, expected)
@xfail_pyarrow # TypeError: an integer is required
def test_header_multi_index_common_format_malformed1(all_parsers):
parser = all_parsers
expected = DataFrame(
np.array([[2, 3, 4, 5, 6], [8, 9, 10, 11, 12]], dtype="int64"),
index=Index([1, 7]),
columns=MultiIndex(
levels=[["a", "b", "c"], ["r", "s", "t", "u", "v"]],
codes=[[0, 0, 1, 2, 2], [0, 1, 2, 3, 4]],
names=["a", "q"],
),
)
data = """a,a,a,b,c,c
q,r,s,t,u,v
1,2,3,4,5,6
7,8,9,10,11,12"""
result = parser.read_csv(StringIO(data), header=[0, 1], index_col=0)
tm.assert_frame_equal(expected, result)
@xfail_pyarrow # TypeError: an integer is required
def test_header_multi_index_common_format_malformed2(all_parsers):
parser = all_parsers
expected = DataFrame(
np.array([[2, 3, 4, 5, 6], [8, 9, 10, 11, 12]], dtype="int64"),
index=Index([1, 7]),
columns=MultiIndex(
levels=[["a", "b", "c"], ["r", "s", "t", "u", "v"]],
codes=[[0, 0, 1, 2, 2], [0, 1, 2, 3, 4]],
names=[None, "q"],
),
)
data = """,a,a,b,c,c
q,r,s,t,u,v
1,2,3,4,5,6
7,8,9,10,11,12"""
result = parser.read_csv(StringIO(data), header=[0, 1], index_col=0)
tm.assert_frame_equal(expected, result)
@xfail_pyarrow # TypeError: an integer is required
def test_header_multi_index_common_format_malformed3(all_parsers):
parser = all_parsers
expected = DataFrame(
np.array([[3, 4, 5, 6], [9, 10, 11, 12]], dtype="int64"),
index=MultiIndex(levels=[[1, 7], [2, 8]], codes=[[0, 1], [0, 1]]),
columns=MultiIndex(
levels=[["a", "b", "c"], ["s", "t", "u", "v"]],
codes=[[0, 1, 2, 2], [0, 1, 2, 3]],
names=[None, "q"],
),
)
data = """,a,a,b,c,c
q,r,s,t,u,v
1,2,3,4,5,6
7,8,9,10,11,12"""
result = parser.read_csv(StringIO(data), header=[0, 1], index_col=[0, 1])
tm.assert_frame_equal(expected, result)
@xfail_pyarrow # TypeError: an integer is required
def test_header_multi_index_blank_line(all_parsers):
# GH 40442
parser = all_parsers
data = [[None, None], [1, 2], [3, 4]]
columns = MultiIndex.from_tuples([("a", "A"), ("b", "B")])
expected = DataFrame(data, columns=columns)
data = "a,b\nA,B\n,\n1,2\n3,4"
result = parser.read_csv(StringIO(data), header=[0, 1])
tm.assert_frame_equal(expected, result)
@pytest.mark.parametrize(
"data,header", [("1,2,3\n4,5,6", None), ("foo,bar,baz\n1,2,3\n4,5,6", 0)]
)
def test_header_names_backward_compat(all_parsers, data, header, request):
# see gh-2539
parser = all_parsers
if parser.engine == "pyarrow" and header is not None:
mark = pytest.mark.xfail(reason="DataFrame.columns are different")
request.applymarker(mark)
expected = parser.read_csv(StringIO("1,2,3\n4,5,6"), names=["a", "b", "c"])
result = parser.read_csv(StringIO(data), names=["a", "b", "c"], header=header)
tm.assert_frame_equal(result, expected)
@skip_pyarrow # CSV parse error: Empty CSV file or block: cannot infer
@pytest.mark.parametrize("kwargs", [{}, {"index_col": False}])
def test_read_only_header_no_rows(all_parsers, kwargs):
# See gh-7773
parser = all_parsers
expected = DataFrame(columns=["a", "b", "c"])
result = parser.read_csv(StringIO("a,b,c"), **kwargs)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"kwargs,names",
[
({}, [0, 1, 2, 3, 4]),
(
{"names": ["foo", "bar", "baz", "quux", "panda"]},
["foo", "bar", "baz", "quux", "panda"],
),
],
)
def test_no_header(all_parsers, kwargs, names):
parser = all_parsers
data = """1,2,3,4,5
6,7,8,9,10
11,12,13,14,15
"""
expected = DataFrame(
[[1, 2, 3, 4, 5], [6, 7, 8, 9, 10], [11, 12, 13, 14, 15]], columns=names
)
result = parser.read_csv(StringIO(data), header=None, **kwargs)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("header", [["a", "b"], "string_header"])
def test_non_int_header(all_parsers, header):
# see gh-16338
msg = "header must be integer or list of integers"
data = """1,2\n3,4"""
parser = all_parsers
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), header=header)
@xfail_pyarrow # TypeError: an integer is required
def test_singleton_header(all_parsers):
# see gh-7757
data = """a,b,c\n0,1,2\n1,2,3"""
parser = all_parsers
expected = DataFrame({"a": [0, 1], "b": [1, 2], "c": [2, 3]})
result = parser.read_csv(StringIO(data), header=[0])
tm.assert_frame_equal(result, expected)
@xfail_pyarrow # TypeError: an integer is required
@pytest.mark.parametrize(
"data,expected",
[
(
"A,A,A,B\none,one,one,two\n0,40,34,0.1",
DataFrame(
[[0, 40, 34, 0.1]],
columns=MultiIndex.from_tuples(
[("A", "one"), ("A", "one.1"), ("A", "one.2"), ("B", "two")]
),
),
),
(
"A,A,A,B\none,one,one.1,two\n0,40,34,0.1",
DataFrame(
[[0, 40, 34, 0.1]],
columns=MultiIndex.from_tuples(
[("A", "one"), ("A", "one.1"), ("A", "one.1.1"), ("B", "two")]
),
),
),
(
"A,A,A,B,B\none,one,one.1,two,two\n0,40,34,0.1,0.1",
DataFrame(
[[0, 40, 34, 0.1, 0.1]],
columns=MultiIndex.from_tuples(
[
("A", "one"),
("A", "one.1"),
("A", "one.1.1"),
("B", "two"),
("B", "two.1"),
]
),
),
),
],
)
def test_mangles_multi_index(all_parsers, data, expected):
# see gh-18062
parser = all_parsers
result = parser.read_csv(StringIO(data), header=[0, 1])
tm.assert_frame_equal(result, expected)
@xfail_pyarrow # TypeError: an integer is requireds
@pytest.mark.parametrize("index_col", [None, [0]])
@pytest.mark.parametrize(
"columns", [None, (["", "Unnamed"]), (["Unnamed", ""]), (["Unnamed", "NotUnnamed"])]
)
def test_multi_index_unnamed(all_parsers, index_col, columns):
# see gh-23687
#
# When specifying a multi-index header, make sure that
# we don't error just because one of the rows in our header
# has ALL column names containing the string "Unnamed". The
# correct condition to check is whether the row contains
# ALL columns that did not have names (and instead were given
# placeholder ones).
parser = all_parsers
header = [0, 1]
if index_col is None:
data = ",".join(columns or ["", ""]) + "\n0,1\n2,3\n4,5\n"
else:
data = ",".join([""] + (columns or ["", ""])) + "\n,0,1\n0,2,3\n1,4,5\n"
result = parser.read_csv(StringIO(data), header=header, index_col=index_col)
exp_columns = []
if columns is None:
columns = ["", "", ""]
for i, col in enumerate(columns):
if not col: # Unnamed.
col = f"Unnamed: {i if index_col is None else i + 1}_level_0"
exp_columns.append(col)
columns = MultiIndex.from_tuples(zip(exp_columns, ["0", "1"]))
expected = DataFrame([[2, 3], [4, 5]], columns=columns)
tm.assert_frame_equal(result, expected)
@skip_pyarrow # CSV parse error: Expected 2 columns, got 3
def test_names_longer_than_header_but_equal_with_data_rows(all_parsers):
# GH#38453
parser = all_parsers
data = """a, b
1,2,3
5,6,4
"""
result = parser.read_csv(StringIO(data), header=0, names=["A", "B", "C"])
expected = DataFrame({"A": [1, 5], "B": [2, 6], "C": [3, 4]})
tm.assert_frame_equal(result, expected)
@xfail_pyarrow # TypeError: an integer is required
def test_read_csv_multiindex_columns(all_parsers):
# GH#6051
parser = all_parsers
s1 = "Male, Male, Male, Female, Female\nR, R, L, R, R\n.86, .67, .88, .78, .81"
s2 = (
"Male, Male, Male, Female, Female\n"
"R, R, L, R, R\n"
".86, .67, .88, .78, .81\n"
".86, .67, .88, .78, .82"
)
mi = MultiIndex.from_tuples(
[
("Male", "R"),
(" Male", " R"),
(" Male", " L"),
(" Female", " R"),
(" Female", " R.1"),
]
)
expected = DataFrame(
[[0.86, 0.67, 0.88, 0.78, 0.81], [0.86, 0.67, 0.88, 0.78, 0.82]], columns=mi
)
df1 = parser.read_csv(StringIO(s1), header=[0, 1])
tm.assert_frame_equal(df1, expected.iloc[:1])
df2 = parser.read_csv(StringIO(s2), header=[0, 1])
tm.assert_frame_equal(df2, expected)
@xfail_pyarrow # TypeError: an integer is required
def test_read_csv_multi_header_length_check(all_parsers):
# GH#43102
parser = all_parsers
case = """row11,row12,row13
row21,row22, row23
row31,row32
"""
with pytest.raises(
ParserError, match="Header rows must have an equal number of columns."
):
parser.read_csv(StringIO(case), header=[0, 2])
@skip_pyarrow # CSV parse error: Expected 3 columns, got 2
def test_header_none_and_implicit_index(all_parsers):
# GH#22144
parser = all_parsers
data = "x,1,5\ny,2\nz,3\n"
result = parser.read_csv(StringIO(data), names=["a", "b"], header=None)
expected = DataFrame(
{"a": [1, 2, 3], "b": [5, np.nan, np.nan]}, index=["x", "y", "z"]
)
tm.assert_frame_equal(result, expected)
@skip_pyarrow # regex mismatch "CSV parse error: Expected 2 columns, got "
def test_header_none_and_implicit_index_in_second_row(all_parsers):
# GH#22144
parser = all_parsers
data = "x,1\ny,2,5\nz,3\n"
with pytest.raises(ParserError, match="Expected 2 fields in line 2, saw 3"):
parser.read_csv(StringIO(data), names=["a", "b"], header=None)
def test_header_none_and_on_bad_lines_skip(all_parsers):
# GH#22144
parser = all_parsers
data = "x,1\ny,2,5\nz,3\n"
result = parser.read_csv(
StringIO(data), names=["a", "b"], header=None, on_bad_lines="skip"
)
expected = DataFrame({"a": ["x", "z"], "b": [1, 3]})
tm.assert_frame_equal(result, expected)
@xfail_pyarrow # TypeError: an integer is requireds
def test_header_missing_rows(all_parsers):
# GH#47400
parser = all_parsers
data = """a,b
1,2
"""
msg = r"Passed header=\[0,1,2\], len of 3, but only 2 lines in file"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), header=[0, 1, 2])
# ValueError: The 'delim_whitespace' option is not supported with the 'pyarrow' engine
@xfail_pyarrow
def test_header_multiple_whitespaces(all_parsers):
# GH#54931
parser = all_parsers
data = """aa bb(1,1) cc(1,1)
0 2 3.5"""
result = parser.read_csv(StringIO(data), sep=r"\s+")
expected = DataFrame({"aa": [0], "bb(1,1)": 2, "cc(1,1)": 3.5})
tm.assert_frame_equal(result, expected)
# ValueError: The 'delim_whitespace' option is not supported with the 'pyarrow' engine
@xfail_pyarrow
def test_header_delim_whitespace(all_parsers):
# GH#54918
parser = all_parsers
data = """a,b
1,2
3,4
"""
depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated"
with tm.assert_produces_warning(
FutureWarning, match=depr_msg, check_stacklevel=False
):
result = parser.read_csv(StringIO(data), delim_whitespace=True)
expected = DataFrame({"a,b": ["1,2", "3,4"]})
tm.assert_frame_equal(result, expected)
def test_usecols_no_header_pyarrow(pyarrow_parser_only):
parser = pyarrow_parser_only
data = """
a,i,x
b,j,y
"""
result = parser.read_csv(
StringIO(data),
header=None,
usecols=[0, 1],
dtype="string[pyarrow]",
dtype_backend="pyarrow",
engine="pyarrow",
)
expected = DataFrame([["a", "i"], ["b", "j"]], dtype="string[pyarrow]")
tm.assert_frame_equal(result, expected)

View File

@ -0,0 +1,376 @@
"""
Tests that the specified index column (a.k.a "index_col")
is properly handled or inferred during parsing for all of
the parsers defined in parsers.py
"""
from io import StringIO
import numpy as np
import pytest
from pandas import (
DataFrame,
Index,
MultiIndex,
)
import pandas._testing as tm
pytestmark = pytest.mark.filterwarnings(
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
)
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
@pytest.mark.parametrize("with_header", [True, False])
def test_index_col_named(all_parsers, with_header):
parser = all_parsers
no_header = """\
KORD1,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
KORD2,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
KORD3,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
KORD4,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
KORD5,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
KORD6,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000"""
header = "ID,date,NominalTime,ActualTime,TDew,TAir,Windspeed,Precip,WindDir\n"
if with_header:
data = header + no_header
result = parser.read_csv(StringIO(data), index_col="ID")
expected = parser.read_csv(StringIO(data), header=0).set_index("ID")
tm.assert_frame_equal(result, expected)
else:
data = no_header
msg = "Index ID invalid"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), index_col="ID")
def test_index_col_named2(all_parsers):
parser = all_parsers
data = """\
1,2,3,4,hello
5,6,7,8,world
9,10,11,12,foo
"""
expected = DataFrame(
{"a": [1, 5, 9], "b": [2, 6, 10], "c": [3, 7, 11], "d": [4, 8, 12]},
index=Index(["hello", "world", "foo"], name="message"),
)
names = ["a", "b", "c", "d", "message"]
result = parser.read_csv(StringIO(data), names=names, index_col=["message"])
tm.assert_frame_equal(result, expected)
def test_index_col_is_true(all_parsers):
# see gh-9798
data = "a,b\n1,2"
parser = all_parsers
msg = "The value of index_col couldn't be 'True'"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), index_col=True)
@skip_pyarrow # CSV parse error: Expected 3 columns, got 4
def test_infer_index_col(all_parsers):
data = """A,B,C
foo,1,2,3
bar,4,5,6
baz,7,8,9
"""
parser = all_parsers
result = parser.read_csv(StringIO(data))
expected = DataFrame(
[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
index=["foo", "bar", "baz"],
columns=["A", "B", "C"],
)
tm.assert_frame_equal(result, expected)
@skip_pyarrow # CSV parse error: Empty CSV file or block
@pytest.mark.parametrize(
"index_col,kwargs",
[
(None, {"columns": ["x", "y", "z"]}),
(False, {"columns": ["x", "y", "z"]}),
(0, {"columns": ["y", "z"], "index": Index([], name="x")}),
(1, {"columns": ["x", "z"], "index": Index([], name="y")}),
("x", {"columns": ["y", "z"], "index": Index([], name="x")}),
("y", {"columns": ["x", "z"], "index": Index([], name="y")}),
(
[0, 1],
{
"columns": ["z"],
"index": MultiIndex.from_arrays([[]] * 2, names=["x", "y"]),
},
),
(
["x", "y"],
{
"columns": ["z"],
"index": MultiIndex.from_arrays([[]] * 2, names=["x", "y"]),
},
),
(
[1, 0],
{
"columns": ["z"],
"index": MultiIndex.from_arrays([[]] * 2, names=["y", "x"]),
},
),
(
["y", "x"],
{
"columns": ["z"],
"index": MultiIndex.from_arrays([[]] * 2, names=["y", "x"]),
},
),
],
)
def test_index_col_empty_data(all_parsers, index_col, kwargs):
data = "x,y,z"
parser = all_parsers
result = parser.read_csv(StringIO(data), index_col=index_col)
expected = DataFrame(**kwargs)
tm.assert_frame_equal(result, expected)
@skip_pyarrow # CSV parse error: Empty CSV file or block
def test_empty_with_index_col_false(all_parsers):
# see gh-10413
data = "x,y"
parser = all_parsers
result = parser.read_csv(StringIO(data), index_col=False)
expected = DataFrame(columns=["x", "y"])
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"index_names",
[
["", ""],
["foo", ""],
["", "bar"],
["foo", "bar"],
["NotReallyUnnamed", "Unnamed: 0"],
],
)
def test_multi_index_naming(all_parsers, index_names, request):
parser = all_parsers
if parser.engine == "pyarrow" and "" in index_names:
mark = pytest.mark.xfail(reason="One case raises, others are wrong")
request.applymarker(mark)
# We don't want empty index names being replaced with "Unnamed: 0"
data = ",".join(index_names + ["col\na,c,1\na,d,2\nb,c,3\nb,d,4"])
result = parser.read_csv(StringIO(data), index_col=[0, 1])
expected = DataFrame(
{"col": [1, 2, 3, 4]}, index=MultiIndex.from_product([["a", "b"], ["c", "d"]])
)
expected.index.names = [name if name else None for name in index_names]
tm.assert_frame_equal(result, expected)
@xfail_pyarrow # ValueError: Found non-unique column index
def test_multi_index_naming_not_all_at_beginning(all_parsers):
parser = all_parsers
data = ",Unnamed: 2,\na,c,1\na,d,2\nb,c,3\nb,d,4"
result = parser.read_csv(StringIO(data), index_col=[0, 2])
expected = DataFrame(
{"Unnamed: 2": ["c", "d", "c", "d"]},
index=MultiIndex(
levels=[["a", "b"], [1, 2, 3, 4]], codes=[[0, 0, 1, 1], [0, 1, 2, 3]]
),
)
tm.assert_frame_equal(result, expected)
@xfail_pyarrow # ValueError: Found non-unique column index
def test_no_multi_index_level_names_empty(all_parsers):
# GH 10984
parser = all_parsers
midx = MultiIndex.from_tuples([("A", 1, 2), ("A", 1, 2), ("B", 1, 2)])
expected = DataFrame(
np.random.default_rng(2).standard_normal((3, 3)),
index=midx,
columns=["x", "y", "z"],
)
with tm.ensure_clean() as path:
expected.to_csv(path)
result = parser.read_csv(path, index_col=[0, 1, 2])
tm.assert_frame_equal(result, expected)
@xfail_pyarrow # TypeError: an integer is required
def test_header_with_index_col(all_parsers):
# GH 33476
parser = all_parsers
data = """
I11,A,A
I12,B,B
I2,1,3
"""
midx = MultiIndex.from_tuples([("A", "B"), ("A", "B.1")], names=["I11", "I12"])
idx = Index(["I2"])
expected = DataFrame([[1, 3]], index=idx, columns=midx)
result = parser.read_csv(StringIO(data), index_col=0, header=[0, 1])
tm.assert_frame_equal(result, expected)
col_idx = Index(["A", "A.1"])
idx = Index(["I12", "I2"], name="I11")
expected = DataFrame([["B", "B"], ["1", "3"]], index=idx, columns=col_idx)
result = parser.read_csv(StringIO(data), index_col="I11", header=0)
tm.assert_frame_equal(result, expected)
@pytest.mark.slow
def test_index_col_large_csv(all_parsers, monkeypatch):
# https://github.com/pandas-dev/pandas/issues/37094
parser = all_parsers
ARR_LEN = 100
df = DataFrame(
{
"a": range(ARR_LEN + 1),
"b": np.random.default_rng(2).standard_normal(ARR_LEN + 1),
}
)
with tm.ensure_clean() as path:
df.to_csv(path, index=False)
with monkeypatch.context() as m:
m.setattr("pandas.core.algorithms._MINIMUM_COMP_ARR_LEN", ARR_LEN)
result = parser.read_csv(path, index_col=[0])
tm.assert_frame_equal(result, df.set_index("a"))
@xfail_pyarrow # TypeError: an integer is required
def test_index_col_multiindex_columns_no_data(all_parsers):
# GH#38292
parser = all_parsers
result = parser.read_csv(
StringIO("a0,a1,a2\nb0,b1,b2\n"), header=[0, 1], index_col=0
)
expected = DataFrame(
[],
index=Index([]),
columns=MultiIndex.from_arrays(
[["a1", "a2"], ["b1", "b2"]], names=["a0", "b0"]
),
)
tm.assert_frame_equal(result, expected)
@xfail_pyarrow # TypeError: an integer is required
def test_index_col_header_no_data(all_parsers):
# GH#38292
parser = all_parsers
result = parser.read_csv(StringIO("a0,a1,a2\n"), header=[0], index_col=0)
expected = DataFrame(
[],
columns=["a1", "a2"],
index=Index([], name="a0"),
)
tm.assert_frame_equal(result, expected)
@xfail_pyarrow # TypeError: an integer is required
def test_multiindex_columns_no_data(all_parsers):
# GH#38292
parser = all_parsers
result = parser.read_csv(StringIO("a0,a1,a2\nb0,b1,b2\n"), header=[0, 1])
expected = DataFrame(
[], columns=MultiIndex.from_arrays([["a0", "a1", "a2"], ["b0", "b1", "b2"]])
)
tm.assert_frame_equal(result, expected)
@xfail_pyarrow # TypeError: an integer is required
def test_multiindex_columns_index_col_with_data(all_parsers):
# GH#38292
parser = all_parsers
result = parser.read_csv(
StringIO("a0,a1,a2\nb0,b1,b2\ndata,data,data"), header=[0, 1], index_col=0
)
expected = DataFrame(
[["data", "data"]],
columns=MultiIndex.from_arrays(
[["a1", "a2"], ["b1", "b2"]], names=["a0", "b0"]
),
index=Index(["data"]),
)
tm.assert_frame_equal(result, expected)
@skip_pyarrow # CSV parse error: Empty CSV file or block
def test_infer_types_boolean_sum(all_parsers):
# GH#44079
parser = all_parsers
result = parser.read_csv(
StringIO("0,1"),
names=["a", "b"],
index_col=["a"],
dtype={"a": "UInt8"},
)
expected = DataFrame(
data={
"a": [
0,
],
"b": [1],
}
).set_index("a")
# Not checking index type now, because the C parser will return a
# index column of dtype 'object', and the Python parser will return a
# index column of dtype 'int64'.
tm.assert_frame_equal(result, expected, check_index_type=False)
@pytest.mark.parametrize("dtype, val", [(object, "01"), ("int64", 1)])
def test_specify_dtype_for_index_col(all_parsers, dtype, val, request):
# GH#9435
data = "a,b\n01,2"
parser = all_parsers
if dtype == object and parser.engine == "pyarrow":
request.applymarker(
pytest.mark.xfail(reason="Cannot disable type-inference for pyarrow engine")
)
result = parser.read_csv(StringIO(data), index_col="a", dtype={"a": dtype})
expected = DataFrame({"b": [2]}, index=Index([val], name="a", dtype=dtype))
tm.assert_frame_equal(result, expected)
@xfail_pyarrow # TypeError: an integer is required
def test_multiindex_columns_not_leading_index_col(all_parsers):
# GH#38549
parser = all_parsers
data = """a,b,c,d
e,f,g,h
x,y,1,2
"""
result = parser.read_csv(
StringIO(data),
header=[0, 1],
index_col=1,
)
cols = MultiIndex.from_tuples(
[("a", "e"), ("c", "g"), ("d", "h")], names=["b", "f"]
)
expected = DataFrame([["x", 1, 2]], columns=cols, index=["y"])
tm.assert_frame_equal(result, expected)

View File

@ -0,0 +1,182 @@
"""
Tests that duplicate columns are handled appropriately when parsed by the
CSV engine. In general, the expected result is that they are either thoroughly
de-duplicated (if mangling requested) or ignored otherwise.
"""
from io import StringIO
import pytest
from pandas import (
DataFrame,
Index,
)
import pandas._testing as tm
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
pytestmark = pytest.mark.filterwarnings(
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
)
@xfail_pyarrow # ValueError: Found non-unique column index
def test_basic(all_parsers):
parser = all_parsers
data = "a,a,b,b,b\n1,2,3,4,5"
result = parser.read_csv(StringIO(data), sep=",")
expected = DataFrame([[1, 2, 3, 4, 5]], columns=["a", "a.1", "b", "b.1", "b.2"])
tm.assert_frame_equal(result, expected)
@xfail_pyarrow # ValueError: Found non-unique column index
def test_basic_names(all_parsers):
# See gh-7160
parser = all_parsers
data = "a,b,a\n0,1,2\n3,4,5"
expected = DataFrame([[0, 1, 2], [3, 4, 5]], columns=["a", "b", "a.1"])
result = parser.read_csv(StringIO(data))
tm.assert_frame_equal(result, expected)
def test_basic_names_raise(all_parsers):
# See gh-7160
parser = all_parsers
data = "0,1,2\n3,4,5"
with pytest.raises(ValueError, match="Duplicate names"):
parser.read_csv(StringIO(data), names=["a", "b", "a"])
@xfail_pyarrow # ValueError: Found non-unique column index
@pytest.mark.parametrize(
"data,expected",
[
("a,a,a.1\n1,2,3", DataFrame([[1, 2, 3]], columns=["a", "a.2", "a.1"])),
(
"a,a,a.1,a.1.1,a.1.1.1,a.1.1.1.1\n1,2,3,4,5,6",
DataFrame(
[[1, 2, 3, 4, 5, 6]],
columns=["a", "a.2", "a.1", "a.1.1", "a.1.1.1", "a.1.1.1.1"],
),
),
(
"a,a,a.3,a.1,a.2,a,a\n1,2,3,4,5,6,7",
DataFrame(
[[1, 2, 3, 4, 5, 6, 7]],
columns=["a", "a.4", "a.3", "a.1", "a.2", "a.5", "a.6"],
),
),
],
)
def test_thorough_mangle_columns(all_parsers, data, expected):
# see gh-17060
parser = all_parsers
result = parser.read_csv(StringIO(data))
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"data,names,expected",
[
(
"a,b,b\n1,2,3",
["a.1", "a.1", "a.1.1"],
DataFrame(
[["a", "b", "b"], ["1", "2", "3"]], columns=["a.1", "a.1.1", "a.1.1.1"]
),
),
(
"a,b,c,d,e,f\n1,2,3,4,5,6",
["a", "a", "a.1", "a.1.1", "a.1.1.1", "a.1.1.1.1"],
DataFrame(
[["a", "b", "c", "d", "e", "f"], ["1", "2", "3", "4", "5", "6"]],
columns=["a", "a.1", "a.1.1", "a.1.1.1", "a.1.1.1.1", "a.1.1.1.1.1"],
),
),
(
"a,b,c,d,e,f,g\n1,2,3,4,5,6,7",
["a", "a", "a.3", "a.1", "a.2", "a", "a"],
DataFrame(
[
["a", "b", "c", "d", "e", "f", "g"],
["1", "2", "3", "4", "5", "6", "7"],
],
columns=["a", "a.1", "a.3", "a.1.1", "a.2", "a.2.1", "a.3.1"],
),
),
],
)
def test_thorough_mangle_names(all_parsers, data, names, expected):
# see gh-17095
parser = all_parsers
with pytest.raises(ValueError, match="Duplicate names"):
parser.read_csv(StringIO(data), names=names)
@xfail_pyarrow # AssertionError: DataFrame.columns are different
def test_mangled_unnamed_placeholders(all_parsers):
# xref gh-13017
orig_key = "0"
parser = all_parsers
orig_value = [1, 2, 3]
df = DataFrame({orig_key: orig_value})
# This test recursively updates `df`.
for i in range(3):
expected = DataFrame(columns=Index([], dtype="str"))
for j in range(i + 1):
col_name = "Unnamed: 0" + f".{1*j}" * min(j, 1)
expected.insert(loc=0, column=col_name, value=[0, 1, 2])
expected[orig_key] = orig_value
df = parser.read_csv(StringIO(df.to_csv()))
tm.assert_frame_equal(df, expected)
@xfail_pyarrow # ValueError: Found non-unique column index
def test_mangle_dupe_cols_already_exists(all_parsers):
# GH#14704
parser = all_parsers
data = "a,a,a.1,a,a.3,a.1,a.1.1\n1,2,3,4,5,6,7"
result = parser.read_csv(StringIO(data))
expected = DataFrame(
[[1, 2, 3, 4, 5, 6, 7]],
columns=["a", "a.2", "a.1", "a.4", "a.3", "a.1.2", "a.1.1"],
)
tm.assert_frame_equal(result, expected)
@xfail_pyarrow # ValueError: Found non-unique column index
def test_mangle_dupe_cols_already_exists_unnamed_col(all_parsers):
# GH#14704
parser = all_parsers
data = ",Unnamed: 0,,Unnamed: 2\n1,2,3,4"
result = parser.read_csv(StringIO(data))
expected = DataFrame(
[[1, 2, 3, 4]],
columns=["Unnamed: 0.1", "Unnamed: 0", "Unnamed: 2.1", "Unnamed: 2"],
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("usecol, engine", [([0, 1, 1], "python"), ([0, 1, 1], "c")])
def test_mangle_cols_names(all_parsers, usecol, engine):
# GH 11823
parser = all_parsers
data = "1,2,3"
names = ["A", "A", "B"]
with pytest.raises(ValueError, match="Duplicate names"):
parser.read_csv(StringIO(data), names=names, usecols=usecol, engine=engine)

View File

@ -0,0 +1,157 @@
"""
Tests multithreading behaviour for reading and
parsing files for each parser defined in parsers.py
"""
from contextlib import ExitStack
from io import BytesIO
from multiprocessing.pool import ThreadPool
import numpy as np
import pytest
import pandas as pd
from pandas import DataFrame
import pandas._testing as tm
from pandas.util.version import Version
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
# We'll probably always skip these for pyarrow
# Maybe we'll add our own tests for pyarrow too
pytestmark = [
pytest.mark.single_cpu,
pytest.mark.slow,
]
@pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning")
def test_multi_thread_string_io_read_csv(all_parsers, request):
# see gh-11786
parser = all_parsers
if parser.engine == "pyarrow":
pa = pytest.importorskip("pyarrow")
if Version(pa.__version__) < Version("16.0"):
request.applymarker(
pytest.mark.xfail(reason="# ValueError: Found non-unique column index")
)
max_row_range = 100
num_files = 10
bytes_to_df = (
"\n".join([f"{i:d},{i:d},{i:d}" for i in range(max_row_range)]).encode()
for _ in range(num_files)
)
# Read all files in many threads.
with ExitStack() as stack:
files = [stack.enter_context(BytesIO(b)) for b in bytes_to_df]
pool = stack.enter_context(ThreadPool(8))
results = pool.map(parser.read_csv, files)
first_result = results[0]
for result in results:
tm.assert_frame_equal(first_result, result)
def _generate_multi_thread_dataframe(parser, path, num_rows, num_tasks):
"""
Generate a DataFrame via multi-thread.
Parameters
----------
parser : BaseParser
The parser object to use for reading the data.
path : str
The location of the CSV file to read.
num_rows : int
The number of rows to read per task.
num_tasks : int
The number of tasks to use for reading this DataFrame.
Returns
-------
df : DataFrame
"""
def reader(arg):
"""
Create a reader for part of the CSV.
Parameters
----------
arg : tuple
A tuple of the following:
* start : int
The starting row to start for parsing CSV
* nrows : int
The number of rows to read.
Returns
-------
df : DataFrame
"""
start, nrows = arg
if not start:
return parser.read_csv(
path, index_col=0, header=0, nrows=nrows, parse_dates=["date"]
)
return parser.read_csv(
path,
index_col=0,
header=None,
skiprows=int(start) + 1,
nrows=nrows,
parse_dates=[9],
)
tasks = [
(num_rows * i // num_tasks, num_rows // num_tasks) for i in range(num_tasks)
]
with ThreadPool(processes=num_tasks) as pool:
results = pool.map(reader, tasks)
header = results[0].columns
for r in results[1:]:
r.columns = header
final_dataframe = pd.concat(results)
return final_dataframe
@xfail_pyarrow # ValueError: The 'nrows' option is not supported
def test_multi_thread_path_multipart_read_csv(all_parsers):
# see gh-11786
num_tasks = 4
num_rows = 48
parser = all_parsers
file_name = "__thread_pool_reader__.csv"
df = DataFrame(
{
"a": np.random.default_rng(2).random(num_rows),
"b": np.random.default_rng(2).random(num_rows),
"c": np.random.default_rng(2).random(num_rows),
"d": np.random.default_rng(2).random(num_rows),
"e": np.random.default_rng(2).random(num_rows),
"foo": ["foo"] * num_rows,
"bar": ["bar"] * num_rows,
"baz": ["baz"] * num_rows,
"date": pd.date_range("20000101 09:00:00", periods=num_rows, freq="s"),
"int": np.arange(num_rows, dtype="int64"),
}
)
with tm.ensure_clean(file_name) as path:
df.to_csv(path)
final_dataframe = _generate_multi_thread_dataframe(
parser, path, num_rows, num_tasks
)
tm.assert_frame_equal(df, final_dataframe)

View File

@ -0,0 +1,780 @@
"""
Tests that NA values are properly handled during
parsing for all of the parsers defined in parsers.py
"""
from io import StringIO
import numpy as np
import pytest
from pandas._libs.parsers import STR_NA_VALUES
from pandas import (
DataFrame,
Index,
MultiIndex,
)
import pandas._testing as tm
pytestmark = pytest.mark.filterwarnings(
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
)
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
def test_string_nas(all_parsers):
parser = all_parsers
data = """A,B,C
a,b,c
d,,f
,g,h
"""
result = parser.read_csv(StringIO(data))
expected = DataFrame(
[["a", "b", "c"], ["d", np.nan, "f"], [np.nan, "g", "h"]],
columns=["A", "B", "C"],
)
if parser.engine == "pyarrow":
expected.loc[2, "A"] = None
expected.loc[1, "B"] = None
tm.assert_frame_equal(result, expected)
def test_detect_string_na(all_parsers):
parser = all_parsers
data = """A,B
foo,bar
NA,baz
NaN,nan
"""
expected = DataFrame(
[["foo", "bar"], [np.nan, "baz"], [np.nan, np.nan]], columns=["A", "B"]
)
if parser.engine == "pyarrow":
expected.loc[[1, 2], "A"] = None
expected.loc[2, "B"] = None
result = parser.read_csv(StringIO(data))
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"na_values",
[
["-999.0", "-999"],
[-999, -999.0],
[-999.0, -999],
["-999.0"],
["-999"],
[-999.0],
[-999],
],
)
@pytest.mark.parametrize(
"data",
[
"""A,B
-999,1.2
2,-999
3,4.5
""",
"""A,B
-999,1.200
2,-999.000
3,4.500
""",
],
)
def test_non_string_na_values(all_parsers, data, na_values, request):
# see gh-3611: with an odd float format, we can't match
# the string "999.0" exactly but still need float matching
parser = all_parsers
expected = DataFrame([[np.nan, 1.2], [2.0, np.nan], [3.0, 4.5]], columns=["A", "B"])
if parser.engine == "pyarrow" and not all(isinstance(x, str) for x in na_values):
msg = "The 'pyarrow' engine requires all na_values to be strings"
with pytest.raises(TypeError, match=msg):
parser.read_csv(StringIO(data), na_values=na_values)
return
elif parser.engine == "pyarrow" and "-999.000" in data:
# bc the pyarrow engine does not include the float-ified version
# of "-999" -> -999, it does not match the entry with the trailing
# zeros, so "-999.000" is not treated as null.
mark = pytest.mark.xfail(
reason="pyarrow engined does not recognize equivalent floats"
)
request.applymarker(mark)
result = parser.read_csv(StringIO(data), na_values=na_values)
tm.assert_frame_equal(result, expected)
def test_default_na_values(all_parsers):
_NA_VALUES = {
"-1.#IND",
"1.#QNAN",
"1.#IND",
"-1.#QNAN",
"#N/A",
"N/A",
"n/a",
"NA",
"<NA>",
"#NA",
"NULL",
"null",
"NaN",
"nan",
"-NaN",
"-nan",
"#N/A N/A",
"",
"None",
}
assert _NA_VALUES == STR_NA_VALUES
parser = all_parsers
nv = len(_NA_VALUES)
def f(i, v):
if i == 0:
buf = ""
elif i > 0:
buf = "".join([","] * i)
buf = f"{buf}{v}"
if i < nv - 1:
joined = "".join([","] * (nv - i - 1))
buf = f"{buf}{joined}"
return buf
data = StringIO("\n".join([f(i, v) for i, v in enumerate(_NA_VALUES)]))
expected = DataFrame(np.nan, columns=range(nv), index=range(nv))
result = parser.read_csv(data, header=None)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("na_values", ["baz", ["baz"]])
def test_custom_na_values(all_parsers, na_values):
parser = all_parsers
data = """A,B,C
ignore,this,row
1,NA,3
-1.#IND,5,baz
7,8,NaN
"""
expected = DataFrame(
[[1.0, np.nan, 3], [np.nan, 5, np.nan], [7, 8, np.nan]], columns=["A", "B", "C"]
)
if parser.engine == "pyarrow":
msg = "skiprows argument must be an integer when using engine='pyarrow'"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), na_values=na_values, skiprows=[1])
return
result = parser.read_csv(StringIO(data), na_values=na_values, skiprows=[1])
tm.assert_frame_equal(result, expected)
def test_bool_na_values(all_parsers):
data = """A,B,C
True,False,True
NA,True,False
False,NA,True"""
parser = all_parsers
result = parser.read_csv(StringIO(data))
expected = DataFrame(
{
"A": np.array([True, np.nan, False], dtype=object),
"B": np.array([False, True, np.nan], dtype=object),
"C": [True, False, True],
}
)
if parser.engine == "pyarrow":
expected.loc[1, "A"] = None
expected.loc[2, "B"] = None
tm.assert_frame_equal(result, expected)
def test_na_value_dict(all_parsers):
data = """A,B,C
foo,bar,NA
bar,foo,foo
foo,bar,NA
bar,foo,foo"""
parser = all_parsers
if parser.engine == "pyarrow":
msg = "pyarrow engine doesn't support passing a dict for na_values"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), na_values={"A": ["foo"], "B": ["bar"]})
return
df = parser.read_csv(StringIO(data), na_values={"A": ["foo"], "B": ["bar"]})
expected = DataFrame(
{
"A": [np.nan, "bar", np.nan, "bar"],
"B": [np.nan, "foo", np.nan, "foo"],
"C": [np.nan, "foo", np.nan, "foo"],
}
)
tm.assert_frame_equal(df, expected)
@pytest.mark.parametrize(
"index_col,expected",
[
(
[0],
DataFrame({"b": [np.nan], "c": [1], "d": [5]}, index=Index([0], name="a")),
),
(
[0, 2],
DataFrame(
{"b": [np.nan], "d": [5]},
index=MultiIndex.from_tuples([(0, 1)], names=["a", "c"]),
),
),
(
["a", "c"],
DataFrame(
{"b": [np.nan], "d": [5]},
index=MultiIndex.from_tuples([(0, 1)], names=["a", "c"]),
),
),
],
)
def test_na_value_dict_multi_index(all_parsers, index_col, expected):
data = """\
a,b,c,d
0,NA,1,5
"""
parser = all_parsers
result = parser.read_csv(StringIO(data), na_values=set(), index_col=index_col)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"kwargs,expected",
[
(
{},
DataFrame(
{
"A": ["a", "b", np.nan, "d", "e", np.nan, "g"],
"B": [1, 2, 3, 4, 5, 6, 7],
"C": ["one", "two", "three", np.nan, "five", np.nan, "seven"],
}
),
),
(
{"na_values": {"A": [], "C": []}, "keep_default_na": False},
DataFrame(
{
"A": ["a", "b", "", "d", "e", "nan", "g"],
"B": [1, 2, 3, 4, 5, 6, 7],
"C": ["one", "two", "three", "nan", "five", "", "seven"],
}
),
),
(
{"na_values": ["a"], "keep_default_na": False},
DataFrame(
{
"A": [np.nan, "b", "", "d", "e", "nan", "g"],
"B": [1, 2, 3, 4, 5, 6, 7],
"C": ["one", "two", "three", "nan", "five", "", "seven"],
}
),
),
(
{"na_values": {"A": [], "C": []}},
DataFrame(
{
"A": ["a", "b", np.nan, "d", "e", np.nan, "g"],
"B": [1, 2, 3, 4, 5, 6, 7],
"C": ["one", "two", "three", np.nan, "five", np.nan, "seven"],
}
),
),
],
)
def test_na_values_keep_default(
all_parsers, kwargs, expected, request, using_infer_string
):
data = """\
A,B,C
a,1,one
b,2,two
,3,three
d,4,nan
e,5,five
nan,6,
g,7,seven
"""
parser = all_parsers
if parser.engine == "pyarrow":
if "na_values" in kwargs and isinstance(kwargs["na_values"], dict):
msg = "The pyarrow engine doesn't support passing a dict for na_values"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), **kwargs)
return
if not using_infer_string or "na_values" in kwargs:
mark = pytest.mark.xfail()
request.applymarker(mark)
result = parser.read_csv(StringIO(data), **kwargs)
tm.assert_frame_equal(result, expected)
def test_no_na_values_no_keep_default(all_parsers):
# see gh-4318: passing na_values=None and
# keep_default_na=False yields 'None" as a na_value
data = """\
A,B,C
a,1,None
b,2,two
,3,None
d,4,nan
e,5,five
nan,6,
g,7,seven
"""
parser = all_parsers
result = parser.read_csv(StringIO(data), keep_default_na=False)
expected = DataFrame(
{
"A": ["a", "b", "", "d", "e", "nan", "g"],
"B": [1, 2, 3, 4, 5, 6, 7],
"C": ["None", "two", "None", "nan", "five", "", "seven"],
}
)
tm.assert_frame_equal(result, expected)
def test_no_keep_default_na_dict_na_values(all_parsers):
# see gh-19227
data = "a,b\n,2"
parser = all_parsers
if parser.engine == "pyarrow":
msg = "The pyarrow engine doesn't support passing a dict for na_values"
with pytest.raises(ValueError, match=msg):
parser.read_csv(
StringIO(data), na_values={"b": ["2"]}, keep_default_na=False
)
return
result = parser.read_csv(
StringIO(data), na_values={"b": ["2"]}, keep_default_na=False
)
expected = DataFrame({"a": [""], "b": [np.nan]})
tm.assert_frame_equal(result, expected)
def test_no_keep_default_na_dict_na_scalar_values(all_parsers):
# see gh-19227
#
# Scalar values shouldn't cause the parsing to crash or fail.
data = "a,b\n1,2"
parser = all_parsers
if parser.engine == "pyarrow":
msg = "The pyarrow engine doesn't support passing a dict for na_values"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), na_values={"b": 2}, keep_default_na=False)
return
df = parser.read_csv(StringIO(data), na_values={"b": 2}, keep_default_na=False)
expected = DataFrame({"a": [1], "b": [np.nan]})
tm.assert_frame_equal(df, expected)
@pytest.mark.parametrize("col_zero_na_values", [113125, "113125"])
def test_no_keep_default_na_dict_na_values_diff_reprs(all_parsers, col_zero_na_values):
# see gh-19227
data = """\
113125,"blah","/blaha",kjsdkj,412.166,225.874,214.008
729639,"qwer","",asdfkj,466.681,,252.373
"""
parser = all_parsers
expected = DataFrame(
{
0: [np.nan, 729639.0],
1: [np.nan, "qwer"],
2: ["/blaha", np.nan],
3: ["kjsdkj", "asdfkj"],
4: [412.166, 466.681],
5: ["225.874", ""],
6: [np.nan, 252.373],
}
)
if parser.engine == "pyarrow":
msg = "The pyarrow engine doesn't support passing a dict for na_values"
with pytest.raises(ValueError, match=msg):
parser.read_csv(
StringIO(data),
header=None,
keep_default_na=False,
na_values={2: "", 6: "214.008", 1: "blah", 0: col_zero_na_values},
)
return
result = parser.read_csv(
StringIO(data),
header=None,
keep_default_na=False,
na_values={2: "", 6: "214.008", 1: "blah", 0: col_zero_na_values},
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"na_filter,row_data",
[
(True, [[1, "A"], [np.nan, np.nan], [3, "C"]]),
(False, [["1", "A"], ["nan", "B"], ["3", "C"]]),
],
)
def test_na_values_na_filter_override(
request, all_parsers, na_filter, row_data, using_infer_string
):
parser = all_parsers
if parser.engine == "pyarrow":
# mismatched dtypes in both cases, FutureWarning in the True case
if not (using_infer_string and na_filter):
mark = pytest.mark.xfail(reason="pyarrow doesn't support this.")
request.applymarker(mark)
data = """\
A,B
1,A
nan,B
3,C
"""
result = parser.read_csv(StringIO(data), na_values=["B"], na_filter=na_filter)
expected = DataFrame(row_data, columns=["A", "B"])
tm.assert_frame_equal(result, expected)
@skip_pyarrow # CSV parse error: Expected 8 columns, got 5:
def test_na_trailing_columns(all_parsers):
parser = all_parsers
data = """Date,Currency,Symbol,Type,Units,UnitPrice,Cost,Tax
2012-03-14,USD,AAPL,BUY,1000
2012-05-12,USD,SBUX,SELL,500"""
# Trailing columns should be all NaN.
result = parser.read_csv(StringIO(data))
expected = DataFrame(
[
["2012-03-14", "USD", "AAPL", "BUY", 1000, np.nan, np.nan, np.nan],
["2012-05-12", "USD", "SBUX", "SELL", 500, np.nan, np.nan, np.nan],
],
columns=[
"Date",
"Currency",
"Symbol",
"Type",
"Units",
"UnitPrice",
"Cost",
"Tax",
],
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"na_values,row_data",
[
(1, [[np.nan, 2.0], [2.0, np.nan]]),
({"a": 2, "b": 1}, [[1.0, 2.0], [np.nan, np.nan]]),
],
)
def test_na_values_scalar(all_parsers, na_values, row_data):
# see gh-12224
parser = all_parsers
names = ["a", "b"]
data = "1,2\n2,1"
if parser.engine == "pyarrow" and isinstance(na_values, dict):
if isinstance(na_values, dict):
err = ValueError
msg = "The pyarrow engine doesn't support passing a dict for na_values"
else:
err = TypeError
msg = "The 'pyarrow' engine requires all na_values to be strings"
with pytest.raises(err, match=msg):
parser.read_csv(StringIO(data), names=names, na_values=na_values)
return
elif parser.engine == "pyarrow":
msg = "The 'pyarrow' engine requires all na_values to be strings"
with pytest.raises(TypeError, match=msg):
parser.read_csv(StringIO(data), names=names, na_values=na_values)
return
result = parser.read_csv(StringIO(data), names=names, na_values=na_values)
expected = DataFrame(row_data, columns=names)
tm.assert_frame_equal(result, expected)
def test_na_values_dict_aliasing(all_parsers):
parser = all_parsers
na_values = {"a": 2, "b": 1}
na_values_copy = na_values.copy()
names = ["a", "b"]
data = "1,2\n2,1"
expected = DataFrame([[1.0, 2.0], [np.nan, np.nan]], columns=names)
if parser.engine == "pyarrow":
msg = "The pyarrow engine doesn't support passing a dict for na_values"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), names=names, na_values=na_values)
return
result = parser.read_csv(StringIO(data), names=names, na_values=na_values)
tm.assert_frame_equal(result, expected)
tm.assert_dict_equal(na_values, na_values_copy)
def test_na_values_dict_col_index(all_parsers):
# see gh-14203
data = "a\nfoo\n1"
parser = all_parsers
na_values = {0: "foo"}
if parser.engine == "pyarrow":
msg = "The pyarrow engine doesn't support passing a dict for na_values"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), na_values=na_values)
return
result = parser.read_csv(StringIO(data), na_values=na_values)
expected = DataFrame({"a": [np.nan, 1]})
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"data,kwargs,expected",
[
(
str(2**63) + "\n" + str(2**63 + 1),
{"na_values": [2**63]},
DataFrame([str(2**63), str(2**63 + 1)]),
),
(str(2**63) + ",1" + "\n,2", {}, DataFrame([[str(2**63), 1], ["", 2]])),
(str(2**63) + "\n1", {"na_values": [2**63]}, DataFrame([np.nan, 1])),
],
)
def test_na_values_uint64(all_parsers, data, kwargs, expected, request):
# see gh-14983
parser = all_parsers
if parser.engine == "pyarrow" and "na_values" in kwargs:
msg = "The 'pyarrow' engine requires all na_values to be strings"
with pytest.raises(TypeError, match=msg):
parser.read_csv(StringIO(data), header=None, **kwargs)
return
elif parser.engine == "pyarrow":
mark = pytest.mark.xfail(reason="Returns float64 instead of object")
request.applymarker(mark)
result = parser.read_csv(StringIO(data), header=None, **kwargs)
tm.assert_frame_equal(result, expected)
def test_empty_na_values_no_default_with_index(all_parsers):
# see gh-15835
data = "a,1\nb,2"
parser = all_parsers
expected = DataFrame({"1": [2]}, index=Index(["b"], name="a"))
result = parser.read_csv(StringIO(data), index_col=0, keep_default_na=False)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"na_filter,index_data", [(False, ["", "5"]), (True, [np.nan, 5.0])]
)
def test_no_na_filter_on_index(all_parsers, na_filter, index_data, request):
# see gh-5239
#
# Don't parse NA-values in index unless na_filter=True
parser = all_parsers
data = "a,b,c\n1,,3\n4,5,6"
if parser.engine == "pyarrow" and na_filter is False:
mark = pytest.mark.xfail(reason="mismatched index result")
request.applymarker(mark)
expected = DataFrame({"a": [1, 4], "c": [3, 6]}, index=Index(index_data, name="b"))
result = parser.read_csv(StringIO(data), index_col=[1], na_filter=na_filter)
tm.assert_frame_equal(result, expected)
def test_inf_na_values_with_int_index(all_parsers):
# see gh-17128
parser = all_parsers
data = "idx,col1,col2\n1,3,4\n2,inf,-inf"
# Don't fail with OverflowError with inf's and integer index column.
out = parser.read_csv(StringIO(data), index_col=[0], na_values=["inf", "-inf"])
expected = DataFrame(
{"col1": [3, np.nan], "col2": [4, np.nan]}, index=Index([1, 2], name="idx")
)
tm.assert_frame_equal(out, expected)
@xfail_pyarrow # mismatched shape
@pytest.mark.parametrize("na_filter", [True, False])
def test_na_values_with_dtype_str_and_na_filter(all_parsers, na_filter):
# see gh-20377
parser = all_parsers
data = "a,b,c\n1,,3\n4,5,6"
# na_filter=True --> missing value becomes NaN.
# na_filter=False --> missing value remains empty string.
empty = np.nan if na_filter else ""
expected = DataFrame({"a": ["1", "4"], "b": [empty, "5"], "c": ["3", "6"]})
result = parser.read_csv(StringIO(data), na_filter=na_filter, dtype=str)
tm.assert_frame_equal(result, expected)
@xfail_pyarrow # mismatched exception message
@pytest.mark.parametrize(
"data, na_values",
[
("false,1\n,1\ntrue", None),
("false,1\nnull,1\ntrue", None),
("false,1\nnan,1\ntrue", None),
("false,1\nfoo,1\ntrue", "foo"),
("false,1\nfoo,1\ntrue", ["foo"]),
("false,1\nfoo,1\ntrue", {"a": "foo"}),
],
)
def test_cast_NA_to_bool_raises_error(all_parsers, data, na_values):
parser = all_parsers
msg = "|".join(
[
"Bool column has NA values in column [0a]",
"cannot safely convert passed user dtype of "
"bool for object dtyped data in column 0",
]
)
with pytest.raises(ValueError, match=msg):
parser.read_csv(
StringIO(data),
header=None,
names=["a", "b"],
dtype={"a": "bool"},
na_values=na_values,
)
# TODO: this test isn't about the na_values keyword, it is about the empty entries
# being returned with NaN entries, whereas the pyarrow engine returns "nan"
@xfail_pyarrow # mismatched shapes
def test_str_nan_dropped(all_parsers):
# see gh-21131
parser = all_parsers
data = """File: small.csv,,
10010010233,0123,654
foo,,bar
01001000155,4530,898"""
result = parser.read_csv(
StringIO(data),
header=None,
names=["col1", "col2", "col3"],
dtype={"col1": str, "col2": str, "col3": str},
).dropna()
expected = DataFrame(
{
"col1": ["10010010233", "01001000155"],
"col2": ["0123", "4530"],
"col3": ["654", "898"],
},
index=[1, 3],
)
tm.assert_frame_equal(result, expected)
def test_nan_multi_index(all_parsers):
# GH 42446
parser = all_parsers
data = "A,B,B\nX,Y,Z\n1,2,inf"
if parser.engine == "pyarrow":
msg = "The pyarrow engine doesn't support passing a dict for na_values"
with pytest.raises(ValueError, match=msg):
parser.read_csv(
StringIO(data), header=list(range(2)), na_values={("B", "Z"): "inf"}
)
return
result = parser.read_csv(
StringIO(data), header=list(range(2)), na_values={("B", "Z"): "inf"}
)
expected = DataFrame(
{
("A", "X"): [1],
("B", "Y"): [2],
("B", "Z"): [np.nan],
}
)
tm.assert_frame_equal(result, expected)
@xfail_pyarrow # Failed: DID NOT RAISE <class 'ValueError'>; it casts the NaN to False
def test_bool_and_nan_to_bool(all_parsers):
# GH#42808
parser = all_parsers
data = """0
NaN
True
False
"""
with pytest.raises(ValueError, match="NA values"):
parser.read_csv(StringIO(data), dtype="bool")
def test_bool_and_nan_to_int(all_parsers):
# GH#42808
parser = all_parsers
data = """0
NaN
True
False
"""
with pytest.raises(ValueError, match="convert|NoneType"):
parser.read_csv(StringIO(data), dtype="int")
def test_bool_and_nan_to_float(all_parsers):
# GH#42808
parser = all_parsers
data = """0
NaN
True
False
"""
result = parser.read_csv(StringIO(data), dtype="float")
expected = DataFrame.from_dict({"0": [np.nan, 1.0, 0.0]})
tm.assert_frame_equal(result, expected)

View File

@ -0,0 +1,327 @@
"""
Tests parsers ability to read and parse non-local files
and hence require a network connection to be read.
"""
from io import BytesIO
import logging
import re
import numpy as np
import pytest
import pandas.util._test_decorators as td
from pandas import DataFrame
import pandas._testing as tm
from pandas.io.feather_format import read_feather
from pandas.io.parsers import read_csv
pytestmark = pytest.mark.filterwarnings(
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
)
@pytest.mark.network
@pytest.mark.single_cpu
@pytest.mark.parametrize("mode", ["explicit", "infer"])
@pytest.mark.parametrize("engine", ["python", "c"])
def test_compressed_urls(
httpserver,
datapath,
salaries_table,
mode,
engine,
compression_only,
compression_to_extension,
):
# test reading compressed urls with various engines and
# extension inference
if compression_only == "tar":
pytest.skip("TODO: Add tar salaraies.csv to pandas/io/parsers/data")
extension = compression_to_extension[compression_only]
with open(datapath("io", "parser", "data", "salaries.csv" + extension), "rb") as f:
httpserver.serve_content(content=f.read())
url = httpserver.url + "/salaries.csv" + extension
if mode != "explicit":
compression_only = mode
url_table = read_csv(url, sep="\t", compression=compression_only, engine=engine)
tm.assert_frame_equal(url_table, salaries_table)
@pytest.mark.network
@pytest.mark.single_cpu
def test_url_encoding_csv(httpserver, datapath):
"""
read_csv should honor the requested encoding for URLs.
GH 10424
"""
with open(datapath("io", "parser", "data", "unicode_series.csv"), "rb") as f:
httpserver.serve_content(content=f.read())
df = read_csv(httpserver.url, encoding="latin-1", header=None)
assert df.loc[15, 1] == "Á köldum klaka (Cold Fever) (1994)"
@pytest.fixture
def tips_df(datapath):
"""DataFrame with the tips dataset."""
return read_csv(datapath("io", "data", "csv", "tips.csv"))
@pytest.mark.single_cpu
@pytest.mark.usefixtures("s3_resource")
@td.skip_if_not_us_locale()
class TestS3:
def test_parse_public_s3_bucket(self, s3_public_bucket_with_data, tips_df, s3so):
# more of an integration test due to the not-public contents portion
# can probably mock this though.
pytest.importorskip("s3fs")
for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]:
df = read_csv(
f"s3://{s3_public_bucket_with_data.name}/tips.csv" + ext,
compression=comp,
storage_options=s3so,
)
assert isinstance(df, DataFrame)
assert not df.empty
tm.assert_frame_equal(df, tips_df)
def test_parse_private_s3_bucket(self, s3_private_bucket_with_data, tips_df, s3so):
# Read public file from bucket with not-public contents
pytest.importorskip("s3fs")
df = read_csv(
f"s3://{s3_private_bucket_with_data.name}/tips.csv", storage_options=s3so
)
assert isinstance(df, DataFrame)
assert not df.empty
tm.assert_frame_equal(df, tips_df)
def test_parse_public_s3n_bucket(self, s3_public_bucket_with_data, tips_df, s3so):
# Read from AWS s3 as "s3n" URL
df = read_csv(
f"s3n://{s3_public_bucket_with_data.name}/tips.csv",
nrows=10,
storage_options=s3so,
)
assert isinstance(df, DataFrame)
assert not df.empty
tm.assert_frame_equal(tips_df.iloc[:10], df)
def test_parse_public_s3a_bucket(self, s3_public_bucket_with_data, tips_df, s3so):
# Read from AWS s3 as "s3a" URL
df = read_csv(
f"s3a://{s3_public_bucket_with_data.name}/tips.csv",
nrows=10,
storage_options=s3so,
)
assert isinstance(df, DataFrame)
assert not df.empty
tm.assert_frame_equal(tips_df.iloc[:10], df)
def test_parse_public_s3_bucket_nrows(
self, s3_public_bucket_with_data, tips_df, s3so
):
for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]:
df = read_csv(
f"s3://{s3_public_bucket_with_data.name}/tips.csv" + ext,
nrows=10,
compression=comp,
storage_options=s3so,
)
assert isinstance(df, DataFrame)
assert not df.empty
tm.assert_frame_equal(tips_df.iloc[:10], df)
def test_parse_public_s3_bucket_chunked(
self, s3_public_bucket_with_data, tips_df, s3so
):
# Read with a chunksize
chunksize = 5
for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]:
with read_csv(
f"s3://{s3_public_bucket_with_data.name}/tips.csv" + ext,
chunksize=chunksize,
compression=comp,
storage_options=s3so,
) as df_reader:
assert df_reader.chunksize == chunksize
for i_chunk in [0, 1, 2]:
# Read a couple of chunks and make sure we see them
# properly.
df = df_reader.get_chunk()
assert isinstance(df, DataFrame)
assert not df.empty
true_df = tips_df.iloc[
chunksize * i_chunk : chunksize * (i_chunk + 1)
]
tm.assert_frame_equal(true_df, df)
def test_parse_public_s3_bucket_chunked_python(
self, s3_public_bucket_with_data, tips_df, s3so
):
# Read with a chunksize using the Python parser
chunksize = 5
for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]:
with read_csv(
f"s3://{s3_public_bucket_with_data.name}/tips.csv" + ext,
chunksize=chunksize,
compression=comp,
engine="python",
storage_options=s3so,
) as df_reader:
assert df_reader.chunksize == chunksize
for i_chunk in [0, 1, 2]:
# Read a couple of chunks and make sure we see them properly.
df = df_reader.get_chunk()
assert isinstance(df, DataFrame)
assert not df.empty
true_df = tips_df.iloc[
chunksize * i_chunk : chunksize * (i_chunk + 1)
]
tm.assert_frame_equal(true_df, df)
def test_parse_public_s3_bucket_python(
self, s3_public_bucket_with_data, tips_df, s3so
):
for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]:
df = read_csv(
f"s3://{s3_public_bucket_with_data.name}/tips.csv" + ext,
engine="python",
compression=comp,
storage_options=s3so,
)
assert isinstance(df, DataFrame)
assert not df.empty
tm.assert_frame_equal(df, tips_df)
def test_infer_s3_compression(self, s3_public_bucket_with_data, tips_df, s3so):
for ext in ["", ".gz", ".bz2"]:
df = read_csv(
f"s3://{s3_public_bucket_with_data.name}/tips.csv" + ext,
engine="python",
compression="infer",
storage_options=s3so,
)
assert isinstance(df, DataFrame)
assert not df.empty
tm.assert_frame_equal(df, tips_df)
def test_parse_public_s3_bucket_nrows_python(
self, s3_public_bucket_with_data, tips_df, s3so
):
for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]:
df = read_csv(
f"s3://{s3_public_bucket_with_data.name}/tips.csv" + ext,
engine="python",
nrows=10,
compression=comp,
storage_options=s3so,
)
assert isinstance(df, DataFrame)
assert not df.empty
tm.assert_frame_equal(tips_df.iloc[:10], df)
def test_read_s3_fails(self, s3so):
msg = "The specified bucket does not exist"
with pytest.raises(OSError, match=msg):
read_csv("s3://nyqpug/asdf.csv", storage_options=s3so)
def test_read_s3_fails_private(self, s3_private_bucket, s3so):
msg = "The specified bucket does not exist"
# Receive a permission error when trying to read a private bucket.
# It's irrelevant here that this isn't actually a table.
with pytest.raises(OSError, match=msg):
read_csv(f"s3://{s3_private_bucket.name}/file.csv")
@pytest.mark.xfail(reason="GH#39155 s3fs upgrade", strict=False)
def test_write_s3_csv_fails(self, tips_df, s3so):
# GH 32486
# Attempting to write to an invalid S3 path should raise
import botocore
# GH 34087
# https://boto3.amazonaws.com/v1/documentation/api/latest/guide/error-handling.html
# Catch a ClientError since AWS Service Errors are defined dynamically
error = (FileNotFoundError, botocore.exceptions.ClientError)
with pytest.raises(error, match="The specified bucket does not exist"):
tips_df.to_csv(
"s3://an_s3_bucket_data_doesnt_exit/not_real.csv", storage_options=s3so
)
@pytest.mark.xfail(reason="GH#39155 s3fs upgrade", strict=False)
def test_write_s3_parquet_fails(self, tips_df, s3so):
# GH 27679
# Attempting to write to an invalid S3 path should raise
pytest.importorskip("pyarrow")
import botocore
# GH 34087
# https://boto3.amazonaws.com/v1/documentation/api/latest/guide/error-handling.html
# Catch a ClientError since AWS Service Errors are defined dynamically
error = (FileNotFoundError, botocore.exceptions.ClientError)
with pytest.raises(error, match="The specified bucket does not exist"):
tips_df.to_parquet(
"s3://an_s3_bucket_data_doesnt_exit/not_real.parquet",
storage_options=s3so,
)
@pytest.mark.single_cpu
def test_read_csv_handles_boto_s3_object(
self, s3_public_bucket_with_data, tips_file
):
# see gh-16135
s3_object = s3_public_bucket_with_data.Object("tips.csv")
with BytesIO(s3_object.get()["Body"].read()) as buffer:
result = read_csv(buffer, encoding="utf8")
assert isinstance(result, DataFrame)
assert not result.empty
expected = read_csv(tips_file)
tm.assert_frame_equal(result, expected)
@pytest.mark.single_cpu
def test_read_csv_chunked_download(self, s3_public_bucket, caplog, s3so):
# 8 MB, S3FS uses 5MB chunks
df = DataFrame(np.zeros((100000, 4)), columns=list("abcd"))
with BytesIO(df.to_csv().encode("utf-8")) as buf:
s3_public_bucket.put_object(Key="large-file.csv", Body=buf)
uri = f"{s3_public_bucket.name}/large-file.csv"
match_re = re.compile(rf"^Fetch: {uri}, 0-(?P<stop>\d+)$")
with caplog.at_level(logging.DEBUG, logger="s3fs"):
read_csv(
f"s3://{uri}",
nrows=5,
storage_options=s3so,
)
for log in caplog.messages:
if match := re.match(match_re, log):
# Less than 8 MB
assert int(match.group("stop")) < 8000000
def test_read_s3_with_hash_in_key(self, s3_public_bucket_with_data, tips_df, s3so):
# GH 25945
result = read_csv(
f"s3://{s3_public_bucket_with_data.name}/tips#1.csv", storage_options=s3so
)
tm.assert_frame_equal(tips_df, result)
def test_read_feather_s3_file_path(
self, s3_public_bucket_with_data, feather_file, s3so
):
# GH 29055
pytest.importorskip("pyarrow")
expected = read_feather(feather_file)
res = read_feather(
f"s3://{s3_public_bucket_with_data.name}/simple_dataset.feather",
storage_options=s3so,
)
tm.assert_frame_equal(expected, res)

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,566 @@
"""
Tests that apply specifically to the Python parser. Unless specifically
stated as a Python-specific issue, the goal is to eventually move as many of
these tests out of this module as soon as the C parser can accept further
arguments when parsing.
"""
from __future__ import annotations
import csv
from io import (
BytesIO,
StringIO,
TextIOWrapper,
)
from typing import TYPE_CHECKING
import numpy as np
import pytest
from pandas.errors import (
ParserError,
ParserWarning,
)
from pandas import (
DataFrame,
Index,
MultiIndex,
)
import pandas._testing as tm
if TYPE_CHECKING:
from collections.abc import Iterator
def test_default_separator(python_parser_only):
# see gh-17333
#
# csv.Sniffer in Python treats "o" as separator.
data = "aob\n1o2\n3o4"
parser = python_parser_only
expected = DataFrame({"a": [1, 3], "b": [2, 4]})
result = parser.read_csv(StringIO(data), sep=None)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("skipfooter", ["foo", 1.5, True])
def test_invalid_skipfooter_non_int(python_parser_only, skipfooter):
# see gh-15925 (comment)
data = "a\n1\n2"
parser = python_parser_only
msg = "skipfooter must be an integer"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), skipfooter=skipfooter)
def test_invalid_skipfooter_negative(python_parser_only):
# see gh-15925 (comment)
data = "a\n1\n2"
parser = python_parser_only
msg = "skipfooter cannot be negative"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), skipfooter=-1)
@pytest.mark.parametrize("kwargs", [{"sep": None}, {"delimiter": "|"}])
def test_sniff_delimiter(python_parser_only, kwargs):
data = """index|A|B|C
foo|1|2|3
bar|4|5|6
baz|7|8|9
"""
parser = python_parser_only
result = parser.read_csv(StringIO(data), index_col=0, **kwargs)
expected = DataFrame(
[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
columns=["A", "B", "C"],
index=Index(["foo", "bar", "baz"], name="index"),
)
tm.assert_frame_equal(result, expected)
def test_sniff_delimiter_comment(python_parser_only):
data = """# comment line
index|A|B|C
# comment line
foo|1|2|3 # ignore | this
bar|4|5|6
baz|7|8|9
"""
parser = python_parser_only
result = parser.read_csv(StringIO(data), index_col=0, sep=None, comment="#")
expected = DataFrame(
[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
columns=["A", "B", "C"],
index=Index(["foo", "bar", "baz"], name="index"),
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("encoding", [None, "utf-8"])
def test_sniff_delimiter_encoding(python_parser_only, encoding):
parser = python_parser_only
data = """ignore this
ignore this too
index|A|B|C
foo|1|2|3
bar|4|5|6
baz|7|8|9
"""
if encoding is not None:
data = data.encode(encoding)
data = BytesIO(data)
data = TextIOWrapper(data, encoding=encoding)
else:
data = StringIO(data)
result = parser.read_csv(data, index_col=0, sep=None, skiprows=2, encoding=encoding)
expected = DataFrame(
[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
columns=["A", "B", "C"],
index=Index(["foo", "bar", "baz"], name="index"),
)
tm.assert_frame_equal(result, expected)
def test_single_line(python_parser_only):
# see gh-6607: sniff separator
parser = python_parser_only
result = parser.read_csv(StringIO("1,2"), names=["a", "b"], header=None, sep=None)
expected = DataFrame({"a": [1], "b": [2]})
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("kwargs", [{"skipfooter": 2}, {"nrows": 3}])
def test_skipfooter(python_parser_only, kwargs):
# see gh-6607
data = """A,B,C
1,2,3
4,5,6
7,8,9
want to skip this
also also skip this
"""
parser = python_parser_only
result = parser.read_csv(StringIO(data), **kwargs)
expected = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=["A", "B", "C"])
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"compression,klass", [("gzip", "GzipFile"), ("bz2", "BZ2File")]
)
def test_decompression_regex_sep(python_parser_only, csv1, compression, klass):
# see gh-6607
parser = python_parser_only
with open(csv1, "rb") as f:
data = f.read()
data = data.replace(b",", b"::")
expected = parser.read_csv(csv1)
module = pytest.importorskip(compression)
klass = getattr(module, klass)
with tm.ensure_clean() as path:
with klass(path, mode="wb") as tmp:
tmp.write(data)
result = parser.read_csv(path, sep="::", compression=compression)
tm.assert_frame_equal(result, expected)
def test_read_csv_buglet_4x_multi_index(python_parser_only):
# see gh-6607
data = """ A B C D E
one two three four
a b 10.0032 5 -0.5109 -2.3358 -0.4645 0.05076 0.3640
a q 20 4 0.4473 1.4152 0.2834 1.00661 0.1744
x q 30 3 -0.6662 -0.5243 -0.3580 0.89145 2.5838"""
parser = python_parser_only
expected = DataFrame(
[
[-0.5109, -2.3358, -0.4645, 0.05076, 0.3640],
[0.4473, 1.4152, 0.2834, 1.00661, 0.1744],
[-0.6662, -0.5243, -0.3580, 0.89145, 2.5838],
],
columns=["A", "B", "C", "D", "E"],
index=MultiIndex.from_tuples(
[("a", "b", 10.0032, 5), ("a", "q", 20, 4), ("x", "q", 30, 3)],
names=["one", "two", "three", "four"],
),
)
result = parser.read_csv(StringIO(data), sep=r"\s+")
tm.assert_frame_equal(result, expected)
def test_read_csv_buglet_4x_multi_index2(python_parser_only):
# see gh-6893
data = " A B C\na b c\n1 3 7 0 3 6\n3 1 4 1 5 9"
parser = python_parser_only
expected = DataFrame.from_records(
[(1, 3, 7, 0, 3, 6), (3, 1, 4, 1, 5, 9)],
columns=list("abcABC"),
index=list("abc"),
)
result = parser.read_csv(StringIO(data), sep=r"\s+")
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("add_footer", [True, False])
def test_skipfooter_with_decimal(python_parser_only, add_footer):
# see gh-6971
data = "1#2\n3#4"
parser = python_parser_only
expected = DataFrame({"a": [1.2, 3.4]})
if add_footer:
# The stray footer line should not mess with the
# casting of the first two lines if we skip it.
kwargs = {"skipfooter": 1}
data += "\nFooter"
else:
kwargs = {}
result = parser.read_csv(StringIO(data), names=["a"], decimal="#", **kwargs)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"sep", ["::", "#####", "!!!", "123", "#1!c5", "%!c!d", "@@#4:2", "_!pd#_"]
)
@pytest.mark.parametrize(
"encoding", ["utf-16", "utf-16-be", "utf-16-le", "utf-32", "cp037"]
)
def test_encoding_non_utf8_multichar_sep(python_parser_only, sep, encoding):
# see gh-3404
expected = DataFrame({"a": [1], "b": [2]})
parser = python_parser_only
data = "1" + sep + "2"
encoded_data = data.encode(encoding)
result = parser.read_csv(
BytesIO(encoded_data), sep=sep, names=["a", "b"], encoding=encoding
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("quoting", [csv.QUOTE_MINIMAL, csv.QUOTE_NONE])
def test_multi_char_sep_quotes(python_parser_only, quoting):
# see gh-13374
kwargs = {"sep": ",,"}
parser = python_parser_only
data = 'a,,b\n1,,a\n2,,"2,,b"'
if quoting == csv.QUOTE_NONE:
msg = "Expected 2 fields in line 3, saw 3"
with pytest.raises(ParserError, match=msg):
parser.read_csv(StringIO(data), quoting=quoting, **kwargs)
else:
msg = "ignored when a multi-char delimiter is used"
with pytest.raises(ParserError, match=msg):
parser.read_csv(StringIO(data), quoting=quoting, **kwargs)
def test_none_delimiter(python_parser_only):
# see gh-13374 and gh-17465
parser = python_parser_only
data = "a,b,c\n0,1,2\n3,4,5,6\n7,8,9"
expected = DataFrame({"a": [0, 7], "b": [1, 8], "c": [2, 9]})
# We expect the third line in the data to be
# skipped because it is malformed, but we do
# not expect any errors to occur.
with tm.assert_produces_warning(
ParserWarning, match="Skipping line 3", check_stacklevel=False
):
result = parser.read_csv(
StringIO(data), header=0, sep=None, on_bad_lines="warn"
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("data", ['a\n1\n"b"a', 'a,b,c\ncat,foo,bar\ndog,foo,"baz'])
@pytest.mark.parametrize("skipfooter", [0, 1])
def test_skipfooter_bad_row(python_parser_only, data, skipfooter):
# see gh-13879 and gh-15910
parser = python_parser_only
if skipfooter:
msg = "parsing errors in the skipped footer rows"
with pytest.raises(ParserError, match=msg):
parser.read_csv(StringIO(data), skipfooter=skipfooter)
else:
msg = "unexpected end of data|expected after"
with pytest.raises(ParserError, match=msg):
parser.read_csv(StringIO(data), skipfooter=skipfooter)
def test_malformed_skipfooter(python_parser_only):
parser = python_parser_only
data = """ignore
A,B,C
1,2,3 # comment
1,2,3,4,5
2,3,4
footer
"""
msg = "Expected 3 fields in line 4, saw 5"
with pytest.raises(ParserError, match=msg):
parser.read_csv(StringIO(data), header=1, comment="#", skipfooter=1)
def test_python_engine_file_no_next(python_parser_only):
parser = python_parser_only
class NoNextBuffer:
def __init__(self, csv_data) -> None:
self.data = csv_data
def __iter__(self) -> Iterator:
return self.data.__iter__()
def read(self):
return self.data
def readline(self):
return self.data
parser.read_csv(NoNextBuffer("a\n1"))
@pytest.mark.parametrize("bad_line_func", [lambda x: ["2", "3"], lambda x: x[:2]])
def test_on_bad_lines_callable(python_parser_only, bad_line_func):
# GH 5686
parser = python_parser_only
data = """a,b
1,2
2,3,4,5,6
3,4
"""
bad_sio = StringIO(data)
result = parser.read_csv(bad_sio, on_bad_lines=bad_line_func)
expected = DataFrame({"a": [1, 2, 3], "b": [2, 3, 4]})
tm.assert_frame_equal(result, expected)
def test_on_bad_lines_callable_write_to_external_list(python_parser_only):
# GH 5686
parser = python_parser_only
data = """a,b
1,2
2,3,4,5,6
3,4
"""
bad_sio = StringIO(data)
lst = []
def bad_line_func(bad_line: list[str]) -> list[str]:
lst.append(bad_line)
return ["2", "3"]
result = parser.read_csv(bad_sio, on_bad_lines=bad_line_func)
expected = DataFrame({"a": [1, 2, 3], "b": [2, 3, 4]})
tm.assert_frame_equal(result, expected)
assert lst == [["2", "3", "4", "5", "6"]]
@pytest.mark.parametrize("bad_line_func", [lambda x: ["foo", "bar"], lambda x: x[:2]])
@pytest.mark.parametrize("sep", [",", "111"])
def test_on_bad_lines_callable_iterator_true(python_parser_only, bad_line_func, sep):
# GH 5686
# iterator=True has a separate code path than iterator=False
parser = python_parser_only
data = f"""
0{sep}1
hi{sep}there
foo{sep}bar{sep}baz
good{sep}bye
"""
bad_sio = StringIO(data)
result_iter = parser.read_csv(
bad_sio, on_bad_lines=bad_line_func, chunksize=1, iterator=True, sep=sep
)
expecteds = [
{"0": "hi", "1": "there"},
{"0": "foo", "1": "bar"},
{"0": "good", "1": "bye"},
]
for i, (result, expected) in enumerate(zip(result_iter, expecteds)):
expected = DataFrame(expected, index=range(i, i + 1))
tm.assert_frame_equal(result, expected)
def test_on_bad_lines_callable_dont_swallow_errors(python_parser_only):
# GH 5686
parser = python_parser_only
data = """a,b
1,2
2,3,4,5,6
3,4
"""
bad_sio = StringIO(data)
msg = "This function is buggy."
def bad_line_func(bad_line):
raise ValueError(msg)
with pytest.raises(ValueError, match=msg):
parser.read_csv(bad_sio, on_bad_lines=bad_line_func)
def test_on_bad_lines_callable_not_expected_length(python_parser_only):
# GH 5686
parser = python_parser_only
data = """a,b
1,2
2,3,4,5,6
3,4
"""
bad_sio = StringIO(data)
result = parser.read_csv_check_warnings(
ParserWarning, "Length of header or names", bad_sio, on_bad_lines=lambda x: x
)
expected = DataFrame({"a": [1, 2, 3], "b": [2, 3, 4]})
tm.assert_frame_equal(result, expected)
def test_on_bad_lines_callable_returns_none(python_parser_only):
# GH 5686
parser = python_parser_only
data = """a,b
1,2
2,3,4,5,6
3,4
"""
bad_sio = StringIO(data)
result = parser.read_csv(bad_sio, on_bad_lines=lambda x: None)
expected = DataFrame({"a": [1, 3], "b": [2, 4]})
tm.assert_frame_equal(result, expected)
def test_on_bad_lines_index_col_inferred(python_parser_only):
# GH 5686
parser = python_parser_only
data = """a,b
1,2,3
4,5,6
"""
bad_sio = StringIO(data)
result = parser.read_csv(bad_sio, on_bad_lines=lambda x: ["99", "99"])
expected = DataFrame({"a": [2, 5], "b": [3, 6]}, index=[1, 4])
tm.assert_frame_equal(result, expected)
def test_index_col_false_and_header_none(python_parser_only):
# GH#46955
parser = python_parser_only
data = """
0.5,0.03
0.1,0.2,0.3,2
"""
result = parser.read_csv_check_warnings(
ParserWarning,
"Length of header",
StringIO(data),
sep=",",
header=None,
index_col=False,
)
expected = DataFrame({0: [0.5, 0.1], 1: [0.03, 0.2]})
tm.assert_frame_equal(result, expected)
def test_header_int_do_not_infer_multiindex_names_on_different_line(python_parser_only):
# GH#46569
parser = python_parser_only
data = StringIO("a\na,b\nc,d,e\nf,g,h")
result = parser.read_csv_check_warnings(
ParserWarning, "Length of header", data, engine="python", index_col=False
)
expected = DataFrame({"a": ["a", "c", "f"]})
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"dtype", [{"a": object}, {"a": str, "b": np.int64, "c": np.int64}]
)
def test_no_thousand_convert_with_dot_for_non_numeric_cols(python_parser_only, dtype):
# GH#50270
parser = python_parser_only
data = """\
a;b;c
0000.7995;16.000;0
3.03.001.00514;0;4.000
4923.600.041;23.000;131"""
result = parser.read_csv(
StringIO(data),
sep=";",
dtype=dtype,
thousands=".",
)
expected = DataFrame(
{
"a": ["0000.7995", "3.03.001.00514", "4923.600.041"],
"b": [16000, 0, 23000],
"c": [0, 4000, 131],
}
)
if dtype["a"] == object:
expected["a"] = expected["a"].astype(object)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"dtype,expected",
[
(
{"a": str, "b": np.float64, "c": np.int64},
DataFrame(
{
"b": [16000.1, 0, 23000],
"c": [0, 4001, 131],
}
),
),
(
str,
DataFrame(
{
"b": ["16,000.1", "0", "23,000"],
"c": ["0", "4,001", "131"],
}
),
),
],
)
def test_no_thousand_convert_for_non_numeric_cols(python_parser_only, dtype, expected):
# GH#50270
parser = python_parser_only
data = """a;b;c
0000,7995;16,000.1;0
3,03,001,00514;0;4,001
4923,600,041;23,000;131
"""
result = parser.read_csv(
StringIO(data),
sep=";",
dtype=dtype,
thousands=",",
)
expected.insert(0, "a", ["0000,7995", "3,03,001,00514", "4923,600,041"])
tm.assert_frame_equal(result, expected)

View File

@ -0,0 +1,183 @@
"""
Tests that quoting specifications are properly handled
during parsing for all of the parsers defined in parsers.py
"""
import csv
from io import StringIO
import pytest
from pandas.compat import PY311
from pandas.errors import ParserError
from pandas import DataFrame
import pandas._testing as tm
pytestmark = pytest.mark.filterwarnings(
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
)
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
@pytest.mark.parametrize(
"kwargs,msg",
[
({"quotechar": "foo"}, '"quotechar" must be a(n)? 1-character string'),
(
{"quotechar": None, "quoting": csv.QUOTE_MINIMAL},
"quotechar must be set if quoting enabled",
),
({"quotechar": 2}, '"quotechar" must be string( or None)?, not int'),
],
)
@skip_pyarrow # ParserError: CSV parse error: Empty CSV file or block
def test_bad_quote_char(all_parsers, kwargs, msg):
data = "1,2,3"
parser = all_parsers
with pytest.raises(TypeError, match=msg):
parser.read_csv(StringIO(data), **kwargs)
@pytest.mark.parametrize(
"quoting,msg",
[
("foo", '"quoting" must be an integer|Argument'),
(10, 'bad "quoting" value'), # quoting must be in the range [0, 3]
],
)
@xfail_pyarrow # ValueError: The 'quoting' option is not supported
def test_bad_quoting(all_parsers, quoting, msg):
data = "1,2,3"
parser = all_parsers
with pytest.raises(TypeError, match=msg):
parser.read_csv(StringIO(data), quoting=quoting)
def test_quote_char_basic(all_parsers):
parser = all_parsers
data = 'a,b,c\n1,2,"cat"'
expected = DataFrame([[1, 2, "cat"]], columns=["a", "b", "c"])
result = parser.read_csv(StringIO(data), quotechar='"')
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("quote_char", ["~", "*", "%", "$", "@", "P"])
def test_quote_char_various(all_parsers, quote_char):
parser = all_parsers
expected = DataFrame([[1, 2, "cat"]], columns=["a", "b", "c"])
data = 'a,b,c\n1,2,"cat"'
new_data = data.replace('"', quote_char)
result = parser.read_csv(StringIO(new_data), quotechar=quote_char)
tm.assert_frame_equal(result, expected)
@xfail_pyarrow # ValueError: The 'quoting' option is not supported
@pytest.mark.parametrize("quoting", [csv.QUOTE_MINIMAL, csv.QUOTE_NONE])
@pytest.mark.parametrize("quote_char", ["", None])
def test_null_quote_char(all_parsers, quoting, quote_char):
kwargs = {"quotechar": quote_char, "quoting": quoting}
data = "a,b,c\n1,2,3"
parser = all_parsers
if quoting != csv.QUOTE_NONE:
# Sanity checking.
msg = (
'"quotechar" must be a 1-character string'
if PY311 and all_parsers.engine == "python" and quote_char == ""
else "quotechar must be set if quoting enabled"
)
with pytest.raises(TypeError, match=msg):
parser.read_csv(StringIO(data), **kwargs)
elif not (PY311 and all_parsers.engine == "python"):
# Python 3.11+ doesn't support null/blank quote chars in their csv parsers
expected = DataFrame([[1, 2, 3]], columns=["a", "b", "c"])
result = parser.read_csv(StringIO(data), **kwargs)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"kwargs,exp_data",
[
({}, [[1, 2, "foo"]]), # Test default.
# QUOTE_MINIMAL only applies to CSV writing, so no effect on reading.
({"quotechar": '"', "quoting": csv.QUOTE_MINIMAL}, [[1, 2, "foo"]]),
# QUOTE_MINIMAL only applies to CSV writing, so no effect on reading.
({"quotechar": '"', "quoting": csv.QUOTE_ALL}, [[1, 2, "foo"]]),
# QUOTE_NONE tells the reader to do no special handling
# of quote characters and leave them alone.
({"quotechar": '"', "quoting": csv.QUOTE_NONE}, [[1, 2, '"foo"']]),
# QUOTE_NONNUMERIC tells the reader to cast
# all non-quoted fields to float
({"quotechar": '"', "quoting": csv.QUOTE_NONNUMERIC}, [[1.0, 2.0, "foo"]]),
],
)
@xfail_pyarrow # ValueError: The 'quoting' option is not supported
def test_quoting_various(all_parsers, kwargs, exp_data):
data = '1,2,"foo"'
parser = all_parsers
columns = ["a", "b", "c"]
result = parser.read_csv(StringIO(data), names=columns, **kwargs)
expected = DataFrame(exp_data, columns=columns)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"doublequote,exp_data", [(True, [[3, '4 " 5']]), (False, [[3, '4 " 5"']])]
)
def test_double_quote(all_parsers, doublequote, exp_data, request):
parser = all_parsers
data = 'a,b\n3,"4 "" 5"'
if parser.engine == "pyarrow" and not doublequote:
mark = pytest.mark.xfail(reason="Mismatched result")
request.applymarker(mark)
result = parser.read_csv(StringIO(data), quotechar='"', doublequote=doublequote)
expected = DataFrame(exp_data, columns=["a", "b"])
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("quotechar", ['"', "\u0001"])
def test_quotechar_unicode(all_parsers, quotechar):
# see gh-14477
data = "a\n1"
parser = all_parsers
expected = DataFrame({"a": [1]})
result = parser.read_csv(StringIO(data), quotechar=quotechar)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("balanced", [True, False])
def test_unbalanced_quoting(all_parsers, balanced, request):
# see gh-22789.
parser = all_parsers
data = 'a,b,c\n1,2,"3'
if parser.engine == "pyarrow" and not balanced:
mark = pytest.mark.xfail(reason="Mismatched result")
request.applymarker(mark)
if balanced:
# Re-balance the quoting and read in without errors.
expected = DataFrame([[1, 2, 3]], columns=["a", "b", "c"])
result = parser.read_csv(StringIO(data + '"'))
tm.assert_frame_equal(result, expected)
else:
msg = (
"EOF inside string starting at row 1"
if parser.engine == "c"
else "unexpected end of data"
)
with pytest.raises(ParserError, match=msg):
parser.read_csv(StringIO(data))

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,334 @@
"""
Tests that skipped rows are properly handled during
parsing for all of the parsers defined in parsers.py
"""
from datetime import datetime
from io import StringIO
import numpy as np
import pytest
from pandas.errors import EmptyDataError
from pandas import (
DataFrame,
Index,
)
import pandas._testing as tm
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
pytestmark = pytest.mark.filterwarnings(
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
)
@xfail_pyarrow # ValueError: skiprows argument must be an integer
@pytest.mark.parametrize("skiprows", [list(range(6)), 6])
def test_skip_rows_bug(all_parsers, skiprows):
# see gh-505
parser = all_parsers
text = """#foo,a,b,c
#foo,a,b,c
#foo,a,b,c
#foo,a,b,c
#foo,a,b,c
#foo,a,b,c
1/1/2000,1.,2.,3.
1/2/2000,4,5,6
1/3/2000,7,8,9
"""
result = parser.read_csv(
StringIO(text), skiprows=skiprows, header=None, index_col=0, parse_dates=True
)
index = Index(
[datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], name=0
)
expected = DataFrame(
np.arange(1.0, 10.0).reshape((3, 3)), columns=[1, 2, 3], index=index
)
tm.assert_frame_equal(result, expected)
@xfail_pyarrow # ValueError: skiprows argument must be an integer
def test_deep_skip_rows(all_parsers):
# see gh-4382
parser = all_parsers
data = "a,b,c\n" + "\n".join(
[",".join([str(i), str(i + 1), str(i + 2)]) for i in range(10)]
)
condensed_data = "a,b,c\n" + "\n".join(
[",".join([str(i), str(i + 1), str(i + 2)]) for i in [0, 1, 2, 3, 4, 6, 8, 9]]
)
result = parser.read_csv(StringIO(data), skiprows=[6, 8])
condensed_result = parser.read_csv(StringIO(condensed_data))
tm.assert_frame_equal(result, condensed_result)
@xfail_pyarrow # AssertionError: DataFrame are different
def test_skip_rows_blank(all_parsers):
# see gh-9832
parser = all_parsers
text = """#foo,a,b,c
#foo,a,b,c
#foo,a,b,c
#foo,a,b,c
1/1/2000,1.,2.,3.
1/2/2000,4,5,6
1/3/2000,7,8,9
"""
data = parser.read_csv(
StringIO(text), skiprows=6, header=None, index_col=0, parse_dates=True
)
index = Index(
[datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], name=0
)
expected = DataFrame(
np.arange(1.0, 10.0).reshape((3, 3)), columns=[1, 2, 3], index=index
)
tm.assert_frame_equal(data, expected)
@pytest.mark.parametrize(
"data,kwargs,expected",
[
(
"""id,text,num_lines
1,"line 11
line 12",2
2,"line 21
line 22",2
3,"line 31",1""",
{"skiprows": [1]},
DataFrame(
[[2, "line 21\nline 22", 2], [3, "line 31", 1]],
columns=["id", "text", "num_lines"],
),
),
(
"a,b,c\n~a\n b~,~e\n d~,~f\n f~\n1,2,~12\n 13\n 14~",
{"quotechar": "~", "skiprows": [2]},
DataFrame([["a\n b", "e\n d", "f\n f"]], columns=["a", "b", "c"]),
),
(
(
"Text,url\n~example\n "
"sentence\n one~,url1\n~"
"example\n sentence\n two~,url2\n~"
"example\n sentence\n three~,url3"
),
{"quotechar": "~", "skiprows": [1, 3]},
DataFrame([["example\n sentence\n two", "url2"]], columns=["Text", "url"]),
),
],
)
@xfail_pyarrow # ValueError: skiprows argument must be an integer
def test_skip_row_with_newline(all_parsers, data, kwargs, expected):
# see gh-12775 and gh-10911
parser = all_parsers
result = parser.read_csv(StringIO(data), **kwargs)
tm.assert_frame_equal(result, expected)
@xfail_pyarrow # ValueError: skiprows argument must be an integer
def test_skip_row_with_quote(all_parsers):
# see gh-12775 and gh-10911
parser = all_parsers
data = """id,text,num_lines
1,"line '11' line 12",2
2,"line '21' line 22",2
3,"line '31' line 32",1"""
exp_data = [[2, "line '21' line 22", 2], [3, "line '31' line 32", 1]]
expected = DataFrame(exp_data, columns=["id", "text", "num_lines"])
result = parser.read_csv(StringIO(data), skiprows=[1])
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"data,exp_data",
[
(
"""id,text,num_lines
1,"line \n'11' line 12",2
2,"line \n'21' line 22",2
3,"line \n'31' line 32",1""",
[[2, "line \n'21' line 22", 2], [3, "line \n'31' line 32", 1]],
),
(
"""id,text,num_lines
1,"line '11\n' line 12",2
2,"line '21\n' line 22",2
3,"line '31\n' line 32",1""",
[[2, "line '21\n' line 22", 2], [3, "line '31\n' line 32", 1]],
),
(
"""id,text,num_lines
1,"line '11\n' \r\tline 12",2
2,"line '21\n' \r\tline 22",2
3,"line '31\n' \r\tline 32",1""",
[[2, "line '21\n' \r\tline 22", 2], [3, "line '31\n' \r\tline 32", 1]],
),
],
)
@xfail_pyarrow # ValueError: skiprows argument must be an integer
def test_skip_row_with_newline_and_quote(all_parsers, data, exp_data):
# see gh-12775 and gh-10911
parser = all_parsers
result = parser.read_csv(StringIO(data), skiprows=[1])
expected = DataFrame(exp_data, columns=["id", "text", "num_lines"])
tm.assert_frame_equal(result, expected)
@xfail_pyarrow # ValueError: The 'delim_whitespace' option is not supported
@pytest.mark.parametrize(
"lineterminator", ["\n", "\r\n", "\r"] # "LF" # "CRLF" # "CR"
)
def test_skiprows_lineterminator(all_parsers, lineterminator, request):
# see gh-9079
parser = all_parsers
data = "\n".join(
[
"SMOSMANIA ThetaProbe-ML2X ",
"2007/01/01 01:00 0.2140 U M ",
"2007/01/01 02:00 0.2141 M O ",
"2007/01/01 04:00 0.2142 D M ",
]
)
expected = DataFrame(
[
["2007/01/01", "01:00", 0.2140, "U", "M"],
["2007/01/01", "02:00", 0.2141, "M", "O"],
["2007/01/01", "04:00", 0.2142, "D", "M"],
],
columns=["date", "time", "var", "flag", "oflag"],
)
if parser.engine == "python" and lineterminator == "\r":
mark = pytest.mark.xfail(reason="'CR' not respect with the Python parser yet")
request.applymarker(mark)
data = data.replace("\n", lineterminator)
depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated"
with tm.assert_produces_warning(
FutureWarning, match=depr_msg, check_stacklevel=False
):
result = parser.read_csv(
StringIO(data),
skiprows=1,
delim_whitespace=True,
names=["date", "time", "var", "flag", "oflag"],
)
tm.assert_frame_equal(result, expected)
@xfail_pyarrow # AssertionError: DataFrame are different
def test_skiprows_infield_quote(all_parsers):
# see gh-14459
parser = all_parsers
data = 'a"\nb"\na\n1'
expected = DataFrame({"a": [1]})
result = parser.read_csv(StringIO(data), skiprows=2)
tm.assert_frame_equal(result, expected)
@xfail_pyarrow # ValueError: skiprows argument must be an integer
@pytest.mark.parametrize(
"kwargs,expected",
[
({}, DataFrame({"1": [3, 5]})),
({"header": 0, "names": ["foo"]}, DataFrame({"foo": [3, 5]})),
],
)
def test_skip_rows_callable(all_parsers, kwargs, expected):
parser = all_parsers
data = "a\n1\n2\n3\n4\n5"
result = parser.read_csv(StringIO(data), skiprows=lambda x: x % 2 == 0, **kwargs)
tm.assert_frame_equal(result, expected)
@xfail_pyarrow # ValueError: skiprows argument must be an integer
def test_skip_rows_callable_not_in(all_parsers):
parser = all_parsers
data = "0,a\n1,b\n2,c\n3,d\n4,e"
expected = DataFrame([[1, "b"], [3, "d"]])
result = parser.read_csv(
StringIO(data), header=None, skiprows=lambda x: x not in [1, 3]
)
tm.assert_frame_equal(result, expected)
@xfail_pyarrow # ValueError: skiprows argument must be an integer
def test_skip_rows_skip_all(all_parsers):
parser = all_parsers
data = "a\n1\n2\n3\n4\n5"
msg = "No columns to parse from file"
with pytest.raises(EmptyDataError, match=msg):
parser.read_csv(StringIO(data), skiprows=lambda x: True)
@xfail_pyarrow # ValueError: skiprows argument must be an integer
def test_skip_rows_bad_callable(all_parsers):
msg = "by zero"
parser = all_parsers
data = "a\n1\n2\n3\n4\n5"
with pytest.raises(ZeroDivisionError, match=msg):
parser.read_csv(StringIO(data), skiprows=lambda x: 1 / 0)
@xfail_pyarrow # ValueError: skiprows argument must be an integer
def test_skip_rows_and_n_rows(all_parsers):
# GH#44021
data = """a,b
1,a
2,b
3,c
4,d
5,e
6,f
7,g
8,h
"""
parser = all_parsers
result = parser.read_csv(StringIO(data), nrows=5, skiprows=[2, 4, 6])
expected = DataFrame({"a": [1, 3, 5, 7, 8], "b": ["a", "c", "e", "g", "h"]})
tm.assert_frame_equal(result, expected)
@xfail_pyarrow
def test_skip_rows_with_chunks(all_parsers):
# GH 55677
data = """col_a
10
20
30
40
50
60
70
80
90
100
"""
parser = all_parsers
reader = parser.read_csv(
StringIO(data), engine=parser, skiprows=lambda x: x in [1, 4, 5], chunksize=4
)
df1 = next(reader)
df2 = next(reader)
tm.assert_frame_equal(df1, DataFrame({"col_a": [20, 30, 60, 70]}))
tm.assert_frame_equal(df2, DataFrame({"col_a": [80, 90, 100]}, index=[4, 5, 6]))

View File

@ -0,0 +1,342 @@
"""
Tests the TextReader class in parsers.pyx, which
is integral to the C engine in parsers.py
"""
from io import (
BytesIO,
StringIO,
)
import numpy as np
import pytest
import pandas._libs.parsers as parser
from pandas._libs.parsers import TextReader
from pandas.errors import ParserWarning
from pandas import DataFrame
import pandas._testing as tm
from pandas.io.parsers import (
TextFileReader,
read_csv,
)
from pandas.io.parsers.c_parser_wrapper import ensure_dtype_objs
class TestTextReader:
@pytest.fixture
def csv_path(self, datapath):
return datapath("io", "data", "csv", "test1.csv")
def test_file_handle(self, csv_path):
with open(csv_path, "rb") as f:
reader = TextReader(f)
reader.read()
def test_file_handle_mmap(self, csv_path):
# this was never using memory_map=True
with open(csv_path, "rb") as f:
reader = TextReader(f, header=None)
reader.read()
def test_StringIO(self, csv_path):
with open(csv_path, "rb") as f:
text = f.read()
src = BytesIO(text)
reader = TextReader(src, header=None)
reader.read()
def test_string_factorize(self):
# should this be optional?
data = "a\nb\na\nb\na"
reader = TextReader(StringIO(data), header=None)
result = reader.read()
assert len(set(map(id, result[0]))) == 2
def test_skipinitialspace(self):
data = "a, b\na, b\na, b\na, b"
reader = TextReader(StringIO(data), skipinitialspace=True, header=None)
result = reader.read()
tm.assert_numpy_array_equal(
result[0], np.array(["a", "a", "a", "a"], dtype=np.object_)
)
tm.assert_numpy_array_equal(
result[1], np.array(["b", "b", "b", "b"], dtype=np.object_)
)
def test_parse_booleans(self):
data = "True\nFalse\nTrue\nTrue"
reader = TextReader(StringIO(data), header=None)
result = reader.read()
assert result[0].dtype == np.bool_
def test_delimit_whitespace(self):
data = 'a b\na\t\t "b"\n"a"\t \t b'
reader = TextReader(StringIO(data), delim_whitespace=True, header=None)
result = reader.read()
tm.assert_numpy_array_equal(
result[0], np.array(["a", "a", "a"], dtype=np.object_)
)
tm.assert_numpy_array_equal(
result[1], np.array(["b", "b", "b"], dtype=np.object_)
)
def test_embedded_newline(self):
data = 'a\n"hello\nthere"\nthis'
reader = TextReader(StringIO(data), header=None)
result = reader.read()
expected = np.array(["a", "hello\nthere", "this"], dtype=np.object_)
tm.assert_numpy_array_equal(result[0], expected)
def test_euro_decimal(self):
data = "12345,67\n345,678"
reader = TextReader(StringIO(data), delimiter=":", decimal=",", header=None)
result = reader.read()
expected = np.array([12345.67, 345.678])
tm.assert_almost_equal(result[0], expected)
def test_integer_thousands(self):
data = "123,456\n12,500"
reader = TextReader(StringIO(data), delimiter=":", thousands=",", header=None)
result = reader.read()
expected = np.array([123456, 12500], dtype=np.int64)
tm.assert_almost_equal(result[0], expected)
def test_integer_thousands_alt(self):
data = "123.456\n12.500"
reader = TextFileReader(
StringIO(data), delimiter=":", thousands=".", header=None
)
result = reader.read()
expected = DataFrame([123456, 12500])
tm.assert_frame_equal(result, expected)
def test_skip_bad_lines(self):
# too many lines, see #2430 for why
data = "a:b:c\nd:e:f\ng:h:i\nj:k:l:m\nl:m:n\no:p:q:r"
reader = TextReader(StringIO(data), delimiter=":", header=None)
msg = r"Error tokenizing data\. C error: Expected 3 fields in line 4, saw 4"
with pytest.raises(parser.ParserError, match=msg):
reader.read()
reader = TextReader(
StringIO(data), delimiter=":", header=None, on_bad_lines=2 # Skip
)
result = reader.read()
expected = {
0: np.array(["a", "d", "g", "l"], dtype=object),
1: np.array(["b", "e", "h", "m"], dtype=object),
2: np.array(["c", "f", "i", "n"], dtype=object),
}
assert_array_dicts_equal(result, expected)
with tm.assert_produces_warning(ParserWarning, match="Skipping line"):
reader = TextReader(
StringIO(data), delimiter=":", header=None, on_bad_lines=1 # Warn
)
reader.read()
def test_header_not_enough_lines(self):
data = "skip this\nskip this\na,b,c\n1,2,3\n4,5,6"
reader = TextReader(StringIO(data), delimiter=",", header=2)
header = reader.header
expected = [["a", "b", "c"]]
assert header == expected
recs = reader.read()
expected = {
0: np.array([1, 4], dtype=np.int64),
1: np.array([2, 5], dtype=np.int64),
2: np.array([3, 6], dtype=np.int64),
}
assert_array_dicts_equal(recs, expected)
def test_escapechar(self):
data = '\\"hello world"\n\\"hello world"\n\\"hello world"'
reader = TextReader(StringIO(data), delimiter=",", header=None, escapechar="\\")
result = reader.read()
expected = {0: np.array(['"hello world"'] * 3, dtype=object)}
assert_array_dicts_equal(result, expected)
def test_eof_has_eol(self):
# handling of new line at EOF
pass
def test_na_substitution(self):
pass
def test_numpy_string_dtype(self):
data = """\
a,1
aa,2
aaa,3
aaaa,4
aaaaa,5"""
def _make_reader(**kwds):
if "dtype" in kwds:
kwds["dtype"] = ensure_dtype_objs(kwds["dtype"])
return TextReader(StringIO(data), delimiter=",", header=None, **kwds)
reader = _make_reader(dtype="S5,i4")
result = reader.read()
assert result[0].dtype == "S5"
ex_values = np.array(["a", "aa", "aaa", "aaaa", "aaaaa"], dtype="S5")
assert (result[0] == ex_values).all()
assert result[1].dtype == "i4"
reader = _make_reader(dtype="S4")
result = reader.read()
assert result[0].dtype == "S4"
ex_values = np.array(["a", "aa", "aaa", "aaaa", "aaaa"], dtype="S4")
assert (result[0] == ex_values).all()
assert result[1].dtype == "S4"
def test_pass_dtype(self):
data = """\
one,two
1,a
2,b
3,c
4,d"""
def _make_reader(**kwds):
if "dtype" in kwds:
kwds["dtype"] = ensure_dtype_objs(kwds["dtype"])
return TextReader(StringIO(data), delimiter=",", **kwds)
reader = _make_reader(dtype={"one": "u1", 1: "S1"})
result = reader.read()
assert result[0].dtype == "u1"
assert result[1].dtype == "S1"
reader = _make_reader(dtype={"one": np.uint8, 1: object})
result = reader.read()
assert result[0].dtype == "u1"
assert result[1].dtype == "O"
reader = _make_reader(dtype={"one": np.dtype("u1"), 1: np.dtype("O")})
result = reader.read()
assert result[0].dtype == "u1"
assert result[1].dtype == "O"
def test_usecols(self):
data = """\
a,b,c
1,2,3
4,5,6
7,8,9
10,11,12"""
def _make_reader(**kwds):
return TextReader(StringIO(data), delimiter=",", **kwds)
reader = _make_reader(usecols=(1, 2))
result = reader.read()
exp = _make_reader().read()
assert len(result) == 2
assert (result[1] == exp[1]).all()
assert (result[2] == exp[2]).all()
@pytest.mark.parametrize(
"text, kwargs",
[
("a,b,c\r1,2,3\r4,5,6\r7,8,9\r10,11,12", {"delimiter": ","}),
(
"a b c\r1 2 3\r4 5 6\r7 8 9\r10 11 12",
{"delim_whitespace": True},
),
("a,b,c\r1,2,3\r4,5,6\r,88,9\r10,11,12", {"delimiter": ","}),
(
(
"A,B,C,D,E,F,G,H,I,J,K,L,M,N,O\r"
"AAAAA,BBBBB,0,0,0,0,0,0,0,0,0,0,0,0,0\r"
",BBBBB,0,0,0,0,0,0,0,0,0,0,0,0,0"
),
{"delimiter": ","},
),
("A B C\r 2 3\r4 5 6", {"delim_whitespace": True}),
("A B C\r2 3\r4 5 6", {"delim_whitespace": True}),
],
)
def test_cr_delimited(self, text, kwargs):
nice_text = text.replace("\r", "\r\n")
result = TextReader(StringIO(text), **kwargs).read()
expected = TextReader(StringIO(nice_text), **kwargs).read()
assert_array_dicts_equal(result, expected)
def test_empty_field_eof(self):
data = "a,b,c\n1,2,3\n4,,"
result = TextReader(StringIO(data), delimiter=",").read()
expected = {
0: np.array([1, 4], dtype=np.int64),
1: np.array(["2", ""], dtype=object),
2: np.array(["3", ""], dtype=object),
}
assert_array_dicts_equal(result, expected)
@pytest.mark.parametrize("repeat", range(10))
def test_empty_field_eof_mem_access_bug(self, repeat):
# GH5664
a = DataFrame([["b"], [np.nan]], columns=["a"], index=["a", "c"])
b = DataFrame([[1, 1, 1, 0], [1, 1, 1, 0]], columns=list("abcd"), index=[1, 1])
c = DataFrame(
[
[1, 2, 3, 4],
[6, np.nan, np.nan, np.nan],
[8, 9, 10, 11],
[13, 14, np.nan, np.nan],
],
columns=list("abcd"),
index=[0, 5, 7, 12],
)
df = read_csv(StringIO("a,b\nc\n"), skiprows=0, names=["a"], engine="c")
tm.assert_frame_equal(df, a)
df = read_csv(
StringIO("1,1,1,1,0\n" * 2 + "\n" * 2), names=list("abcd"), engine="c"
)
tm.assert_frame_equal(df, b)
df = read_csv(
StringIO("0,1,2,3,4\n5,6\n7,8,9,10,11\n12,13,14"),
names=list("abcd"),
engine="c",
)
tm.assert_frame_equal(df, c)
def test_empty_csv_input(self):
# GH14867
with read_csv(
StringIO(), chunksize=20, header=None, names=["a", "b", "c"]
) as df:
assert isinstance(df, TextFileReader)
def assert_array_dicts_equal(left, right):
for k, v in left.items():
tm.assert_numpy_array_equal(np.asarray(v), np.asarray(right[k]))

View File

@ -0,0 +1,226 @@
"""
Tests that features that are currently unsupported in
either the Python or C parser are actually enforced
and are clearly communicated to the user.
Ultimately, the goal is to remove test cases from this
test suite as new feature support is added to the parsers.
"""
from io import StringIO
import os
from pathlib import Path
import pytest
from pandas.errors import ParserError
import pandas._testing as tm
from pandas.io.parsers import read_csv
import pandas.io.parsers.readers as parsers
pytestmark = pytest.mark.filterwarnings(
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
)
@pytest.fixture(params=["python", "python-fwf"], ids=lambda val: val)
def python_engine(request):
return request.param
class TestUnsupportedFeatures:
def test_mangle_dupe_cols_false(self):
# see gh-12935
data = "a b c\n1 2 3"
for engine in ("c", "python"):
with pytest.raises(TypeError, match="unexpected keyword"):
read_csv(StringIO(data), engine=engine, mangle_dupe_cols=True)
def test_c_engine(self):
# see gh-6607
data = "a b c\n1 2 3"
msg = "does not support"
depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated"
# specify C engine with unsupported options (raise)
with pytest.raises(ValueError, match=msg):
with tm.assert_produces_warning(FutureWarning, match=depr_msg):
read_csv(StringIO(data), engine="c", sep=None, delim_whitespace=False)
with pytest.raises(ValueError, match=msg):
read_csv(StringIO(data), engine="c", sep=r"\s")
with pytest.raises(ValueError, match=msg):
read_csv(StringIO(data), engine="c", sep="\t", quotechar=chr(128))
with pytest.raises(ValueError, match=msg):
read_csv(StringIO(data), engine="c", skipfooter=1)
# specify C-unsupported options without python-unsupported options
with tm.assert_produces_warning((parsers.ParserWarning, FutureWarning)):
read_csv(StringIO(data), sep=None, delim_whitespace=False)
with tm.assert_produces_warning(parsers.ParserWarning):
read_csv(StringIO(data), sep=r"\s")
with tm.assert_produces_warning(parsers.ParserWarning):
read_csv(StringIO(data), sep="\t", quotechar=chr(128))
with tm.assert_produces_warning(parsers.ParserWarning):
read_csv(StringIO(data), skipfooter=1)
text = """ A B C D E
one two three four
a b 10.0032 5 -0.5109 -2.3358 -0.4645 0.05076 0.3640
a q 20 4 0.4473 1.4152 0.2834 1.00661 0.1744
x q 30 3 -0.6662 -0.5243 -0.3580 0.89145 2.5838"""
msg = "Error tokenizing data"
with pytest.raises(ParserError, match=msg):
read_csv(StringIO(text), sep="\\s+")
with pytest.raises(ParserError, match=msg):
read_csv(StringIO(text), engine="c", sep="\\s+")
msg = "Only length-1 thousands markers supported"
data = """A|B|C
1|2,334|5
10|13|10.
"""
with pytest.raises(ValueError, match=msg):
read_csv(StringIO(data), thousands=",,")
with pytest.raises(ValueError, match=msg):
read_csv(StringIO(data), thousands="")
msg = "Only length-1 line terminators supported"
data = "a,b,c~~1,2,3~~4,5,6"
with pytest.raises(ValueError, match=msg):
read_csv(StringIO(data), lineterminator="~~")
def test_python_engine(self, python_engine):
from pandas.io.parsers.readers import _python_unsupported as py_unsupported
data = """1,2,3,,
1,2,3,4,
1,2,3,4,5
1,2,,,
1,2,3,4,"""
for default in py_unsupported:
msg = (
f"The {repr(default)} option is not "
f"supported with the {repr(python_engine)} engine"
)
kwargs = {default: object()}
with pytest.raises(ValueError, match=msg):
read_csv(StringIO(data), engine=python_engine, **kwargs)
def test_python_engine_file_no_iter(self, python_engine):
# see gh-16530
class NoNextBuffer:
def __init__(self, csv_data) -> None:
self.data = csv_data
def __next__(self):
return self.data.__next__()
def read(self):
return self.data
def readline(self):
return self.data
data = "a\n1"
msg = "'NoNextBuffer' object is not iterable|argument 1 must be an iterator"
with pytest.raises(TypeError, match=msg):
read_csv(NoNextBuffer(data), engine=python_engine)
def test_pyarrow_engine(self):
from pandas.io.parsers.readers import _pyarrow_unsupported as pa_unsupported
data = """1,2,3,,
1,2,3,4,
1,2,3,4,5
1,2,,,
1,2,3,4,"""
for default in pa_unsupported:
msg = (
f"The {repr(default)} option is not "
f"supported with the 'pyarrow' engine"
)
kwargs = {default: object()}
default_needs_bool = {"warn_bad_lines", "error_bad_lines"}
if default == "dialect":
kwargs[default] = "excel" # test a random dialect
elif default in default_needs_bool:
kwargs[default] = True
elif default == "on_bad_lines":
kwargs[default] = "warn"
warn = None
depr_msg = None
if "delim_whitespace" in kwargs:
depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated"
warn = FutureWarning
if "verbose" in kwargs:
depr_msg = "The 'verbose' keyword in pd.read_csv is deprecated"
warn = FutureWarning
with pytest.raises(ValueError, match=msg):
with tm.assert_produces_warning(warn, match=depr_msg):
read_csv(StringIO(data), engine="pyarrow", **kwargs)
def test_on_bad_lines_callable_python_or_pyarrow(self, all_parsers):
# GH 5686
# GH 54643
sio = StringIO("a,b\n1,2")
bad_lines_func = lambda x: x
parser = all_parsers
if all_parsers.engine not in ["python", "pyarrow"]:
msg = (
"on_bad_line can only be a callable "
"function if engine='python' or 'pyarrow'"
)
with pytest.raises(ValueError, match=msg):
parser.read_csv(sio, on_bad_lines=bad_lines_func)
else:
parser.read_csv(sio, on_bad_lines=bad_lines_func)
def test_close_file_handle_on_invalid_usecols(all_parsers):
# GH 45384
parser = all_parsers
error = ValueError
if parser.engine == "pyarrow":
# Raises pyarrow.lib.ArrowKeyError
pytest.skip(reason="https://github.com/apache/arrow/issues/38676")
with tm.ensure_clean("test.csv") as fname:
Path(fname).write_text("col1,col2\na,b\n1,2", encoding="utf-8")
with tm.assert_produces_warning(False):
with pytest.raises(error, match="col3"):
parser.read_csv(fname, usecols=["col1", "col2", "col3"])
# unlink fails on windows if file handles still point to it
os.unlink(fname)
def test_invalid_file_inputs(request, all_parsers):
# GH#45957
parser = all_parsers
if parser.engine == "python":
request.applymarker(
pytest.mark.xfail(reason=f"{parser.engine} engine supports lists.")
)
with pytest.raises(ValueError, match="Invalid"):
parser.read_csv([])
def test_invalid_dtype_backend(all_parsers):
parser = all_parsers
msg = (
"dtype_backend numpy is invalid, only 'numpy_nullable' and "
"'pyarrow' are allowed."
)
with pytest.raises(ValueError, match=msg):
parser.read_csv("test", dtype_backend="numpy")

View File

@ -0,0 +1,102 @@
import numpy as np
import pytest
from pandas._libs.parsers import (
_maybe_upcast,
na_values,
)
import pandas as pd
from pandas import NA
import pandas._testing as tm
from pandas.core.arrays import (
ArrowStringArray,
BooleanArray,
FloatingArray,
IntegerArray,
StringArray,
)
def test_maybe_upcast(any_real_numpy_dtype):
# GH#36712
dtype = np.dtype(any_real_numpy_dtype)
na_value = na_values[dtype]
arr = np.array([1, 2, na_value], dtype=dtype)
result = _maybe_upcast(arr, use_dtype_backend=True)
expected_mask = np.array([False, False, True])
if issubclass(dtype.type, np.integer):
expected = IntegerArray(arr, mask=expected_mask)
else:
expected = FloatingArray(arr, mask=expected_mask)
tm.assert_extension_array_equal(result, expected)
def test_maybe_upcast_no_na(any_real_numpy_dtype):
# GH#36712
arr = np.array([1, 2, 3], dtype=any_real_numpy_dtype)
result = _maybe_upcast(arr, use_dtype_backend=True)
expected_mask = np.array([False, False, False])
if issubclass(np.dtype(any_real_numpy_dtype).type, np.integer):
expected = IntegerArray(arr, mask=expected_mask)
else:
expected = FloatingArray(arr, mask=expected_mask)
tm.assert_extension_array_equal(result, expected)
def test_maybe_upcaste_bool():
# GH#36712
dtype = np.bool_
na_value = na_values[dtype]
arr = np.array([True, False, na_value], dtype="uint8").view(dtype)
result = _maybe_upcast(arr, use_dtype_backend=True)
expected_mask = np.array([False, False, True])
expected = BooleanArray(arr, mask=expected_mask)
tm.assert_extension_array_equal(result, expected)
def test_maybe_upcaste_bool_no_nan():
# GH#36712
dtype = np.bool_
arr = np.array([True, False, False], dtype="uint8").view(dtype)
result = _maybe_upcast(arr, use_dtype_backend=True)
expected_mask = np.array([False, False, False])
expected = BooleanArray(arr, mask=expected_mask)
tm.assert_extension_array_equal(result, expected)
def test_maybe_upcaste_all_nan():
# GH#36712
dtype = np.int64
na_value = na_values[dtype]
arr = np.array([na_value, na_value], dtype=dtype)
result = _maybe_upcast(arr, use_dtype_backend=True)
expected_mask = np.array([True, True])
expected = IntegerArray(arr, mask=expected_mask)
tm.assert_extension_array_equal(result, expected)
@pytest.mark.parametrize("val", [na_values[np.object_], "c"])
def test_maybe_upcast_object(val, string_storage):
# GH#36712
pa = pytest.importorskip("pyarrow")
with pd.option_context("mode.string_storage", string_storage):
arr = np.array(["a", "b", val], dtype=np.object_)
result = _maybe_upcast(arr, use_dtype_backend=True)
if string_storage == "python":
exp_val = "c" if val == "c" else NA
expected = StringArray(np.array(["a", "b", exp_val], dtype=np.object_))
else:
exp_val = "c" if val == "c" else None
expected = ArrowStringArray(pa.array(["a", "b", exp_val]))
tm.assert_extension_array_equal(result, expected)

View File

@ -0,0 +1,194 @@
"""
Tests the usecols functionality during parsing
for all of the parsers defined in parsers.py
"""
from io import StringIO
import pytest
from pandas import (
DataFrame,
Index,
Timestamp,
)
import pandas._testing as tm
pytestmark = pytest.mark.filterwarnings(
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
)
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
_msg_pyarrow_requires_names = (
"The pyarrow engine does not allow 'usecols' to be integer column "
"positions. Pass a list of string column names instead."
)
@pytest.mark.parametrize("usecols", [[0, 2, 3], [3, 0, 2]])
def test_usecols_with_parse_dates(all_parsers, usecols):
# see gh-9755
data = """a,b,c,d,e
0,1,2014-01-01,09:00,4
0,1,2014-01-02,10:00,4"""
parser = all_parsers
parse_dates = [[1, 2]]
depr_msg = (
"Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated"
)
cols = {
"a": [0, 0],
"c_d": [Timestamp("2014-01-01 09:00:00"), Timestamp("2014-01-02 10:00:00")],
}
expected = DataFrame(cols, columns=["c_d", "a"])
if parser.engine == "pyarrow":
with pytest.raises(ValueError, match=_msg_pyarrow_requires_names):
with tm.assert_produces_warning(
FutureWarning, match=depr_msg, check_stacklevel=False
):
parser.read_csv(
StringIO(data), usecols=usecols, parse_dates=parse_dates
)
return
with tm.assert_produces_warning(
FutureWarning, match=depr_msg, check_stacklevel=False
):
result = parser.read_csv(
StringIO(data), usecols=usecols, parse_dates=parse_dates
)
tm.assert_frame_equal(result, expected)
@skip_pyarrow # pyarrow.lib.ArrowKeyError: Column 'fdate' in include_columns
def test_usecols_with_parse_dates2(all_parsers):
# see gh-13604
parser = all_parsers
data = """2008-02-07 09:40,1032.43
2008-02-07 09:50,1042.54
2008-02-07 10:00,1051.65"""
names = ["date", "values"]
usecols = names[:]
parse_dates = [0]
index = Index(
[
Timestamp("2008-02-07 09:40"),
Timestamp("2008-02-07 09:50"),
Timestamp("2008-02-07 10:00"),
],
name="date",
)
cols = {"values": [1032.43, 1042.54, 1051.65]}
expected = DataFrame(cols, index=index)
result = parser.read_csv(
StringIO(data),
parse_dates=parse_dates,
index_col=0,
usecols=usecols,
header=None,
names=names,
)
tm.assert_frame_equal(result, expected)
def test_usecols_with_parse_dates3(all_parsers):
# see gh-14792
parser = all_parsers
data = """a,b,c,d,e,f,g,h,i,j
2016/09/21,1,1,2,3,4,5,6,7,8"""
usecols = list("abcdefghij")
parse_dates = [0]
cols = {
"a": Timestamp("2016-09-21").as_unit("ns"),
"b": [1],
"c": [1],
"d": [2],
"e": [3],
"f": [4],
"g": [5],
"h": [6],
"i": [7],
"j": [8],
}
expected = DataFrame(cols, columns=usecols)
result = parser.read_csv(StringIO(data), usecols=usecols, parse_dates=parse_dates)
tm.assert_frame_equal(result, expected)
def test_usecols_with_parse_dates4(all_parsers):
data = "a,b,c,d,e,f,g,h,i,j\n2016/09/21,1,1,2,3,4,5,6,7,8"
usecols = list("abcdefghij")
parse_dates = [[0, 1]]
parser = all_parsers
cols = {
"a_b": "2016/09/21 1",
"c": [1],
"d": [2],
"e": [3],
"f": [4],
"g": [5],
"h": [6],
"i": [7],
"j": [8],
}
expected = DataFrame(cols, columns=["a_b"] + list("cdefghij"))
depr_msg = (
"Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated"
)
with tm.assert_produces_warning(
(FutureWarning, DeprecationWarning), match=depr_msg, check_stacklevel=False
):
result = parser.read_csv(
StringIO(data),
usecols=usecols,
parse_dates=parse_dates,
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("usecols", [[0, 2, 3], [3, 0, 2]])
@pytest.mark.parametrize(
"names",
[
list("abcde"), # Names span all columns in original data.
list("acd"), # Names span only the selected columns.
],
)
def test_usecols_with_parse_dates_and_names(all_parsers, usecols, names, request):
# see gh-9755
s = """0,1,2014-01-01,09:00,4
0,1,2014-01-02,10:00,4"""
parse_dates = [[1, 2]]
parser = all_parsers
if parser.engine == "pyarrow" and not (len(names) == 3 and usecols[0] == 0):
mark = pytest.mark.xfail(
reason="Length mismatch in some cases, UserWarning in other"
)
request.applymarker(mark)
cols = {
"a": [0, 0],
"c_d": [Timestamp("2014-01-01 09:00:00"), Timestamp("2014-01-02 10:00:00")],
}
expected = DataFrame(cols, columns=["c_d", "a"])
depr_msg = (
"Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated"
)
with tm.assert_produces_warning(
(FutureWarning, DeprecationWarning), match=depr_msg, check_stacklevel=False
):
result = parser.read_csv(
StringIO(s), names=names, parse_dates=parse_dates, usecols=usecols
)
tm.assert_frame_equal(result, expected)

View File

@ -0,0 +1,96 @@
"""
Tests the usecols functionality during parsing
for all of the parsers defined in parsers.py
"""
from io import StringIO
import pytest
from pandas import DataFrame
import pandas._testing as tm
pytestmark = pytest.mark.filterwarnings(
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
)
def test_usecols_with_unicode_strings(all_parsers):
# see gh-13219
data = """AAA,BBB,CCC,DDD
0.056674973,8,True,a
2.613230982,2,False,b
3.568935038,7,False,a"""
parser = all_parsers
exp_data = {
"AAA": {
0: 0.056674972999999997,
1: 2.6132309819999997,
2: 3.5689350380000002,
},
"BBB": {0: 8, 1: 2, 2: 7},
}
expected = DataFrame(exp_data)
result = parser.read_csv(StringIO(data), usecols=["AAA", "BBB"])
tm.assert_frame_equal(result, expected)
def test_usecols_with_single_byte_unicode_strings(all_parsers):
# see gh-13219
data = """A,B,C,D
0.056674973,8,True,a
2.613230982,2,False,b
3.568935038,7,False,a"""
parser = all_parsers
exp_data = {
"A": {
0: 0.056674972999999997,
1: 2.6132309819999997,
2: 3.5689350380000002,
},
"B": {0: 8, 1: 2, 2: 7},
}
expected = DataFrame(exp_data)
result = parser.read_csv(StringIO(data), usecols=["A", "B"])
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("usecols", [["AAA", b"BBB"], [b"AAA", "BBB"]])
def test_usecols_with_mixed_encoding_strings(all_parsers, usecols):
data = """AAA,BBB,CCC,DDD
0.056674973,8,True,a
2.613230982,2,False,b
3.568935038,7,False,a"""
parser = all_parsers
_msg_validate_usecols_arg = (
"'usecols' must either be list-like "
"of all strings, all unicode, all "
"integers or a callable."
)
with pytest.raises(ValueError, match=_msg_validate_usecols_arg):
parser.read_csv(StringIO(data), usecols=usecols)
@pytest.mark.parametrize("usecols", [["あああ", "いい"], ["あああ", "いい"]])
def test_usecols_with_multi_byte_characters(all_parsers, usecols):
data = """あああ,いい,ううう,ええええ
0.056674973,8,True,a
2.613230982,2,False,b
3.568935038,7,False,a"""
parser = all_parsers
exp_data = {
"あああ": {
0: 0.056674972999999997,
1: 2.6132309819999997,
2: 3.5689350380000002,
},
"いい": {0: 8, 1: 2, 2: 7},
}
expected = DataFrame(exp_data)
result = parser.read_csv(StringIO(data), usecols=usecols)
tm.assert_frame_equal(result, expected)

View File

@ -0,0 +1,563 @@
"""
Tests the usecols functionality during parsing
for all of the parsers defined in parsers.py
"""
from io import StringIO
import numpy as np
import pytest
from pandas.errors import ParserError
from pandas import (
DataFrame,
Index,
array,
)
import pandas._testing as tm
pytestmark = pytest.mark.filterwarnings(
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
)
_msg_validate_usecols_arg = (
"'usecols' must either be list-like "
"of all strings, all unicode, all "
"integers or a callable."
)
_msg_validate_usecols_names = (
"Usecols do not match columns, columns expected but not found: {0}"
)
_msg_pyarrow_requires_names = (
"The pyarrow engine does not allow 'usecols' to be integer column "
"positions. Pass a list of string column names instead."
)
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
pytestmark = pytest.mark.filterwarnings(
"ignore:Passing a BlockManager to DataFrame is deprecated:DeprecationWarning"
)
def test_raise_on_mixed_dtype_usecols(all_parsers):
# See gh-12678
data = """a,b,c
1000,2000,3000
4000,5000,6000
"""
usecols = [0, "b", 2]
parser = all_parsers
with pytest.raises(ValueError, match=_msg_validate_usecols_arg):
parser.read_csv(StringIO(data), usecols=usecols)
@pytest.mark.parametrize("usecols", [(1, 2), ("b", "c")])
def test_usecols(all_parsers, usecols, request):
data = """\
a,b,c
1,2,3
4,5,6
7,8,9
10,11,12"""
parser = all_parsers
if parser.engine == "pyarrow" and isinstance(usecols[0], int):
with pytest.raises(ValueError, match=_msg_pyarrow_requires_names):
parser.read_csv(StringIO(data), usecols=usecols)
return
result = parser.read_csv(StringIO(data), usecols=usecols)
expected = DataFrame([[2, 3], [5, 6], [8, 9], [11, 12]], columns=["b", "c"])
tm.assert_frame_equal(result, expected)
def test_usecols_with_names(all_parsers):
data = """\
a,b,c
1,2,3
4,5,6
7,8,9
10,11,12"""
parser = all_parsers
names = ["foo", "bar"]
if parser.engine == "pyarrow":
with pytest.raises(ValueError, match=_msg_pyarrow_requires_names):
parser.read_csv(StringIO(data), names=names, usecols=[1, 2], header=0)
return
result = parser.read_csv(StringIO(data), names=names, usecols=[1, 2], header=0)
expected = DataFrame([[2, 3], [5, 6], [8, 9], [11, 12]], columns=names)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"names,usecols", [(["b", "c"], [1, 2]), (["a", "b", "c"], ["b", "c"])]
)
def test_usecols_relative_to_names(all_parsers, names, usecols):
data = """\
1,2,3
4,5,6
7,8,9
10,11,12"""
parser = all_parsers
if parser.engine == "pyarrow" and not isinstance(usecols[0], int):
# ArrowKeyError: Column 'fb' in include_columns does not exist
pytest.skip(reason="https://github.com/apache/arrow/issues/38676")
result = parser.read_csv(StringIO(data), names=names, header=None, usecols=usecols)
expected = DataFrame([[2, 3], [5, 6], [8, 9], [11, 12]], columns=["b", "c"])
tm.assert_frame_equal(result, expected)
def test_usecols_relative_to_names2(all_parsers):
# see gh-5766
data = """\
1,2,3
4,5,6
7,8,9
10,11,12"""
parser = all_parsers
result = parser.read_csv(
StringIO(data), names=["a", "b"], header=None, usecols=[0, 1]
)
expected = DataFrame([[1, 2], [4, 5], [7, 8], [10, 11]], columns=["a", "b"])
tm.assert_frame_equal(result, expected)
# regex mismatch: "Length mismatch: Expected axis has 1 elements"
@xfail_pyarrow
def test_usecols_name_length_conflict(all_parsers):
data = """\
1,2,3
4,5,6
7,8,9
10,11,12"""
parser = all_parsers
msg = "Number of passed names did not match number of header fields in the file"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), names=["a", "b"], header=None, usecols=[1])
def test_usecols_single_string(all_parsers):
# see gh-20558
parser = all_parsers
data = """foo, bar, baz
1000, 2000, 3000
4000, 5000, 6000"""
with pytest.raises(ValueError, match=_msg_validate_usecols_arg):
parser.read_csv(StringIO(data), usecols="foo")
@skip_pyarrow # CSV parse error in one case, AttributeError in another
@pytest.mark.parametrize(
"data", ["a,b,c,d\n1,2,3,4\n5,6,7,8", "a,b,c,d\n1,2,3,4,\n5,6,7,8,"]
)
def test_usecols_index_col_false(all_parsers, data):
# see gh-9082
parser = all_parsers
usecols = ["a", "c", "d"]
expected = DataFrame({"a": [1, 5], "c": [3, 7], "d": [4, 8]})
result = parser.read_csv(StringIO(data), usecols=usecols, index_col=False)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("index_col", ["b", 0])
@pytest.mark.parametrize("usecols", [["b", "c"], [1, 2]])
def test_usecols_index_col_conflict(all_parsers, usecols, index_col, request):
# see gh-4201: test that index_col as integer reflects usecols
parser = all_parsers
data = "a,b,c,d\nA,a,1,one\nB,b,2,two"
if parser.engine == "pyarrow" and isinstance(usecols[0], int):
with pytest.raises(ValueError, match=_msg_pyarrow_requires_names):
parser.read_csv(StringIO(data), usecols=usecols, index_col=index_col)
return
expected = DataFrame({"c": [1, 2]}, index=Index(["a", "b"], name="b"))
result = parser.read_csv(StringIO(data), usecols=usecols, index_col=index_col)
tm.assert_frame_equal(result, expected)
def test_usecols_index_col_conflict2(all_parsers):
# see gh-4201: test that index_col as integer reflects usecols
parser = all_parsers
data = "a,b,c,d\nA,a,1,one\nB,b,2,two"
expected = DataFrame({"b": ["a", "b"], "c": [1, 2], "d": ("one", "two")})
expected = expected.set_index(["b", "c"])
result = parser.read_csv(
StringIO(data), usecols=["b", "c", "d"], index_col=["b", "c"]
)
tm.assert_frame_equal(result, expected)
@skip_pyarrow # CSV parse error: Expected 3 columns, got 4
def test_usecols_implicit_index_col(all_parsers):
# see gh-2654
parser = all_parsers
data = "a,b,c\n4,apple,bat,5.7\n8,orange,cow,10"
result = parser.read_csv(StringIO(data), usecols=["a", "b"])
expected = DataFrame({"a": ["apple", "orange"], "b": ["bat", "cow"]}, index=[4, 8])
tm.assert_frame_equal(result, expected)
def test_usecols_index_col_middle(all_parsers):
# GH#9098
parser = all_parsers
data = """a,b,c,d
1,2,3,4
"""
result = parser.read_csv(StringIO(data), usecols=["b", "c", "d"], index_col="c")
expected = DataFrame({"b": [2], "d": [4]}, index=Index([3], name="c"))
tm.assert_frame_equal(result, expected)
def test_usecols_index_col_end(all_parsers):
# GH#9098
parser = all_parsers
data = """a,b,c,d
1,2,3,4
"""
result = parser.read_csv(StringIO(data), usecols=["b", "c", "d"], index_col="d")
expected = DataFrame({"b": [2], "c": [3]}, index=Index([4], name="d"))
tm.assert_frame_equal(result, expected)
def test_usecols_regex_sep(all_parsers):
# see gh-2733
parser = all_parsers
data = "a b c\n4 apple bat 5.7\n8 orange cow 10"
if parser.engine == "pyarrow":
msg = "the 'pyarrow' engine does not support regex separators"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), sep=r"\s+", usecols=("a", "b"))
return
result = parser.read_csv(StringIO(data), sep=r"\s+", usecols=("a", "b"))
expected = DataFrame({"a": ["apple", "orange"], "b": ["bat", "cow"]}, index=[4, 8])
tm.assert_frame_equal(result, expected)
def test_usecols_with_whitespace(all_parsers):
parser = all_parsers
data = "a b c\n4 apple bat 5.7\n8 orange cow 10"
depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated"
if parser.engine == "pyarrow":
msg = "The 'delim_whitespace' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
with tm.assert_produces_warning(
FutureWarning, match=depr_msg, check_stacklevel=False
):
parser.read_csv(
StringIO(data), delim_whitespace=True, usecols=("a", "b")
)
return
with tm.assert_produces_warning(
FutureWarning, match=depr_msg, check_stacklevel=False
):
result = parser.read_csv(
StringIO(data), delim_whitespace=True, usecols=("a", "b")
)
expected = DataFrame({"a": ["apple", "orange"], "b": ["bat", "cow"]}, index=[4, 8])
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"usecols,expected",
[
# Column selection by index.
([0, 1], DataFrame(data=[[1000, 2000], [4000, 5000]], columns=["2", "0"])),
# Column selection by name.
(
["0", "1"],
DataFrame(data=[[2000, 3000], [5000, 6000]], columns=["0", "1"]),
),
],
)
def test_usecols_with_integer_like_header(all_parsers, usecols, expected, request):
parser = all_parsers
data = """2,0,1
1000,2000,3000
4000,5000,6000"""
if parser.engine == "pyarrow" and isinstance(usecols[0], int):
with pytest.raises(ValueError, match=_msg_pyarrow_requires_names):
parser.read_csv(StringIO(data), usecols=usecols)
return
result = parser.read_csv(StringIO(data), usecols=usecols)
tm.assert_frame_equal(result, expected)
@xfail_pyarrow # mismatched shape
def test_empty_usecols(all_parsers):
data = "a,b,c\n1,2,3\n4,5,6"
expected = DataFrame(columns=Index([]))
parser = all_parsers
result = parser.read_csv(StringIO(data), usecols=set())
tm.assert_frame_equal(result, expected)
def test_np_array_usecols(all_parsers):
# see gh-12546
parser = all_parsers
data = "a,b,c\n1,2,3"
usecols = np.array(["a", "b"])
expected = DataFrame([[1, 2]], columns=usecols)
result = parser.read_csv(StringIO(data), usecols=usecols)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"usecols,expected",
[
(
lambda x: x.upper() in ["AAA", "BBB", "DDD"],
DataFrame(
{
"AaA": {
0: 0.056674972999999997,
1: 2.6132309819999997,
2: 3.5689350380000002,
},
"bBb": {0: 8, 1: 2, 2: 7},
"ddd": {0: "a", 1: "b", 2: "a"},
}
),
),
(lambda x: False, DataFrame(columns=Index([]))),
],
)
def test_callable_usecols(all_parsers, usecols, expected):
# see gh-14154
data = """AaA,bBb,CCC,ddd
0.056674973,8,True,a
2.613230982,2,False,b
3.568935038,7,False,a"""
parser = all_parsers
if parser.engine == "pyarrow":
msg = "The pyarrow engine does not allow 'usecols' to be a callable"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), usecols=usecols)
return
result = parser.read_csv(StringIO(data), usecols=usecols)
tm.assert_frame_equal(result, expected)
# ArrowKeyError: Column 'fa' in include_columns does not exist in CSV file
@skip_pyarrow
@pytest.mark.parametrize("usecols", [["a", "c"], lambda x: x in ["a", "c"]])
def test_incomplete_first_row(all_parsers, usecols):
# see gh-6710
data = "1,2\n1,2,3"
parser = all_parsers
names = ["a", "b", "c"]
expected = DataFrame({"a": [1, 1], "c": [np.nan, 3]})
result = parser.read_csv(StringIO(data), names=names, usecols=usecols)
tm.assert_frame_equal(result, expected)
@skip_pyarrow # CSV parse error: Expected 3 columns, got 4
@pytest.mark.parametrize(
"data,usecols,kwargs,expected",
[
# see gh-8985
(
"19,29,39\n" * 2 + "10,20,30,40",
[0, 1, 2],
{"header": None},
DataFrame([[19, 29, 39], [19, 29, 39], [10, 20, 30]]),
),
# see gh-9549
(
("A,B,C\n1,2,3\n3,4,5\n1,2,4,5,1,6\n1,2,3,,,1,\n1,2,3\n5,6,7"),
["A", "B", "C"],
{},
DataFrame(
{
"A": [1, 3, 1, 1, 1, 5],
"B": [2, 4, 2, 2, 2, 6],
"C": [3, 5, 4, 3, 3, 7],
}
),
),
],
)
def test_uneven_length_cols(all_parsers, data, usecols, kwargs, expected):
# see gh-8985
parser = all_parsers
result = parser.read_csv(StringIO(data), usecols=usecols, **kwargs)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"usecols,kwargs,expected,msg",
[
(
["a", "b", "c", "d"],
{},
DataFrame({"a": [1, 5], "b": [2, 6], "c": [3, 7], "d": [4, 8]}),
None,
),
(
["a", "b", "c", "f"],
{},
None,
_msg_validate_usecols_names.format(r"\['f'\]"),
),
(["a", "b", "f"], {}, None, _msg_validate_usecols_names.format(r"\['f'\]")),
(
["a", "b", "f", "g"],
{},
None,
_msg_validate_usecols_names.format(r"\[('f', 'g'|'g', 'f')\]"),
),
# see gh-14671
(
None,
{"header": 0, "names": ["A", "B", "C", "D"]},
DataFrame({"A": [1, 5], "B": [2, 6], "C": [3, 7], "D": [4, 8]}),
None,
),
(
["A", "B", "C", "f"],
{"header": 0, "names": ["A", "B", "C", "D"]},
None,
_msg_validate_usecols_names.format(r"\['f'\]"),
),
(
["A", "B", "f"],
{"names": ["A", "B", "C", "D"]},
None,
_msg_validate_usecols_names.format(r"\['f'\]"),
),
],
)
def test_raises_on_usecols_names_mismatch(
all_parsers, usecols, kwargs, expected, msg, request
):
data = "a,b,c,d\n1,2,3,4\n5,6,7,8"
kwargs.update(usecols=usecols)
parser = all_parsers
if parser.engine == "pyarrow" and not (
usecols is not None and expected is not None
):
# everything but the first case
# ArrowKeyError: Column 'f' in include_columns does not exist in CSV file
pytest.skip(reason="https://github.com/apache/arrow/issues/38676")
if expected is None:
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), **kwargs)
else:
result = parser.read_csv(StringIO(data), **kwargs)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("usecols", [["A", "C"], [0, 2]])
def test_usecols_subset_names_mismatch_orig_columns(all_parsers, usecols, request):
data = "a,b,c,d\n1,2,3,4\n5,6,7,8"
names = ["A", "B", "C", "D"]
parser = all_parsers
if parser.engine == "pyarrow":
if isinstance(usecols[0], int):
with pytest.raises(ValueError, match=_msg_pyarrow_requires_names):
parser.read_csv(StringIO(data), header=0, names=names, usecols=usecols)
return
# "pyarrow.lib.ArrowKeyError: Column 'A' in include_columns does not exist"
pytest.skip(reason="https://github.com/apache/arrow/issues/38676")
result = parser.read_csv(StringIO(data), header=0, names=names, usecols=usecols)
expected = DataFrame({"A": [1, 5], "C": [3, 7]})
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("names", [None, ["a", "b"]])
def test_usecols_indices_out_of_bounds(all_parsers, names):
# GH#25623 & GH 41130; enforced in 2.0
parser = all_parsers
data = """
a,b
1,2
"""
err = ParserError
msg = "Defining usecols with out-of-bounds"
if parser.engine == "pyarrow":
err = ValueError
msg = _msg_pyarrow_requires_names
with pytest.raises(err, match=msg):
parser.read_csv(StringIO(data), usecols=[0, 2], names=names, header=0)
def test_usecols_additional_columns(all_parsers):
# GH#46997
parser = all_parsers
usecols = lambda header: header.strip() in ["a", "b", "c"]
if parser.engine == "pyarrow":
msg = "The pyarrow engine does not allow 'usecols' to be a callable"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO("a,b\nx,y,z"), index_col=False, usecols=usecols)
return
result = parser.read_csv(StringIO("a,b\nx,y,z"), index_col=False, usecols=usecols)
expected = DataFrame({"a": ["x"], "b": "y"})
tm.assert_frame_equal(result, expected)
def test_usecols_additional_columns_integer_columns(all_parsers):
# GH#46997
parser = all_parsers
usecols = lambda header: header.strip() in ["0", "1"]
if parser.engine == "pyarrow":
msg = "The pyarrow engine does not allow 'usecols' to be a callable"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO("0,1\nx,y,z"), index_col=False, usecols=usecols)
return
result = parser.read_csv(StringIO("0,1\nx,y,z"), index_col=False, usecols=usecols)
expected = DataFrame({"0": ["x"], "1": "y"})
tm.assert_frame_equal(result, expected)
def test_usecols_dtype(all_parsers):
parser = all_parsers
data = """
col1,col2,col3
a,1,x
b,2,y
"""
result = parser.read_csv(
StringIO(data),
usecols=["col1", "col2"],
dtype={"col1": "string", "col2": "uint8", "col3": "string"},
)
expected = DataFrame(
{"col1": array(["a", "b"]), "col2": np.array([1, 2], dtype="uint8")}
)
tm.assert_frame_equal(result, expected)

View File

@ -0,0 +1,50 @@
from collections.abc import Generator
from contextlib import contextmanager
import pathlib
import tempfile
import pytest
from pandas.io.pytables import HDFStore
tables = pytest.importorskip("tables")
# set these parameters so we don't have file sharing
tables.parameters.MAX_NUMEXPR_THREADS = 1
tables.parameters.MAX_BLOSC_THREADS = 1
tables.parameters.MAX_THREADS = 1
def safe_close(store):
try:
if store is not None:
store.close()
except OSError:
pass
# contextmanager to ensure the file cleanup
@contextmanager
def ensure_clean_store(
path, mode="a", complevel=None, complib=None, fletcher32=False
) -> Generator[HDFStore, None, None]:
with tempfile.TemporaryDirectory() as tmpdirname:
tmp_path = pathlib.Path(tmpdirname, path)
with HDFStore(
tmp_path,
mode=mode,
complevel=complevel,
complib=complib,
fletcher32=fletcher32,
) as store:
yield store
def _maybe_remove(store, key):
"""
For tests using tables, try removing the table to be sure there is
no content from previous tests using the same table name.
"""
try:
store.remove(key)
except (ValueError, KeyError):
pass

View File

@ -0,0 +1,9 @@
import uuid
import pytest
@pytest.fixture
def setup_path():
"""Fixture for setup path"""
return f"tmp.__{uuid.uuid4()}__.h5"

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,214 @@
import numpy as np
import pytest
from pandas import (
Categorical,
DataFrame,
Series,
_testing as tm,
concat,
read_hdf,
)
from pandas.tests.io.pytables.common import (
_maybe_remove,
ensure_clean_store,
)
pytestmark = [pytest.mark.single_cpu]
def test_categorical(setup_path):
with ensure_clean_store(setup_path) as store:
# Basic
_maybe_remove(store, "s")
s = Series(
Categorical(
["a", "b", "b", "a", "a", "c"],
categories=["a", "b", "c", "d"],
ordered=False,
)
)
store.append("s", s, format="table")
result = store.select("s")
tm.assert_series_equal(s, result)
_maybe_remove(store, "s_ordered")
s = Series(
Categorical(
["a", "b", "b", "a", "a", "c"],
categories=["a", "b", "c", "d"],
ordered=True,
)
)
store.append("s_ordered", s, format="table")
result = store.select("s_ordered")
tm.assert_series_equal(s, result)
_maybe_remove(store, "df")
df = DataFrame({"s": s, "vals": [1, 2, 3, 4, 5, 6]})
store.append("df", df, format="table")
result = store.select("df")
tm.assert_frame_equal(result, df)
# Dtypes
_maybe_remove(store, "si")
s = Series([1, 1, 2, 2, 3, 4, 5]).astype("category")
store.append("si", s)
result = store.select("si")
tm.assert_series_equal(result, s)
_maybe_remove(store, "si2")
s = Series([1, 1, np.nan, 2, 3, 4, 5]).astype("category")
store.append("si2", s)
result = store.select("si2")
tm.assert_series_equal(result, s)
# Multiple
_maybe_remove(store, "df2")
df2 = df.copy()
df2["s2"] = Series(list("abcdefg")).astype("category")
store.append("df2", df2)
result = store.select("df2")
tm.assert_frame_equal(result, df2)
# Make sure the metadata is OK
info = store.info()
assert "/df2 " in info
# df2._mgr.blocks[0] and df2._mgr.blocks[2] are Categorical
assert "/df2/meta/values_block_0/meta" in info
assert "/df2/meta/values_block_2/meta" in info
# unordered
_maybe_remove(store, "s2")
s = Series(
Categorical(
["a", "b", "b", "a", "a", "c"],
categories=["a", "b", "c", "d"],
ordered=False,
)
)
store.append("s2", s, format="table")
result = store.select("s2")
tm.assert_series_equal(result, s)
# Query
_maybe_remove(store, "df3")
store.append("df3", df, data_columns=["s"])
expected = df[df.s.isin(["b", "c"])]
result = store.select("df3", where=['s in ["b","c"]'])
tm.assert_frame_equal(result, expected)
expected = df[df.s.isin(["b", "c"])]
result = store.select("df3", where=['s = ["b","c"]'])
tm.assert_frame_equal(result, expected)
expected = df[df.s.isin(["d"])]
result = store.select("df3", where=['s in ["d"]'])
tm.assert_frame_equal(result, expected)
expected = df[df.s.isin(["f"])]
result = store.select("df3", where=['s in ["f"]'])
tm.assert_frame_equal(result, expected)
# Appending with same categories is ok
store.append("df3", df)
df = concat([df, df])
expected = df[df.s.isin(["b", "c"])]
result = store.select("df3", where=['s in ["b","c"]'])
tm.assert_frame_equal(result, expected)
# Appending must have the same categories
df3 = df.copy()
df3["s"] = df3["s"].cat.remove_unused_categories()
msg = "cannot append a categorical with different categories to the existing"
with pytest.raises(ValueError, match=msg):
store.append("df3", df3)
# Remove, and make sure meta data is removed (its a recursive
# removal so should be).
result = store.select("df3/meta/s/meta")
assert result is not None
store.remove("df3")
with pytest.raises(
KeyError, match="'No object named df3/meta/s/meta in the file'"
):
store.select("df3/meta/s/meta")
def test_categorical_conversion(tmp_path, setup_path):
# GH13322
# Check that read_hdf with categorical columns doesn't return rows if
# where criteria isn't met.
obsids = ["ESP_012345_6789", "ESP_987654_3210"]
imgids = ["APF00006np", "APF0001imm"]
data = [4.3, 9.8]
# Test without categories
df = DataFrame({"obsids": obsids, "imgids": imgids, "data": data})
# We are expecting an empty DataFrame matching types of df
expected = df.iloc[[], :]
path = tmp_path / setup_path
df.to_hdf(path, key="df", format="table", data_columns=True)
result = read_hdf(path, "df", where="obsids=B")
tm.assert_frame_equal(result, expected)
# Test with categories
df.obsids = df.obsids.astype("category")
df.imgids = df.imgids.astype("category")
# We are expecting an empty DataFrame matching types of df
expected = df.iloc[[], :]
path = tmp_path / setup_path
df.to_hdf(path, key="df", format="table", data_columns=True)
result = read_hdf(path, "df", where="obsids=B")
tm.assert_frame_equal(result, expected)
def test_categorical_nan_only_columns(tmp_path, setup_path):
# GH18413
# Check that read_hdf with categorical columns with NaN-only values can
# be read back.
df = DataFrame(
{
"a": ["a", "b", "c", np.nan],
"b": [np.nan, np.nan, np.nan, np.nan],
"c": [1, 2, 3, 4],
"d": Series([None] * 4, dtype=object),
}
)
df["a"] = df.a.astype("category")
df["b"] = df.b.astype("category")
df["d"] = df.b.astype("category")
expected = df
path = tmp_path / setup_path
df.to_hdf(path, key="df", format="table", data_columns=True)
result = read_hdf(path, "df")
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"where, df, expected",
[
('col=="q"', DataFrame({"col": ["a", "b", "s"]}), DataFrame({"col": []})),
('col=="a"', DataFrame({"col": ["a", "b", "s"]}), DataFrame({"col": ["a"]})),
],
)
def test_convert_value(
tmp_path, setup_path, where: str, df: DataFrame, expected: DataFrame
):
# GH39420
# Check that read_hdf with categorical columns can filter by where condition.
df.col = df.col.astype("category")
max_widths = {"col": 1}
categorical_values = sorted(df.col.unique())
expected.col = expected.col.astype("category")
expected.col = expected.col.cat.set_categories(categorical_values)
path = tmp_path / setup_path
df.to_hdf(path, key="df", format="table", min_itemsize=max_widths)
result = read_hdf(path, where=where)
tm.assert_frame_equal(result, expected)

View File

@ -0,0 +1,75 @@
import pytest
import pandas as pd
import pandas._testing as tm
tables = pytest.importorskip("tables")
@pytest.fixture
def pytables_hdf5_file(tmp_path):
"""
Use PyTables to create a simple HDF5 file.
"""
table_schema = {
"c0": tables.Time64Col(pos=0),
"c1": tables.StringCol(5, pos=1),
"c2": tables.Int64Col(pos=2),
}
t0 = 1_561_105_000.0
testsamples = [
{"c0": t0, "c1": "aaaaa", "c2": 1},
{"c0": t0 + 1, "c1": "bbbbb", "c2": 2},
{"c0": t0 + 2, "c1": "ccccc", "c2": 10**5},
{"c0": t0 + 3, "c1": "ddddd", "c2": 4_294_967_295},
]
objname = "pandas_test_timeseries"
path = tmp_path / "written_with_pytables.h5"
with tables.open_file(path, mode="w") as f:
t = f.create_table("/", name=objname, description=table_schema)
for sample in testsamples:
for key, value in sample.items():
t.row[key] = value
t.row.append()
yield path, objname, pd.DataFrame(testsamples)
class TestReadPyTablesHDF5:
"""
A group of tests which covers reading HDF5 files written by plain PyTables
(not written by pandas).
Was introduced for regression-testing issue 11188.
"""
def test_read_complete(self, pytables_hdf5_file):
path, objname, df = pytables_hdf5_file
result = pd.read_hdf(path, key=objname)
expected = df
tm.assert_frame_equal(result, expected, check_index_type=True)
def test_read_with_start(self, pytables_hdf5_file):
path, objname, df = pytables_hdf5_file
# This is a regression test for pandas-dev/pandas/issues/11188
result = pd.read_hdf(path, key=objname, start=1)
expected = df[1:].reset_index(drop=True)
tm.assert_frame_equal(result, expected, check_index_type=True)
def test_read_with_stop(self, pytables_hdf5_file):
path, objname, df = pytables_hdf5_file
# This is a regression test for pandas-dev/pandas/issues/11188
result = pd.read_hdf(path, key=objname, stop=1)
expected = df[:1].reset_index(drop=True)
tm.assert_frame_equal(result, expected, check_index_type=True)
def test_read_with_startstop(self, pytables_hdf5_file):
path, objname, df = pytables_hdf5_file
# This is a regression test for pandas-dev/pandas/issues/11188
result = pd.read_hdf(path, key=objname, start=1, stop=2)
expected = df[1:2].reset_index(drop=True)
tm.assert_frame_equal(result, expected, check_index_type=True)

View File

@ -0,0 +1,195 @@
import numpy as np
import pytest
import pandas as pd
from pandas import (
DataFrame,
Series,
)
import pandas._testing as tm
from pandas.tests.io.pytables.common import ensure_clean_store
from pandas.io.pytables import read_hdf
def test_complex_fixed(tmp_path, setup_path):
df = DataFrame(
np.random.default_rng(2).random((4, 5)).astype(np.complex64),
index=list("abcd"),
columns=list("ABCDE"),
)
path = tmp_path / setup_path
df.to_hdf(path, key="df")
reread = read_hdf(path, "df")
tm.assert_frame_equal(df, reread)
df = DataFrame(
np.random.default_rng(2).random((4, 5)).astype(np.complex128),
index=list("abcd"),
columns=list("ABCDE"),
)
path = tmp_path / setup_path
df.to_hdf(path, key="df")
reread = read_hdf(path, "df")
tm.assert_frame_equal(df, reread)
def test_complex_table(tmp_path, setup_path):
df = DataFrame(
np.random.default_rng(2).random((4, 5)).astype(np.complex64),
index=list("abcd"),
columns=list("ABCDE"),
)
path = tmp_path / setup_path
df.to_hdf(path, key="df", format="table")
reread = read_hdf(path, key="df")
tm.assert_frame_equal(df, reread)
df = DataFrame(
np.random.default_rng(2).random((4, 5)).astype(np.complex128),
index=list("abcd"),
columns=list("ABCDE"),
)
path = tmp_path / setup_path
df.to_hdf(path, key="df", format="table", mode="w")
reread = read_hdf(path, "df")
tm.assert_frame_equal(df, reread)
def test_complex_mixed_fixed(tmp_path, setup_path):
complex64 = np.array(
[1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j], dtype=np.complex64
)
complex128 = np.array(
[1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j], dtype=np.complex128
)
df = DataFrame(
{
"A": [1, 2, 3, 4],
"B": ["a", "b", "c", "d"],
"C": complex64,
"D": complex128,
"E": [1.0, 2.0, 3.0, 4.0],
},
index=list("abcd"),
)
path = tmp_path / setup_path
df.to_hdf(path, key="df")
reread = read_hdf(path, "df")
tm.assert_frame_equal(df, reread)
def test_complex_mixed_table(tmp_path, setup_path):
complex64 = np.array(
[1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j], dtype=np.complex64
)
complex128 = np.array(
[1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j], dtype=np.complex128
)
df = DataFrame(
{
"A": [1, 2, 3, 4],
"B": ["a", "b", "c", "d"],
"C": complex64,
"D": complex128,
"E": [1.0, 2.0, 3.0, 4.0],
},
index=list("abcd"),
)
with ensure_clean_store(setup_path) as store:
store.append("df", df, data_columns=["A", "B"])
result = store.select("df", where="A>2")
tm.assert_frame_equal(df.loc[df.A > 2], result)
path = tmp_path / setup_path
df.to_hdf(path, key="df", format="table")
reread = read_hdf(path, "df")
tm.assert_frame_equal(df, reread)
def test_complex_across_dimensions_fixed(tmp_path, setup_path):
complex128 = np.array([1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j])
s = Series(complex128, index=list("abcd"))
df = DataFrame({"A": s, "B": s})
objs = [s, df]
comps = [tm.assert_series_equal, tm.assert_frame_equal]
for obj, comp in zip(objs, comps):
path = tmp_path / setup_path
obj.to_hdf(path, key="obj", format="fixed")
reread = read_hdf(path, "obj")
comp(obj, reread)
def test_complex_across_dimensions(tmp_path, setup_path):
complex128 = np.array([1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j])
s = Series(complex128, index=list("abcd"))
df = DataFrame({"A": s, "B": s})
path = tmp_path / setup_path
df.to_hdf(path, key="obj", format="table")
reread = read_hdf(path, "obj")
tm.assert_frame_equal(df, reread)
def test_complex_indexing_error(setup_path):
complex128 = np.array(
[1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j], dtype=np.complex128
)
df = DataFrame(
{"A": [1, 2, 3, 4], "B": ["a", "b", "c", "d"], "C": complex128},
index=list("abcd"),
)
msg = (
"Columns containing complex values can be stored "
"but cannot be indexed when using table format. "
"Either use fixed format, set index=False, "
"or do not include the columns containing complex "
"values to data_columns when initializing the table."
)
with ensure_clean_store(setup_path) as store:
with pytest.raises(TypeError, match=msg):
store.append("df", df, data_columns=["C"])
def test_complex_series_error(tmp_path, setup_path):
complex128 = np.array([1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j])
s = Series(complex128, index=list("abcd"))
msg = (
"Columns containing complex values can be stored "
"but cannot be indexed when using table format. "
"Either use fixed format, set index=False, "
"or do not include the columns containing complex "
"values to data_columns when initializing the table."
)
path = tmp_path / setup_path
with pytest.raises(TypeError, match=msg):
s.to_hdf(path, key="obj", format="t")
path = tmp_path / setup_path
s.to_hdf(path, key="obj", format="t", index=False)
reread = read_hdf(path, "obj")
tm.assert_series_equal(s, reread)
def test_complex_append(setup_path):
df = DataFrame(
{
"a": np.random.default_rng(2).standard_normal(100).astype(np.complex128),
"b": np.random.default_rng(2).standard_normal(100),
}
)
with ensure_clean_store(setup_path) as store:
store.append("df", df, data_columns=["b"])
store.append("df", df)
result = store.select("df")
tm.assert_frame_equal(pd.concat([df, df], axis=0), result)

View File

@ -0,0 +1,256 @@
import datetime
from io import BytesIO
import re
import numpy as np
import pytest
from pandas import (
CategoricalIndex,
DataFrame,
HDFStore,
Index,
MultiIndex,
_testing as tm,
date_range,
read_hdf,
)
from pandas.tests.io.pytables.common import ensure_clean_store
from pandas.io.pytables import (
Term,
_maybe_adjust_name,
)
pytestmark = [pytest.mark.single_cpu]
def test_pass_spec_to_storer(setup_path):
df = DataFrame(
1.1 * np.arange(120).reshape((30, 4)),
columns=Index(list("ABCD"), dtype=object),
index=Index([f"i-{i}" for i in range(30)], dtype=object),
)
with ensure_clean_store(setup_path) as store:
store.put("df", df)
msg = (
"cannot pass a column specification when reading a Fixed format "
"store. this store must be selected in its entirety"
)
with pytest.raises(TypeError, match=msg):
store.select("df", columns=["A"])
msg = (
"cannot pass a where specification when reading from a Fixed "
"format store. this store must be selected in its entirety"
)
with pytest.raises(TypeError, match=msg):
store.select("df", where=[("columns=A")])
def test_table_index_incompatible_dtypes(setup_path):
df1 = DataFrame({"a": [1, 2, 3]})
df2 = DataFrame({"a": [4, 5, 6]}, index=date_range("1/1/2000", periods=3))
with ensure_clean_store(setup_path) as store:
store.put("frame", df1, format="table")
msg = re.escape("incompatible kind in col [integer - datetime64[ns]]")
with pytest.raises(TypeError, match=msg):
store.put("frame", df2, format="table", append=True)
def test_unimplemented_dtypes_table_columns(setup_path):
with ensure_clean_store(setup_path) as store:
dtypes = [("date", datetime.date(2001, 1, 2))]
# currently not supported dtypes ####
for n, f in dtypes:
df = DataFrame(
1.1 * np.arange(120).reshape((30, 4)),
columns=Index(list("ABCD"), dtype=object),
index=Index([f"i-{i}" for i in range(30)], dtype=object),
)
df[n] = f
msg = re.escape(f"[{n}] is not implemented as a table column")
with pytest.raises(TypeError, match=msg):
store.append(f"df1_{n}", df)
# frame
df = DataFrame(
1.1 * np.arange(120).reshape((30, 4)),
columns=Index(list("ABCD"), dtype=object),
index=Index([f"i-{i}" for i in range(30)], dtype=object),
)
df["obj1"] = "foo"
df["obj2"] = "bar"
df["datetime1"] = datetime.date(2001, 1, 2)
df = df._consolidate()
with ensure_clean_store(setup_path) as store:
# this fails because we have a date in the object block......
msg = "|".join(
[
re.escape(
"Cannot serialize the column [datetime1]\nbecause its data "
"contents are not [string] but [date] object dtype"
),
re.escape("[date] is not implemented as a table column"),
]
)
with pytest.raises(TypeError, match=msg):
store.append("df_unimplemented", df)
def test_invalid_terms(tmp_path, setup_path):
with ensure_clean_store(setup_path) as store:
df = DataFrame(
np.random.default_rng(2).standard_normal((10, 4)),
columns=Index(list("ABCD"), dtype=object),
index=date_range("2000-01-01", periods=10, freq="B"),
)
df["string"] = "foo"
df.loc[df.index[0:4], "string"] = "bar"
store.put("df", df, format="table")
# some invalid terms
msg = re.escape("__init__() missing 1 required positional argument: 'where'")
with pytest.raises(TypeError, match=msg):
Term()
# more invalid
msg = re.escape(
"cannot process expression [df.index[3]], "
"[2000-01-06 00:00:00] is not a valid condition"
)
with pytest.raises(ValueError, match=msg):
store.select("df", "df.index[3]")
msg = "invalid syntax"
with pytest.raises(SyntaxError, match=msg):
store.select("df", "index>")
# from the docs
path = tmp_path / setup_path
dfq = DataFrame(
np.random.default_rng(2).standard_normal((10, 4)),
columns=list("ABCD"),
index=date_range("20130101", periods=10),
)
dfq.to_hdf(path, key="dfq", format="table", data_columns=True)
# check ok
read_hdf(path, "dfq", where="index>Timestamp('20130104') & columns=['A', 'B']")
read_hdf(path, "dfq", where="A>0 or C>0")
# catch the invalid reference
path = tmp_path / setup_path
dfq = DataFrame(
np.random.default_rng(2).standard_normal((10, 4)),
columns=list("ABCD"),
index=date_range("20130101", periods=10),
)
dfq.to_hdf(path, key="dfq", format="table")
msg = (
r"The passed where expression: A>0 or C>0\n\s*"
r"contains an invalid variable reference\n\s*"
r"all of the variable references must be a reference to\n\s*"
r"an axis \(e.g. 'index' or 'columns'\), or a data_column\n\s*"
r"The currently defined references are: index,columns\n"
)
with pytest.raises(ValueError, match=msg):
read_hdf(path, "dfq", where="A>0 or C>0")
def test_append_with_diff_col_name_types_raises_value_error(setup_path):
df = DataFrame(np.random.default_rng(2).standard_normal((10, 1)))
df2 = DataFrame({"a": np.random.default_rng(2).standard_normal(10)})
df3 = DataFrame({(1, 2): np.random.default_rng(2).standard_normal(10)})
df4 = DataFrame({("1", 2): np.random.default_rng(2).standard_normal(10)})
df5 = DataFrame({("1", 2, object): np.random.default_rng(2).standard_normal(10)})
with ensure_clean_store(setup_path) as store:
name = "df_diff_valerror"
store.append(name, df)
for d in (df2, df3, df4, df5):
msg = re.escape(
"cannot match existing table structure for [0] on appending data"
)
with pytest.raises(ValueError, match=msg):
store.append(name, d)
def test_invalid_complib(setup_path):
df = DataFrame(
np.random.default_rng(2).random((4, 5)),
index=list("abcd"),
columns=list("ABCDE"),
)
with tm.ensure_clean(setup_path) as path:
msg = r"complib only supports \[.*\] compression."
with pytest.raises(ValueError, match=msg):
df.to_hdf(path, key="df", complib="foolib")
@pytest.mark.parametrize(
"idx",
[
date_range("2019", freq="D", periods=3, tz="UTC"),
CategoricalIndex(list("abc")),
],
)
def test_to_hdf_multiindex_extension_dtype(idx, tmp_path, setup_path):
# GH 7775
mi = MultiIndex.from_arrays([idx, idx])
df = DataFrame(0, index=mi, columns=["a"])
path = tmp_path / setup_path
with pytest.raises(NotImplementedError, match="Saving a MultiIndex"):
df.to_hdf(path, key="df")
def test_unsuppored_hdf_file_error(datapath):
# GH 9539
data_path = datapath("io", "data", "legacy_hdf/incompatible_dataset.h5")
message = (
r"Dataset\(s\) incompatible with Pandas data types, "
"not table, or no datasets found in HDF5 file."
)
with pytest.raises(ValueError, match=message):
read_hdf(data_path)
def test_read_hdf_errors(setup_path, tmp_path):
df = DataFrame(
np.random.default_rng(2).random((4, 5)),
index=list("abcd"),
columns=list("ABCDE"),
)
path = tmp_path / setup_path
msg = r"File [\S]* does not exist"
with pytest.raises(OSError, match=msg):
read_hdf(path, "key")
df.to_hdf(path, key="df")
store = HDFStore(path, mode="r")
store.close()
msg = "The HDFStore must be open for reading."
with pytest.raises(OSError, match=msg):
read_hdf(store, "df")
def test_read_hdf_generic_buffer_errors():
msg = "Support for generic buffers has not been implemented."
with pytest.raises(NotImplementedError, match=msg):
read_hdf(BytesIO(b""), "df")
@pytest.mark.parametrize("bad_version", [(1, 2), (1,), [], "12", "123"])
def test_maybe_adjust_name_bad_version_raises(bad_version):
msg = "Version is incorrect, expected sequence of 3 integers"
with pytest.raises(ValueError, match=msg):
_maybe_adjust_name("values_block_0", version=bad_version)

View File

@ -0,0 +1,517 @@
import os
import numpy as np
import pytest
from pandas.compat import (
PY311,
is_ci_environment,
is_platform_linux,
is_platform_little_endian,
)
from pandas.errors import (
ClosedFileError,
PossibleDataLossError,
)
from pandas import (
DataFrame,
HDFStore,
Index,
Series,
_testing as tm,
date_range,
read_hdf,
)
from pandas.tests.io.pytables.common import (
_maybe_remove,
ensure_clean_store,
tables,
)
from pandas.io import pytables
from pandas.io.pytables import Term
pytestmark = [pytest.mark.single_cpu]
@pytest.mark.parametrize("mode", ["r", "r+", "a", "w"])
def test_mode(setup_path, tmp_path, mode, using_infer_string):
df = DataFrame(
np.random.default_rng(2).standard_normal((10, 4)),
columns=Index(list("ABCD"), dtype=object),
index=date_range("2000-01-01", periods=10, freq="B"),
)
msg = r"[\S]* does not exist"
path = tmp_path / setup_path
# constructor
if mode in ["r", "r+"]:
with pytest.raises(OSError, match=msg):
HDFStore(path, mode=mode)
else:
with HDFStore(path, mode=mode) as store:
assert store._handle.mode == mode
path = tmp_path / setup_path
# context
if mode in ["r", "r+"]:
with pytest.raises(OSError, match=msg):
with HDFStore(path, mode=mode) as store:
pass
else:
with HDFStore(path, mode=mode) as store:
assert store._handle.mode == mode
path = tmp_path / setup_path
# conv write
if mode in ["r", "r+"]:
with pytest.raises(OSError, match=msg):
df.to_hdf(path, key="df", mode=mode)
df.to_hdf(path, key="df", mode="w")
else:
df.to_hdf(path, key="df", mode=mode)
# conv read
if mode in ["w"]:
msg = (
"mode w is not allowed while performing a read. "
r"Allowed modes are r, r\+ and a."
)
with pytest.raises(ValueError, match=msg):
read_hdf(path, "df", mode=mode)
else:
result = read_hdf(path, "df", mode=mode)
if using_infer_string:
df.columns = df.columns.astype("str")
tm.assert_frame_equal(result, df)
def test_default_mode(tmp_path, setup_path, using_infer_string):
# read_hdf uses default mode
df = DataFrame(
np.random.default_rng(2).standard_normal((10, 4)),
columns=Index(list("ABCD"), dtype=object),
index=date_range("2000-01-01", periods=10, freq="B"),
)
path = tmp_path / setup_path
df.to_hdf(path, key="df", mode="w")
result = read_hdf(path, "df")
expected = df.copy()
if using_infer_string:
expected.columns = expected.columns.astype("str")
tm.assert_frame_equal(result, expected)
def test_reopen_handle(tmp_path, setup_path):
path = tmp_path / setup_path
store = HDFStore(path, mode="a")
store["a"] = Series(
np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10)
)
msg = (
r"Re-opening the file \[[\S]*\] with mode \[a\] will delete the "
"current file!"
)
# invalid mode change
with pytest.raises(PossibleDataLossError, match=msg):
store.open("w")
store.close()
assert not store.is_open
# truncation ok here
store.open("w")
assert store.is_open
assert len(store) == 0
store.close()
assert not store.is_open
store = HDFStore(path, mode="a")
store["a"] = Series(
np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10)
)
# reopen as read
store.open("r")
assert store.is_open
assert len(store) == 1
assert store._mode == "r"
store.close()
assert not store.is_open
# reopen as append
store.open("a")
assert store.is_open
assert len(store) == 1
assert store._mode == "a"
store.close()
assert not store.is_open
# reopen as append (again)
store.open("a")
assert store.is_open
assert len(store) == 1
assert store._mode == "a"
store.close()
assert not store.is_open
def test_open_args(setup_path, using_infer_string):
with tm.ensure_clean(setup_path) as path:
df = DataFrame(
1.1 * np.arange(120).reshape((30, 4)),
columns=Index(list("ABCD"), dtype=object),
index=Index([f"i-{i}" for i in range(30)], dtype=object),
)
# create an in memory store
store = HDFStore(
path, mode="a", driver="H5FD_CORE", driver_core_backing_store=0
)
store["df"] = df
store.append("df2", df)
expected = df.copy()
if using_infer_string:
expected.index = expected.index.astype("str")
expected.columns = expected.columns.astype("str")
tm.assert_frame_equal(store["df"], expected)
tm.assert_frame_equal(store["df2"], expected)
store.close()
# the file should not have actually been written
assert not os.path.exists(path)
def test_flush(setup_path):
with ensure_clean_store(setup_path) as store:
store["a"] = Series(range(5))
store.flush()
store.flush(fsync=True)
def test_complibs_default_settings(tmp_path, setup_path, using_infer_string):
# GH15943
df = DataFrame(
1.1 * np.arange(120).reshape((30, 4)),
columns=Index(list("ABCD"), dtype=object),
index=Index([f"i-{i}" for i in range(30)], dtype=object),
)
# Set complevel and check if complib is automatically set to
# default value
tmpfile = tmp_path / setup_path
df.to_hdf(tmpfile, key="df", complevel=9)
result = read_hdf(tmpfile, "df")
expected = df.copy()
if using_infer_string:
expected.index = expected.index.astype("str")
expected.columns = expected.columns.astype("str")
tm.assert_frame_equal(result, expected)
with tables.open_file(tmpfile, mode="r") as h5file:
for node in h5file.walk_nodes(where="/df", classname="Leaf"):
assert node.filters.complevel == 9
assert node.filters.complib == "zlib"
# Set complib and check to see if compression is disabled
tmpfile = tmp_path / setup_path
df.to_hdf(tmpfile, key="df", complib="zlib")
result = read_hdf(tmpfile, "df")
expected = df.copy()
if using_infer_string:
expected.index = expected.index.astype("str")
expected.columns = expected.columns.astype("str")
tm.assert_frame_equal(result, expected)
with tables.open_file(tmpfile, mode="r") as h5file:
for node in h5file.walk_nodes(where="/df", classname="Leaf"):
assert node.filters.complevel == 0
assert node.filters.complib is None
# Check if not setting complib or complevel results in no compression
tmpfile = tmp_path / setup_path
df.to_hdf(tmpfile, key="df")
result = read_hdf(tmpfile, "df")
expected = df.copy()
if using_infer_string:
expected.index = expected.index.astype("str")
expected.columns = expected.columns.astype("str")
tm.assert_frame_equal(result, expected)
with tables.open_file(tmpfile, mode="r") as h5file:
for node in h5file.walk_nodes(where="/df", classname="Leaf"):
assert node.filters.complevel == 0
assert node.filters.complib is None
def test_complibs_default_settings_override(tmp_path, setup_path):
# Check if file-defaults can be overridden on a per table basis
df = DataFrame(
1.1 * np.arange(120).reshape((30, 4)),
columns=Index(list("ABCD"), dtype=object),
index=Index([f"i-{i}" for i in range(30)], dtype=object),
)
tmpfile = tmp_path / setup_path
store = HDFStore(tmpfile)
store.append("dfc", df, complevel=9, complib="blosc")
store.append("df", df)
store.close()
with tables.open_file(tmpfile, mode="r") as h5file:
for node in h5file.walk_nodes(where="/df", classname="Leaf"):
assert node.filters.complevel == 0
assert node.filters.complib is None
for node in h5file.walk_nodes(where="/dfc", classname="Leaf"):
assert node.filters.complevel == 9
assert node.filters.complib == "blosc"
@pytest.mark.parametrize("lvl", range(10))
@pytest.mark.parametrize("lib", tables.filters.all_complibs)
@pytest.mark.filterwarnings("ignore:object name is not a valid")
@pytest.mark.skipif(
not PY311 and is_ci_environment() and is_platform_linux(),
reason="Segfaulting in a CI environment"
# with xfail, would sometimes raise UnicodeDecodeError
# invalid state byte
)
def test_complibs(tmp_path, lvl, lib, request):
# GH14478
if PY311 and is_platform_linux() and lib == "blosc2" and lvl != 0:
request.applymarker(
pytest.mark.xfail(reason=f"Fails for {lib} on Linux and PY > 3.11")
)
df = DataFrame(
np.ones((30, 4)), columns=list("ABCD"), index=np.arange(30).astype(np.str_)
)
# Remove lzo if its not available on this platform
if not tables.which_lib_version("lzo"):
pytest.skip("lzo not available")
# Remove bzip2 if its not available on this platform
if not tables.which_lib_version("bzip2"):
pytest.skip("bzip2 not available")
tmpfile = tmp_path / f"{lvl}_{lib}.h5"
gname = f"{lvl}_{lib}"
# Write and read file to see if data is consistent
df.to_hdf(tmpfile, key=gname, complib=lib, complevel=lvl)
result = read_hdf(tmpfile, gname)
tm.assert_frame_equal(result, df)
# Open file and check metadata for correct amount of compression
with tables.open_file(tmpfile, mode="r") as h5table:
for node in h5table.walk_nodes(where="/" + gname, classname="Leaf"):
assert node.filters.complevel == lvl
if lvl == 0:
assert node.filters.complib is None
else:
assert node.filters.complib == lib
@pytest.mark.skipif(
not is_platform_little_endian(), reason="reason platform is not little endian"
)
def test_encoding(setup_path):
with ensure_clean_store(setup_path) as store:
df = DataFrame({"A": "foo", "B": "bar"}, index=range(5))
df.loc[2, "A"] = np.nan
df.loc[3, "B"] = np.nan
_maybe_remove(store, "df")
store.append("df", df, encoding="ascii")
tm.assert_frame_equal(store["df"], df)
expected = df.reindex(columns=["A"])
result = store.select("df", Term("columns=A", encoding="ascii"))
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"val",
[
[b"E\xc9, 17", b"", b"a", b"b", b"c"],
[b"E\xc9, 17", b"a", b"b", b"c"],
[b"EE, 17", b"", b"a", b"b", b"c"],
[b"E\xc9, 17", b"\xf8\xfc", b"a", b"b", b"c"],
[b"", b"a", b"b", b"c"],
[b"\xf8\xfc", b"a", b"b", b"c"],
[b"A\xf8\xfc", b"", b"a", b"b", b"c"],
[np.nan, b"", b"b", b"c"],
[b"A\xf8\xfc", np.nan, b"", b"b", b"c"],
],
)
@pytest.mark.parametrize("dtype", ["category", None])
def test_latin_encoding(tmp_path, setup_path, dtype, val):
enc = "latin-1"
nan_rep = ""
key = "data"
val = [x.decode(enc) if isinstance(x, bytes) else x for x in val]
ser = Series(val, dtype=dtype)
store = tmp_path / setup_path
ser.to_hdf(store, key=key, format="table", encoding=enc, nan_rep=nan_rep)
retr = read_hdf(store, key)
# TODO:(3.0): once Categorical replace deprecation is enforced,
# we may be able to re-simplify the construction of s_nan
if dtype == "category":
if nan_rep in ser.cat.categories:
s_nan = ser.cat.remove_categories([nan_rep])
else:
s_nan = ser
else:
s_nan = ser.replace(nan_rep, np.nan)
tm.assert_series_equal(s_nan, retr)
def test_multiple_open_close(tmp_path, setup_path):
# gh-4409: open & close multiple times
path = tmp_path / setup_path
df = DataFrame(
1.1 * np.arange(120).reshape((30, 4)),
columns=Index(list("ABCD"), dtype=object),
index=Index([f"i-{i}" for i in range(30)], dtype=object),
)
df.to_hdf(path, key="df", mode="w", format="table")
# single
store = HDFStore(path)
assert "CLOSED" not in store.info()
assert store.is_open
store.close()
assert "CLOSED" in store.info()
assert not store.is_open
path = tmp_path / setup_path
if pytables._table_file_open_policy_is_strict:
# multiples
store1 = HDFStore(path)
msg = (
r"The file [\S]* is already opened\. Please close it before "
r"reopening in write mode\."
)
with pytest.raises(ValueError, match=msg):
HDFStore(path)
store1.close()
else:
# multiples
store1 = HDFStore(path)
store2 = HDFStore(path)
assert "CLOSED" not in store1.info()
assert "CLOSED" not in store2.info()
assert store1.is_open
assert store2.is_open
store1.close()
assert "CLOSED" in store1.info()
assert not store1.is_open
assert "CLOSED" not in store2.info()
assert store2.is_open
store2.close()
assert "CLOSED" in store1.info()
assert "CLOSED" in store2.info()
assert not store1.is_open
assert not store2.is_open
# nested close
store = HDFStore(path, mode="w")
store.append("df", df)
store2 = HDFStore(path)
store2.append("df2", df)
store2.close()
assert "CLOSED" in store2.info()
assert not store2.is_open
store.close()
assert "CLOSED" in store.info()
assert not store.is_open
# double closing
store = HDFStore(path, mode="w")
store.append("df", df)
store2 = HDFStore(path)
store.close()
assert "CLOSED" in store.info()
assert not store.is_open
store2.close()
assert "CLOSED" in store2.info()
assert not store2.is_open
# ops on a closed store
path = tmp_path / setup_path
df = DataFrame(
1.1 * np.arange(120).reshape((30, 4)),
columns=Index(list("ABCD"), dtype=object),
index=Index([f"i-{i}" for i in range(30)], dtype=object),
)
df.to_hdf(path, key="df", mode="w", format="table")
store = HDFStore(path)
store.close()
msg = r"[\S]* file is not open!"
with pytest.raises(ClosedFileError, match=msg):
store.keys()
with pytest.raises(ClosedFileError, match=msg):
"df" in store
with pytest.raises(ClosedFileError, match=msg):
len(store)
with pytest.raises(ClosedFileError, match=msg):
store["df"]
with pytest.raises(ClosedFileError, match=msg):
store.select("df")
with pytest.raises(ClosedFileError, match=msg):
store.get("df")
with pytest.raises(ClosedFileError, match=msg):
store.append("df2", df)
with pytest.raises(ClosedFileError, match=msg):
store.put("df3", df)
with pytest.raises(ClosedFileError, match=msg):
store.get_storer("df2")
with pytest.raises(ClosedFileError, match=msg):
store.remove("df2")
with pytest.raises(ClosedFileError, match=msg):
store.select("df")
msg = "'HDFStore' object has no attribute 'df'"
with pytest.raises(AttributeError, match=msg):
store.df
def test_fspath():
with tm.ensure_clean("foo.h5") as path:
with HDFStore(path) as store:
assert os.fspath(store) == str(path)

Some files were not shown because too many files have changed in this diff Show More