done
This commit is contained in:
		| @ -0,0 +1,55 @@ | ||||
| from hypothesis import ( | ||||
|     assume, | ||||
|     example, | ||||
|     given, | ||||
|     strategies as st, | ||||
| ) | ||||
| import numpy as np | ||||
| import pytest | ||||
|  | ||||
| from pandas._libs.byteswap import ( | ||||
|     read_double_with_byteswap, | ||||
|     read_float_with_byteswap, | ||||
|     read_uint16_with_byteswap, | ||||
|     read_uint32_with_byteswap, | ||||
|     read_uint64_with_byteswap, | ||||
| ) | ||||
|  | ||||
| import pandas._testing as tm | ||||
|  | ||||
|  | ||||
| @given(read_offset=st.integers(0, 11), number=st.integers(min_value=0)) | ||||
| @example(number=2**16, read_offset=0) | ||||
| @example(number=2**32, read_offset=0) | ||||
| @example(number=2**64, read_offset=0) | ||||
| @pytest.mark.parametrize("int_type", [np.uint16, np.uint32, np.uint64]) | ||||
| @pytest.mark.parametrize("should_byteswap", [True, False]) | ||||
| def test_int_byteswap(read_offset, number, int_type, should_byteswap): | ||||
|     assume(number < 2 ** (8 * int_type(0).itemsize)) | ||||
|     _test(number, int_type, read_offset, should_byteswap) | ||||
|  | ||||
|  | ||||
| @pytest.mark.filterwarnings("ignore:overflow encountered:RuntimeWarning") | ||||
| @given(read_offset=st.integers(0, 11), number=st.floats()) | ||||
| @pytest.mark.parametrize("float_type", [np.float32, np.float64]) | ||||
| @pytest.mark.parametrize("should_byteswap", [True, False]) | ||||
| def test_float_byteswap(read_offset, number, float_type, should_byteswap): | ||||
|     _test(number, float_type, read_offset, should_byteswap) | ||||
|  | ||||
|  | ||||
| def _test(number, number_type, read_offset, should_byteswap): | ||||
|     number = number_type(number) | ||||
|     data = np.random.default_rng(2).integers(0, 256, size=20, dtype="uint8") | ||||
|     data[read_offset : read_offset + number.itemsize] = number[None].view("uint8") | ||||
|     swap_func = { | ||||
|         np.float32: read_float_with_byteswap, | ||||
|         np.float64: read_double_with_byteswap, | ||||
|         np.uint16: read_uint16_with_byteswap, | ||||
|         np.uint32: read_uint32_with_byteswap, | ||||
|         np.uint64: read_uint64_with_byteswap, | ||||
|     }[type(number)] | ||||
|     output_number = number_type(swap_func(bytes(data), read_offset, should_byteswap)) | ||||
|     if should_byteswap: | ||||
|         tm.assert_equal(output_number, number.byteswap()) | ||||
|     else: | ||||
|         tm.assert_equal(output_number, number) | ||||
							
								
								
									
										34
									
								
								lib/python3.11/site-packages/pandas/tests/io/sas/test_sas.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										34
									
								
								lib/python3.11/site-packages/pandas/tests/io/sas/test_sas.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,34 @@ | ||||
| from io import StringIO | ||||
|  | ||||
| import pytest | ||||
|  | ||||
| from pandas import read_sas | ||||
| import pandas._testing as tm | ||||
|  | ||||
|  | ||||
| class TestSas: | ||||
|     def test_sas_buffer_format(self): | ||||
|         # see gh-14947 | ||||
|         b = StringIO("") | ||||
|  | ||||
|         msg = ( | ||||
|             "If this is a buffer object rather than a string " | ||||
|             "name, you must specify a format string" | ||||
|         ) | ||||
|         with pytest.raises(ValueError, match=msg): | ||||
|             read_sas(b) | ||||
|  | ||||
|     def test_sas_read_no_format_or_extension(self): | ||||
|         # see gh-24548 | ||||
|         msg = "unable to infer format of SAS file.+" | ||||
|         with tm.ensure_clean("test_file_no_extension") as path: | ||||
|             with pytest.raises(ValueError, match=msg): | ||||
|                 read_sas(path) | ||||
|  | ||||
|  | ||||
| def test_sas_archive(datapath): | ||||
|     fname_uncompressed = datapath("io", "sas", "data", "airline.sas7bdat") | ||||
|     df_uncompressed = read_sas(fname_uncompressed) | ||||
|     fname_compressed = datapath("io", "sas", "data", "airline.sas7bdat.gz") | ||||
|     df_compressed = read_sas(fname_compressed, format="sas7bdat") | ||||
|     tm.assert_frame_equal(df_uncompressed, df_compressed) | ||||
| @ -0,0 +1,421 @@ | ||||
| import contextlib | ||||
| from datetime import datetime | ||||
| import io | ||||
| import os | ||||
| from pathlib import Path | ||||
|  | ||||
| import numpy as np | ||||
| import pytest | ||||
|  | ||||
| from pandas.compat import IS64 | ||||
| from pandas.errors import EmptyDataError | ||||
| import pandas.util._test_decorators as td | ||||
|  | ||||
| import pandas as pd | ||||
| import pandas._testing as tm | ||||
|  | ||||
| from pandas.io.sas.sas7bdat import SAS7BDATReader | ||||
|  | ||||
|  | ||||
| @pytest.fixture | ||||
| def dirpath(datapath): | ||||
|     return datapath("io", "sas", "data") | ||||
|  | ||||
|  | ||||
| @pytest.fixture(params=[(1, range(1, 16)), (2, [16])]) | ||||
| def data_test_ix(request, dirpath): | ||||
|     i, test_ix = request.param | ||||
|     fname = os.path.join(dirpath, f"test_sas7bdat_{i}.csv") | ||||
|     df = pd.read_csv(fname) | ||||
|     epoch = datetime(1960, 1, 1) | ||||
|     t1 = pd.to_timedelta(df["Column4"], unit="d") | ||||
|     df["Column4"] = (epoch + t1).astype("M8[s]") | ||||
|     t2 = pd.to_timedelta(df["Column12"], unit="d") | ||||
|     df["Column12"] = (epoch + t2).astype("M8[s]") | ||||
|     for k in range(df.shape[1]): | ||||
|         col = df.iloc[:, k] | ||||
|         if col.dtype == np.int64: | ||||
|             df.isetitem(k, df.iloc[:, k].astype(np.float64)) | ||||
|     return df, test_ix | ||||
|  | ||||
|  | ||||
| # https://github.com/cython/cython/issues/1720 | ||||
| class TestSAS7BDAT: | ||||
|     @pytest.mark.slow | ||||
|     def test_from_file(self, dirpath, data_test_ix): | ||||
|         expected, test_ix = data_test_ix | ||||
|         for k in test_ix: | ||||
|             fname = os.path.join(dirpath, f"test{k}.sas7bdat") | ||||
|             df = pd.read_sas(fname, encoding="utf-8") | ||||
|             tm.assert_frame_equal(df, expected) | ||||
|  | ||||
|     @pytest.mark.slow | ||||
|     def test_from_buffer(self, dirpath, data_test_ix): | ||||
|         expected, test_ix = data_test_ix | ||||
|         for k in test_ix: | ||||
|             fname = os.path.join(dirpath, f"test{k}.sas7bdat") | ||||
|             with open(fname, "rb") as f: | ||||
|                 byts = f.read() | ||||
|             buf = io.BytesIO(byts) | ||||
|             with pd.read_sas( | ||||
|                 buf, format="sas7bdat", iterator=True, encoding="utf-8" | ||||
|             ) as rdr: | ||||
|                 df = rdr.read() | ||||
|             tm.assert_frame_equal(df, expected) | ||||
|  | ||||
|     @pytest.mark.slow | ||||
|     def test_from_iterator(self, dirpath, data_test_ix): | ||||
|         expected, test_ix = data_test_ix | ||||
|         for k in test_ix: | ||||
|             fname = os.path.join(dirpath, f"test{k}.sas7bdat") | ||||
|             with pd.read_sas(fname, iterator=True, encoding="utf-8") as rdr: | ||||
|                 df = rdr.read(2) | ||||
|                 tm.assert_frame_equal(df, expected.iloc[0:2, :]) | ||||
|                 df = rdr.read(3) | ||||
|                 tm.assert_frame_equal(df, expected.iloc[2:5, :]) | ||||
|  | ||||
|     @pytest.mark.slow | ||||
|     def test_path_pathlib(self, dirpath, data_test_ix): | ||||
|         expected, test_ix = data_test_ix | ||||
|         for k in test_ix: | ||||
|             fname = Path(os.path.join(dirpath, f"test{k}.sas7bdat")) | ||||
|             df = pd.read_sas(fname, encoding="utf-8") | ||||
|             tm.assert_frame_equal(df, expected) | ||||
|  | ||||
|     @td.skip_if_no("py.path") | ||||
|     @pytest.mark.slow | ||||
|     def test_path_localpath(self, dirpath, data_test_ix): | ||||
|         from py.path import local as LocalPath | ||||
|  | ||||
|         expected, test_ix = data_test_ix | ||||
|         for k in test_ix: | ||||
|             fname = LocalPath(os.path.join(dirpath, f"test{k}.sas7bdat")) | ||||
|             df = pd.read_sas(fname, encoding="utf-8") | ||||
|             tm.assert_frame_equal(df, expected) | ||||
|  | ||||
|     @pytest.mark.slow | ||||
|     @pytest.mark.parametrize("chunksize", (3, 5, 10, 11)) | ||||
|     @pytest.mark.parametrize("k", range(1, 17)) | ||||
|     def test_iterator_loop(self, dirpath, k, chunksize): | ||||
|         # github #13654 | ||||
|         fname = os.path.join(dirpath, f"test{k}.sas7bdat") | ||||
|         with pd.read_sas(fname, chunksize=chunksize, encoding="utf-8") as rdr: | ||||
|             y = 0 | ||||
|             for x in rdr: | ||||
|                 y += x.shape[0] | ||||
|         assert y == rdr.row_count | ||||
|  | ||||
|     def test_iterator_read_too_much(self, dirpath): | ||||
|         # github #14734 | ||||
|         fname = os.path.join(dirpath, "test1.sas7bdat") | ||||
|         with pd.read_sas( | ||||
|             fname, format="sas7bdat", iterator=True, encoding="utf-8" | ||||
|         ) as rdr: | ||||
|             d1 = rdr.read(rdr.row_count + 20) | ||||
|  | ||||
|         with pd.read_sas(fname, iterator=True, encoding="utf-8") as rdr: | ||||
|             d2 = rdr.read(rdr.row_count + 20) | ||||
|         tm.assert_frame_equal(d1, d2) | ||||
|  | ||||
|  | ||||
| def test_encoding_options(datapath): | ||||
|     fname = datapath("io", "sas", "data", "test1.sas7bdat") | ||||
|     df1 = pd.read_sas(fname) | ||||
|     df2 = pd.read_sas(fname, encoding="utf-8") | ||||
|     for col in df1.columns: | ||||
|         try: | ||||
|             df1[col] = df1[col].str.decode("utf-8") | ||||
|         except AttributeError: | ||||
|             pass | ||||
|     tm.assert_frame_equal(df1, df2) | ||||
|  | ||||
|     with contextlib.closing(SAS7BDATReader(fname, convert_header_text=False)) as rdr: | ||||
|         df3 = rdr.read() | ||||
|     for x, y in zip(df1.columns, df3.columns): | ||||
|         assert x == y.decode() | ||||
|  | ||||
|  | ||||
| def test_encoding_infer(datapath): | ||||
|     fname = datapath("io", "sas", "data", "test1.sas7bdat") | ||||
|  | ||||
|     with pd.read_sas(fname, encoding="infer", iterator=True) as df1_reader: | ||||
|         # check: is encoding inferred correctly from file | ||||
|         assert df1_reader.inferred_encoding == "cp1252" | ||||
|         df1 = df1_reader.read() | ||||
|  | ||||
|     with pd.read_sas(fname, encoding="cp1252", iterator=True) as df2_reader: | ||||
|         df2 = df2_reader.read() | ||||
|  | ||||
|     # check: reader reads correct information | ||||
|     tm.assert_frame_equal(df1, df2) | ||||
|  | ||||
|  | ||||
| def test_productsales(datapath): | ||||
|     fname = datapath("io", "sas", "data", "productsales.sas7bdat") | ||||
|     df = pd.read_sas(fname, encoding="utf-8") | ||||
|     fname = datapath("io", "sas", "data", "productsales.csv") | ||||
|     df0 = pd.read_csv(fname, parse_dates=["MONTH"]) | ||||
|     vn = ["ACTUAL", "PREDICT", "QUARTER", "YEAR"] | ||||
|     df0[vn] = df0[vn].astype(np.float64) | ||||
|  | ||||
|     df0["MONTH"] = df0["MONTH"].astype("M8[s]") | ||||
|     tm.assert_frame_equal(df, df0) | ||||
|  | ||||
|  | ||||
| def test_12659(datapath): | ||||
|     fname = datapath("io", "sas", "data", "test_12659.sas7bdat") | ||||
|     df = pd.read_sas(fname) | ||||
|     fname = datapath("io", "sas", "data", "test_12659.csv") | ||||
|     df0 = pd.read_csv(fname) | ||||
|     df0 = df0.astype(np.float64) | ||||
|     tm.assert_frame_equal(df, df0) | ||||
|  | ||||
|  | ||||
| def test_airline(datapath): | ||||
|     fname = datapath("io", "sas", "data", "airline.sas7bdat") | ||||
|     df = pd.read_sas(fname) | ||||
|     fname = datapath("io", "sas", "data", "airline.csv") | ||||
|     df0 = pd.read_csv(fname) | ||||
|     df0 = df0.astype(np.float64) | ||||
|     tm.assert_frame_equal(df, df0) | ||||
|  | ||||
|  | ||||
| def test_date_time(datapath): | ||||
|     # Support of different SAS date/datetime formats (PR #15871) | ||||
|     fname = datapath("io", "sas", "data", "datetime.sas7bdat") | ||||
|     df = pd.read_sas(fname) | ||||
|     fname = datapath("io", "sas", "data", "datetime.csv") | ||||
|     df0 = pd.read_csv( | ||||
|         fname, parse_dates=["Date1", "Date2", "DateTime", "DateTimeHi", "Taiw"] | ||||
|     ) | ||||
|     # GH 19732: Timestamps imported from sas will incur floating point errors | ||||
|     # See GH#56014 for discussion of the correct "expected" results | ||||
|     #  We are really just testing that we are "close". This only seems to be | ||||
|     #  an issue near the implementation bounds. | ||||
|  | ||||
|     df[df.columns[3]] = df.iloc[:, 3].dt.round("us") | ||||
|     df0["Date1"] = df0["Date1"].astype("M8[s]") | ||||
|     df0["Date2"] = df0["Date2"].astype("M8[s]") | ||||
|     df0["DateTime"] = df0["DateTime"].astype("M8[ms]") | ||||
|     df0["Taiw"] = df0["Taiw"].astype("M8[s]") | ||||
|  | ||||
|     res = df0["DateTimeHi"].astype("M8[us]").dt.round("ms") | ||||
|     df0["DateTimeHi"] = res.astype("M8[ms]") | ||||
|  | ||||
|     if not IS64: | ||||
|         # No good reason for this, just what we get on the CI | ||||
|         df0.loc[0, "DateTimeHi"] += np.timedelta64(1, "ms") | ||||
|         df0.loc[[2, 3], "DateTimeHi"] -= np.timedelta64(1, "ms") | ||||
|     tm.assert_frame_equal(df, df0) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("column", ["WGT", "CYL"]) | ||||
| def test_compact_numerical_values(datapath, column): | ||||
|     # Regression test for #21616 | ||||
|     fname = datapath("io", "sas", "data", "cars.sas7bdat") | ||||
|     df = pd.read_sas(fname, encoding="latin-1") | ||||
|     # The two columns CYL and WGT in cars.sas7bdat have column | ||||
|     # width < 8 and only contain integral values. | ||||
|     # Test that pandas doesn't corrupt the numbers by adding | ||||
|     # decimals. | ||||
|     result = df[column] | ||||
|     expected = df[column].round() | ||||
|     tm.assert_series_equal(result, expected, check_exact=True) | ||||
|  | ||||
|  | ||||
| def test_many_columns(datapath): | ||||
|     # Test for looking for column information in more places (PR #22628) | ||||
|     fname = datapath("io", "sas", "data", "many_columns.sas7bdat") | ||||
|  | ||||
|     df = pd.read_sas(fname, encoding="latin-1") | ||||
|  | ||||
|     fname = datapath("io", "sas", "data", "many_columns.csv") | ||||
|     df0 = pd.read_csv(fname, encoding="latin-1") | ||||
|     tm.assert_frame_equal(df, df0) | ||||
|  | ||||
|  | ||||
| def test_inconsistent_number_of_rows(datapath): | ||||
|     # Regression test for issue #16615. (PR #22628) | ||||
|     fname = datapath("io", "sas", "data", "load_log.sas7bdat") | ||||
|     df = pd.read_sas(fname, encoding="latin-1") | ||||
|     assert len(df) == 2097 | ||||
|  | ||||
|  | ||||
| def test_zero_variables(datapath): | ||||
|     # Check if the SAS file has zero variables (PR #18184) | ||||
|     fname = datapath("io", "sas", "data", "zero_variables.sas7bdat") | ||||
|     with pytest.raises(EmptyDataError, match="No columns to parse from file"): | ||||
|         pd.read_sas(fname) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("encoding", [None, "utf8"]) | ||||
| def test_zero_rows(datapath, encoding): | ||||
|     # GH 18198 | ||||
|     fname = datapath("io", "sas", "data", "zero_rows.sas7bdat") | ||||
|     result = pd.read_sas(fname, encoding=encoding) | ||||
|     str_value = b"a" if encoding is None else "a" | ||||
|     expected = pd.DataFrame([{"char_field": str_value, "num_field": 1.0}]).iloc[:0] | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_corrupt_read(datapath): | ||||
|     # We don't really care about the exact failure, the important thing is | ||||
|     # that the resource should be cleaned up afterwards (BUG #35566) | ||||
|     fname = datapath("io", "sas", "data", "corrupt.sas7bdat") | ||||
|     msg = "'SAS7BDATReader' object has no attribute 'row_count'" | ||||
|     with pytest.raises(AttributeError, match=msg): | ||||
|         pd.read_sas(fname) | ||||
|  | ||||
|  | ||||
| def test_max_sas_date(datapath): | ||||
|     # GH 20927 | ||||
|     # NB. max datetime in SAS dataset is 31DEC9999:23:59:59.999 | ||||
|     #    but this is read as 29DEC9999:23:59:59.998993 by a buggy | ||||
|     #    sas7bdat module | ||||
|     # See also GH#56014 for discussion of the correct "expected" results. | ||||
|     fname = datapath("io", "sas", "data", "max_sas_date.sas7bdat") | ||||
|     df = pd.read_sas(fname, encoding="iso-8859-1") | ||||
|  | ||||
|     expected = pd.DataFrame( | ||||
|         { | ||||
|             "text": ["max", "normal"], | ||||
|             "dt_as_float": [253717747199.999, 1880323199.999], | ||||
|             "dt_as_dt": np.array( | ||||
|                 [ | ||||
|                     datetime(9999, 12, 29, 23, 59, 59, 999000), | ||||
|                     datetime(2019, 8, 1, 23, 59, 59, 999000), | ||||
|                 ], | ||||
|                 dtype="M8[ms]", | ||||
|             ), | ||||
|             "date_as_float": [2936547.0, 21762.0], | ||||
|             "date_as_date": np.array( | ||||
|                 [ | ||||
|                     datetime(9999, 12, 29), | ||||
|                     datetime(2019, 8, 1), | ||||
|                 ], | ||||
|                 dtype="M8[s]", | ||||
|             ), | ||||
|         }, | ||||
|         columns=["text", "dt_as_float", "dt_as_dt", "date_as_float", "date_as_date"], | ||||
|     ) | ||||
|  | ||||
|     if not IS64: | ||||
|         # No good reason for this, just what we get on the CI | ||||
|         expected.loc[:, "dt_as_dt"] -= np.timedelta64(1, "ms") | ||||
|  | ||||
|     tm.assert_frame_equal(df, expected) | ||||
|  | ||||
|  | ||||
| def test_max_sas_date_iterator(datapath): | ||||
|     # GH 20927 | ||||
|     # when called as an iterator, only those chunks with a date > pd.Timestamp.max | ||||
|     # are returned as datetime.datetime, if this happens that whole chunk is returned | ||||
|     # as datetime.datetime | ||||
|     col_order = ["text", "dt_as_float", "dt_as_dt", "date_as_float", "date_as_date"] | ||||
|     fname = datapath("io", "sas", "data", "max_sas_date.sas7bdat") | ||||
|     results = [] | ||||
|     for df in pd.read_sas(fname, encoding="iso-8859-1", chunksize=1): | ||||
|         # GH 19732: Timestamps imported from sas will incur floating point errors | ||||
|         df.reset_index(inplace=True, drop=True) | ||||
|         results.append(df) | ||||
|     expected = [ | ||||
|         pd.DataFrame( | ||||
|             { | ||||
|                 "text": ["max"], | ||||
|                 "dt_as_float": [253717747199.999], | ||||
|                 "dt_as_dt": np.array( | ||||
|                     [datetime(9999, 12, 29, 23, 59, 59, 999000)], dtype="M8[ms]" | ||||
|                 ), | ||||
|                 "date_as_float": [2936547.0], | ||||
|                 "date_as_date": np.array([datetime(9999, 12, 29)], dtype="M8[s]"), | ||||
|             }, | ||||
|             columns=col_order, | ||||
|         ), | ||||
|         pd.DataFrame( | ||||
|             { | ||||
|                 "text": ["normal"], | ||||
|                 "dt_as_float": [1880323199.999], | ||||
|                 "dt_as_dt": np.array(["2019-08-01 23:59:59.999"], dtype="M8[ms]"), | ||||
|                 "date_as_float": [21762.0], | ||||
|                 "date_as_date": np.array(["2019-08-01"], dtype="M8[s]"), | ||||
|             }, | ||||
|             columns=col_order, | ||||
|         ), | ||||
|     ] | ||||
|     if not IS64: | ||||
|         # No good reason for this, just what we get on the CI | ||||
|         expected[0].loc[0, "dt_as_dt"] -= np.timedelta64(1, "ms") | ||||
|         expected[1].loc[0, "dt_as_dt"] -= np.timedelta64(1, "ms") | ||||
|  | ||||
|     tm.assert_frame_equal(results[0], expected[0]) | ||||
|     tm.assert_frame_equal(results[1], expected[1]) | ||||
|  | ||||
|  | ||||
| def test_null_date(datapath): | ||||
|     fname = datapath("io", "sas", "data", "dates_null.sas7bdat") | ||||
|     df = pd.read_sas(fname, encoding="utf-8") | ||||
|  | ||||
|     expected = pd.DataFrame( | ||||
|         { | ||||
|             "datecol": np.array( | ||||
|                 [ | ||||
|                     datetime(9999, 12, 29), | ||||
|                     np.datetime64("NaT"), | ||||
|                 ], | ||||
|                 dtype="M8[s]", | ||||
|             ), | ||||
|             "datetimecol": np.array( | ||||
|                 [ | ||||
|                     datetime(9999, 12, 29, 23, 59, 59, 999000), | ||||
|                     np.datetime64("NaT"), | ||||
|                 ], | ||||
|                 dtype="M8[ms]", | ||||
|             ), | ||||
|         }, | ||||
|     ) | ||||
|     if not IS64: | ||||
|         # No good reason for this, just what we get on the CI | ||||
|         expected.loc[0, "datetimecol"] -= np.timedelta64(1, "ms") | ||||
|     tm.assert_frame_equal(df, expected) | ||||
|  | ||||
|  | ||||
| def test_meta2_page(datapath): | ||||
|     # GH 35545 | ||||
|     fname = datapath("io", "sas", "data", "test_meta2_page.sas7bdat") | ||||
|     df = pd.read_sas(fname) | ||||
|     assert len(df) == 1000 | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "test_file, override_offset, override_value, expected_msg", | ||||
|     [ | ||||
|         ("test2.sas7bdat", 0x10000 + 55229, 0x80 | 0x0F, "Out of bounds"), | ||||
|         ("test2.sas7bdat", 0x10000 + 55229, 0x10, "unknown control byte"), | ||||
|         ("test3.sas7bdat", 118170, 184, "Out of bounds"), | ||||
|     ], | ||||
| ) | ||||
| def test_rle_rdc_exceptions( | ||||
|     datapath, test_file, override_offset, override_value, expected_msg | ||||
| ): | ||||
|     """Errors in RLE/RDC decompression should propagate.""" | ||||
|     with open(datapath("io", "sas", "data", test_file), "rb") as fd: | ||||
|         data = bytearray(fd.read()) | ||||
|     data[override_offset] = override_value | ||||
|     with pytest.raises(Exception, match=expected_msg): | ||||
|         pd.read_sas(io.BytesIO(data), format="sas7bdat") | ||||
|  | ||||
|  | ||||
| def test_0x40_control_byte(datapath): | ||||
|     # GH 31243 | ||||
|     fname = datapath("io", "sas", "data", "0x40controlbyte.sas7bdat") | ||||
|     df = pd.read_sas(fname, encoding="ascii") | ||||
|     fname = datapath("io", "sas", "data", "0x40controlbyte.csv") | ||||
|     df0 = pd.read_csv(fname, dtype="str") | ||||
|     tm.assert_frame_equal(df, df0) | ||||
|  | ||||
|  | ||||
| def test_0x00_control_byte(datapath): | ||||
|     # GH 47099 | ||||
|     fname = datapath("io", "sas", "data", "0x00controlbyte.sas7bdat.bz2") | ||||
|     df = next(pd.read_sas(fname, chunksize=11_000)) | ||||
|     assert df.shape == (11_000, 20) | ||||
							
								
								
									
										167
									
								
								lib/python3.11/site-packages/pandas/tests/io/sas/test_xport.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										167
									
								
								lib/python3.11/site-packages/pandas/tests/io/sas/test_xport.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,167 @@ | ||||
| import numpy as np | ||||
| import pytest | ||||
|  | ||||
| import pandas as pd | ||||
| import pandas._testing as tm | ||||
|  | ||||
| from pandas.io.sas.sasreader import read_sas | ||||
|  | ||||
| # CSV versions of test xpt files were obtained using the R foreign library | ||||
|  | ||||
| # Numbers in a SAS xport file are always float64, so need to convert | ||||
| # before making comparisons. | ||||
|  | ||||
|  | ||||
| def numeric_as_float(data): | ||||
|     for v in data.columns: | ||||
|         if data[v].dtype is np.dtype("int64"): | ||||
|             data[v] = data[v].astype(np.float64) | ||||
|  | ||||
|  | ||||
| class TestXport: | ||||
|     @pytest.fixture | ||||
|     def file01(self, datapath): | ||||
|         return datapath("io", "sas", "data", "DEMO_G.xpt") | ||||
|  | ||||
|     @pytest.fixture | ||||
|     def file02(self, datapath): | ||||
|         return datapath("io", "sas", "data", "SSHSV1_A.xpt") | ||||
|  | ||||
|     @pytest.fixture | ||||
|     def file03(self, datapath): | ||||
|         return datapath("io", "sas", "data", "DRXFCD_G.xpt") | ||||
|  | ||||
|     @pytest.fixture | ||||
|     def file04(self, datapath): | ||||
|         return datapath("io", "sas", "data", "paxraw_d_short.xpt") | ||||
|  | ||||
|     @pytest.fixture | ||||
|     def file05(self, datapath): | ||||
|         return datapath("io", "sas", "data", "DEMO_PUF.cpt") | ||||
|  | ||||
|     @pytest.mark.slow | ||||
|     def test1_basic(self, file01): | ||||
|         # Tests with DEMO_G.xpt (all numeric file) | ||||
|  | ||||
|         # Compare to this | ||||
|         data_csv = pd.read_csv(file01.replace(".xpt", ".csv")) | ||||
|         numeric_as_float(data_csv) | ||||
|  | ||||
|         # Read full file | ||||
|         data = read_sas(file01, format="xport") | ||||
|         tm.assert_frame_equal(data, data_csv) | ||||
|         num_rows = data.shape[0] | ||||
|  | ||||
|         # Test reading beyond end of file | ||||
|         with read_sas(file01, format="xport", iterator=True) as reader: | ||||
|             data = reader.read(num_rows + 100) | ||||
|         assert data.shape[0] == num_rows | ||||
|  | ||||
|         # Test incremental read with `read` method. | ||||
|         with read_sas(file01, format="xport", iterator=True) as reader: | ||||
|             data = reader.read(10) | ||||
|         tm.assert_frame_equal(data, data_csv.iloc[0:10, :]) | ||||
|  | ||||
|         # Test incremental read with `get_chunk` method. | ||||
|         with read_sas(file01, format="xport", chunksize=10) as reader: | ||||
|             data = reader.get_chunk() | ||||
|         tm.assert_frame_equal(data, data_csv.iloc[0:10, :]) | ||||
|  | ||||
|         # Test read in loop | ||||
|         m = 0 | ||||
|         with read_sas(file01, format="xport", chunksize=100) as reader: | ||||
|             for x in reader: | ||||
|                 m += x.shape[0] | ||||
|         assert m == num_rows | ||||
|  | ||||
|         # Read full file with `read_sas` method | ||||
|         data = read_sas(file01) | ||||
|         tm.assert_frame_equal(data, data_csv) | ||||
|  | ||||
|     def test1_index(self, file01): | ||||
|         # Tests with DEMO_G.xpt using index (all numeric file) | ||||
|  | ||||
|         # Compare to this | ||||
|         data_csv = pd.read_csv(file01.replace(".xpt", ".csv")) | ||||
|         data_csv = data_csv.set_index("SEQN") | ||||
|         numeric_as_float(data_csv) | ||||
|  | ||||
|         # Read full file | ||||
|         data = read_sas(file01, index="SEQN", format="xport") | ||||
|         tm.assert_frame_equal(data, data_csv, check_index_type=False) | ||||
|  | ||||
|         # Test incremental read with `read` method. | ||||
|         with read_sas(file01, index="SEQN", format="xport", iterator=True) as reader: | ||||
|             data = reader.read(10) | ||||
|         tm.assert_frame_equal(data, data_csv.iloc[0:10, :], check_index_type=False) | ||||
|  | ||||
|         # Test incremental read with `get_chunk` method. | ||||
|         with read_sas(file01, index="SEQN", format="xport", chunksize=10) as reader: | ||||
|             data = reader.get_chunk() | ||||
|         tm.assert_frame_equal(data, data_csv.iloc[0:10, :], check_index_type=False) | ||||
|  | ||||
|     def test1_incremental(self, file01): | ||||
|         # Test with DEMO_G.xpt, reading full file incrementally | ||||
|  | ||||
|         data_csv = pd.read_csv(file01.replace(".xpt", ".csv")) | ||||
|         data_csv = data_csv.set_index("SEQN") | ||||
|         numeric_as_float(data_csv) | ||||
|  | ||||
|         with read_sas(file01, index="SEQN", chunksize=1000) as reader: | ||||
|             all_data = list(reader) | ||||
|         data = pd.concat(all_data, axis=0) | ||||
|  | ||||
|         tm.assert_frame_equal(data, data_csv, check_index_type=False) | ||||
|  | ||||
|     def test2(self, file02): | ||||
|         # Test with SSHSV1_A.xpt | ||||
|  | ||||
|         # Compare to this | ||||
|         data_csv = pd.read_csv(file02.replace(".xpt", ".csv")) | ||||
|         numeric_as_float(data_csv) | ||||
|  | ||||
|         data = read_sas(file02) | ||||
|         tm.assert_frame_equal(data, data_csv) | ||||
|  | ||||
|     def test2_binary(self, file02): | ||||
|         # Test with SSHSV1_A.xpt, read as a binary file | ||||
|  | ||||
|         # Compare to this | ||||
|         data_csv = pd.read_csv(file02.replace(".xpt", ".csv")) | ||||
|         numeric_as_float(data_csv) | ||||
|  | ||||
|         with open(file02, "rb") as fd: | ||||
|             # GH#35693 ensure that if we pass an open file, we | ||||
|             #  dont incorrectly close it in read_sas | ||||
|             data = read_sas(fd, format="xport") | ||||
|  | ||||
|         tm.assert_frame_equal(data, data_csv) | ||||
|  | ||||
|     def test_multiple_types(self, file03): | ||||
|         # Test with DRXFCD_G.xpt (contains text and numeric variables) | ||||
|  | ||||
|         # Compare to this | ||||
|         data_csv = pd.read_csv(file03.replace(".xpt", ".csv")) | ||||
|  | ||||
|         data = read_sas(file03, encoding="utf-8") | ||||
|         tm.assert_frame_equal(data, data_csv) | ||||
|  | ||||
|     def test_truncated_float_support(self, file04): | ||||
|         # Test with paxraw_d_short.xpt, a shortened version of: | ||||
|         # http://wwwn.cdc.gov/Nchs/Nhanes/2005-2006/PAXRAW_D.ZIP | ||||
|         # This file has truncated floats (5 bytes in this case). | ||||
|  | ||||
|         # GH 11713 | ||||
|  | ||||
|         data_csv = pd.read_csv(file04.replace(".xpt", ".csv")) | ||||
|  | ||||
|         data = read_sas(file04, format="xport") | ||||
|         tm.assert_frame_equal(data.astype("int64"), data_csv) | ||||
|  | ||||
|     def test_cport_header_found_raises(self, file05): | ||||
|         # Test with DEMO_PUF.cpt, the beginning of puf2019_1_fall.xpt | ||||
|         # from https://www.cms.gov/files/zip/puf2019.zip | ||||
|         # (despite the extension, it's a cpt file) | ||||
|         msg = "Header record indicates a CPORT file, which is not readable." | ||||
|         with pytest.raises(ValueError, match=msg): | ||||
|             read_sas(file05, format="xport") | ||||
		Reference in New Issue
	
	Block a user