done

2025-09-07 22:09:54 +02:00
parent e1b817252c
commit 2fc0d000b6
7796 changed files with 2159515 additions and 933 deletions
--- a/lib/python3.11/site-packages/pandas/tests/io/sas/init.py
+++ b/lib/python3.11/site-packages/pandas/tests/io/sas/init.py
--- a/lib/python3.11/site-packages/pandas/tests/io/sas/test_byteswap.py
+++ b/lib/python3.11/site-packages/pandas/tests/io/sas/test_byteswap.py
@ -0,0 +1,55 @@
+from hypothesis import (
+    assume,
+    example,
+    given,
+    strategies as st,
+)
+import numpy as np
+import pytest
+
+from pandas._libs.byteswap import (
+    read_double_with_byteswap,
+    read_float_with_byteswap,
+    read_uint16_with_byteswap,
+    read_uint32_with_byteswap,
+    read_uint64_with_byteswap,
+)
+
+import pandas._testing as tm
+
+
+@given(read_offset=st.integers(0, 11), number=st.integers(min_value=0))
+@example(number=2**16, read_offset=0)
+@example(number=2**32, read_offset=0)
+@example(number=2**64, read_offset=0)
+@pytest.mark.parametrize("int_type", [np.uint16, np.uint32, np.uint64])
+@pytest.mark.parametrize("should_byteswap", [True, False])
+def test_int_byteswap(read_offset, number, int_type, should_byteswap):
+    assume(number < 2 ** (8 * int_type(0).itemsize))
+    _test(number, int_type, read_offset, should_byteswap)
+
+
+@pytest.mark.filterwarnings("ignore:overflow encountered:RuntimeWarning")
+@given(read_offset=st.integers(0, 11), number=st.floats())
+@pytest.mark.parametrize("float_type", [np.float32, np.float64])
+@pytest.mark.parametrize("should_byteswap", [True, False])
+def test_float_byteswap(read_offset, number, float_type, should_byteswap):
+    _test(number, float_type, read_offset, should_byteswap)
+
+
+def _test(number, number_type, read_offset, should_byteswap):
+    number = number_type(number)
+    data = np.random.default_rng(2).integers(0, 256, size=20, dtype="uint8")
+    data[read_offset : read_offset + number.itemsize] = number[None].view("uint8")
+    swap_func = {
+        np.float32: read_float_with_byteswap,
+        np.float64: read_double_with_byteswap,
+        np.uint16: read_uint16_with_byteswap,
+        np.uint32: read_uint32_with_byteswap,
+        np.uint64: read_uint64_with_byteswap,
+    }[type(number)]
+    output_number = number_type(swap_func(bytes(data), read_offset, should_byteswap))
+    if should_byteswap:
+        tm.assert_equal(output_number, number.byteswap())
+    else:
+        tm.assert_equal(output_number, number)
--- a/lib/python3.11/site-packages/pandas/tests/io/sas/test_sas.py
+++ b/lib/python3.11/site-packages/pandas/tests/io/sas/test_sas.py
@ -0,0 +1,34 @@
+from io import StringIO
+
+import pytest
+
+from pandas import read_sas
+import pandas._testing as tm
+
+
+class TestSas:
+    def test_sas_buffer_format(self):
+        # see gh-14947
+        b = StringIO("")
+
+        msg = (
+            "If this is a buffer object rather than a string "
+            "name, you must specify a format string"
+        )
+        with pytest.raises(ValueError, match=msg):
+            read_sas(b)
+
+    def test_sas_read_no_format_or_extension(self):
+        # see gh-24548
+        msg = "unable to infer format of SAS file.+"
+        with tm.ensure_clean("test_file_no_extension") as path:
+            with pytest.raises(ValueError, match=msg):
+                read_sas(path)
+
+
+def test_sas_archive(datapath):
+    fname_uncompressed = datapath("io", "sas", "data", "airline.sas7bdat")
+    df_uncompressed = read_sas(fname_uncompressed)
+    fname_compressed = datapath("io", "sas", "data", "airline.sas7bdat.gz")
+    df_compressed = read_sas(fname_compressed, format="sas7bdat")
+    tm.assert_frame_equal(df_uncompressed, df_compressed)
--- a/lib/python3.11/site-packages/pandas/tests/io/sas/test_sas7bdat.py
+++ b/lib/python3.11/site-packages/pandas/tests/io/sas/test_sas7bdat.py
@ -0,0 +1,421 @@
+import contextlib
+from datetime import datetime
+import io
+import os
+from pathlib import Path
+
+import numpy as np
+import pytest
+
+from pandas.compat import IS64
+from pandas.errors import EmptyDataError
+import pandas.util._test_decorators as td
+
+import pandas as pd
+import pandas._testing as tm
+
+from pandas.io.sas.sas7bdat import SAS7BDATReader
+
+
+@pytest.fixture
+def dirpath(datapath):
+    return datapath("io", "sas", "data")
+
+
+@pytest.fixture(params=[(1, range(1, 16)), (2, [16])])
+def data_test_ix(request, dirpath):
+    i, test_ix = request.param
+    fname = os.path.join(dirpath, f"test_sas7bdat_{i}.csv")
+    df = pd.read_csv(fname)
+    epoch = datetime(1960, 1, 1)
+    t1 = pd.to_timedelta(df["Column4"], unit="d")
+    df["Column4"] = (epoch + t1).astype("M8[s]")
+    t2 = pd.to_timedelta(df["Column12"], unit="d")
+    df["Column12"] = (epoch + t2).astype("M8[s]")
+    for k in range(df.shape[1]):
+        col = df.iloc[:, k]
+        if col.dtype == np.int64:
+            df.isetitem(k, df.iloc[:, k].astype(np.float64))
+    return df, test_ix
+
+
+# https://github.com/cython/cython/issues/1720
+class TestSAS7BDAT:
+    @pytest.mark.slow
+    def test_from_file(self, dirpath, data_test_ix):
+        expected, test_ix = data_test_ix
+        for k in test_ix:
+            fname = os.path.join(dirpath, f"test{k}.sas7bdat")
+            df = pd.read_sas(fname, encoding="utf-8")
+            tm.assert_frame_equal(df, expected)
+
+    @pytest.mark.slow
+    def test_from_buffer(self, dirpath, data_test_ix):
+        expected, test_ix = data_test_ix
+        for k in test_ix:
+            fname = os.path.join(dirpath, f"test{k}.sas7bdat")
+            with open(fname, "rb") as f:
+                byts = f.read()
+            buf = io.BytesIO(byts)
+            with pd.read_sas(
+                buf, format="sas7bdat", iterator=True, encoding="utf-8"
+            ) as rdr:
+                df = rdr.read()
+            tm.assert_frame_equal(df, expected)
+
+    @pytest.mark.slow
+    def test_from_iterator(self, dirpath, data_test_ix):
+        expected, test_ix = data_test_ix
+        for k in test_ix:
+            fname = os.path.join(dirpath, f"test{k}.sas7bdat")
+            with pd.read_sas(fname, iterator=True, encoding="utf-8") as rdr:
+                df = rdr.read(2)
+                tm.assert_frame_equal(df, expected.iloc[0:2, :])
+                df = rdr.read(3)
+                tm.assert_frame_equal(df, expected.iloc[2:5, :])
+
+    @pytest.mark.slow
+    def test_path_pathlib(self, dirpath, data_test_ix):
+        expected, test_ix = data_test_ix
+        for k in test_ix:
+            fname = Path(os.path.join(dirpath, f"test{k}.sas7bdat"))
+            df = pd.read_sas(fname, encoding="utf-8")
+            tm.assert_frame_equal(df, expected)
+
+    @td.skip_if_no("py.path")
+    @pytest.mark.slow
+    def test_path_localpath(self, dirpath, data_test_ix):
+        from py.path import local as LocalPath
+
+        expected, test_ix = data_test_ix
+        for k in test_ix:
+            fname = LocalPath(os.path.join(dirpath, f"test{k}.sas7bdat"))
+            df = pd.read_sas(fname, encoding="utf-8")
+            tm.assert_frame_equal(df, expected)
+
+    @pytest.mark.slow
+    @pytest.mark.parametrize("chunksize", (3, 5, 10, 11))
+    @pytest.mark.parametrize("k", range(1, 17))
+    def test_iterator_loop(self, dirpath, k, chunksize):
+        # github #13654
+        fname = os.path.join(dirpath, f"test{k}.sas7bdat")
+        with pd.read_sas(fname, chunksize=chunksize, encoding="utf-8") as rdr:
+            y = 0
+            for x in rdr:
+                y += x.shape[0]
+        assert y == rdr.row_count
+
+    def test_iterator_read_too_much(self, dirpath):
+        # github #14734
+        fname = os.path.join(dirpath, "test1.sas7bdat")
+        with pd.read_sas(
+            fname, format="sas7bdat", iterator=True, encoding="utf-8"
+        ) as rdr:
+            d1 = rdr.read(rdr.row_count + 20)
+
+        with pd.read_sas(fname, iterator=True, encoding="utf-8") as rdr:
+            d2 = rdr.read(rdr.row_count + 20)
+        tm.assert_frame_equal(d1, d2)
+
+
+def test_encoding_options(datapath):
+    fname = datapath("io", "sas", "data", "test1.sas7bdat")
+    df1 = pd.read_sas(fname)
+    df2 = pd.read_sas(fname, encoding="utf-8")
+    for col in df1.columns:
+        try:
+            df1[col] = df1[col].str.decode("utf-8")
+        except AttributeError:
+            pass
+    tm.assert_frame_equal(df1, df2)
+
+    with contextlib.closing(SAS7BDATReader(fname, convert_header_text=False)) as rdr:
+        df3 = rdr.read()
+    for x, y in zip(df1.columns, df3.columns):
+        assert x == y.decode()
+
+
+def test_encoding_infer(datapath):
+    fname = datapath("io", "sas", "data", "test1.sas7bdat")
+
+    with pd.read_sas(fname, encoding="infer", iterator=True) as df1_reader:
+        # check: is encoding inferred correctly from file
+        assert df1_reader.inferred_encoding == "cp1252"
+        df1 = df1_reader.read()
+
+    with pd.read_sas(fname, encoding="cp1252", iterator=True) as df2_reader:
+        df2 = df2_reader.read()
+
+    # check: reader reads correct information
+    tm.assert_frame_equal(df1, df2)
+
+
+def test_productsales(datapath):
+    fname = datapath("io", "sas", "data", "productsales.sas7bdat")
+    df = pd.read_sas(fname, encoding="utf-8")
+    fname = datapath("io", "sas", "data", "productsales.csv")
+    df0 = pd.read_csv(fname, parse_dates=["MONTH"])
+    vn = ["ACTUAL", "PREDICT", "QUARTER", "YEAR"]
+    df0[vn] = df0[vn].astype(np.float64)
+
+    df0["MONTH"] = df0["MONTH"].astype("M8[s]")
+    tm.assert_frame_equal(df, df0)
+
+
+def test_12659(datapath):
+    fname = datapath("io", "sas", "data", "test_12659.sas7bdat")
+    df = pd.read_sas(fname)
+    fname = datapath("io", "sas", "data", "test_12659.csv")
+    df0 = pd.read_csv(fname)
+    df0 = df0.astype(np.float64)
+    tm.assert_frame_equal(df, df0)
+
+
+def test_airline(datapath):
+    fname = datapath("io", "sas", "data", "airline.sas7bdat")
+    df = pd.read_sas(fname)
+    fname = datapath("io", "sas", "data", "airline.csv")
+    df0 = pd.read_csv(fname)
+    df0 = df0.astype(np.float64)
+    tm.assert_frame_equal(df, df0)
+
+
+def test_date_time(datapath):
+    # Support of different SAS date/datetime formats (PR #15871)
+    fname = datapath("io", "sas", "data", "datetime.sas7bdat")
+    df = pd.read_sas(fname)
+    fname = datapath("io", "sas", "data", "datetime.csv")
+    df0 = pd.read_csv(
+        fname, parse_dates=["Date1", "Date2", "DateTime", "DateTimeHi", "Taiw"]
+    )
+    # GH 19732: Timestamps imported from sas will incur floating point errors
+    # See GH#56014 for discussion of the correct "expected" results
+    #  We are really just testing that we are "close". This only seems to be
+    #  an issue near the implementation bounds.
+
+    df[df.columns[3]] = df.iloc[:, 3].dt.round("us")
+    df0["Date1"] = df0["Date1"].astype("M8[s]")
+    df0["Date2"] = df0["Date2"].astype("M8[s]")
+    df0["DateTime"] = df0["DateTime"].astype("M8[ms]")
+    df0["Taiw"] = df0["Taiw"].astype("M8[s]")
+
+    res = df0["DateTimeHi"].astype("M8[us]").dt.round("ms")
+    df0["DateTimeHi"] = res.astype("M8[ms]")
+
+    if not IS64:
+        # No good reason for this, just what we get on the CI
+        df0.loc[0, "DateTimeHi"] += np.timedelta64(1, "ms")
+        df0.loc[[2, 3], "DateTimeHi"] -= np.timedelta64(1, "ms")
+    tm.assert_frame_equal(df, df0)
+
+
+@pytest.mark.parametrize("column", ["WGT", "CYL"])
+def test_compact_numerical_values(datapath, column):
+    # Regression test for #21616
+    fname = datapath("io", "sas", "data", "cars.sas7bdat")
+    df = pd.read_sas(fname, encoding="latin-1")
+    # The two columns CYL and WGT in cars.sas7bdat have column
+    # width < 8 and only contain integral values.
+    # Test that pandas doesn't corrupt the numbers by adding
+    # decimals.
+    result = df[column]
+    expected = df[column].round()
+    tm.assert_series_equal(result, expected, check_exact=True)
+
+
+def test_many_columns(datapath):
+    # Test for looking for column information in more places (PR #22628)
+    fname = datapath("io", "sas", "data", "many_columns.sas7bdat")
+
+    df = pd.read_sas(fname, encoding="latin-1")
+
+    fname = datapath("io", "sas", "data", "many_columns.csv")
+    df0 = pd.read_csv(fname, encoding="latin-1")
+    tm.assert_frame_equal(df, df0)
+
+
+def test_inconsistent_number_of_rows(datapath):
+    # Regression test for issue #16615. (PR #22628)
+    fname = datapath("io", "sas", "data", "load_log.sas7bdat")
+    df = pd.read_sas(fname, encoding="latin-1")
+    assert len(df) == 2097
+
+
+def test_zero_variables(datapath):
+    # Check if the SAS file has zero variables (PR #18184)
+    fname = datapath("io", "sas", "data", "zero_variables.sas7bdat")
+    with pytest.raises(EmptyDataError, match="No columns to parse from file"):
+        pd.read_sas(fname)
+
+
+@pytest.mark.parametrize("encoding", [None, "utf8"])
+def test_zero_rows(datapath, encoding):
+    # GH 18198
+    fname = datapath("io", "sas", "data", "zero_rows.sas7bdat")
+    result = pd.read_sas(fname, encoding=encoding)
+    str_value = b"a" if encoding is None else "a"
+    expected = pd.DataFrame([{"char_field": str_value, "num_field": 1.0}]).iloc[:0]
+    tm.assert_frame_equal(result, expected)
+
+
+def test_corrupt_read(datapath):
+    # We don't really care about the exact failure, the important thing is
+    # that the resource should be cleaned up afterwards (BUG #35566)
+    fname = datapath("io", "sas", "data", "corrupt.sas7bdat")
+    msg = "'SAS7BDATReader' object has no attribute 'row_count'"
+    with pytest.raises(AttributeError, match=msg):
+        pd.read_sas(fname)
+
+
+def test_max_sas_date(datapath):
+    # GH 20927
+    # NB. max datetime in SAS dataset is 31DEC9999:23:59:59.999
+    #    but this is read as 29DEC9999:23:59:59.998993 by a buggy
+    #    sas7bdat module
+    # See also GH#56014 for discussion of the correct "expected" results.
+    fname = datapath("io", "sas", "data", "max_sas_date.sas7bdat")
+    df = pd.read_sas(fname, encoding="iso-8859-1")
+
+    expected = pd.DataFrame(
+        {
+            "text": ["max", "normal"],
+            "dt_as_float": [253717747199.999, 1880323199.999],
+            "dt_as_dt": np.array(
+                [
+                    datetime(9999, 12, 29, 23, 59, 59, 999000),
+                    datetime(2019, 8, 1, 23, 59, 59, 999000),
+                ],
+                dtype="M8[ms]",
+            ),
+            "date_as_float": [2936547.0, 21762.0],
+            "date_as_date": np.array(
+                [
+                    datetime(9999, 12, 29),
+                    datetime(2019, 8, 1),
+                ],
+                dtype="M8[s]",
+            ),
+        },
+        columns=["text", "dt_as_float", "dt_as_dt", "date_as_float", "date_as_date"],
+    )
+
+    if not IS64:
+        # No good reason for this, just what we get on the CI
+        expected.loc[:, "dt_as_dt"] -= np.timedelta64(1, "ms")
+
+    tm.assert_frame_equal(df, expected)
+
+
+def test_max_sas_date_iterator(datapath):
+    # GH 20927
+    # when called as an iterator, only those chunks with a date > pd.Timestamp.max
+    # are returned as datetime.datetime, if this happens that whole chunk is returned
+    # as datetime.datetime
+    col_order = ["text", "dt_as_float", "dt_as_dt", "date_as_float", "date_as_date"]
+    fname = datapath("io", "sas", "data", "max_sas_date.sas7bdat")
+    results = []
+    for df in pd.read_sas(fname, encoding="iso-8859-1", chunksize=1):
+        # GH 19732: Timestamps imported from sas will incur floating point errors
+        df.reset_index(inplace=True, drop=True)
+        results.append(df)
+    expected = [
+        pd.DataFrame(
+            {
+                "text": ["max"],
+                "dt_as_float": [253717747199.999],
+                "dt_as_dt": np.array(
+                    [datetime(9999, 12, 29, 23, 59, 59, 999000)], dtype="M8[ms]"
+                ),
+                "date_as_float": [2936547.0],
+                "date_as_date": np.array([datetime(9999, 12, 29)], dtype="M8[s]"),
+            },
+            columns=col_order,
+        ),
+        pd.DataFrame(
+            {
+                "text": ["normal"],
+                "dt_as_float": [1880323199.999],
+                "dt_as_dt": np.array(["2019-08-01 23:59:59.999"], dtype="M8[ms]"),
+                "date_as_float": [21762.0],
+                "date_as_date": np.array(["2019-08-01"], dtype="M8[s]"),
+            },
+            columns=col_order,
+        ),
+    ]
+    if not IS64:
+        # No good reason for this, just what we get on the CI
+        expected[0].loc[0, "dt_as_dt"] -= np.timedelta64(1, "ms")
+        expected[1].loc[0, "dt_as_dt"] -= np.timedelta64(1, "ms")
+
+    tm.assert_frame_equal(results[0], expected[0])
+    tm.assert_frame_equal(results[1], expected[1])
+
+
+def test_null_date(datapath):
+    fname = datapath("io", "sas", "data", "dates_null.sas7bdat")
+    df = pd.read_sas(fname, encoding="utf-8")
+
+    expected = pd.DataFrame(
+        {
+            "datecol": np.array(
+                [
+                    datetime(9999, 12, 29),
+                    np.datetime64("NaT"),
+                ],
+                dtype="M8[s]",
+            ),
+            "datetimecol": np.array(
+                [
+                    datetime(9999, 12, 29, 23, 59, 59, 999000),
+                    np.datetime64("NaT"),
+                ],
+                dtype="M8[ms]",
+            ),
+        },
+    )
+    if not IS64:
+        # No good reason for this, just what we get on the CI
+        expected.loc[0, "datetimecol"] -= np.timedelta64(1, "ms")
+    tm.assert_frame_equal(df, expected)
+
+
+def test_meta2_page(datapath):
+    # GH 35545
+    fname = datapath("io", "sas", "data", "test_meta2_page.sas7bdat")
+    df = pd.read_sas(fname)
+    assert len(df) == 1000
+
+
+@pytest.mark.parametrize(
+    "test_file, override_offset, override_value, expected_msg",
+    [
+        ("test2.sas7bdat", 0x10000 + 55229, 0x80 | 0x0F, "Out of bounds"),
+        ("test2.sas7bdat", 0x10000 + 55229, 0x10, "unknown control byte"),
+        ("test3.sas7bdat", 118170, 184, "Out of bounds"),
+    ],
+)
+def test_rle_rdc_exceptions(
+    datapath, test_file, override_offset, override_value, expected_msg
+):
+    """Errors in RLE/RDC decompression should propagate."""
+    with open(datapath("io", "sas", "data", test_file), "rb") as fd:
+        data = bytearray(fd.read())
+    data[override_offset] = override_value
+    with pytest.raises(Exception, match=expected_msg):
+        pd.read_sas(io.BytesIO(data), format="sas7bdat")
+
+
+def test_0x40_control_byte(datapath):
+    # GH 31243
+    fname = datapath("io", "sas", "data", "0x40controlbyte.sas7bdat")
+    df = pd.read_sas(fname, encoding="ascii")
+    fname = datapath("io", "sas", "data", "0x40controlbyte.csv")
+    df0 = pd.read_csv(fname, dtype="str")
+    tm.assert_frame_equal(df, df0)
+
+
+def test_0x00_control_byte(datapath):
+    # GH 47099
+    fname = datapath("io", "sas", "data", "0x00controlbyte.sas7bdat.bz2")
+    df = next(pd.read_sas(fname, chunksize=11_000))
+    assert df.shape == (11_000, 20)
--- a/lib/python3.11/site-packages/pandas/tests/io/sas/test_xport.py
+++ b/lib/python3.11/site-packages/pandas/tests/io/sas/test_xport.py
@ -0,0 +1,167 @@
+import numpy as np
+import pytest
+
+import pandas as pd
+import pandas._testing as tm
+
+from pandas.io.sas.sasreader import read_sas
+
+# CSV versions of test xpt files were obtained using the R foreign library
+
+# Numbers in a SAS xport file are always float64, so need to convert
+# before making comparisons.
+
+
+def numeric_as_float(data):
+    for v in data.columns:
+        if data[v].dtype is np.dtype("int64"):
+            data[v] = data[v].astype(np.float64)
+
+
+class TestXport:
+    @pytest.fixture
+    def file01(self, datapath):
+        return datapath("io", "sas", "data", "DEMO_G.xpt")
+
+    @pytest.fixture
+    def file02(self, datapath):
+        return datapath("io", "sas", "data", "SSHSV1_A.xpt")
+
+    @pytest.fixture
+    def file03(self, datapath):
+        return datapath("io", "sas", "data", "DRXFCD_G.xpt")
+
+    @pytest.fixture
+    def file04(self, datapath):
+        return datapath("io", "sas", "data", "paxraw_d_short.xpt")
+
+    @pytest.fixture
+    def file05(self, datapath):
+        return datapath("io", "sas", "data", "DEMO_PUF.cpt")
+
+    @pytest.mark.slow
+    def test1_basic(self, file01):
+        # Tests with DEMO_G.xpt (all numeric file)
+
+        # Compare to this
+        data_csv = pd.read_csv(file01.replace(".xpt", ".csv"))
+        numeric_as_float(data_csv)
+
+        # Read full file
+        data = read_sas(file01, format="xport")
+        tm.assert_frame_equal(data, data_csv)
+        num_rows = data.shape[0]
+
+        # Test reading beyond end of file
+        with read_sas(file01, format="xport", iterator=True) as reader:
+            data = reader.read(num_rows + 100)
+        assert data.shape[0] == num_rows
+
+        # Test incremental read with `read` method.
+        with read_sas(file01, format="xport", iterator=True) as reader:
+            data = reader.read(10)
+        tm.assert_frame_equal(data, data_csv.iloc[0:10, :])
+
+        # Test incremental read with `get_chunk` method.
+        with read_sas(file01, format="xport", chunksize=10) as reader:
+            data = reader.get_chunk()
+        tm.assert_frame_equal(data, data_csv.iloc[0:10, :])
+
+        # Test read in loop
+        m = 0
+        with read_sas(file01, format="xport", chunksize=100) as reader:
+            for x in reader:
+                m += x.shape[0]
+        assert m == num_rows
+
+        # Read full file with `read_sas` method
+        data = read_sas(file01)
+        tm.assert_frame_equal(data, data_csv)
+
+    def test1_index(self, file01):
+        # Tests with DEMO_G.xpt using index (all numeric file)
+
+        # Compare to this
+        data_csv = pd.read_csv(file01.replace(".xpt", ".csv"))
+        data_csv = data_csv.set_index("SEQN")
+        numeric_as_float(data_csv)
+
+        # Read full file
+        data = read_sas(file01, index="SEQN", format="xport")
+        tm.assert_frame_equal(data, data_csv, check_index_type=False)
+
+        # Test incremental read with `read` method.
+        with read_sas(file01, index="SEQN", format="xport", iterator=True) as reader:
+            data = reader.read(10)
+        tm.assert_frame_equal(data, data_csv.iloc[0:10, :], check_index_type=False)
+
+        # Test incremental read with `get_chunk` method.
+        with read_sas(file01, index="SEQN", format="xport", chunksize=10) as reader:
+            data = reader.get_chunk()
+        tm.assert_frame_equal(data, data_csv.iloc[0:10, :], check_index_type=False)
+
+    def test1_incremental(self, file01):
+        # Test with DEMO_G.xpt, reading full file incrementally
+
+        data_csv = pd.read_csv(file01.replace(".xpt", ".csv"))
+        data_csv = data_csv.set_index("SEQN")
+        numeric_as_float(data_csv)
+
+        with read_sas(file01, index="SEQN", chunksize=1000) as reader:
+            all_data = list(reader)
+        data = pd.concat(all_data, axis=0)
+
+        tm.assert_frame_equal(data, data_csv, check_index_type=False)
+
+    def test2(self, file02):
+        # Test with SSHSV1_A.xpt
+
+        # Compare to this
+        data_csv = pd.read_csv(file02.replace(".xpt", ".csv"))
+        numeric_as_float(data_csv)
+
+        data = read_sas(file02)
+        tm.assert_frame_equal(data, data_csv)
+
+    def test2_binary(self, file02):
+        # Test with SSHSV1_A.xpt, read as a binary file
+
+        # Compare to this
+        data_csv = pd.read_csv(file02.replace(".xpt", ".csv"))
+        numeric_as_float(data_csv)
+
+        with open(file02, "rb") as fd:
+            # GH#35693 ensure that if we pass an open file, we
+            #  dont incorrectly close it in read_sas
+            data = read_sas(fd, format="xport")
+
+        tm.assert_frame_equal(data, data_csv)
+
+    def test_multiple_types(self, file03):
+        # Test with DRXFCD_G.xpt (contains text and numeric variables)
+
+        # Compare to this
+        data_csv = pd.read_csv(file03.replace(".xpt", ".csv"))
+
+        data = read_sas(file03, encoding="utf-8")
+        tm.assert_frame_equal(data, data_csv)
+
+    def test_truncated_float_support(self, file04):
+        # Test with paxraw_d_short.xpt, a shortened version of:
+        # http://wwwn.cdc.gov/Nchs/Nhanes/2005-2006/PAXRAW_D.ZIP
+        # This file has truncated floats (5 bytes in this case).
+
+        # GH 11713
+
+        data_csv = pd.read_csv(file04.replace(".xpt", ".csv"))
+
+        data = read_sas(file04, format="xport")
+        tm.assert_frame_equal(data.astype("int64"), data_csv)
+
+    def test_cport_header_found_raises(self, file05):
+        # Test with DEMO_PUF.cpt, the beginning of puf2019_1_fall.xpt
+        # from https://www.cms.gov/files/zip/puf2019.zip
+        # (despite the extension, it's a cpt file)
+        msg = "Header record indicates a CPORT file, which is not readable."
+        with pytest.raises(ValueError, match=msg):
+            read_sas(file05, format="xport")