done

2025-09-07 22:09:54 +02:00
parent e1b817252c
commit 2fc0d000b6
7796 changed files with 2159515 additions and 933 deletions
--- a/lib/python3.11/site-packages/pandas/io/excel/_pyxlsb.py
+++ b/lib/python3.11/site-packages/pandas/io/excel/_pyxlsb.py
@ -0,0 +1,127 @@
+# pyright: reportMissingImports=false
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+from pandas.compat._optional import import_optional_dependency
+from pandas.util._decorators import doc
+
+from pandas.core.shared_docs import _shared_docs
+
+from pandas.io.excel._base import BaseExcelReader
+
+if TYPE_CHECKING:
+    from pyxlsb import Workbook
+
+    from pandas._typing import (
+        FilePath,
+        ReadBuffer,
+        Scalar,
+        StorageOptions,
+    )
+
+
+class PyxlsbReader(BaseExcelReader["Workbook"]):
+    @doc(storage_options=_shared_docs["storage_options"])
+    def __init__(
+        self,
+        filepath_or_buffer: FilePath | ReadBuffer[bytes],
+        storage_options: StorageOptions | None = None,
+        engine_kwargs: dict | None = None,
+    ) -> None:
+        """
+        Reader using pyxlsb engine.
+
+        Parameters
+        ----------
+        filepath_or_buffer : str, path object, or Workbook
+            Object to be parsed.
+        {storage_options}
+        engine_kwargs : dict, optional
+            Arbitrary keyword arguments passed to excel engine.
+        """
+        import_optional_dependency("pyxlsb")
+        # This will call load_workbook on the filepath or buffer
+        # And set the result to the book-attribute
+        super().__init__(
+            filepath_or_buffer,
+            storage_options=storage_options,
+            engine_kwargs=engine_kwargs,
+        )
+
+    @property
+    def _workbook_class(self) -> type[Workbook]:
+        from pyxlsb import Workbook
+
+        return Workbook
+
+    def load_workbook(
+        self, filepath_or_buffer: FilePath | ReadBuffer[bytes], engine_kwargs
+    ) -> Workbook:
+        from pyxlsb import open_workbook
+
+        # TODO: hack in buffer capability
+        # This might need some modifications to the Pyxlsb library
+        # Actual work for opening it is in xlsbpackage.py, line 20-ish
+
+        return open_workbook(filepath_or_buffer, **engine_kwargs)
+
+    @property
+    def sheet_names(self) -> list[str]:
+        return self.book.sheets
+
+    def get_sheet_by_name(self, name: str):
+        self.raise_if_bad_sheet_by_name(name)
+        return self.book.get_sheet(name)
+
+    def get_sheet_by_index(self, index: int):
+        self.raise_if_bad_sheet_by_index(index)
+        # pyxlsb sheets are indexed from 1 onwards
+        # There's a fix for this in the source, but the pypi package doesn't have it
+        return self.book.get_sheet(index + 1)
+
+    def _convert_cell(self, cell) -> Scalar:
+        # TODO: there is no way to distinguish between floats and datetimes in pyxlsb
+        # This means that there is no way to read datetime types from an xlsb file yet
+        if cell.v is None:
+            return ""  # Prevents non-named columns from not showing up as Unnamed: i
+        if isinstance(cell.v, float):
+            val = int(cell.v)
+            if val == cell.v:
+                return val
+            else:
+                return float(cell.v)
+
+        return cell.v
+
+    def get_sheet_data(
+        self,
+        sheet,
+        file_rows_needed: int | None = None,
+    ) -> list[list[Scalar]]:
+        data: list[list[Scalar]] = []
+        previous_row_number = -1
+        # When sparse=True the rows can have different lengths and empty rows are
+        # not returned. The cells are namedtuples of row, col, value (r, c, v).
+        for row in sheet.rows(sparse=True):
+            row_number = row[0].r
+            converted_row = [self._convert_cell(cell) for cell in row]
+            while converted_row and converted_row[-1] == "":
+                # trim trailing empty elements
+                converted_row.pop()
+            if converted_row:
+                data.extend([[]] * (row_number - previous_row_number - 1))
+                data.append(converted_row)
+                previous_row_number = row_number
+            if file_rows_needed is not None and len(data) >= file_rows_needed:
+                break
+        if data:
+            # extend rows to max_width
+            max_width = max(len(data_row) for data_row in data)
+            if min(len(data_row) for data_row in data) < max_width:
+                empty_cell: list[Scalar] = [""]
+                data = [
+                    data_row + (max_width - len(data_row)) * empty_cell
+                    for data_row in data
+                ]
+        return data