done
This commit is contained in:
13
lib/python3.11/site-packages/pandas/io/__init__.py
Normal file
13
lib/python3.11/site-packages/pandas/io/__init__.py
Normal file
@ -0,0 +1,13 @@
|
||||
# ruff: noqa: TCH004
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
if TYPE_CHECKING:
|
||||
# import modules that have public classes/functions
|
||||
from pandas.io import (
|
||||
formats,
|
||||
json,
|
||||
stata,
|
||||
)
|
||||
|
||||
# mark only those modules as public
|
||||
__all__ = ["formats", "json", "stata"]
|
||||
94
lib/python3.11/site-packages/pandas/io/_util.py
Normal file
94
lib/python3.11/site-packages/pandas/io/_util.py
Normal file
@ -0,0 +1,94 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Literal,
|
||||
)
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas._config import using_string_dtype
|
||||
|
||||
from pandas._libs import lib
|
||||
from pandas.compat import (
|
||||
pa_version_under18p0,
|
||||
pa_version_under19p0,
|
||||
)
|
||||
from pandas.compat._optional import import_optional_dependency
|
||||
|
||||
import pandas as pd
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Callable
|
||||
|
||||
import pyarrow
|
||||
|
||||
from pandas._typing import DtypeBackend
|
||||
|
||||
|
||||
def _arrow_dtype_mapping() -> dict:
|
||||
pa = import_optional_dependency("pyarrow")
|
||||
return {
|
||||
pa.int8(): pd.Int8Dtype(),
|
||||
pa.int16(): pd.Int16Dtype(),
|
||||
pa.int32(): pd.Int32Dtype(),
|
||||
pa.int64(): pd.Int64Dtype(),
|
||||
pa.uint8(): pd.UInt8Dtype(),
|
||||
pa.uint16(): pd.UInt16Dtype(),
|
||||
pa.uint32(): pd.UInt32Dtype(),
|
||||
pa.uint64(): pd.UInt64Dtype(),
|
||||
pa.bool_(): pd.BooleanDtype(),
|
||||
pa.string(): pd.StringDtype(),
|
||||
pa.float32(): pd.Float32Dtype(),
|
||||
pa.float64(): pd.Float64Dtype(),
|
||||
pa.string(): pd.StringDtype(),
|
||||
pa.large_string(): pd.StringDtype(),
|
||||
}
|
||||
|
||||
|
||||
def _arrow_string_types_mapper() -> Callable:
|
||||
pa = import_optional_dependency("pyarrow")
|
||||
|
||||
mapping = {
|
||||
pa.string(): pd.StringDtype(na_value=np.nan),
|
||||
pa.large_string(): pd.StringDtype(na_value=np.nan),
|
||||
}
|
||||
if not pa_version_under18p0:
|
||||
mapping[pa.string_view()] = pd.StringDtype(na_value=np.nan)
|
||||
|
||||
return mapping.get
|
||||
|
||||
|
||||
def arrow_table_to_pandas(
|
||||
table: pyarrow.Table,
|
||||
dtype_backend: DtypeBackend | Literal["numpy"] | lib.NoDefault = lib.no_default,
|
||||
null_to_int64: bool = False,
|
||||
to_pandas_kwargs: dict | None = None,
|
||||
) -> pd.DataFrame:
|
||||
if to_pandas_kwargs is None:
|
||||
to_pandas_kwargs = {}
|
||||
|
||||
pa = import_optional_dependency("pyarrow")
|
||||
|
||||
types_mapper: type[pd.ArrowDtype] | None | Callable
|
||||
if dtype_backend == "numpy_nullable":
|
||||
mapping = _arrow_dtype_mapping()
|
||||
if null_to_int64:
|
||||
# Modify the default mapping to also map null to Int64
|
||||
# (to match other engines - only for CSV parser)
|
||||
mapping[pa.null()] = pd.Int64Dtype()
|
||||
types_mapper = mapping.get
|
||||
elif dtype_backend == "pyarrow":
|
||||
types_mapper = pd.ArrowDtype
|
||||
elif using_string_dtype():
|
||||
if pa_version_under19p0:
|
||||
types_mapper = _arrow_string_types_mapper()
|
||||
else:
|
||||
types_mapper = None
|
||||
elif dtype_backend is lib.no_default or dtype_backend == "numpy":
|
||||
types_mapper = None
|
||||
else:
|
||||
raise NotImplementedError
|
||||
|
||||
df = table.to_pandas(types_mapper=types_mapper, **to_pandas_kwargs)
|
||||
return df
|
||||
65
lib/python3.11/site-packages/pandas/io/api.py
Normal file
65
lib/python3.11/site-packages/pandas/io/api.py
Normal file
@ -0,0 +1,65 @@
|
||||
"""
|
||||
Data IO api
|
||||
"""
|
||||
|
||||
from pandas.io.clipboards import read_clipboard
|
||||
from pandas.io.excel import (
|
||||
ExcelFile,
|
||||
ExcelWriter,
|
||||
read_excel,
|
||||
)
|
||||
from pandas.io.feather_format import read_feather
|
||||
from pandas.io.gbq import read_gbq
|
||||
from pandas.io.html import read_html
|
||||
from pandas.io.json import read_json
|
||||
from pandas.io.orc import read_orc
|
||||
from pandas.io.parquet import read_parquet
|
||||
from pandas.io.parsers import (
|
||||
read_csv,
|
||||
read_fwf,
|
||||
read_table,
|
||||
)
|
||||
from pandas.io.pickle import (
|
||||
read_pickle,
|
||||
to_pickle,
|
||||
)
|
||||
from pandas.io.pytables import (
|
||||
HDFStore,
|
||||
read_hdf,
|
||||
)
|
||||
from pandas.io.sas import read_sas
|
||||
from pandas.io.spss import read_spss
|
||||
from pandas.io.sql import (
|
||||
read_sql,
|
||||
read_sql_query,
|
||||
read_sql_table,
|
||||
)
|
||||
from pandas.io.stata import read_stata
|
||||
from pandas.io.xml import read_xml
|
||||
|
||||
__all__ = [
|
||||
"ExcelFile",
|
||||
"ExcelWriter",
|
||||
"HDFStore",
|
||||
"read_clipboard",
|
||||
"read_csv",
|
||||
"read_excel",
|
||||
"read_feather",
|
||||
"read_fwf",
|
||||
"read_gbq",
|
||||
"read_hdf",
|
||||
"read_html",
|
||||
"read_json",
|
||||
"read_orc",
|
||||
"read_parquet",
|
||||
"read_pickle",
|
||||
"read_sas",
|
||||
"read_spss",
|
||||
"read_sql",
|
||||
"read_sql_query",
|
||||
"read_sql_table",
|
||||
"read_stata",
|
||||
"read_table",
|
||||
"read_xml",
|
||||
"to_pickle",
|
||||
]
|
||||
747
lib/python3.11/site-packages/pandas/io/clipboard/__init__.py
Normal file
747
lib/python3.11/site-packages/pandas/io/clipboard/__init__.py
Normal file
@ -0,0 +1,747 @@
|
||||
"""
|
||||
Pyperclip
|
||||
|
||||
A cross-platform clipboard module for Python,
|
||||
with copy & paste functions for plain text.
|
||||
By Al Sweigart al@inventwithpython.com
|
||||
Licence at LICENSES/PYPERCLIP_LICENSE
|
||||
|
||||
Usage:
|
||||
import pyperclip
|
||||
pyperclip.copy('The text to be copied to the clipboard.')
|
||||
spam = pyperclip.paste()
|
||||
|
||||
if not pyperclip.is_available():
|
||||
print("Copy functionality unavailable!")
|
||||
|
||||
On Windows, no additional modules are needed.
|
||||
On Mac, the pyobjc module is used, falling back to the pbcopy and pbpaste cli
|
||||
commands. (These commands should come with OS X.).
|
||||
On Linux, install xclip, xsel, or wl-clipboard (for "wayland" sessions) via
|
||||
package manager.
|
||||
For example, in Debian:
|
||||
sudo apt-get install xclip
|
||||
sudo apt-get install xsel
|
||||
sudo apt-get install wl-clipboard
|
||||
|
||||
Otherwise on Linux, you will need the PyQt5 modules installed.
|
||||
|
||||
This module does not work with PyGObject yet.
|
||||
|
||||
Cygwin is currently not supported.
|
||||
|
||||
Security Note: This module runs programs with these names:
|
||||
- pbcopy
|
||||
- pbpaste
|
||||
- xclip
|
||||
- xsel
|
||||
- wl-copy/wl-paste
|
||||
- klipper
|
||||
- qdbus
|
||||
A malicious user could rename or add programs with these names, tricking
|
||||
Pyperclip into running them with whatever permissions the Python process has.
|
||||
|
||||
"""
|
||||
|
||||
__version__ = "1.8.2"
|
||||
|
||||
|
||||
import contextlib
|
||||
import ctypes
|
||||
from ctypes import (
|
||||
c_size_t,
|
||||
c_wchar,
|
||||
c_wchar_p,
|
||||
get_errno,
|
||||
sizeof,
|
||||
)
|
||||
import os
|
||||
import platform
|
||||
from shutil import which as _executable_exists
|
||||
import subprocess
|
||||
import time
|
||||
import warnings
|
||||
|
||||
from pandas.errors import (
|
||||
PyperclipException,
|
||||
PyperclipWindowsException,
|
||||
)
|
||||
from pandas.util._exceptions import find_stack_level
|
||||
|
||||
# `import PyQt4` sys.exit()s if DISPLAY is not in the environment.
|
||||
# Thus, we need to detect the presence of $DISPLAY manually
|
||||
# and not load PyQt4 if it is absent.
|
||||
HAS_DISPLAY = os.getenv("DISPLAY")
|
||||
|
||||
EXCEPT_MSG = """
|
||||
Pyperclip could not find a copy/paste mechanism for your system.
|
||||
For more information, please visit
|
||||
https://pyperclip.readthedocs.io/en/latest/index.html#not-implemented-error
|
||||
"""
|
||||
|
||||
ENCODING = "utf-8"
|
||||
|
||||
|
||||
class PyperclipTimeoutException(PyperclipException):
|
||||
pass
|
||||
|
||||
|
||||
def _stringifyText(text) -> str:
|
||||
acceptedTypes = (str, int, float, bool)
|
||||
if not isinstance(text, acceptedTypes):
|
||||
raise PyperclipException(
|
||||
f"only str, int, float, and bool values "
|
||||
f"can be copied to the clipboard, not {type(text).__name__}"
|
||||
)
|
||||
return str(text)
|
||||
|
||||
|
||||
def init_osx_pbcopy_clipboard():
|
||||
def copy_osx_pbcopy(text):
|
||||
text = _stringifyText(text) # Converts non-str values to str.
|
||||
with subprocess.Popen(
|
||||
["pbcopy", "w"], stdin=subprocess.PIPE, close_fds=True
|
||||
) as p:
|
||||
p.communicate(input=text.encode(ENCODING))
|
||||
|
||||
def paste_osx_pbcopy():
|
||||
with subprocess.Popen(
|
||||
["pbpaste", "r"], stdout=subprocess.PIPE, close_fds=True
|
||||
) as p:
|
||||
stdout = p.communicate()[0]
|
||||
return stdout.decode(ENCODING)
|
||||
|
||||
return copy_osx_pbcopy, paste_osx_pbcopy
|
||||
|
||||
|
||||
def init_osx_pyobjc_clipboard():
|
||||
def copy_osx_pyobjc(text):
|
||||
"""Copy string argument to clipboard"""
|
||||
text = _stringifyText(text) # Converts non-str values to str.
|
||||
newStr = Foundation.NSString.stringWithString_(text).nsstring()
|
||||
newData = newStr.dataUsingEncoding_(Foundation.NSUTF8StringEncoding)
|
||||
board = AppKit.NSPasteboard.generalPasteboard()
|
||||
board.declareTypes_owner_([AppKit.NSStringPboardType], None)
|
||||
board.setData_forType_(newData, AppKit.NSStringPboardType)
|
||||
|
||||
def paste_osx_pyobjc():
|
||||
"""Returns contents of clipboard"""
|
||||
board = AppKit.NSPasteboard.generalPasteboard()
|
||||
content = board.stringForType_(AppKit.NSStringPboardType)
|
||||
return content
|
||||
|
||||
return copy_osx_pyobjc, paste_osx_pyobjc
|
||||
|
||||
|
||||
def init_qt_clipboard():
|
||||
global QApplication
|
||||
# $DISPLAY should exist
|
||||
|
||||
# Try to import from qtpy, but if that fails try PyQt5 then PyQt4
|
||||
try:
|
||||
from qtpy.QtWidgets import QApplication
|
||||
except ImportError:
|
||||
try:
|
||||
from PyQt5.QtWidgets import QApplication
|
||||
except ImportError:
|
||||
from PyQt4.QtGui import QApplication
|
||||
|
||||
app = QApplication.instance()
|
||||
if app is None:
|
||||
app = QApplication([])
|
||||
|
||||
def copy_qt(text):
|
||||
text = _stringifyText(text) # Converts non-str values to str.
|
||||
cb = app.clipboard()
|
||||
cb.setText(text)
|
||||
|
||||
def paste_qt() -> str:
|
||||
cb = app.clipboard()
|
||||
return str(cb.text())
|
||||
|
||||
return copy_qt, paste_qt
|
||||
|
||||
|
||||
def init_xclip_clipboard():
|
||||
DEFAULT_SELECTION = "c"
|
||||
PRIMARY_SELECTION = "p"
|
||||
|
||||
def copy_xclip(text, primary=False):
|
||||
text = _stringifyText(text) # Converts non-str values to str.
|
||||
selection = DEFAULT_SELECTION
|
||||
if primary:
|
||||
selection = PRIMARY_SELECTION
|
||||
with subprocess.Popen(
|
||||
["xclip", "-selection", selection], stdin=subprocess.PIPE, close_fds=True
|
||||
) as p:
|
||||
p.communicate(input=text.encode(ENCODING))
|
||||
|
||||
def paste_xclip(primary=False):
|
||||
selection = DEFAULT_SELECTION
|
||||
if primary:
|
||||
selection = PRIMARY_SELECTION
|
||||
with subprocess.Popen(
|
||||
["xclip", "-selection", selection, "-o"],
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
close_fds=True,
|
||||
) as p:
|
||||
stdout = p.communicate()[0]
|
||||
# Intentionally ignore extraneous output on stderr when clipboard is empty
|
||||
return stdout.decode(ENCODING)
|
||||
|
||||
return copy_xclip, paste_xclip
|
||||
|
||||
|
||||
def init_xsel_clipboard():
|
||||
DEFAULT_SELECTION = "-b"
|
||||
PRIMARY_SELECTION = "-p"
|
||||
|
||||
def copy_xsel(text, primary=False):
|
||||
text = _stringifyText(text) # Converts non-str values to str.
|
||||
selection_flag = DEFAULT_SELECTION
|
||||
if primary:
|
||||
selection_flag = PRIMARY_SELECTION
|
||||
with subprocess.Popen(
|
||||
["xsel", selection_flag, "-i"], stdin=subprocess.PIPE, close_fds=True
|
||||
) as p:
|
||||
p.communicate(input=text.encode(ENCODING))
|
||||
|
||||
def paste_xsel(primary=False):
|
||||
selection_flag = DEFAULT_SELECTION
|
||||
if primary:
|
||||
selection_flag = PRIMARY_SELECTION
|
||||
with subprocess.Popen(
|
||||
["xsel", selection_flag, "-o"], stdout=subprocess.PIPE, close_fds=True
|
||||
) as p:
|
||||
stdout = p.communicate()[0]
|
||||
return stdout.decode(ENCODING)
|
||||
|
||||
return copy_xsel, paste_xsel
|
||||
|
||||
|
||||
def init_wl_clipboard():
|
||||
PRIMARY_SELECTION = "-p"
|
||||
|
||||
def copy_wl(text, primary=False):
|
||||
text = _stringifyText(text) # Converts non-str values to str.
|
||||
args = ["wl-copy"]
|
||||
if primary:
|
||||
args.append(PRIMARY_SELECTION)
|
||||
if not text:
|
||||
args.append("--clear")
|
||||
subprocess.check_call(args, close_fds=True)
|
||||
else:
|
||||
p = subprocess.Popen(args, stdin=subprocess.PIPE, close_fds=True)
|
||||
p.communicate(input=text.encode(ENCODING))
|
||||
|
||||
def paste_wl(primary=False):
|
||||
args = ["wl-paste", "-n"]
|
||||
if primary:
|
||||
args.append(PRIMARY_SELECTION)
|
||||
p = subprocess.Popen(args, stdout=subprocess.PIPE, close_fds=True)
|
||||
stdout, _stderr = p.communicate()
|
||||
return stdout.decode(ENCODING)
|
||||
|
||||
return copy_wl, paste_wl
|
||||
|
||||
|
||||
def init_klipper_clipboard():
|
||||
def copy_klipper(text):
|
||||
text = _stringifyText(text) # Converts non-str values to str.
|
||||
with subprocess.Popen(
|
||||
[
|
||||
"qdbus",
|
||||
"org.kde.klipper",
|
||||
"/klipper",
|
||||
"setClipboardContents",
|
||||
text.encode(ENCODING),
|
||||
],
|
||||
stdin=subprocess.PIPE,
|
||||
close_fds=True,
|
||||
) as p:
|
||||
p.communicate(input=None)
|
||||
|
||||
def paste_klipper():
|
||||
with subprocess.Popen(
|
||||
["qdbus", "org.kde.klipper", "/klipper", "getClipboardContents"],
|
||||
stdout=subprocess.PIPE,
|
||||
close_fds=True,
|
||||
) as p:
|
||||
stdout = p.communicate()[0]
|
||||
|
||||
# Workaround for https://bugs.kde.org/show_bug.cgi?id=342874
|
||||
# TODO: https://github.com/asweigart/pyperclip/issues/43
|
||||
clipboardContents = stdout.decode(ENCODING)
|
||||
# even if blank, Klipper will append a newline at the end
|
||||
assert len(clipboardContents) > 0
|
||||
# make sure that newline is there
|
||||
assert clipboardContents.endswith("\n")
|
||||
if clipboardContents.endswith("\n"):
|
||||
clipboardContents = clipboardContents[:-1]
|
||||
return clipboardContents
|
||||
|
||||
return copy_klipper, paste_klipper
|
||||
|
||||
|
||||
def init_dev_clipboard_clipboard():
|
||||
def copy_dev_clipboard(text):
|
||||
text = _stringifyText(text) # Converts non-str values to str.
|
||||
if text == "":
|
||||
warnings.warn(
|
||||
"Pyperclip cannot copy a blank string to the clipboard on Cygwin. "
|
||||
"This is effectively a no-op.",
|
||||
stacklevel=find_stack_level(),
|
||||
)
|
||||
if "\r" in text:
|
||||
warnings.warn(
|
||||
"Pyperclip cannot handle \\r characters on Cygwin.",
|
||||
stacklevel=find_stack_level(),
|
||||
)
|
||||
|
||||
with open("/dev/clipboard", "w", encoding="utf-8") as fd:
|
||||
fd.write(text)
|
||||
|
||||
def paste_dev_clipboard() -> str:
|
||||
with open("/dev/clipboard", encoding="utf-8") as fd:
|
||||
content = fd.read()
|
||||
return content
|
||||
|
||||
return copy_dev_clipboard, paste_dev_clipboard
|
||||
|
||||
|
||||
def init_no_clipboard():
|
||||
class ClipboardUnavailable:
|
||||
def __call__(self, *args, **kwargs):
|
||||
raise PyperclipException(EXCEPT_MSG)
|
||||
|
||||
def __bool__(self) -> bool:
|
||||
return False
|
||||
|
||||
return ClipboardUnavailable(), ClipboardUnavailable()
|
||||
|
||||
|
||||
# Windows-related clipboard functions:
|
||||
class CheckedCall:
|
||||
def __init__(self, f) -> None:
|
||||
super().__setattr__("f", f)
|
||||
|
||||
def __call__(self, *args):
|
||||
ret = self.f(*args)
|
||||
if not ret and get_errno():
|
||||
raise PyperclipWindowsException("Error calling " + self.f.__name__)
|
||||
return ret
|
||||
|
||||
def __setattr__(self, key, value):
|
||||
setattr(self.f, key, value)
|
||||
|
||||
|
||||
def init_windows_clipboard():
|
||||
global HGLOBAL, LPVOID, DWORD, LPCSTR, INT
|
||||
global HWND, HINSTANCE, HMENU, BOOL, UINT, HANDLE
|
||||
from ctypes.wintypes import (
|
||||
BOOL,
|
||||
DWORD,
|
||||
HANDLE,
|
||||
HGLOBAL,
|
||||
HINSTANCE,
|
||||
HMENU,
|
||||
HWND,
|
||||
INT,
|
||||
LPCSTR,
|
||||
LPVOID,
|
||||
UINT,
|
||||
)
|
||||
|
||||
windll = ctypes.windll
|
||||
msvcrt = ctypes.CDLL("msvcrt")
|
||||
|
||||
safeCreateWindowExA = CheckedCall(windll.user32.CreateWindowExA)
|
||||
safeCreateWindowExA.argtypes = [
|
||||
DWORD,
|
||||
LPCSTR,
|
||||
LPCSTR,
|
||||
DWORD,
|
||||
INT,
|
||||
INT,
|
||||
INT,
|
||||
INT,
|
||||
HWND,
|
||||
HMENU,
|
||||
HINSTANCE,
|
||||
LPVOID,
|
||||
]
|
||||
safeCreateWindowExA.restype = HWND
|
||||
|
||||
safeDestroyWindow = CheckedCall(windll.user32.DestroyWindow)
|
||||
safeDestroyWindow.argtypes = [HWND]
|
||||
safeDestroyWindow.restype = BOOL
|
||||
|
||||
OpenClipboard = windll.user32.OpenClipboard
|
||||
OpenClipboard.argtypes = [HWND]
|
||||
OpenClipboard.restype = BOOL
|
||||
|
||||
safeCloseClipboard = CheckedCall(windll.user32.CloseClipboard)
|
||||
safeCloseClipboard.argtypes = []
|
||||
safeCloseClipboard.restype = BOOL
|
||||
|
||||
safeEmptyClipboard = CheckedCall(windll.user32.EmptyClipboard)
|
||||
safeEmptyClipboard.argtypes = []
|
||||
safeEmptyClipboard.restype = BOOL
|
||||
|
||||
safeGetClipboardData = CheckedCall(windll.user32.GetClipboardData)
|
||||
safeGetClipboardData.argtypes = [UINT]
|
||||
safeGetClipboardData.restype = HANDLE
|
||||
|
||||
safeSetClipboardData = CheckedCall(windll.user32.SetClipboardData)
|
||||
safeSetClipboardData.argtypes = [UINT, HANDLE]
|
||||
safeSetClipboardData.restype = HANDLE
|
||||
|
||||
safeGlobalAlloc = CheckedCall(windll.kernel32.GlobalAlloc)
|
||||
safeGlobalAlloc.argtypes = [UINT, c_size_t]
|
||||
safeGlobalAlloc.restype = HGLOBAL
|
||||
|
||||
safeGlobalLock = CheckedCall(windll.kernel32.GlobalLock)
|
||||
safeGlobalLock.argtypes = [HGLOBAL]
|
||||
safeGlobalLock.restype = LPVOID
|
||||
|
||||
safeGlobalUnlock = CheckedCall(windll.kernel32.GlobalUnlock)
|
||||
safeGlobalUnlock.argtypes = [HGLOBAL]
|
||||
safeGlobalUnlock.restype = BOOL
|
||||
|
||||
wcslen = CheckedCall(msvcrt.wcslen)
|
||||
wcslen.argtypes = [c_wchar_p]
|
||||
wcslen.restype = UINT
|
||||
|
||||
GMEM_MOVEABLE = 0x0002
|
||||
CF_UNICODETEXT = 13
|
||||
|
||||
@contextlib.contextmanager
|
||||
def window():
|
||||
"""
|
||||
Context that provides a valid Windows hwnd.
|
||||
"""
|
||||
# we really just need the hwnd, so setting "STATIC"
|
||||
# as predefined lpClass is just fine.
|
||||
hwnd = safeCreateWindowExA(
|
||||
0, b"STATIC", None, 0, 0, 0, 0, 0, None, None, None, None
|
||||
)
|
||||
try:
|
||||
yield hwnd
|
||||
finally:
|
||||
safeDestroyWindow(hwnd)
|
||||
|
||||
@contextlib.contextmanager
|
||||
def clipboard(hwnd):
|
||||
"""
|
||||
Context manager that opens the clipboard and prevents
|
||||
other applications from modifying the clipboard content.
|
||||
"""
|
||||
# We may not get the clipboard handle immediately because
|
||||
# some other application is accessing it (?)
|
||||
# We try for at least 500ms to get the clipboard.
|
||||
t = time.time() + 0.5
|
||||
success = False
|
||||
while time.time() < t:
|
||||
success = OpenClipboard(hwnd)
|
||||
if success:
|
||||
break
|
||||
time.sleep(0.01)
|
||||
if not success:
|
||||
raise PyperclipWindowsException("Error calling OpenClipboard")
|
||||
|
||||
try:
|
||||
yield
|
||||
finally:
|
||||
safeCloseClipboard()
|
||||
|
||||
def copy_windows(text):
|
||||
# This function is heavily based on
|
||||
# http://msdn.com/ms649016#_win32_Copying_Information_to_the_Clipboard
|
||||
|
||||
text = _stringifyText(text) # Converts non-str values to str.
|
||||
|
||||
with window() as hwnd:
|
||||
# http://msdn.com/ms649048
|
||||
# If an application calls OpenClipboard with hwnd set to NULL,
|
||||
# EmptyClipboard sets the clipboard owner to NULL;
|
||||
# this causes SetClipboardData to fail.
|
||||
# => We need a valid hwnd to copy something.
|
||||
with clipboard(hwnd):
|
||||
safeEmptyClipboard()
|
||||
|
||||
if text:
|
||||
# http://msdn.com/ms649051
|
||||
# If the hMem parameter identifies a memory object,
|
||||
# the object must have been allocated using the
|
||||
# function with the GMEM_MOVEABLE flag.
|
||||
count = wcslen(text) + 1
|
||||
handle = safeGlobalAlloc(GMEM_MOVEABLE, count * sizeof(c_wchar))
|
||||
locked_handle = safeGlobalLock(handle)
|
||||
|
||||
ctypes.memmove(
|
||||
c_wchar_p(locked_handle),
|
||||
c_wchar_p(text),
|
||||
count * sizeof(c_wchar),
|
||||
)
|
||||
|
||||
safeGlobalUnlock(handle)
|
||||
safeSetClipboardData(CF_UNICODETEXT, handle)
|
||||
|
||||
def paste_windows():
|
||||
with clipboard(None):
|
||||
handle = safeGetClipboardData(CF_UNICODETEXT)
|
||||
if not handle:
|
||||
# GetClipboardData may return NULL with errno == NO_ERROR
|
||||
# if the clipboard is empty.
|
||||
# (Also, it may return a handle to an empty buffer,
|
||||
# but technically that's not empty)
|
||||
return ""
|
||||
return c_wchar_p(handle).value
|
||||
|
||||
return copy_windows, paste_windows
|
||||
|
||||
|
||||
def init_wsl_clipboard():
|
||||
def copy_wsl(text):
|
||||
text = _stringifyText(text) # Converts non-str values to str.
|
||||
with subprocess.Popen(["clip.exe"], stdin=subprocess.PIPE, close_fds=True) as p:
|
||||
p.communicate(input=text.encode(ENCODING))
|
||||
|
||||
def paste_wsl():
|
||||
with subprocess.Popen(
|
||||
["powershell.exe", "-command", "Get-Clipboard"],
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
close_fds=True,
|
||||
) as p:
|
||||
stdout = p.communicate()[0]
|
||||
# WSL appends "\r\n" to the contents.
|
||||
return stdout[:-2].decode(ENCODING)
|
||||
|
||||
return copy_wsl, paste_wsl
|
||||
|
||||
|
||||
# Automatic detection of clipboard mechanisms
|
||||
# and importing is done in determine_clipboard():
|
||||
def determine_clipboard():
|
||||
"""
|
||||
Determine the OS/platform and set the copy() and paste() functions
|
||||
accordingly.
|
||||
"""
|
||||
global Foundation, AppKit, qtpy, PyQt4, PyQt5
|
||||
|
||||
# Setup for the CYGWIN platform:
|
||||
if (
|
||||
"cygwin" in platform.system().lower()
|
||||
): # Cygwin has a variety of values returned by platform.system(),
|
||||
# such as 'CYGWIN_NT-6.1'
|
||||
# FIXME(pyperclip#55): pyperclip currently does not support Cygwin,
|
||||
# see https://github.com/asweigart/pyperclip/issues/55
|
||||
if os.path.exists("/dev/clipboard"):
|
||||
warnings.warn(
|
||||
"Pyperclip's support for Cygwin is not perfect, "
|
||||
"see https://github.com/asweigart/pyperclip/issues/55",
|
||||
stacklevel=find_stack_level(),
|
||||
)
|
||||
return init_dev_clipboard_clipboard()
|
||||
|
||||
# Setup for the WINDOWS platform:
|
||||
elif os.name == "nt" or platform.system() == "Windows":
|
||||
return init_windows_clipboard()
|
||||
|
||||
if platform.system() == "Linux":
|
||||
if _executable_exists("wslconfig.exe"):
|
||||
return init_wsl_clipboard()
|
||||
|
||||
# Setup for the macOS platform:
|
||||
if os.name == "mac" or platform.system() == "Darwin":
|
||||
try:
|
||||
import AppKit
|
||||
import Foundation # check if pyobjc is installed
|
||||
except ImportError:
|
||||
return init_osx_pbcopy_clipboard()
|
||||
else:
|
||||
return init_osx_pyobjc_clipboard()
|
||||
|
||||
# Setup for the LINUX platform:
|
||||
if HAS_DISPLAY:
|
||||
if os.environ.get("WAYLAND_DISPLAY") and _executable_exists("wl-copy"):
|
||||
return init_wl_clipboard()
|
||||
if _executable_exists("xsel"):
|
||||
return init_xsel_clipboard()
|
||||
if _executable_exists("xclip"):
|
||||
return init_xclip_clipboard()
|
||||
if _executable_exists("klipper") and _executable_exists("qdbus"):
|
||||
return init_klipper_clipboard()
|
||||
|
||||
try:
|
||||
# qtpy is a small abstraction layer that lets you write applications
|
||||
# using a single api call to either PyQt or PySide.
|
||||
# https://pypi.python.org/project/QtPy
|
||||
import qtpy # check if qtpy is installed
|
||||
except ImportError:
|
||||
# If qtpy isn't installed, fall back on importing PyQt4.
|
||||
try:
|
||||
import PyQt5 # check if PyQt5 is installed
|
||||
except ImportError:
|
||||
try:
|
||||
import PyQt4 # check if PyQt4 is installed
|
||||
except ImportError:
|
||||
pass # We want to fail fast for all non-ImportError exceptions.
|
||||
else:
|
||||
return init_qt_clipboard()
|
||||
else:
|
||||
return init_qt_clipboard()
|
||||
else:
|
||||
return init_qt_clipboard()
|
||||
|
||||
return init_no_clipboard()
|
||||
|
||||
|
||||
def set_clipboard(clipboard):
|
||||
"""
|
||||
Explicitly sets the clipboard mechanism. The "clipboard mechanism" is how
|
||||
the copy() and paste() functions interact with the operating system to
|
||||
implement the copy/paste feature. The clipboard parameter must be one of:
|
||||
- pbcopy
|
||||
- pyobjc (default on macOS)
|
||||
- qt
|
||||
- xclip
|
||||
- xsel
|
||||
- klipper
|
||||
- windows (default on Windows)
|
||||
- no (this is what is set when no clipboard mechanism can be found)
|
||||
"""
|
||||
global copy, paste
|
||||
|
||||
clipboard_types = {
|
||||
"pbcopy": init_osx_pbcopy_clipboard,
|
||||
"pyobjc": init_osx_pyobjc_clipboard,
|
||||
"qt": init_qt_clipboard, # TODO - split this into 'qtpy', 'pyqt4', and 'pyqt5'
|
||||
"xclip": init_xclip_clipboard,
|
||||
"xsel": init_xsel_clipboard,
|
||||
"wl-clipboard": init_wl_clipboard,
|
||||
"klipper": init_klipper_clipboard,
|
||||
"windows": init_windows_clipboard,
|
||||
"no": init_no_clipboard,
|
||||
}
|
||||
|
||||
if clipboard not in clipboard_types:
|
||||
allowed_clipboard_types = [repr(_) for _ in clipboard_types]
|
||||
raise ValueError(
|
||||
f"Argument must be one of {', '.join(allowed_clipboard_types)}"
|
||||
)
|
||||
|
||||
# Sets pyperclip's copy() and paste() functions:
|
||||
copy, paste = clipboard_types[clipboard]()
|
||||
|
||||
|
||||
def lazy_load_stub_copy(text):
|
||||
"""
|
||||
A stub function for copy(), which will load the real copy() function when
|
||||
called so that the real copy() function is used for later calls.
|
||||
|
||||
This allows users to import pyperclip without having determine_clipboard()
|
||||
automatically run, which will automatically select a clipboard mechanism.
|
||||
This could be a problem if it selects, say, the memory-heavy PyQt4 module
|
||||
but the user was just going to immediately call set_clipboard() to use a
|
||||
different clipboard mechanism.
|
||||
|
||||
The lazy loading this stub function implements gives the user a chance to
|
||||
call set_clipboard() to pick another clipboard mechanism. Or, if the user
|
||||
simply calls copy() or paste() without calling set_clipboard() first,
|
||||
will fall back on whatever clipboard mechanism that determine_clipboard()
|
||||
automatically chooses.
|
||||
"""
|
||||
global copy, paste
|
||||
copy, paste = determine_clipboard()
|
||||
return copy(text)
|
||||
|
||||
|
||||
def lazy_load_stub_paste():
|
||||
"""
|
||||
A stub function for paste(), which will load the real paste() function when
|
||||
called so that the real paste() function is used for later calls.
|
||||
|
||||
This allows users to import pyperclip without having determine_clipboard()
|
||||
automatically run, which will automatically select a clipboard mechanism.
|
||||
This could be a problem if it selects, say, the memory-heavy PyQt4 module
|
||||
but the user was just going to immediately call set_clipboard() to use a
|
||||
different clipboard mechanism.
|
||||
|
||||
The lazy loading this stub function implements gives the user a chance to
|
||||
call set_clipboard() to pick another clipboard mechanism. Or, if the user
|
||||
simply calls copy() or paste() without calling set_clipboard() first,
|
||||
will fall back on whatever clipboard mechanism that determine_clipboard()
|
||||
automatically chooses.
|
||||
"""
|
||||
global copy, paste
|
||||
copy, paste = determine_clipboard()
|
||||
return paste()
|
||||
|
||||
|
||||
def is_available() -> bool:
|
||||
return copy != lazy_load_stub_copy and paste != lazy_load_stub_paste
|
||||
|
||||
|
||||
# Initially, copy() and paste() are set to lazy loading wrappers which will
|
||||
# set `copy` and `paste` to real functions the first time they're used, unless
|
||||
# set_clipboard() or determine_clipboard() is called first.
|
||||
copy, paste = lazy_load_stub_copy, lazy_load_stub_paste
|
||||
|
||||
|
||||
def waitForPaste(timeout=None):
|
||||
"""This function call blocks until a non-empty text string exists on the
|
||||
clipboard. It returns this text.
|
||||
|
||||
This function raises PyperclipTimeoutException if timeout was set to
|
||||
a number of seconds that has elapsed without non-empty text being put on
|
||||
the clipboard."""
|
||||
startTime = time.time()
|
||||
while True:
|
||||
clipboardText = paste()
|
||||
if clipboardText != "":
|
||||
return clipboardText
|
||||
time.sleep(0.01)
|
||||
|
||||
if timeout is not None and time.time() > startTime + timeout:
|
||||
raise PyperclipTimeoutException(
|
||||
"waitForPaste() timed out after " + str(timeout) + " seconds."
|
||||
)
|
||||
|
||||
|
||||
def waitForNewPaste(timeout=None):
|
||||
"""This function call blocks until a new text string exists on the
|
||||
clipboard that is different from the text that was there when the function
|
||||
was first called. It returns this text.
|
||||
|
||||
This function raises PyperclipTimeoutException if timeout was set to
|
||||
a number of seconds that has elapsed without non-empty text being put on
|
||||
the clipboard."""
|
||||
startTime = time.time()
|
||||
originalText = paste()
|
||||
while True:
|
||||
currentText = paste()
|
||||
if currentText != originalText:
|
||||
return currentText
|
||||
time.sleep(0.01)
|
||||
|
||||
if timeout is not None and time.time() > startTime + timeout:
|
||||
raise PyperclipTimeoutException(
|
||||
"waitForNewPaste() timed out after " + str(timeout) + " seconds."
|
||||
)
|
||||
|
||||
|
||||
__all__ = [
|
||||
"copy",
|
||||
"paste",
|
||||
"waitForPaste",
|
||||
"waitForNewPaste",
|
||||
"set_clipboard",
|
||||
"determine_clipboard",
|
||||
]
|
||||
|
||||
# pandas aliases
|
||||
clipboard_get = paste
|
||||
clipboard_set = copy
|
||||
197
lib/python3.11/site-packages/pandas/io/clipboards.py
Normal file
197
lib/python3.11/site-packages/pandas/io/clipboards.py
Normal file
@ -0,0 +1,197 @@
|
||||
""" io on the clipboard """
|
||||
from __future__ import annotations
|
||||
|
||||
from io import StringIO
|
||||
from typing import TYPE_CHECKING
|
||||
import warnings
|
||||
|
||||
from pandas._libs import lib
|
||||
from pandas.util._exceptions import find_stack_level
|
||||
from pandas.util._validators import check_dtype_backend
|
||||
|
||||
from pandas.core.dtypes.generic import ABCDataFrame
|
||||
|
||||
from pandas import (
|
||||
get_option,
|
||||
option_context,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pandas._typing import DtypeBackend
|
||||
|
||||
|
||||
def read_clipboard(
|
||||
sep: str = r"\s+",
|
||||
dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
|
||||
**kwargs,
|
||||
): # pragma: no cover
|
||||
r"""
|
||||
Read text from clipboard and pass to :func:`~pandas.read_csv`.
|
||||
|
||||
Parses clipboard contents similar to how CSV files are parsed
|
||||
using :func:`~pandas.read_csv`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
sep : str, default '\\s+'
|
||||
A string or regex delimiter. The default of ``'\\s+'`` denotes
|
||||
one or more whitespace characters.
|
||||
|
||||
dtype_backend : {'numpy_nullable', 'pyarrow'}, default 'numpy_nullable'
|
||||
Back-end data type applied to the resultant :class:`DataFrame`
|
||||
(still experimental). Behaviour is as follows:
|
||||
|
||||
* ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame`
|
||||
(default).
|
||||
* ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype`
|
||||
DataFrame.
|
||||
|
||||
.. versionadded:: 2.0
|
||||
|
||||
**kwargs
|
||||
See :func:`~pandas.read_csv` for the full argument list.
|
||||
|
||||
Returns
|
||||
-------
|
||||
DataFrame
|
||||
A parsed :class:`~pandas.DataFrame` object.
|
||||
|
||||
See Also
|
||||
--------
|
||||
DataFrame.to_clipboard : Copy object to the system clipboard.
|
||||
read_csv : Read a comma-separated values (csv) file into DataFrame.
|
||||
read_fwf : Read a table of fixed-width formatted lines into DataFrame.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=['A', 'B', 'C'])
|
||||
>>> df.to_clipboard() # doctest: +SKIP
|
||||
>>> pd.read_clipboard() # doctest: +SKIP
|
||||
A B C
|
||||
0 1 2 3
|
||||
1 4 5 6
|
||||
"""
|
||||
encoding = kwargs.pop("encoding", "utf-8")
|
||||
|
||||
# only utf-8 is valid for passed value because that's what clipboard
|
||||
# supports
|
||||
if encoding is not None and encoding.lower().replace("-", "") != "utf8":
|
||||
raise NotImplementedError("reading from clipboard only supports utf-8 encoding")
|
||||
|
||||
check_dtype_backend(dtype_backend)
|
||||
|
||||
from pandas.io.clipboard import clipboard_get
|
||||
from pandas.io.parsers import read_csv
|
||||
|
||||
text = clipboard_get()
|
||||
|
||||
# Try to decode (if needed, as "text" might already be a string here).
|
||||
try:
|
||||
text = text.decode(kwargs.get("encoding") or get_option("display.encoding"))
|
||||
except AttributeError:
|
||||
pass
|
||||
|
||||
# Excel copies into clipboard with \t separation
|
||||
# inspect no more then the 10 first lines, if they
|
||||
# all contain an equal number (>0) of tabs, infer
|
||||
# that this came from excel and set 'sep' accordingly
|
||||
lines = text[:10000].split("\n")[:-1][:10]
|
||||
|
||||
# Need to remove leading white space, since read_csv
|
||||
# accepts:
|
||||
# a b
|
||||
# 0 1 2
|
||||
# 1 3 4
|
||||
|
||||
counts = {x.lstrip(" ").count("\t") for x in lines}
|
||||
if len(lines) > 1 and len(counts) == 1 and counts.pop() != 0:
|
||||
sep = "\t"
|
||||
# check the number of leading tabs in the first line
|
||||
# to account for index columns
|
||||
index_length = len(lines[0]) - len(lines[0].lstrip(" \t"))
|
||||
if index_length != 0:
|
||||
kwargs.setdefault("index_col", list(range(index_length)))
|
||||
|
||||
# Edge case where sep is specified to be None, return to default
|
||||
if sep is None and kwargs.get("delim_whitespace") is None:
|
||||
sep = r"\s+"
|
||||
|
||||
# Regex separator currently only works with python engine.
|
||||
# Default to python if separator is multi-character (regex)
|
||||
if len(sep) > 1 and kwargs.get("engine") is None:
|
||||
kwargs["engine"] = "python"
|
||||
elif len(sep) > 1 and kwargs.get("engine") == "c":
|
||||
warnings.warn(
|
||||
"read_clipboard with regex separator does not work properly with c engine.",
|
||||
stacklevel=find_stack_level(),
|
||||
)
|
||||
|
||||
return read_csv(StringIO(text), sep=sep, dtype_backend=dtype_backend, **kwargs)
|
||||
|
||||
|
||||
def to_clipboard(
|
||||
obj, excel: bool | None = True, sep: str | None = None, **kwargs
|
||||
) -> None: # pragma: no cover
|
||||
"""
|
||||
Attempt to write text representation of object to the system clipboard
|
||||
The clipboard can be then pasted into Excel for example.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
obj : the object to write to the clipboard
|
||||
excel : bool, defaults to True
|
||||
if True, use the provided separator, writing in a csv
|
||||
format for allowing easy pasting into excel.
|
||||
if False, write a string representation of the object
|
||||
to the clipboard
|
||||
sep : optional, defaults to tab
|
||||
other keywords are passed to to_csv
|
||||
|
||||
Notes
|
||||
-----
|
||||
Requirements for your platform
|
||||
- Linux: xclip, or xsel (with PyQt4 modules)
|
||||
- Windows:
|
||||
- OS X:
|
||||
"""
|
||||
encoding = kwargs.pop("encoding", "utf-8")
|
||||
|
||||
# testing if an invalid encoding is passed to clipboard
|
||||
if encoding is not None and encoding.lower().replace("-", "") != "utf8":
|
||||
raise ValueError("clipboard only supports utf-8 encoding")
|
||||
|
||||
from pandas.io.clipboard import clipboard_set
|
||||
|
||||
if excel is None:
|
||||
excel = True
|
||||
|
||||
if excel:
|
||||
try:
|
||||
if sep is None:
|
||||
sep = "\t"
|
||||
buf = StringIO()
|
||||
|
||||
# clipboard_set (pyperclip) expects unicode
|
||||
obj.to_csv(buf, sep=sep, encoding="utf-8", **kwargs)
|
||||
text = buf.getvalue()
|
||||
|
||||
clipboard_set(text)
|
||||
return
|
||||
except TypeError:
|
||||
warnings.warn(
|
||||
"to_clipboard in excel mode requires a single character separator.",
|
||||
stacklevel=find_stack_level(),
|
||||
)
|
||||
elif sep is not None:
|
||||
warnings.warn(
|
||||
"to_clipboard with excel=False ignores the sep argument.",
|
||||
stacklevel=find_stack_level(),
|
||||
)
|
||||
|
||||
if isinstance(obj, ABCDataFrame):
|
||||
# str(df) has various unhelpful defaults, like truncation
|
||||
with option_context("display.max_colwidth", None):
|
||||
objstr = obj.to_string(**kwargs)
|
||||
else:
|
||||
objstr = str(obj)
|
||||
clipboard_set(objstr)
|
||||
1267
lib/python3.11/site-packages/pandas/io/common.py
Normal file
1267
lib/python3.11/site-packages/pandas/io/common.py
Normal file
File diff suppressed because it is too large
Load Diff
19
lib/python3.11/site-packages/pandas/io/excel/__init__.py
Normal file
19
lib/python3.11/site-packages/pandas/io/excel/__init__.py
Normal file
@ -0,0 +1,19 @@
|
||||
from pandas.io.excel._base import (
|
||||
ExcelFile,
|
||||
ExcelWriter,
|
||||
read_excel,
|
||||
)
|
||||
from pandas.io.excel._odswriter import ODSWriter as _ODSWriter
|
||||
from pandas.io.excel._openpyxl import OpenpyxlWriter as _OpenpyxlWriter
|
||||
from pandas.io.excel._util import register_writer
|
||||
from pandas.io.excel._xlsxwriter import XlsxWriter as _XlsxWriter
|
||||
|
||||
__all__ = ["read_excel", "ExcelWriter", "ExcelFile"]
|
||||
|
||||
|
||||
register_writer(_OpenpyxlWriter)
|
||||
|
||||
register_writer(_XlsxWriter)
|
||||
|
||||
|
||||
register_writer(_ODSWriter)
|
||||
1659
lib/python3.11/site-packages/pandas/io/excel/_base.py
Normal file
1659
lib/python3.11/site-packages/pandas/io/excel/_base.py
Normal file
File diff suppressed because it is too large
Load Diff
121
lib/python3.11/site-packages/pandas/io/excel/_calamine.py
Normal file
121
lib/python3.11/site-packages/pandas/io/excel/_calamine.py
Normal file
@ -0,0 +1,121 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from datetime import (
|
||||
date,
|
||||
datetime,
|
||||
time,
|
||||
timedelta,
|
||||
)
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
Union,
|
||||
)
|
||||
|
||||
from pandas.compat._optional import import_optional_dependency
|
||||
from pandas.util._decorators import doc
|
||||
|
||||
import pandas as pd
|
||||
from pandas.core.shared_docs import _shared_docs
|
||||
|
||||
from pandas.io.excel._base import BaseExcelReader
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from python_calamine import (
|
||||
CalamineSheet,
|
||||
CalamineWorkbook,
|
||||
)
|
||||
|
||||
from pandas._typing import (
|
||||
FilePath,
|
||||
NaTType,
|
||||
ReadBuffer,
|
||||
Scalar,
|
||||
StorageOptions,
|
||||
)
|
||||
|
||||
_CellValue = Union[int, float, str, bool, time, date, datetime, timedelta]
|
||||
|
||||
|
||||
class CalamineReader(BaseExcelReader["CalamineWorkbook"]):
|
||||
@doc(storage_options=_shared_docs["storage_options"])
|
||||
def __init__(
|
||||
self,
|
||||
filepath_or_buffer: FilePath | ReadBuffer[bytes],
|
||||
storage_options: StorageOptions | None = None,
|
||||
engine_kwargs: dict | None = None,
|
||||
) -> None:
|
||||
"""
|
||||
Reader using calamine engine (xlsx/xls/xlsb/ods).
|
||||
|
||||
Parameters
|
||||
----------
|
||||
filepath_or_buffer : str, path to be parsed or
|
||||
an open readable stream.
|
||||
{storage_options}
|
||||
engine_kwargs : dict, optional
|
||||
Arbitrary keyword arguments passed to excel engine.
|
||||
"""
|
||||
import_optional_dependency("python_calamine")
|
||||
super().__init__(
|
||||
filepath_or_buffer,
|
||||
storage_options=storage_options,
|
||||
engine_kwargs=engine_kwargs,
|
||||
)
|
||||
|
||||
@property
|
||||
def _workbook_class(self) -> type[CalamineWorkbook]:
|
||||
from python_calamine import CalamineWorkbook
|
||||
|
||||
return CalamineWorkbook
|
||||
|
||||
def load_workbook(
|
||||
self, filepath_or_buffer: FilePath | ReadBuffer[bytes], engine_kwargs: Any
|
||||
) -> CalamineWorkbook:
|
||||
from python_calamine import load_workbook
|
||||
|
||||
return load_workbook(filepath_or_buffer, **engine_kwargs)
|
||||
|
||||
@property
|
||||
def sheet_names(self) -> list[str]:
|
||||
from python_calamine import SheetTypeEnum
|
||||
|
||||
return [
|
||||
sheet.name
|
||||
for sheet in self.book.sheets_metadata
|
||||
if sheet.typ == SheetTypeEnum.WorkSheet
|
||||
]
|
||||
|
||||
def get_sheet_by_name(self, name: str) -> CalamineSheet:
|
||||
self.raise_if_bad_sheet_by_name(name)
|
||||
return self.book.get_sheet_by_name(name)
|
||||
|
||||
def get_sheet_by_index(self, index: int) -> CalamineSheet:
|
||||
self.raise_if_bad_sheet_by_index(index)
|
||||
return self.book.get_sheet_by_index(index)
|
||||
|
||||
def get_sheet_data(
|
||||
self, sheet: CalamineSheet, file_rows_needed: int | None = None
|
||||
) -> list[list[Scalar | NaTType | time]]:
|
||||
def _convert_cell(value: _CellValue) -> Scalar | NaTType | time:
|
||||
if isinstance(value, float):
|
||||
val = int(value)
|
||||
if val == value:
|
||||
return val
|
||||
else:
|
||||
return value
|
||||
elif isinstance(value, date):
|
||||
return pd.Timestamp(value)
|
||||
elif isinstance(value, timedelta):
|
||||
return pd.Timedelta(value)
|
||||
elif isinstance(value, time):
|
||||
return value
|
||||
|
||||
return value
|
||||
|
||||
rows: list[list[_CellValue]] = sheet.to_python(
|
||||
skip_empty_area=False, nrows=file_rows_needed
|
||||
)
|
||||
data = [[_convert_cell(cell) for cell in row] for row in rows]
|
||||
|
||||
return data
|
||||
253
lib/python3.11/site-packages/pandas/io/excel/_odfreader.py
Normal file
253
lib/python3.11/site-packages/pandas/io/excel/_odfreader.py
Normal file
@ -0,0 +1,253 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
cast,
|
||||
)
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas._typing import (
|
||||
FilePath,
|
||||
ReadBuffer,
|
||||
Scalar,
|
||||
StorageOptions,
|
||||
)
|
||||
from pandas.compat._optional import import_optional_dependency
|
||||
from pandas.util._decorators import doc
|
||||
|
||||
import pandas as pd
|
||||
from pandas.core.shared_docs import _shared_docs
|
||||
|
||||
from pandas.io.excel._base import BaseExcelReader
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from odf.opendocument import OpenDocument
|
||||
|
||||
from pandas._libs.tslibs.nattype import NaTType
|
||||
|
||||
|
||||
@doc(storage_options=_shared_docs["storage_options"])
|
||||
class ODFReader(BaseExcelReader["OpenDocument"]):
|
||||
def __init__(
|
||||
self,
|
||||
filepath_or_buffer: FilePath | ReadBuffer[bytes],
|
||||
storage_options: StorageOptions | None = None,
|
||||
engine_kwargs: dict | None = None,
|
||||
) -> None:
|
||||
"""
|
||||
Read tables out of OpenDocument formatted files.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
filepath_or_buffer : str, path to be parsed or
|
||||
an open readable stream.
|
||||
{storage_options}
|
||||
engine_kwargs : dict, optional
|
||||
Arbitrary keyword arguments passed to excel engine.
|
||||
"""
|
||||
import_optional_dependency("odf")
|
||||
super().__init__(
|
||||
filepath_or_buffer,
|
||||
storage_options=storage_options,
|
||||
engine_kwargs=engine_kwargs,
|
||||
)
|
||||
|
||||
@property
|
||||
def _workbook_class(self) -> type[OpenDocument]:
|
||||
from odf.opendocument import OpenDocument
|
||||
|
||||
return OpenDocument
|
||||
|
||||
def load_workbook(
|
||||
self, filepath_or_buffer: FilePath | ReadBuffer[bytes], engine_kwargs
|
||||
) -> OpenDocument:
|
||||
from odf.opendocument import load
|
||||
|
||||
return load(filepath_or_buffer, **engine_kwargs)
|
||||
|
||||
@property
|
||||
def empty_value(self) -> str:
|
||||
"""Property for compat with other readers."""
|
||||
return ""
|
||||
|
||||
@property
|
||||
def sheet_names(self) -> list[str]:
|
||||
"""Return a list of sheet names present in the document"""
|
||||
from odf.table import Table
|
||||
|
||||
tables = self.book.getElementsByType(Table)
|
||||
return [t.getAttribute("name") for t in tables]
|
||||
|
||||
def get_sheet_by_index(self, index: int):
|
||||
from odf.table import Table
|
||||
|
||||
self.raise_if_bad_sheet_by_index(index)
|
||||
tables = self.book.getElementsByType(Table)
|
||||
return tables[index]
|
||||
|
||||
def get_sheet_by_name(self, name: str):
|
||||
from odf.table import Table
|
||||
|
||||
self.raise_if_bad_sheet_by_name(name)
|
||||
tables = self.book.getElementsByType(Table)
|
||||
|
||||
for table in tables:
|
||||
if table.getAttribute("name") == name:
|
||||
return table
|
||||
|
||||
self.close()
|
||||
raise ValueError(f"sheet {name} not found")
|
||||
|
||||
def get_sheet_data(
|
||||
self, sheet, file_rows_needed: int | None = None
|
||||
) -> list[list[Scalar | NaTType]]:
|
||||
"""
|
||||
Parse an ODF Table into a list of lists
|
||||
"""
|
||||
from odf.table import (
|
||||
CoveredTableCell,
|
||||
TableCell,
|
||||
TableRow,
|
||||
)
|
||||
|
||||
covered_cell_name = CoveredTableCell().qname
|
||||
table_cell_name = TableCell().qname
|
||||
cell_names = {covered_cell_name, table_cell_name}
|
||||
|
||||
sheet_rows = sheet.getElementsByType(TableRow)
|
||||
empty_rows = 0
|
||||
max_row_len = 0
|
||||
|
||||
table: list[list[Scalar | NaTType]] = []
|
||||
|
||||
for sheet_row in sheet_rows:
|
||||
sheet_cells = [
|
||||
x
|
||||
for x in sheet_row.childNodes
|
||||
if hasattr(x, "qname") and x.qname in cell_names
|
||||
]
|
||||
empty_cells = 0
|
||||
table_row: list[Scalar | NaTType] = []
|
||||
|
||||
for sheet_cell in sheet_cells:
|
||||
if sheet_cell.qname == table_cell_name:
|
||||
value = self._get_cell_value(sheet_cell)
|
||||
else:
|
||||
value = self.empty_value
|
||||
|
||||
column_repeat = self._get_column_repeat(sheet_cell)
|
||||
|
||||
# Queue up empty values, writing only if content succeeds them
|
||||
if value == self.empty_value:
|
||||
empty_cells += column_repeat
|
||||
else:
|
||||
table_row.extend([self.empty_value] * empty_cells)
|
||||
empty_cells = 0
|
||||
table_row.extend([value] * column_repeat)
|
||||
|
||||
if max_row_len < len(table_row):
|
||||
max_row_len = len(table_row)
|
||||
|
||||
row_repeat = self._get_row_repeat(sheet_row)
|
||||
if len(table_row) == 0:
|
||||
empty_rows += row_repeat
|
||||
else:
|
||||
# add blank rows to our table
|
||||
table.extend([[self.empty_value]] * empty_rows)
|
||||
empty_rows = 0
|
||||
table.extend(table_row for _ in range(row_repeat))
|
||||
if file_rows_needed is not None and len(table) >= file_rows_needed:
|
||||
break
|
||||
|
||||
# Make our table square
|
||||
for row in table:
|
||||
if len(row) < max_row_len:
|
||||
row.extend([self.empty_value] * (max_row_len - len(row)))
|
||||
|
||||
return table
|
||||
|
||||
def _get_row_repeat(self, row) -> int:
|
||||
"""
|
||||
Return number of times this row was repeated
|
||||
Repeating an empty row appeared to be a common way
|
||||
of representing sparse rows in the table.
|
||||
"""
|
||||
from odf.namespaces import TABLENS
|
||||
|
||||
return int(row.attributes.get((TABLENS, "number-rows-repeated"), 1))
|
||||
|
||||
def _get_column_repeat(self, cell) -> int:
|
||||
from odf.namespaces import TABLENS
|
||||
|
||||
return int(cell.attributes.get((TABLENS, "number-columns-repeated"), 1))
|
||||
|
||||
def _get_cell_value(self, cell) -> Scalar | NaTType:
|
||||
from odf.namespaces import OFFICENS
|
||||
|
||||
if str(cell) == "#N/A":
|
||||
return np.nan
|
||||
|
||||
cell_type = cell.attributes.get((OFFICENS, "value-type"))
|
||||
if cell_type == "boolean":
|
||||
if str(cell) == "TRUE":
|
||||
return True
|
||||
return False
|
||||
if cell_type is None:
|
||||
return self.empty_value
|
||||
elif cell_type == "float":
|
||||
# GH5394
|
||||
cell_value = float(cell.attributes.get((OFFICENS, "value")))
|
||||
val = int(cell_value)
|
||||
if val == cell_value:
|
||||
return val
|
||||
return cell_value
|
||||
elif cell_type == "percentage":
|
||||
cell_value = cell.attributes.get((OFFICENS, "value"))
|
||||
return float(cell_value)
|
||||
elif cell_type == "string":
|
||||
return self._get_cell_string_value(cell)
|
||||
elif cell_type == "currency":
|
||||
cell_value = cell.attributes.get((OFFICENS, "value"))
|
||||
return float(cell_value)
|
||||
elif cell_type == "date":
|
||||
cell_value = cell.attributes.get((OFFICENS, "date-value"))
|
||||
return pd.Timestamp(cell_value)
|
||||
elif cell_type == "time":
|
||||
stamp = pd.Timestamp(str(cell))
|
||||
# cast needed here because Scalar doesn't include datetime.time
|
||||
return cast(Scalar, stamp.time())
|
||||
else:
|
||||
self.close()
|
||||
raise ValueError(f"Unrecognized type {cell_type}")
|
||||
|
||||
def _get_cell_string_value(self, cell) -> str:
|
||||
"""
|
||||
Find and decode OpenDocument text:s tags that represent
|
||||
a run length encoded sequence of space characters.
|
||||
"""
|
||||
from odf.element import Element
|
||||
from odf.namespaces import TEXTNS
|
||||
from odf.office import Annotation
|
||||
from odf.text import S
|
||||
|
||||
office_annotation = Annotation().qname
|
||||
text_s = S().qname
|
||||
|
||||
value = []
|
||||
|
||||
for fragment in cell.childNodes:
|
||||
if isinstance(fragment, Element):
|
||||
if fragment.qname == text_s:
|
||||
spaces = int(fragment.attributes.get((TEXTNS, "c"), 1))
|
||||
value.append(" " * spaces)
|
||||
elif fragment.qname == office_annotation:
|
||||
continue
|
||||
else:
|
||||
# recursive impl needed in case of nested fragments
|
||||
# with multiple spaces
|
||||
# https://github.com/pandas-dev/pandas/pull/36175#discussion_r484639704
|
||||
value.append(self._get_cell_string_value(fragment))
|
||||
else:
|
||||
value.append(str(fragment).strip("\n"))
|
||||
return "".join(value)
|
||||
357
lib/python3.11/site-packages/pandas/io/excel/_odswriter.py
Normal file
357
lib/python3.11/site-packages/pandas/io/excel/_odswriter.py
Normal file
@ -0,0 +1,357 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from collections import defaultdict
|
||||
import datetime
|
||||
import json
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
DefaultDict,
|
||||
cast,
|
||||
overload,
|
||||
)
|
||||
|
||||
from pandas.io.excel._base import ExcelWriter
|
||||
from pandas.io.excel._util import (
|
||||
combine_kwargs,
|
||||
validate_freeze_panes,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pandas._typing import (
|
||||
ExcelWriterIfSheetExists,
|
||||
FilePath,
|
||||
StorageOptions,
|
||||
WriteExcelBuffer,
|
||||
)
|
||||
|
||||
from pandas.io.formats.excel import ExcelCell
|
||||
|
||||
|
||||
class ODSWriter(ExcelWriter):
|
||||
_engine = "odf"
|
||||
_supported_extensions = (".ods",)
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
path: FilePath | WriteExcelBuffer | ExcelWriter,
|
||||
engine: str | None = None,
|
||||
date_format: str | None = None,
|
||||
datetime_format=None,
|
||||
mode: str = "w",
|
||||
storage_options: StorageOptions | None = None,
|
||||
if_sheet_exists: ExcelWriterIfSheetExists | None = None,
|
||||
engine_kwargs: dict[str, Any] | None = None,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
from odf.opendocument import OpenDocumentSpreadsheet
|
||||
|
||||
if mode == "a":
|
||||
raise ValueError("Append mode is not supported with odf!")
|
||||
|
||||
engine_kwargs = combine_kwargs(engine_kwargs, kwargs)
|
||||
self._book = OpenDocumentSpreadsheet(**engine_kwargs)
|
||||
|
||||
super().__init__(
|
||||
path,
|
||||
mode=mode,
|
||||
storage_options=storage_options,
|
||||
if_sheet_exists=if_sheet_exists,
|
||||
engine_kwargs=engine_kwargs,
|
||||
)
|
||||
|
||||
self._style_dict: dict[str, str] = {}
|
||||
|
||||
@property
|
||||
def book(self):
|
||||
"""
|
||||
Book instance of class odf.opendocument.OpenDocumentSpreadsheet.
|
||||
|
||||
This attribute can be used to access engine-specific features.
|
||||
"""
|
||||
return self._book
|
||||
|
||||
@property
|
||||
def sheets(self) -> dict[str, Any]:
|
||||
"""Mapping of sheet names to sheet objects."""
|
||||
from odf.table import Table
|
||||
|
||||
result = {
|
||||
sheet.getAttribute("name"): sheet
|
||||
for sheet in self.book.getElementsByType(Table)
|
||||
}
|
||||
return result
|
||||
|
||||
def _save(self) -> None:
|
||||
"""
|
||||
Save workbook to disk.
|
||||
"""
|
||||
for sheet in self.sheets.values():
|
||||
self.book.spreadsheet.addElement(sheet)
|
||||
self.book.save(self._handles.handle)
|
||||
|
||||
def _write_cells(
|
||||
self,
|
||||
cells: list[ExcelCell],
|
||||
sheet_name: str | None = None,
|
||||
startrow: int = 0,
|
||||
startcol: int = 0,
|
||||
freeze_panes: tuple[int, int] | None = None,
|
||||
) -> None:
|
||||
"""
|
||||
Write the frame cells using odf
|
||||
"""
|
||||
from odf.table import (
|
||||
Table,
|
||||
TableCell,
|
||||
TableRow,
|
||||
)
|
||||
from odf.text import P
|
||||
|
||||
sheet_name = self._get_sheet_name(sheet_name)
|
||||
assert sheet_name is not None
|
||||
|
||||
if sheet_name in self.sheets:
|
||||
wks = self.sheets[sheet_name]
|
||||
else:
|
||||
wks = Table(name=sheet_name)
|
||||
self.book.spreadsheet.addElement(wks)
|
||||
|
||||
if validate_freeze_panes(freeze_panes):
|
||||
freeze_panes = cast(tuple[int, int], freeze_panes)
|
||||
self._create_freeze_panes(sheet_name, freeze_panes)
|
||||
|
||||
for _ in range(startrow):
|
||||
wks.addElement(TableRow())
|
||||
|
||||
rows: DefaultDict = defaultdict(TableRow)
|
||||
col_count: DefaultDict = defaultdict(int)
|
||||
|
||||
for cell in sorted(cells, key=lambda cell: (cell.row, cell.col)):
|
||||
# only add empty cells if the row is still empty
|
||||
if not col_count[cell.row]:
|
||||
for _ in range(startcol):
|
||||
rows[cell.row].addElement(TableCell())
|
||||
|
||||
# fill with empty cells if needed
|
||||
for _ in range(cell.col - col_count[cell.row]):
|
||||
rows[cell.row].addElement(TableCell())
|
||||
col_count[cell.row] += 1
|
||||
|
||||
pvalue, tc = self._make_table_cell(cell)
|
||||
rows[cell.row].addElement(tc)
|
||||
col_count[cell.row] += 1
|
||||
p = P(text=pvalue)
|
||||
tc.addElement(p)
|
||||
|
||||
# add all rows to the sheet
|
||||
if len(rows) > 0:
|
||||
for row_nr in range(max(rows.keys()) + 1):
|
||||
wks.addElement(rows[row_nr])
|
||||
|
||||
def _make_table_cell_attributes(self, cell) -> dict[str, int | str]:
|
||||
"""Convert cell attributes to OpenDocument attributes
|
||||
|
||||
Parameters
|
||||
----------
|
||||
cell : ExcelCell
|
||||
Spreadsheet cell data
|
||||
|
||||
Returns
|
||||
-------
|
||||
attributes : Dict[str, Union[int, str]]
|
||||
Dictionary with attributes and attribute values
|
||||
"""
|
||||
attributes: dict[str, int | str] = {}
|
||||
style_name = self._process_style(cell.style)
|
||||
if style_name is not None:
|
||||
attributes["stylename"] = style_name
|
||||
if cell.mergestart is not None and cell.mergeend is not None:
|
||||
attributes["numberrowsspanned"] = max(1, cell.mergestart)
|
||||
attributes["numbercolumnsspanned"] = cell.mergeend
|
||||
return attributes
|
||||
|
||||
def _make_table_cell(self, cell) -> tuple[object, Any]:
|
||||
"""Convert cell data to an OpenDocument spreadsheet cell
|
||||
|
||||
Parameters
|
||||
----------
|
||||
cell : ExcelCell
|
||||
Spreadsheet cell data
|
||||
|
||||
Returns
|
||||
-------
|
||||
pvalue, cell : Tuple[str, TableCell]
|
||||
Display value, Cell value
|
||||
"""
|
||||
from odf.table import TableCell
|
||||
|
||||
attributes = self._make_table_cell_attributes(cell)
|
||||
val, fmt = self._value_with_fmt(cell.val)
|
||||
pvalue = value = val
|
||||
if isinstance(val, bool):
|
||||
value = str(val).lower()
|
||||
pvalue = str(val).upper()
|
||||
return (
|
||||
pvalue,
|
||||
TableCell(
|
||||
valuetype="boolean",
|
||||
booleanvalue=value,
|
||||
attributes=attributes,
|
||||
),
|
||||
)
|
||||
elif isinstance(val, datetime.datetime):
|
||||
# Fast formatting
|
||||
value = val.isoformat()
|
||||
# Slow but locale-dependent
|
||||
pvalue = val.strftime("%c")
|
||||
return (
|
||||
pvalue,
|
||||
TableCell(valuetype="date", datevalue=value, attributes=attributes),
|
||||
)
|
||||
elif isinstance(val, datetime.date):
|
||||
# Fast formatting
|
||||
value = f"{val.year}-{val.month:02d}-{val.day:02d}"
|
||||
# Slow but locale-dependent
|
||||
pvalue = val.strftime("%x")
|
||||
return (
|
||||
pvalue,
|
||||
TableCell(valuetype="date", datevalue=value, attributes=attributes),
|
||||
)
|
||||
elif isinstance(val, str):
|
||||
return (
|
||||
pvalue,
|
||||
TableCell(
|
||||
valuetype="string",
|
||||
stringvalue=value,
|
||||
attributes=attributes,
|
||||
),
|
||||
)
|
||||
else:
|
||||
return (
|
||||
pvalue,
|
||||
TableCell(
|
||||
valuetype="float",
|
||||
value=value,
|
||||
attributes=attributes,
|
||||
),
|
||||
)
|
||||
|
||||
@overload
|
||||
def _process_style(self, style: dict[str, Any]) -> str:
|
||||
...
|
||||
|
||||
@overload
|
||||
def _process_style(self, style: None) -> None:
|
||||
...
|
||||
|
||||
def _process_style(self, style: dict[str, Any] | None) -> str | None:
|
||||
"""Convert a style dictionary to a OpenDocument style sheet
|
||||
|
||||
Parameters
|
||||
----------
|
||||
style : Dict
|
||||
Style dictionary
|
||||
|
||||
Returns
|
||||
-------
|
||||
style_key : str
|
||||
Unique style key for later reference in sheet
|
||||
"""
|
||||
from odf.style import (
|
||||
ParagraphProperties,
|
||||
Style,
|
||||
TableCellProperties,
|
||||
TextProperties,
|
||||
)
|
||||
|
||||
if style is None:
|
||||
return None
|
||||
style_key = json.dumps(style)
|
||||
if style_key in self._style_dict:
|
||||
return self._style_dict[style_key]
|
||||
name = f"pd{len(self._style_dict)+1}"
|
||||
self._style_dict[style_key] = name
|
||||
odf_style = Style(name=name, family="table-cell")
|
||||
if "font" in style:
|
||||
font = style["font"]
|
||||
if font.get("bold", False):
|
||||
odf_style.addElement(TextProperties(fontweight="bold"))
|
||||
if "borders" in style:
|
||||
borders = style["borders"]
|
||||
for side, thickness in borders.items():
|
||||
thickness_translation = {"thin": "0.75pt solid #000000"}
|
||||
odf_style.addElement(
|
||||
TableCellProperties(
|
||||
attributes={f"border{side}": thickness_translation[thickness]}
|
||||
)
|
||||
)
|
||||
if "alignment" in style:
|
||||
alignment = style["alignment"]
|
||||
horizontal = alignment.get("horizontal")
|
||||
if horizontal:
|
||||
odf_style.addElement(ParagraphProperties(textalign=horizontal))
|
||||
vertical = alignment.get("vertical")
|
||||
if vertical:
|
||||
odf_style.addElement(TableCellProperties(verticalalign=vertical))
|
||||
self.book.styles.addElement(odf_style)
|
||||
return name
|
||||
|
||||
def _create_freeze_panes(
|
||||
self, sheet_name: str, freeze_panes: tuple[int, int]
|
||||
) -> None:
|
||||
"""
|
||||
Create freeze panes in the sheet.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
sheet_name : str
|
||||
Name of the spreadsheet
|
||||
freeze_panes : tuple of (int, int)
|
||||
Freeze pane location x and y
|
||||
"""
|
||||
from odf.config import (
|
||||
ConfigItem,
|
||||
ConfigItemMapEntry,
|
||||
ConfigItemMapIndexed,
|
||||
ConfigItemMapNamed,
|
||||
ConfigItemSet,
|
||||
)
|
||||
|
||||
config_item_set = ConfigItemSet(name="ooo:view-settings")
|
||||
self.book.settings.addElement(config_item_set)
|
||||
|
||||
config_item_map_indexed = ConfigItemMapIndexed(name="Views")
|
||||
config_item_set.addElement(config_item_map_indexed)
|
||||
|
||||
config_item_map_entry = ConfigItemMapEntry()
|
||||
config_item_map_indexed.addElement(config_item_map_entry)
|
||||
|
||||
config_item_map_named = ConfigItemMapNamed(name="Tables")
|
||||
config_item_map_entry.addElement(config_item_map_named)
|
||||
|
||||
config_item_map_entry = ConfigItemMapEntry(name=sheet_name)
|
||||
config_item_map_named.addElement(config_item_map_entry)
|
||||
|
||||
config_item_map_entry.addElement(
|
||||
ConfigItem(name="HorizontalSplitMode", type="short", text="2")
|
||||
)
|
||||
config_item_map_entry.addElement(
|
||||
ConfigItem(name="VerticalSplitMode", type="short", text="2")
|
||||
)
|
||||
config_item_map_entry.addElement(
|
||||
ConfigItem(
|
||||
name="HorizontalSplitPosition", type="int", text=str(freeze_panes[0])
|
||||
)
|
||||
)
|
||||
config_item_map_entry.addElement(
|
||||
ConfigItem(
|
||||
name="VerticalSplitPosition", type="int", text=str(freeze_panes[1])
|
||||
)
|
||||
)
|
||||
config_item_map_entry.addElement(
|
||||
ConfigItem(name="PositionRight", type="int", text=str(freeze_panes[0]))
|
||||
)
|
||||
config_item_map_entry.addElement(
|
||||
ConfigItem(name="PositionBottom", type="int", text=str(freeze_panes[1]))
|
||||
)
|
||||
639
lib/python3.11/site-packages/pandas/io/excel/_openpyxl.py
Normal file
639
lib/python3.11/site-packages/pandas/io/excel/_openpyxl.py
Normal file
@ -0,0 +1,639 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import mmap
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
cast,
|
||||
)
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas.compat._optional import import_optional_dependency
|
||||
from pandas.util._decorators import doc
|
||||
|
||||
from pandas.core.shared_docs import _shared_docs
|
||||
|
||||
from pandas.io.excel._base import (
|
||||
BaseExcelReader,
|
||||
ExcelWriter,
|
||||
)
|
||||
from pandas.io.excel._util import (
|
||||
combine_kwargs,
|
||||
validate_freeze_panes,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from openpyxl import Workbook
|
||||
from openpyxl.descriptors.serialisable import Serialisable
|
||||
|
||||
from pandas._typing import (
|
||||
ExcelWriterIfSheetExists,
|
||||
FilePath,
|
||||
ReadBuffer,
|
||||
Scalar,
|
||||
StorageOptions,
|
||||
WriteExcelBuffer,
|
||||
)
|
||||
|
||||
|
||||
class OpenpyxlWriter(ExcelWriter):
|
||||
_engine = "openpyxl"
|
||||
_supported_extensions = (".xlsx", ".xlsm")
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
path: FilePath | WriteExcelBuffer | ExcelWriter,
|
||||
engine: str | None = None,
|
||||
date_format: str | None = None,
|
||||
datetime_format: str | None = None,
|
||||
mode: str = "w",
|
||||
storage_options: StorageOptions | None = None,
|
||||
if_sheet_exists: ExcelWriterIfSheetExists | None = None,
|
||||
engine_kwargs: dict[str, Any] | None = None,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
# Use the openpyxl module as the Excel writer.
|
||||
from openpyxl.workbook import Workbook
|
||||
|
||||
engine_kwargs = combine_kwargs(engine_kwargs, kwargs)
|
||||
|
||||
super().__init__(
|
||||
path,
|
||||
mode=mode,
|
||||
storage_options=storage_options,
|
||||
if_sheet_exists=if_sheet_exists,
|
||||
engine_kwargs=engine_kwargs,
|
||||
)
|
||||
|
||||
# ExcelWriter replaced "a" by "r+" to allow us to first read the excel file from
|
||||
# the file and later write to it
|
||||
if "r+" in self._mode: # Load from existing workbook
|
||||
from openpyxl import load_workbook
|
||||
|
||||
try:
|
||||
self._book = load_workbook(self._handles.handle, **engine_kwargs)
|
||||
except TypeError:
|
||||
self._handles.handle.close()
|
||||
raise
|
||||
self._handles.handle.seek(0)
|
||||
else:
|
||||
# Create workbook object with default optimized_write=True.
|
||||
try:
|
||||
self._book = Workbook(**engine_kwargs)
|
||||
except TypeError:
|
||||
self._handles.handle.close()
|
||||
raise
|
||||
|
||||
if self.book.worksheets:
|
||||
self.book.remove(self.book.worksheets[0])
|
||||
|
||||
@property
|
||||
def book(self) -> Workbook:
|
||||
"""
|
||||
Book instance of class openpyxl.workbook.Workbook.
|
||||
|
||||
This attribute can be used to access engine-specific features.
|
||||
"""
|
||||
return self._book
|
||||
|
||||
@property
|
||||
def sheets(self) -> dict[str, Any]:
|
||||
"""Mapping of sheet names to sheet objects."""
|
||||
result = {name: self.book[name] for name in self.book.sheetnames}
|
||||
return result
|
||||
|
||||
def _save(self) -> None:
|
||||
"""
|
||||
Save workbook to disk.
|
||||
"""
|
||||
self.book.save(self._handles.handle)
|
||||
if "r+" in self._mode and not isinstance(self._handles.handle, mmap.mmap):
|
||||
# truncate file to the written content
|
||||
self._handles.handle.truncate()
|
||||
|
||||
@classmethod
|
||||
def _convert_to_style_kwargs(cls, style_dict: dict) -> dict[str, Serialisable]:
|
||||
"""
|
||||
Convert a style_dict to a set of kwargs suitable for initializing
|
||||
or updating-on-copy an openpyxl v2 style object.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
style_dict : dict
|
||||
A dict with zero or more of the following keys (or their synonyms).
|
||||
'font'
|
||||
'fill'
|
||||
'border' ('borders')
|
||||
'alignment'
|
||||
'number_format'
|
||||
'protection'
|
||||
|
||||
Returns
|
||||
-------
|
||||
style_kwargs : dict
|
||||
A dict with the same, normalized keys as ``style_dict`` but each
|
||||
value has been replaced with a native openpyxl style object of the
|
||||
appropriate class.
|
||||
"""
|
||||
_style_key_map = {"borders": "border"}
|
||||
|
||||
style_kwargs: dict[str, Serialisable] = {}
|
||||
for k, v in style_dict.items():
|
||||
k = _style_key_map.get(k, k)
|
||||
_conv_to_x = getattr(cls, f"_convert_to_{k}", lambda x: None)
|
||||
new_v = _conv_to_x(v)
|
||||
if new_v:
|
||||
style_kwargs[k] = new_v
|
||||
|
||||
return style_kwargs
|
||||
|
||||
@classmethod
|
||||
def _convert_to_color(cls, color_spec):
|
||||
"""
|
||||
Convert ``color_spec`` to an openpyxl v2 Color object.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
color_spec : str, dict
|
||||
A 32-bit ARGB hex string, or a dict with zero or more of the
|
||||
following keys.
|
||||
'rgb'
|
||||
'indexed'
|
||||
'auto'
|
||||
'theme'
|
||||
'tint'
|
||||
'index'
|
||||
'type'
|
||||
|
||||
Returns
|
||||
-------
|
||||
color : openpyxl.styles.Color
|
||||
"""
|
||||
from openpyxl.styles import Color
|
||||
|
||||
if isinstance(color_spec, str):
|
||||
return Color(color_spec)
|
||||
else:
|
||||
return Color(**color_spec)
|
||||
|
||||
@classmethod
|
||||
def _convert_to_font(cls, font_dict):
|
||||
"""
|
||||
Convert ``font_dict`` to an openpyxl v2 Font object.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
font_dict : dict
|
||||
A dict with zero or more of the following keys (or their synonyms).
|
||||
'name'
|
||||
'size' ('sz')
|
||||
'bold' ('b')
|
||||
'italic' ('i')
|
||||
'underline' ('u')
|
||||
'strikethrough' ('strike')
|
||||
'color'
|
||||
'vertAlign' ('vertalign')
|
||||
'charset'
|
||||
'scheme'
|
||||
'family'
|
||||
'outline'
|
||||
'shadow'
|
||||
'condense'
|
||||
|
||||
Returns
|
||||
-------
|
||||
font : openpyxl.styles.Font
|
||||
"""
|
||||
from openpyxl.styles import Font
|
||||
|
||||
_font_key_map = {
|
||||
"sz": "size",
|
||||
"b": "bold",
|
||||
"i": "italic",
|
||||
"u": "underline",
|
||||
"strike": "strikethrough",
|
||||
"vertalign": "vertAlign",
|
||||
}
|
||||
|
||||
font_kwargs = {}
|
||||
for k, v in font_dict.items():
|
||||
k = _font_key_map.get(k, k)
|
||||
if k == "color":
|
||||
v = cls._convert_to_color(v)
|
||||
font_kwargs[k] = v
|
||||
|
||||
return Font(**font_kwargs)
|
||||
|
||||
@classmethod
|
||||
def _convert_to_stop(cls, stop_seq):
|
||||
"""
|
||||
Convert ``stop_seq`` to a list of openpyxl v2 Color objects,
|
||||
suitable for initializing the ``GradientFill`` ``stop`` parameter.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
stop_seq : iterable
|
||||
An iterable that yields objects suitable for consumption by
|
||||
``_convert_to_color``.
|
||||
|
||||
Returns
|
||||
-------
|
||||
stop : list of openpyxl.styles.Color
|
||||
"""
|
||||
return map(cls._convert_to_color, stop_seq)
|
||||
|
||||
@classmethod
|
||||
def _convert_to_fill(cls, fill_dict: dict[str, Any]):
|
||||
"""
|
||||
Convert ``fill_dict`` to an openpyxl v2 Fill object.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
fill_dict : dict
|
||||
A dict with one or more of the following keys (or their synonyms),
|
||||
'fill_type' ('patternType', 'patterntype')
|
||||
'start_color' ('fgColor', 'fgcolor')
|
||||
'end_color' ('bgColor', 'bgcolor')
|
||||
or one or more of the following keys (or their synonyms).
|
||||
'type' ('fill_type')
|
||||
'degree'
|
||||
'left'
|
||||
'right'
|
||||
'top'
|
||||
'bottom'
|
||||
'stop'
|
||||
|
||||
Returns
|
||||
-------
|
||||
fill : openpyxl.styles.Fill
|
||||
"""
|
||||
from openpyxl.styles import (
|
||||
GradientFill,
|
||||
PatternFill,
|
||||
)
|
||||
|
||||
_pattern_fill_key_map = {
|
||||
"patternType": "fill_type",
|
||||
"patterntype": "fill_type",
|
||||
"fgColor": "start_color",
|
||||
"fgcolor": "start_color",
|
||||
"bgColor": "end_color",
|
||||
"bgcolor": "end_color",
|
||||
}
|
||||
|
||||
_gradient_fill_key_map = {"fill_type": "type"}
|
||||
|
||||
pfill_kwargs = {}
|
||||
gfill_kwargs = {}
|
||||
for k, v in fill_dict.items():
|
||||
pk = _pattern_fill_key_map.get(k)
|
||||
gk = _gradient_fill_key_map.get(k)
|
||||
if pk in ["start_color", "end_color"]:
|
||||
v = cls._convert_to_color(v)
|
||||
if gk == "stop":
|
||||
v = cls._convert_to_stop(v)
|
||||
if pk:
|
||||
pfill_kwargs[pk] = v
|
||||
elif gk:
|
||||
gfill_kwargs[gk] = v
|
||||
else:
|
||||
pfill_kwargs[k] = v
|
||||
gfill_kwargs[k] = v
|
||||
|
||||
try:
|
||||
return PatternFill(**pfill_kwargs)
|
||||
except TypeError:
|
||||
return GradientFill(**gfill_kwargs)
|
||||
|
||||
@classmethod
|
||||
def _convert_to_side(cls, side_spec):
|
||||
"""
|
||||
Convert ``side_spec`` to an openpyxl v2 Side object.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
side_spec : str, dict
|
||||
A string specifying the border style, or a dict with zero or more
|
||||
of the following keys (or their synonyms).
|
||||
'style' ('border_style')
|
||||
'color'
|
||||
|
||||
Returns
|
||||
-------
|
||||
side : openpyxl.styles.Side
|
||||
"""
|
||||
from openpyxl.styles import Side
|
||||
|
||||
_side_key_map = {"border_style": "style"}
|
||||
|
||||
if isinstance(side_spec, str):
|
||||
return Side(style=side_spec)
|
||||
|
||||
side_kwargs = {}
|
||||
for k, v in side_spec.items():
|
||||
k = _side_key_map.get(k, k)
|
||||
if k == "color":
|
||||
v = cls._convert_to_color(v)
|
||||
side_kwargs[k] = v
|
||||
|
||||
return Side(**side_kwargs)
|
||||
|
||||
@classmethod
|
||||
def _convert_to_border(cls, border_dict):
|
||||
"""
|
||||
Convert ``border_dict`` to an openpyxl v2 Border object.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
border_dict : dict
|
||||
A dict with zero or more of the following keys (or their synonyms).
|
||||
'left'
|
||||
'right'
|
||||
'top'
|
||||
'bottom'
|
||||
'diagonal'
|
||||
'diagonal_direction'
|
||||
'vertical'
|
||||
'horizontal'
|
||||
'diagonalUp' ('diagonalup')
|
||||
'diagonalDown' ('diagonaldown')
|
||||
'outline'
|
||||
|
||||
Returns
|
||||
-------
|
||||
border : openpyxl.styles.Border
|
||||
"""
|
||||
from openpyxl.styles import Border
|
||||
|
||||
_border_key_map = {"diagonalup": "diagonalUp", "diagonaldown": "diagonalDown"}
|
||||
|
||||
border_kwargs = {}
|
||||
for k, v in border_dict.items():
|
||||
k = _border_key_map.get(k, k)
|
||||
if k == "color":
|
||||
v = cls._convert_to_color(v)
|
||||
if k in ["left", "right", "top", "bottom", "diagonal"]:
|
||||
v = cls._convert_to_side(v)
|
||||
border_kwargs[k] = v
|
||||
|
||||
return Border(**border_kwargs)
|
||||
|
||||
@classmethod
|
||||
def _convert_to_alignment(cls, alignment_dict):
|
||||
"""
|
||||
Convert ``alignment_dict`` to an openpyxl v2 Alignment object.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
alignment_dict : dict
|
||||
A dict with zero or more of the following keys (or their synonyms).
|
||||
'horizontal'
|
||||
'vertical'
|
||||
'text_rotation'
|
||||
'wrap_text'
|
||||
'shrink_to_fit'
|
||||
'indent'
|
||||
Returns
|
||||
-------
|
||||
alignment : openpyxl.styles.Alignment
|
||||
"""
|
||||
from openpyxl.styles import Alignment
|
||||
|
||||
return Alignment(**alignment_dict)
|
||||
|
||||
@classmethod
|
||||
def _convert_to_number_format(cls, number_format_dict):
|
||||
"""
|
||||
Convert ``number_format_dict`` to an openpyxl v2.1.0 number format
|
||||
initializer.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
number_format_dict : dict
|
||||
A dict with zero or more of the following keys.
|
||||
'format_code' : str
|
||||
|
||||
Returns
|
||||
-------
|
||||
number_format : str
|
||||
"""
|
||||
return number_format_dict["format_code"]
|
||||
|
||||
@classmethod
|
||||
def _convert_to_protection(cls, protection_dict):
|
||||
"""
|
||||
Convert ``protection_dict`` to an openpyxl v2 Protection object.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
protection_dict : dict
|
||||
A dict with zero or more of the following keys.
|
||||
'locked'
|
||||
'hidden'
|
||||
|
||||
Returns
|
||||
-------
|
||||
"""
|
||||
from openpyxl.styles import Protection
|
||||
|
||||
return Protection(**protection_dict)
|
||||
|
||||
def _write_cells(
|
||||
self,
|
||||
cells,
|
||||
sheet_name: str | None = None,
|
||||
startrow: int = 0,
|
||||
startcol: int = 0,
|
||||
freeze_panes: tuple[int, int] | None = None,
|
||||
) -> None:
|
||||
# Write the frame cells using openpyxl.
|
||||
sheet_name = self._get_sheet_name(sheet_name)
|
||||
|
||||
_style_cache: dict[str, dict[str, Serialisable]] = {}
|
||||
|
||||
if sheet_name in self.sheets and self._if_sheet_exists != "new":
|
||||
if "r+" in self._mode:
|
||||
if self._if_sheet_exists == "replace":
|
||||
old_wks = self.sheets[sheet_name]
|
||||
target_index = self.book.index(old_wks)
|
||||
del self.book[sheet_name]
|
||||
wks = self.book.create_sheet(sheet_name, target_index)
|
||||
elif self._if_sheet_exists == "error":
|
||||
raise ValueError(
|
||||
f"Sheet '{sheet_name}' already exists and "
|
||||
f"if_sheet_exists is set to 'error'."
|
||||
)
|
||||
elif self._if_sheet_exists == "overlay":
|
||||
wks = self.sheets[sheet_name]
|
||||
else:
|
||||
raise ValueError(
|
||||
f"'{self._if_sheet_exists}' is not valid for if_sheet_exists. "
|
||||
"Valid options are 'error', 'new', 'replace' and 'overlay'."
|
||||
)
|
||||
else:
|
||||
wks = self.sheets[sheet_name]
|
||||
else:
|
||||
wks = self.book.create_sheet()
|
||||
wks.title = sheet_name
|
||||
|
||||
if validate_freeze_panes(freeze_panes):
|
||||
freeze_panes = cast(tuple[int, int], freeze_panes)
|
||||
wks.freeze_panes = wks.cell(
|
||||
row=freeze_panes[0] + 1, column=freeze_panes[1] + 1
|
||||
)
|
||||
|
||||
for cell in cells:
|
||||
xcell = wks.cell(
|
||||
row=startrow + cell.row + 1, column=startcol + cell.col + 1
|
||||
)
|
||||
xcell.value, fmt = self._value_with_fmt(cell.val)
|
||||
if fmt:
|
||||
xcell.number_format = fmt
|
||||
|
||||
style_kwargs: dict[str, Serialisable] | None = {}
|
||||
if cell.style:
|
||||
key = str(cell.style)
|
||||
style_kwargs = _style_cache.get(key)
|
||||
if style_kwargs is None:
|
||||
style_kwargs = self._convert_to_style_kwargs(cell.style)
|
||||
_style_cache[key] = style_kwargs
|
||||
|
||||
if style_kwargs:
|
||||
for k, v in style_kwargs.items():
|
||||
setattr(xcell, k, v)
|
||||
|
||||
if cell.mergestart is not None and cell.mergeend is not None:
|
||||
wks.merge_cells(
|
||||
start_row=startrow + cell.row + 1,
|
||||
start_column=startcol + cell.col + 1,
|
||||
end_column=startcol + cell.mergeend + 1,
|
||||
end_row=startrow + cell.mergestart + 1,
|
||||
)
|
||||
|
||||
# When cells are merged only the top-left cell is preserved
|
||||
# The behaviour of the other cells in a merged range is
|
||||
# undefined
|
||||
if style_kwargs:
|
||||
first_row = startrow + cell.row + 1
|
||||
last_row = startrow + cell.mergestart + 1
|
||||
first_col = startcol + cell.col + 1
|
||||
last_col = startcol + cell.mergeend + 1
|
||||
|
||||
for row in range(first_row, last_row + 1):
|
||||
for col in range(first_col, last_col + 1):
|
||||
if row == first_row and col == first_col:
|
||||
# Ignore first cell. It is already handled.
|
||||
continue
|
||||
xcell = wks.cell(column=col, row=row)
|
||||
for k, v in style_kwargs.items():
|
||||
setattr(xcell, k, v)
|
||||
|
||||
|
||||
class OpenpyxlReader(BaseExcelReader["Workbook"]):
|
||||
@doc(storage_options=_shared_docs["storage_options"])
|
||||
def __init__(
|
||||
self,
|
||||
filepath_or_buffer: FilePath | ReadBuffer[bytes],
|
||||
storage_options: StorageOptions | None = None,
|
||||
engine_kwargs: dict | None = None,
|
||||
) -> None:
|
||||
"""
|
||||
Reader using openpyxl engine.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
filepath_or_buffer : str, path object or Workbook
|
||||
Object to be parsed.
|
||||
{storage_options}
|
||||
engine_kwargs : dict, optional
|
||||
Arbitrary keyword arguments passed to excel engine.
|
||||
"""
|
||||
import_optional_dependency("openpyxl")
|
||||
super().__init__(
|
||||
filepath_or_buffer,
|
||||
storage_options=storage_options,
|
||||
engine_kwargs=engine_kwargs,
|
||||
)
|
||||
|
||||
@property
|
||||
def _workbook_class(self) -> type[Workbook]:
|
||||
from openpyxl import Workbook
|
||||
|
||||
return Workbook
|
||||
|
||||
def load_workbook(
|
||||
self, filepath_or_buffer: FilePath | ReadBuffer[bytes], engine_kwargs
|
||||
) -> Workbook:
|
||||
from openpyxl import load_workbook
|
||||
|
||||
default_kwargs = {"read_only": True, "data_only": True, "keep_links": False}
|
||||
|
||||
return load_workbook(
|
||||
filepath_or_buffer,
|
||||
**(default_kwargs | engine_kwargs),
|
||||
)
|
||||
|
||||
@property
|
||||
def sheet_names(self) -> list[str]:
|
||||
return [sheet.title for sheet in self.book.worksheets]
|
||||
|
||||
def get_sheet_by_name(self, name: str):
|
||||
self.raise_if_bad_sheet_by_name(name)
|
||||
return self.book[name]
|
||||
|
||||
def get_sheet_by_index(self, index: int):
|
||||
self.raise_if_bad_sheet_by_index(index)
|
||||
return self.book.worksheets[index]
|
||||
|
||||
def _convert_cell(self, cell) -> Scalar:
|
||||
from openpyxl.cell.cell import (
|
||||
TYPE_ERROR,
|
||||
TYPE_NUMERIC,
|
||||
)
|
||||
|
||||
if cell.value is None:
|
||||
return "" # compat with xlrd
|
||||
elif cell.data_type == TYPE_ERROR:
|
||||
return np.nan
|
||||
elif cell.data_type == TYPE_NUMERIC:
|
||||
val = int(cell.value)
|
||||
if val == cell.value:
|
||||
return val
|
||||
return float(cell.value)
|
||||
|
||||
return cell.value
|
||||
|
||||
def get_sheet_data(
|
||||
self, sheet, file_rows_needed: int | None = None
|
||||
) -> list[list[Scalar]]:
|
||||
if self.book.read_only:
|
||||
sheet.reset_dimensions()
|
||||
|
||||
data: list[list[Scalar]] = []
|
||||
last_row_with_data = -1
|
||||
for row_number, row in enumerate(sheet.rows):
|
||||
converted_row = [self._convert_cell(cell) for cell in row]
|
||||
while converted_row and converted_row[-1] == "":
|
||||
# trim trailing empty elements
|
||||
converted_row.pop()
|
||||
if converted_row:
|
||||
last_row_with_data = row_number
|
||||
data.append(converted_row)
|
||||
if file_rows_needed is not None and len(data) >= file_rows_needed:
|
||||
break
|
||||
|
||||
# Trim trailing empty rows
|
||||
data = data[: last_row_with_data + 1]
|
||||
|
||||
if len(data) > 0:
|
||||
# extend rows to max width
|
||||
max_width = max(len(data_row) for data_row in data)
|
||||
if min(len(data_row) for data_row in data) < max_width:
|
||||
empty_cell: list[Scalar] = [""]
|
||||
data = [
|
||||
data_row + (max_width - len(data_row)) * empty_cell
|
||||
for data_row in data
|
||||
]
|
||||
|
||||
return data
|
||||
127
lib/python3.11/site-packages/pandas/io/excel/_pyxlsb.py
Normal file
127
lib/python3.11/site-packages/pandas/io/excel/_pyxlsb.py
Normal file
@ -0,0 +1,127 @@
|
||||
# pyright: reportMissingImports=false
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from pandas.compat._optional import import_optional_dependency
|
||||
from pandas.util._decorators import doc
|
||||
|
||||
from pandas.core.shared_docs import _shared_docs
|
||||
|
||||
from pandas.io.excel._base import BaseExcelReader
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pyxlsb import Workbook
|
||||
|
||||
from pandas._typing import (
|
||||
FilePath,
|
||||
ReadBuffer,
|
||||
Scalar,
|
||||
StorageOptions,
|
||||
)
|
||||
|
||||
|
||||
class PyxlsbReader(BaseExcelReader["Workbook"]):
|
||||
@doc(storage_options=_shared_docs["storage_options"])
|
||||
def __init__(
|
||||
self,
|
||||
filepath_or_buffer: FilePath | ReadBuffer[bytes],
|
||||
storage_options: StorageOptions | None = None,
|
||||
engine_kwargs: dict | None = None,
|
||||
) -> None:
|
||||
"""
|
||||
Reader using pyxlsb engine.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
filepath_or_buffer : str, path object, or Workbook
|
||||
Object to be parsed.
|
||||
{storage_options}
|
||||
engine_kwargs : dict, optional
|
||||
Arbitrary keyword arguments passed to excel engine.
|
||||
"""
|
||||
import_optional_dependency("pyxlsb")
|
||||
# This will call load_workbook on the filepath or buffer
|
||||
# And set the result to the book-attribute
|
||||
super().__init__(
|
||||
filepath_or_buffer,
|
||||
storage_options=storage_options,
|
||||
engine_kwargs=engine_kwargs,
|
||||
)
|
||||
|
||||
@property
|
||||
def _workbook_class(self) -> type[Workbook]:
|
||||
from pyxlsb import Workbook
|
||||
|
||||
return Workbook
|
||||
|
||||
def load_workbook(
|
||||
self, filepath_or_buffer: FilePath | ReadBuffer[bytes], engine_kwargs
|
||||
) -> Workbook:
|
||||
from pyxlsb import open_workbook
|
||||
|
||||
# TODO: hack in buffer capability
|
||||
# This might need some modifications to the Pyxlsb library
|
||||
# Actual work for opening it is in xlsbpackage.py, line 20-ish
|
||||
|
||||
return open_workbook(filepath_or_buffer, **engine_kwargs)
|
||||
|
||||
@property
|
||||
def sheet_names(self) -> list[str]:
|
||||
return self.book.sheets
|
||||
|
||||
def get_sheet_by_name(self, name: str):
|
||||
self.raise_if_bad_sheet_by_name(name)
|
||||
return self.book.get_sheet(name)
|
||||
|
||||
def get_sheet_by_index(self, index: int):
|
||||
self.raise_if_bad_sheet_by_index(index)
|
||||
# pyxlsb sheets are indexed from 1 onwards
|
||||
# There's a fix for this in the source, but the pypi package doesn't have it
|
||||
return self.book.get_sheet(index + 1)
|
||||
|
||||
def _convert_cell(self, cell) -> Scalar:
|
||||
# TODO: there is no way to distinguish between floats and datetimes in pyxlsb
|
||||
# This means that there is no way to read datetime types from an xlsb file yet
|
||||
if cell.v is None:
|
||||
return "" # Prevents non-named columns from not showing up as Unnamed: i
|
||||
if isinstance(cell.v, float):
|
||||
val = int(cell.v)
|
||||
if val == cell.v:
|
||||
return val
|
||||
else:
|
||||
return float(cell.v)
|
||||
|
||||
return cell.v
|
||||
|
||||
def get_sheet_data(
|
||||
self,
|
||||
sheet,
|
||||
file_rows_needed: int | None = None,
|
||||
) -> list[list[Scalar]]:
|
||||
data: list[list[Scalar]] = []
|
||||
previous_row_number = -1
|
||||
# When sparse=True the rows can have different lengths and empty rows are
|
||||
# not returned. The cells are namedtuples of row, col, value (r, c, v).
|
||||
for row in sheet.rows(sparse=True):
|
||||
row_number = row[0].r
|
||||
converted_row = [self._convert_cell(cell) for cell in row]
|
||||
while converted_row and converted_row[-1] == "":
|
||||
# trim trailing empty elements
|
||||
converted_row.pop()
|
||||
if converted_row:
|
||||
data.extend([[]] * (row_number - previous_row_number - 1))
|
||||
data.append(converted_row)
|
||||
previous_row_number = row_number
|
||||
if file_rows_needed is not None and len(data) >= file_rows_needed:
|
||||
break
|
||||
if data:
|
||||
# extend rows to max_width
|
||||
max_width = max(len(data_row) for data_row in data)
|
||||
if min(len(data_row) for data_row in data) < max_width:
|
||||
empty_cell: list[Scalar] = [""]
|
||||
data = [
|
||||
data_row + (max_width - len(data_row)) * empty_cell
|
||||
for data_row in data
|
||||
]
|
||||
return data
|
||||
334
lib/python3.11/site-packages/pandas/io/excel/_util.py
Normal file
334
lib/python3.11/site-packages/pandas/io/excel/_util.py
Normal file
@ -0,0 +1,334 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from collections.abc import (
|
||||
Hashable,
|
||||
Iterable,
|
||||
MutableMapping,
|
||||
Sequence,
|
||||
)
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
Callable,
|
||||
Literal,
|
||||
TypeVar,
|
||||
overload,
|
||||
)
|
||||
|
||||
from pandas.compat._optional import import_optional_dependency
|
||||
|
||||
from pandas.core.dtypes.common import (
|
||||
is_integer,
|
||||
is_list_like,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pandas.io.excel._base import ExcelWriter
|
||||
|
||||
ExcelWriter_t = type[ExcelWriter]
|
||||
usecols_func = TypeVar("usecols_func", bound=Callable[[Hashable], object])
|
||||
|
||||
_writers: MutableMapping[str, ExcelWriter_t] = {}
|
||||
|
||||
|
||||
def register_writer(klass: ExcelWriter_t) -> None:
|
||||
"""
|
||||
Add engine to the excel writer registry.io.excel.
|
||||
|
||||
You must use this method to integrate with ``to_excel``.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
klass : ExcelWriter
|
||||
"""
|
||||
if not callable(klass):
|
||||
raise ValueError("Can only register callables as engines")
|
||||
engine_name = klass._engine
|
||||
_writers[engine_name] = klass
|
||||
|
||||
|
||||
def get_default_engine(ext: str, mode: Literal["reader", "writer"] = "reader") -> str:
|
||||
"""
|
||||
Return the default reader/writer for the given extension.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
ext : str
|
||||
The excel file extension for which to get the default engine.
|
||||
mode : str {'reader', 'writer'}
|
||||
Whether to get the default engine for reading or writing.
|
||||
Either 'reader' or 'writer'
|
||||
|
||||
Returns
|
||||
-------
|
||||
str
|
||||
The default engine for the extension.
|
||||
"""
|
||||
_default_readers = {
|
||||
"xlsx": "openpyxl",
|
||||
"xlsm": "openpyxl",
|
||||
"xlsb": "pyxlsb",
|
||||
"xls": "xlrd",
|
||||
"ods": "odf",
|
||||
}
|
||||
_default_writers = {
|
||||
"xlsx": "openpyxl",
|
||||
"xlsm": "openpyxl",
|
||||
"xlsb": "pyxlsb",
|
||||
"ods": "odf",
|
||||
}
|
||||
assert mode in ["reader", "writer"]
|
||||
if mode == "writer":
|
||||
# Prefer xlsxwriter over openpyxl if installed
|
||||
xlsxwriter = import_optional_dependency("xlsxwriter", errors="warn")
|
||||
if xlsxwriter:
|
||||
_default_writers["xlsx"] = "xlsxwriter"
|
||||
return _default_writers[ext]
|
||||
else:
|
||||
return _default_readers[ext]
|
||||
|
||||
|
||||
def get_writer(engine_name: str) -> ExcelWriter_t:
|
||||
try:
|
||||
return _writers[engine_name]
|
||||
except KeyError as err:
|
||||
raise ValueError(f"No Excel writer '{engine_name}'") from err
|
||||
|
||||
|
||||
def _excel2num(x: str) -> int:
|
||||
"""
|
||||
Convert Excel column name like 'AB' to 0-based column index.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
x : str
|
||||
The Excel column name to convert to a 0-based column index.
|
||||
|
||||
Returns
|
||||
-------
|
||||
num : int
|
||||
The column index corresponding to the name.
|
||||
|
||||
Raises
|
||||
------
|
||||
ValueError
|
||||
Part of the Excel column name was invalid.
|
||||
"""
|
||||
index = 0
|
||||
|
||||
for c in x.upper().strip():
|
||||
cp = ord(c)
|
||||
|
||||
if cp < ord("A") or cp > ord("Z"):
|
||||
raise ValueError(f"Invalid column name: {x}")
|
||||
|
||||
index = index * 26 + cp - ord("A") + 1
|
||||
|
||||
return index - 1
|
||||
|
||||
|
||||
def _range2cols(areas: str) -> list[int]:
|
||||
"""
|
||||
Convert comma separated list of column names and ranges to indices.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
areas : str
|
||||
A string containing a sequence of column ranges (or areas).
|
||||
|
||||
Returns
|
||||
-------
|
||||
cols : list
|
||||
A list of 0-based column indices.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> _range2cols('A:E')
|
||||
[0, 1, 2, 3, 4]
|
||||
>>> _range2cols('A,C,Z:AB')
|
||||
[0, 2, 25, 26, 27]
|
||||
"""
|
||||
cols: list[int] = []
|
||||
|
||||
for rng in areas.split(","):
|
||||
if ":" in rng:
|
||||
rngs = rng.split(":")
|
||||
cols.extend(range(_excel2num(rngs[0]), _excel2num(rngs[1]) + 1))
|
||||
else:
|
||||
cols.append(_excel2num(rng))
|
||||
|
||||
return cols
|
||||
|
||||
|
||||
@overload
|
||||
def maybe_convert_usecols(usecols: str | list[int]) -> list[int]:
|
||||
...
|
||||
|
||||
|
||||
@overload
|
||||
def maybe_convert_usecols(usecols: list[str]) -> list[str]:
|
||||
...
|
||||
|
||||
|
||||
@overload
|
||||
def maybe_convert_usecols(usecols: usecols_func) -> usecols_func:
|
||||
...
|
||||
|
||||
|
||||
@overload
|
||||
def maybe_convert_usecols(usecols: None) -> None:
|
||||
...
|
||||
|
||||
|
||||
def maybe_convert_usecols(
|
||||
usecols: str | list[int] | list[str] | usecols_func | None,
|
||||
) -> None | list[int] | list[str] | usecols_func:
|
||||
"""
|
||||
Convert `usecols` into a compatible format for parsing in `parsers.py`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
usecols : object
|
||||
The use-columns object to potentially convert.
|
||||
|
||||
Returns
|
||||
-------
|
||||
converted : object
|
||||
The compatible format of `usecols`.
|
||||
"""
|
||||
if usecols is None:
|
||||
return usecols
|
||||
|
||||
if is_integer(usecols):
|
||||
raise ValueError(
|
||||
"Passing an integer for `usecols` is no longer supported. "
|
||||
"Please pass in a list of int from 0 to `usecols` inclusive instead."
|
||||
)
|
||||
|
||||
if isinstance(usecols, str):
|
||||
return _range2cols(usecols)
|
||||
|
||||
return usecols
|
||||
|
||||
|
||||
@overload
|
||||
def validate_freeze_panes(freeze_panes: tuple[int, int]) -> Literal[True]:
|
||||
...
|
||||
|
||||
|
||||
@overload
|
||||
def validate_freeze_panes(freeze_panes: None) -> Literal[False]:
|
||||
...
|
||||
|
||||
|
||||
def validate_freeze_panes(freeze_panes: tuple[int, int] | None) -> bool:
|
||||
if freeze_panes is not None:
|
||||
if len(freeze_panes) == 2 and all(
|
||||
isinstance(item, int) for item in freeze_panes
|
||||
):
|
||||
return True
|
||||
|
||||
raise ValueError(
|
||||
"freeze_panes must be of form (row, column) "
|
||||
"where row and column are integers"
|
||||
)
|
||||
|
||||
# freeze_panes wasn't specified, return False so it won't be applied
|
||||
# to output sheet
|
||||
return False
|
||||
|
||||
|
||||
def fill_mi_header(
|
||||
row: list[Hashable], control_row: list[bool]
|
||||
) -> tuple[list[Hashable], list[bool]]:
|
||||
"""
|
||||
Forward fill blank entries in row but only inside the same parent index.
|
||||
|
||||
Used for creating headers in Multiindex.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
row : list
|
||||
List of items in a single row.
|
||||
control_row : list of bool
|
||||
Helps to determine if particular column is in same parent index as the
|
||||
previous value. Used to stop propagation of empty cells between
|
||||
different indexes.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Returns changed row and control_row
|
||||
"""
|
||||
last = row[0]
|
||||
for i in range(1, len(row)):
|
||||
if not control_row[i]:
|
||||
last = row[i]
|
||||
|
||||
if row[i] == "" or row[i] is None:
|
||||
row[i] = last
|
||||
else:
|
||||
control_row[i] = False
|
||||
last = row[i]
|
||||
|
||||
return row, control_row
|
||||
|
||||
|
||||
def pop_header_name(
|
||||
row: list[Hashable], index_col: int | Sequence[int]
|
||||
) -> tuple[Hashable | None, list[Hashable]]:
|
||||
"""
|
||||
Pop the header name for MultiIndex parsing.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
row : list
|
||||
The data row to parse for the header name.
|
||||
index_col : int, list
|
||||
The index columns for our data. Assumed to be non-null.
|
||||
|
||||
Returns
|
||||
-------
|
||||
header_name : str
|
||||
The extracted header name.
|
||||
trimmed_row : list
|
||||
The original data row with the header name removed.
|
||||
"""
|
||||
# Pop out header name and fill w/blank.
|
||||
if is_list_like(index_col):
|
||||
assert isinstance(index_col, Iterable)
|
||||
i = max(index_col)
|
||||
else:
|
||||
assert not isinstance(index_col, Iterable)
|
||||
i = index_col
|
||||
|
||||
header_name = row[i]
|
||||
header_name = None if header_name == "" else header_name
|
||||
|
||||
return header_name, row[:i] + [""] + row[i + 1 :]
|
||||
|
||||
|
||||
def combine_kwargs(engine_kwargs: dict[str, Any] | None, kwargs: dict) -> dict:
|
||||
"""
|
||||
Used to combine two sources of kwargs for the backend engine.
|
||||
|
||||
Use of kwargs is deprecated, this function is solely for use in 1.3 and should
|
||||
be removed in 1.4/2.0. Also _base.ExcelWriter.__new__ ensures either engine_kwargs
|
||||
or kwargs must be None or empty respectively.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
engine_kwargs: dict
|
||||
kwargs to be passed through to the engine.
|
||||
kwargs: dict
|
||||
kwargs to be psased through to the engine (deprecated)
|
||||
|
||||
Returns
|
||||
-------
|
||||
engine_kwargs combined with kwargs
|
||||
"""
|
||||
if engine_kwargs is None:
|
||||
result = {}
|
||||
else:
|
||||
result = engine_kwargs.copy()
|
||||
result.update(kwargs)
|
||||
return result
|
||||
143
lib/python3.11/site-packages/pandas/io/excel/_xlrd.py
Normal file
143
lib/python3.11/site-packages/pandas/io/excel/_xlrd.py
Normal file
@ -0,0 +1,143 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from datetime import time
|
||||
import math
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas.compat._optional import import_optional_dependency
|
||||
from pandas.util._decorators import doc
|
||||
|
||||
from pandas.core.shared_docs import _shared_docs
|
||||
|
||||
from pandas.io.excel._base import BaseExcelReader
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from xlrd import Book
|
||||
|
||||
from pandas._typing import (
|
||||
Scalar,
|
||||
StorageOptions,
|
||||
)
|
||||
|
||||
|
||||
class XlrdReader(BaseExcelReader["Book"]):
|
||||
@doc(storage_options=_shared_docs["storage_options"])
|
||||
def __init__(
|
||||
self,
|
||||
filepath_or_buffer,
|
||||
storage_options: StorageOptions | None = None,
|
||||
engine_kwargs: dict | None = None,
|
||||
) -> None:
|
||||
"""
|
||||
Reader using xlrd engine.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
filepath_or_buffer : str, path object or Workbook
|
||||
Object to be parsed.
|
||||
{storage_options}
|
||||
engine_kwargs : dict, optional
|
||||
Arbitrary keyword arguments passed to excel engine.
|
||||
"""
|
||||
err_msg = "Install xlrd >= 2.0.1 for xls Excel support"
|
||||
import_optional_dependency("xlrd", extra=err_msg)
|
||||
super().__init__(
|
||||
filepath_or_buffer,
|
||||
storage_options=storage_options,
|
||||
engine_kwargs=engine_kwargs,
|
||||
)
|
||||
|
||||
@property
|
||||
def _workbook_class(self) -> type[Book]:
|
||||
from xlrd import Book
|
||||
|
||||
return Book
|
||||
|
||||
def load_workbook(self, filepath_or_buffer, engine_kwargs) -> Book:
|
||||
from xlrd import open_workbook
|
||||
|
||||
if hasattr(filepath_or_buffer, "read"):
|
||||
data = filepath_or_buffer.read()
|
||||
return open_workbook(file_contents=data, **engine_kwargs)
|
||||
else:
|
||||
return open_workbook(filepath_or_buffer, **engine_kwargs)
|
||||
|
||||
@property
|
||||
def sheet_names(self):
|
||||
return self.book.sheet_names()
|
||||
|
||||
def get_sheet_by_name(self, name):
|
||||
self.raise_if_bad_sheet_by_name(name)
|
||||
return self.book.sheet_by_name(name)
|
||||
|
||||
def get_sheet_by_index(self, index):
|
||||
self.raise_if_bad_sheet_by_index(index)
|
||||
return self.book.sheet_by_index(index)
|
||||
|
||||
def get_sheet_data(
|
||||
self, sheet, file_rows_needed: int | None = None
|
||||
) -> list[list[Scalar]]:
|
||||
from xlrd import (
|
||||
XL_CELL_BOOLEAN,
|
||||
XL_CELL_DATE,
|
||||
XL_CELL_ERROR,
|
||||
XL_CELL_NUMBER,
|
||||
xldate,
|
||||
)
|
||||
|
||||
epoch1904 = self.book.datemode
|
||||
|
||||
def _parse_cell(cell_contents, cell_typ):
|
||||
"""
|
||||
converts the contents of the cell into a pandas appropriate object
|
||||
"""
|
||||
if cell_typ == XL_CELL_DATE:
|
||||
# Use the newer xlrd datetime handling.
|
||||
try:
|
||||
cell_contents = xldate.xldate_as_datetime(cell_contents, epoch1904)
|
||||
except OverflowError:
|
||||
return cell_contents
|
||||
|
||||
# Excel doesn't distinguish between dates and time,
|
||||
# so we treat dates on the epoch as times only.
|
||||
# Also, Excel supports 1900 and 1904 epochs.
|
||||
year = (cell_contents.timetuple())[0:3]
|
||||
if (not epoch1904 and year == (1899, 12, 31)) or (
|
||||
epoch1904 and year == (1904, 1, 1)
|
||||
):
|
||||
cell_contents = time(
|
||||
cell_contents.hour,
|
||||
cell_contents.minute,
|
||||
cell_contents.second,
|
||||
cell_contents.microsecond,
|
||||
)
|
||||
|
||||
elif cell_typ == XL_CELL_ERROR:
|
||||
cell_contents = np.nan
|
||||
elif cell_typ == XL_CELL_BOOLEAN:
|
||||
cell_contents = bool(cell_contents)
|
||||
elif cell_typ == XL_CELL_NUMBER:
|
||||
# GH5394 - Excel 'numbers' are always floats
|
||||
# it's a minimal perf hit and less surprising
|
||||
if math.isfinite(cell_contents):
|
||||
# GH54564 - don't attempt to convert NaN/Inf
|
||||
val = int(cell_contents)
|
||||
if val == cell_contents:
|
||||
cell_contents = val
|
||||
return cell_contents
|
||||
|
||||
data = []
|
||||
|
||||
nrows = sheet.nrows
|
||||
if file_rows_needed is not None:
|
||||
nrows = min(nrows, file_rows_needed)
|
||||
for i in range(nrows):
|
||||
row = [
|
||||
_parse_cell(value, typ)
|
||||
for value, typ in zip(sheet.row_values(i), sheet.row_types(i))
|
||||
]
|
||||
data.append(row)
|
||||
|
||||
return data
|
||||
284
lib/python3.11/site-packages/pandas/io/excel/_xlsxwriter.py
Normal file
284
lib/python3.11/site-packages/pandas/io/excel/_xlsxwriter.py
Normal file
@ -0,0 +1,284 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
)
|
||||
|
||||
from pandas.io.excel._base import ExcelWriter
|
||||
from pandas.io.excel._util import (
|
||||
combine_kwargs,
|
||||
validate_freeze_panes,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pandas._typing import (
|
||||
ExcelWriterIfSheetExists,
|
||||
FilePath,
|
||||
StorageOptions,
|
||||
WriteExcelBuffer,
|
||||
)
|
||||
|
||||
|
||||
class _XlsxStyler:
|
||||
# Map from openpyxl-oriented styles to flatter xlsxwriter representation
|
||||
# Ordering necessary for both determinism and because some are keyed by
|
||||
# prefixes of others.
|
||||
STYLE_MAPPING: dict[str, list[tuple[tuple[str, ...], str]]] = {
|
||||
"font": [
|
||||
(("name",), "font_name"),
|
||||
(("sz",), "font_size"),
|
||||
(("size",), "font_size"),
|
||||
(("color", "rgb"), "font_color"),
|
||||
(("color",), "font_color"),
|
||||
(("b",), "bold"),
|
||||
(("bold",), "bold"),
|
||||
(("i",), "italic"),
|
||||
(("italic",), "italic"),
|
||||
(("u",), "underline"),
|
||||
(("underline",), "underline"),
|
||||
(("strike",), "font_strikeout"),
|
||||
(("vertAlign",), "font_script"),
|
||||
(("vertalign",), "font_script"),
|
||||
],
|
||||
"number_format": [(("format_code",), "num_format"), ((), "num_format")],
|
||||
"protection": [(("locked",), "locked"), (("hidden",), "hidden")],
|
||||
"alignment": [
|
||||
(("horizontal",), "align"),
|
||||
(("vertical",), "valign"),
|
||||
(("text_rotation",), "rotation"),
|
||||
(("wrap_text",), "text_wrap"),
|
||||
(("indent",), "indent"),
|
||||
(("shrink_to_fit",), "shrink"),
|
||||
],
|
||||
"fill": [
|
||||
(("patternType",), "pattern"),
|
||||
(("patterntype",), "pattern"),
|
||||
(("fill_type",), "pattern"),
|
||||
(("start_color", "rgb"), "fg_color"),
|
||||
(("fgColor", "rgb"), "fg_color"),
|
||||
(("fgcolor", "rgb"), "fg_color"),
|
||||
(("start_color",), "fg_color"),
|
||||
(("fgColor",), "fg_color"),
|
||||
(("fgcolor",), "fg_color"),
|
||||
(("end_color", "rgb"), "bg_color"),
|
||||
(("bgColor", "rgb"), "bg_color"),
|
||||
(("bgcolor", "rgb"), "bg_color"),
|
||||
(("end_color",), "bg_color"),
|
||||
(("bgColor",), "bg_color"),
|
||||
(("bgcolor",), "bg_color"),
|
||||
],
|
||||
"border": [
|
||||
(("color", "rgb"), "border_color"),
|
||||
(("color",), "border_color"),
|
||||
(("style",), "border"),
|
||||
(("top", "color", "rgb"), "top_color"),
|
||||
(("top", "color"), "top_color"),
|
||||
(("top", "style"), "top"),
|
||||
(("top",), "top"),
|
||||
(("right", "color", "rgb"), "right_color"),
|
||||
(("right", "color"), "right_color"),
|
||||
(("right", "style"), "right"),
|
||||
(("right",), "right"),
|
||||
(("bottom", "color", "rgb"), "bottom_color"),
|
||||
(("bottom", "color"), "bottom_color"),
|
||||
(("bottom", "style"), "bottom"),
|
||||
(("bottom",), "bottom"),
|
||||
(("left", "color", "rgb"), "left_color"),
|
||||
(("left", "color"), "left_color"),
|
||||
(("left", "style"), "left"),
|
||||
(("left",), "left"),
|
||||
],
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def convert(cls, style_dict, num_format_str=None):
|
||||
"""
|
||||
converts a style_dict to an xlsxwriter format dict
|
||||
|
||||
Parameters
|
||||
----------
|
||||
style_dict : style dictionary to convert
|
||||
num_format_str : optional number format string
|
||||
"""
|
||||
# Create a XlsxWriter format object.
|
||||
props = {}
|
||||
|
||||
if num_format_str is not None:
|
||||
props["num_format"] = num_format_str
|
||||
|
||||
if style_dict is None:
|
||||
return props
|
||||
|
||||
if "borders" in style_dict:
|
||||
style_dict = style_dict.copy()
|
||||
style_dict["border"] = style_dict.pop("borders")
|
||||
|
||||
for style_group_key, style_group in style_dict.items():
|
||||
for src, dst in cls.STYLE_MAPPING.get(style_group_key, []):
|
||||
# src is a sequence of keys into a nested dict
|
||||
# dst is a flat key
|
||||
if dst in props:
|
||||
continue
|
||||
v = style_group
|
||||
for k in src:
|
||||
try:
|
||||
v = v[k]
|
||||
except (KeyError, TypeError):
|
||||
break
|
||||
else:
|
||||
props[dst] = v
|
||||
|
||||
if isinstance(props.get("pattern"), str):
|
||||
# TODO: support other fill patterns
|
||||
props["pattern"] = 0 if props["pattern"] == "none" else 1
|
||||
|
||||
for k in ["border", "top", "right", "bottom", "left"]:
|
||||
if isinstance(props.get(k), str):
|
||||
try:
|
||||
props[k] = [
|
||||
"none",
|
||||
"thin",
|
||||
"medium",
|
||||
"dashed",
|
||||
"dotted",
|
||||
"thick",
|
||||
"double",
|
||||
"hair",
|
||||
"mediumDashed",
|
||||
"dashDot",
|
||||
"mediumDashDot",
|
||||
"dashDotDot",
|
||||
"mediumDashDotDot",
|
||||
"slantDashDot",
|
||||
].index(props[k])
|
||||
except ValueError:
|
||||
props[k] = 2
|
||||
|
||||
if isinstance(props.get("font_script"), str):
|
||||
props["font_script"] = ["baseline", "superscript", "subscript"].index(
|
||||
props["font_script"]
|
||||
)
|
||||
|
||||
if isinstance(props.get("underline"), str):
|
||||
props["underline"] = {
|
||||
"none": 0,
|
||||
"single": 1,
|
||||
"double": 2,
|
||||
"singleAccounting": 33,
|
||||
"doubleAccounting": 34,
|
||||
}[props["underline"]]
|
||||
|
||||
# GH 30107 - xlsxwriter uses different name
|
||||
if props.get("valign") == "center":
|
||||
props["valign"] = "vcenter"
|
||||
|
||||
return props
|
||||
|
||||
|
||||
class XlsxWriter(ExcelWriter):
|
||||
_engine = "xlsxwriter"
|
||||
_supported_extensions = (".xlsx",)
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
path: FilePath | WriteExcelBuffer | ExcelWriter,
|
||||
engine: str | None = None,
|
||||
date_format: str | None = None,
|
||||
datetime_format: str | None = None,
|
||||
mode: str = "w",
|
||||
storage_options: StorageOptions | None = None,
|
||||
if_sheet_exists: ExcelWriterIfSheetExists | None = None,
|
||||
engine_kwargs: dict[str, Any] | None = None,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
# Use the xlsxwriter module as the Excel writer.
|
||||
from xlsxwriter import Workbook
|
||||
|
||||
engine_kwargs = combine_kwargs(engine_kwargs, kwargs)
|
||||
|
||||
if mode == "a":
|
||||
raise ValueError("Append mode is not supported with xlsxwriter!")
|
||||
|
||||
super().__init__(
|
||||
path,
|
||||
engine=engine,
|
||||
date_format=date_format,
|
||||
datetime_format=datetime_format,
|
||||
mode=mode,
|
||||
storage_options=storage_options,
|
||||
if_sheet_exists=if_sheet_exists,
|
||||
engine_kwargs=engine_kwargs,
|
||||
)
|
||||
|
||||
try:
|
||||
self._book = Workbook(self._handles.handle, **engine_kwargs)
|
||||
except TypeError:
|
||||
self._handles.handle.close()
|
||||
raise
|
||||
|
||||
@property
|
||||
def book(self):
|
||||
"""
|
||||
Book instance of class xlsxwriter.Workbook.
|
||||
|
||||
This attribute can be used to access engine-specific features.
|
||||
"""
|
||||
return self._book
|
||||
|
||||
@property
|
||||
def sheets(self) -> dict[str, Any]:
|
||||
result = self.book.sheetnames
|
||||
return result
|
||||
|
||||
def _save(self) -> None:
|
||||
"""
|
||||
Save workbook to disk.
|
||||
"""
|
||||
self.book.close()
|
||||
|
||||
def _write_cells(
|
||||
self,
|
||||
cells,
|
||||
sheet_name: str | None = None,
|
||||
startrow: int = 0,
|
||||
startcol: int = 0,
|
||||
freeze_panes: tuple[int, int] | None = None,
|
||||
) -> None:
|
||||
# Write the frame cells using xlsxwriter.
|
||||
sheet_name = self._get_sheet_name(sheet_name)
|
||||
|
||||
wks = self.book.get_worksheet_by_name(sheet_name)
|
||||
if wks is None:
|
||||
wks = self.book.add_worksheet(sheet_name)
|
||||
|
||||
style_dict = {"null": None}
|
||||
|
||||
if validate_freeze_panes(freeze_panes):
|
||||
wks.freeze_panes(*(freeze_panes))
|
||||
|
||||
for cell in cells:
|
||||
val, fmt = self._value_with_fmt(cell.val)
|
||||
|
||||
stylekey = json.dumps(cell.style)
|
||||
if fmt:
|
||||
stylekey += fmt
|
||||
|
||||
if stylekey in style_dict:
|
||||
style = style_dict[stylekey]
|
||||
else:
|
||||
style = self.book.add_format(_XlsxStyler.convert(cell.style, fmt))
|
||||
style_dict[stylekey] = style
|
||||
|
||||
if cell.mergestart is not None and cell.mergeend is not None:
|
||||
wks.merge_range(
|
||||
startrow + cell.row,
|
||||
startcol + cell.col,
|
||||
startrow + cell.mergestart,
|
||||
startcol + cell.mergeend,
|
||||
val,
|
||||
style,
|
||||
)
|
||||
else:
|
||||
wks.write(startrow + cell.row, startcol + cell.col, val, style)
|
||||
130
lib/python3.11/site-packages/pandas/io/feather_format.py
Normal file
130
lib/python3.11/site-packages/pandas/io/feather_format.py
Normal file
@ -0,0 +1,130 @@
|
||||
""" feather-format compat """
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
)
|
||||
|
||||
from pandas._config import using_string_dtype
|
||||
|
||||
from pandas._libs import lib
|
||||
from pandas.compat._optional import import_optional_dependency
|
||||
from pandas.util._decorators import doc
|
||||
from pandas.util._validators import check_dtype_backend
|
||||
|
||||
from pandas.core.api import DataFrame
|
||||
from pandas.core.shared_docs import _shared_docs
|
||||
|
||||
from pandas.io._util import arrow_table_to_pandas
|
||||
from pandas.io.common import get_handle
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import (
|
||||
Hashable,
|
||||
Sequence,
|
||||
)
|
||||
|
||||
from pandas._typing import (
|
||||
DtypeBackend,
|
||||
FilePath,
|
||||
ReadBuffer,
|
||||
StorageOptions,
|
||||
WriteBuffer,
|
||||
)
|
||||
|
||||
|
||||
@doc(storage_options=_shared_docs["storage_options"])
|
||||
def to_feather(
|
||||
df: DataFrame,
|
||||
path: FilePath | WriteBuffer[bytes],
|
||||
storage_options: StorageOptions | None = None,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
"""
|
||||
Write a DataFrame to the binary Feather format.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
df : DataFrame
|
||||
path : str, path object, or file-like object
|
||||
{storage_options}
|
||||
**kwargs :
|
||||
Additional keywords passed to `pyarrow.feather.write_feather`.
|
||||
|
||||
"""
|
||||
import_optional_dependency("pyarrow")
|
||||
from pyarrow import feather
|
||||
|
||||
if not isinstance(df, DataFrame):
|
||||
raise ValueError("feather only support IO with DataFrames")
|
||||
|
||||
with get_handle(
|
||||
path, "wb", storage_options=storage_options, is_text=False
|
||||
) as handles:
|
||||
feather.write_feather(df, handles.handle, **kwargs)
|
||||
|
||||
|
||||
@doc(storage_options=_shared_docs["storage_options"])
|
||||
def read_feather(
|
||||
path: FilePath | ReadBuffer[bytes],
|
||||
columns: Sequence[Hashable] | None = None,
|
||||
use_threads: bool = True,
|
||||
storage_options: StorageOptions | None = None,
|
||||
dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
|
||||
) -> DataFrame:
|
||||
"""
|
||||
Load a feather-format object from the file path.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path : str, path object, or file-like object
|
||||
String, path object (implementing ``os.PathLike[str]``), or file-like
|
||||
object implementing a binary ``read()`` function. The string could be a URL.
|
||||
Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is
|
||||
expected. A local file could be: ``file://localhost/path/to/table.feather``.
|
||||
columns : sequence, default None
|
||||
If not provided, all columns are read.
|
||||
use_threads : bool, default True
|
||||
Whether to parallelize reading using multiple threads.
|
||||
{storage_options}
|
||||
|
||||
dtype_backend : {{'numpy_nullable', 'pyarrow'}}, default 'numpy_nullable'
|
||||
Back-end data type applied to the resultant :class:`DataFrame`
|
||||
(still experimental). Behaviour is as follows:
|
||||
|
||||
* ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame`
|
||||
(default).
|
||||
* ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype`
|
||||
DataFrame.
|
||||
|
||||
.. versionadded:: 2.0
|
||||
|
||||
Returns
|
||||
-------
|
||||
type of object stored in file
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = pd.read_feather("path/to/file.feather") # doctest: +SKIP
|
||||
"""
|
||||
import_optional_dependency("pyarrow")
|
||||
from pyarrow import feather
|
||||
|
||||
# import utils to register the pyarrow extension types
|
||||
import pandas.core.arrays.arrow.extension_types # pyright: ignore[reportUnusedImport] # noqa: F401
|
||||
|
||||
check_dtype_backend(dtype_backend)
|
||||
|
||||
with get_handle(
|
||||
path, "rb", storage_options=storage_options, is_text=False
|
||||
) as handles:
|
||||
if dtype_backend is lib.no_default and not using_string_dtype():
|
||||
return feather.read_feather(
|
||||
handles.handle, columns=columns, use_threads=bool(use_threads)
|
||||
)
|
||||
|
||||
pa_table = feather.read_table(
|
||||
handles.handle, columns=columns, use_threads=bool(use_threads)
|
||||
)
|
||||
return arrow_table_to_pandas(pa_table, dtype_backend=dtype_backend)
|
||||
@ -0,0 +1,9 @@
|
||||
# ruff: noqa: TCH004
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
if TYPE_CHECKING:
|
||||
# import modules that have public classes/functions
|
||||
from pandas.io.formats import style
|
||||
|
||||
# and mark only those modules as public
|
||||
__all__ = ["style"]
|
||||
157
lib/python3.11/site-packages/pandas/io/formats/_color_data.py
Normal file
157
lib/python3.11/site-packages/pandas/io/formats/_color_data.py
Normal file
@ -0,0 +1,157 @@
|
||||
# GH37967: Enable the use of CSS named colors, as defined in
|
||||
# matplotlib.colors.CSS4_COLORS, when exporting to Excel.
|
||||
# This data has been copied here, instead of being imported from matplotlib,
|
||||
# not to have ``to_excel`` methods require matplotlib.
|
||||
# source: matplotlib._color_data (3.3.3)
|
||||
from __future__ import annotations
|
||||
|
||||
CSS4_COLORS = {
|
||||
"aliceblue": "F0F8FF",
|
||||
"antiquewhite": "FAEBD7",
|
||||
"aqua": "00FFFF",
|
||||
"aquamarine": "7FFFD4",
|
||||
"azure": "F0FFFF",
|
||||
"beige": "F5F5DC",
|
||||
"bisque": "FFE4C4",
|
||||
"black": "000000",
|
||||
"blanchedalmond": "FFEBCD",
|
||||
"blue": "0000FF",
|
||||
"blueviolet": "8A2BE2",
|
||||
"brown": "A52A2A",
|
||||
"burlywood": "DEB887",
|
||||
"cadetblue": "5F9EA0",
|
||||
"chartreuse": "7FFF00",
|
||||
"chocolate": "D2691E",
|
||||
"coral": "FF7F50",
|
||||
"cornflowerblue": "6495ED",
|
||||
"cornsilk": "FFF8DC",
|
||||
"crimson": "DC143C",
|
||||
"cyan": "00FFFF",
|
||||
"darkblue": "00008B",
|
||||
"darkcyan": "008B8B",
|
||||
"darkgoldenrod": "B8860B",
|
||||
"darkgray": "A9A9A9",
|
||||
"darkgreen": "006400",
|
||||
"darkgrey": "A9A9A9",
|
||||
"darkkhaki": "BDB76B",
|
||||
"darkmagenta": "8B008B",
|
||||
"darkolivegreen": "556B2F",
|
||||
"darkorange": "FF8C00",
|
||||
"darkorchid": "9932CC",
|
||||
"darkred": "8B0000",
|
||||
"darksalmon": "E9967A",
|
||||
"darkseagreen": "8FBC8F",
|
||||
"darkslateblue": "483D8B",
|
||||
"darkslategray": "2F4F4F",
|
||||
"darkslategrey": "2F4F4F",
|
||||
"darkturquoise": "00CED1",
|
||||
"darkviolet": "9400D3",
|
||||
"deeppink": "FF1493",
|
||||
"deepskyblue": "00BFFF",
|
||||
"dimgray": "696969",
|
||||
"dimgrey": "696969",
|
||||
"dodgerblue": "1E90FF",
|
||||
"firebrick": "B22222",
|
||||
"floralwhite": "FFFAF0",
|
||||
"forestgreen": "228B22",
|
||||
"fuchsia": "FF00FF",
|
||||
"gainsboro": "DCDCDC",
|
||||
"ghostwhite": "F8F8FF",
|
||||
"gold": "FFD700",
|
||||
"goldenrod": "DAA520",
|
||||
"gray": "808080",
|
||||
"green": "008000",
|
||||
"greenyellow": "ADFF2F",
|
||||
"grey": "808080",
|
||||
"honeydew": "F0FFF0",
|
||||
"hotpink": "FF69B4",
|
||||
"indianred": "CD5C5C",
|
||||
"indigo": "4B0082",
|
||||
"ivory": "FFFFF0",
|
||||
"khaki": "F0E68C",
|
||||
"lavender": "E6E6FA",
|
||||
"lavenderblush": "FFF0F5",
|
||||
"lawngreen": "7CFC00",
|
||||
"lemonchiffon": "FFFACD",
|
||||
"lightblue": "ADD8E6",
|
||||
"lightcoral": "F08080",
|
||||
"lightcyan": "E0FFFF",
|
||||
"lightgoldenrodyellow": "FAFAD2",
|
||||
"lightgray": "D3D3D3",
|
||||
"lightgreen": "90EE90",
|
||||
"lightgrey": "D3D3D3",
|
||||
"lightpink": "FFB6C1",
|
||||
"lightsalmon": "FFA07A",
|
||||
"lightseagreen": "20B2AA",
|
||||
"lightskyblue": "87CEFA",
|
||||
"lightslategray": "778899",
|
||||
"lightslategrey": "778899",
|
||||
"lightsteelblue": "B0C4DE",
|
||||
"lightyellow": "FFFFE0",
|
||||
"lime": "00FF00",
|
||||
"limegreen": "32CD32",
|
||||
"linen": "FAF0E6",
|
||||
"magenta": "FF00FF",
|
||||
"maroon": "800000",
|
||||
"mediumaquamarine": "66CDAA",
|
||||
"mediumblue": "0000CD",
|
||||
"mediumorchid": "BA55D3",
|
||||
"mediumpurple": "9370DB",
|
||||
"mediumseagreen": "3CB371",
|
||||
"mediumslateblue": "7B68EE",
|
||||
"mediumspringgreen": "00FA9A",
|
||||
"mediumturquoise": "48D1CC",
|
||||
"mediumvioletred": "C71585",
|
||||
"midnightblue": "191970",
|
||||
"mintcream": "F5FFFA",
|
||||
"mistyrose": "FFE4E1",
|
||||
"moccasin": "FFE4B5",
|
||||
"navajowhite": "FFDEAD",
|
||||
"navy": "000080",
|
||||
"oldlace": "FDF5E6",
|
||||
"olive": "808000",
|
||||
"olivedrab": "6B8E23",
|
||||
"orange": "FFA500",
|
||||
"orangered": "FF4500",
|
||||
"orchid": "DA70D6",
|
||||
"palegoldenrod": "EEE8AA",
|
||||
"palegreen": "98FB98",
|
||||
"paleturquoise": "AFEEEE",
|
||||
"palevioletred": "DB7093",
|
||||
"papayawhip": "FFEFD5",
|
||||
"peachpuff": "FFDAB9",
|
||||
"peru": "CD853F",
|
||||
"pink": "FFC0CB",
|
||||
"plum": "DDA0DD",
|
||||
"powderblue": "B0E0E6",
|
||||
"purple": "800080",
|
||||
"rebeccapurple": "663399",
|
||||
"red": "FF0000",
|
||||
"rosybrown": "BC8F8F",
|
||||
"royalblue": "4169E1",
|
||||
"saddlebrown": "8B4513",
|
||||
"salmon": "FA8072",
|
||||
"sandybrown": "F4A460",
|
||||
"seagreen": "2E8B57",
|
||||
"seashell": "FFF5EE",
|
||||
"sienna": "A0522D",
|
||||
"silver": "C0C0C0",
|
||||
"skyblue": "87CEEB",
|
||||
"slateblue": "6A5ACD",
|
||||
"slategray": "708090",
|
||||
"slategrey": "708090",
|
||||
"snow": "FFFAFA",
|
||||
"springgreen": "00FF7F",
|
||||
"steelblue": "4682B4",
|
||||
"tan": "D2B48C",
|
||||
"teal": "008080",
|
||||
"thistle": "D8BFD8",
|
||||
"tomato": "FF6347",
|
||||
"turquoise": "40E0D0",
|
||||
"violet": "EE82EE",
|
||||
"wheat": "F5DEB3",
|
||||
"white": "FFFFFF",
|
||||
"whitesmoke": "F5F5F5",
|
||||
"yellow": "FFFF00",
|
||||
"yellowgreen": "9ACD32",
|
||||
}
|
||||
94
lib/python3.11/site-packages/pandas/io/formats/console.py
Normal file
94
lib/python3.11/site-packages/pandas/io/formats/console.py
Normal file
@ -0,0 +1,94 @@
|
||||
"""
|
||||
Internal module for console introspection
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from shutil import get_terminal_size
|
||||
|
||||
|
||||
def get_console_size() -> tuple[int | None, int | None]:
|
||||
"""
|
||||
Return console size as tuple = (width, height).
|
||||
|
||||
Returns (None,None) in non-interactive session.
|
||||
"""
|
||||
from pandas import get_option
|
||||
|
||||
display_width = get_option("display.width")
|
||||
display_height = get_option("display.max_rows")
|
||||
|
||||
# Consider
|
||||
# interactive shell terminal, can detect term size
|
||||
# interactive non-shell terminal (ipnb/ipqtconsole), cannot detect term
|
||||
# size non-interactive script, should disregard term size
|
||||
|
||||
# in addition
|
||||
# width,height have default values, but setting to 'None' signals
|
||||
# should use Auto-Detection, But only in interactive shell-terminal.
|
||||
# Simple. yeah.
|
||||
|
||||
if in_interactive_session():
|
||||
if in_ipython_frontend():
|
||||
# sane defaults for interactive non-shell terminal
|
||||
# match default for width,height in config_init
|
||||
from pandas._config.config import get_default_val
|
||||
|
||||
terminal_width = get_default_val("display.width")
|
||||
terminal_height = get_default_val("display.max_rows")
|
||||
else:
|
||||
# pure terminal
|
||||
terminal_width, terminal_height = get_terminal_size()
|
||||
else:
|
||||
terminal_width, terminal_height = None, None
|
||||
|
||||
# Note if the User sets width/Height to None (auto-detection)
|
||||
# and we're in a script (non-inter), this will return (None,None)
|
||||
# caller needs to deal.
|
||||
return display_width or terminal_width, display_height or terminal_height
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
# Detect our environment
|
||||
|
||||
|
||||
def in_interactive_session() -> bool:
|
||||
"""
|
||||
Check if we're running in an interactive shell.
|
||||
|
||||
Returns
|
||||
-------
|
||||
bool
|
||||
True if running under python/ipython interactive shell.
|
||||
"""
|
||||
from pandas import get_option
|
||||
|
||||
def check_main():
|
||||
try:
|
||||
import __main__ as main
|
||||
except ModuleNotFoundError:
|
||||
return get_option("mode.sim_interactive")
|
||||
return not hasattr(main, "__file__") or get_option("mode.sim_interactive")
|
||||
|
||||
try:
|
||||
# error: Name '__IPYTHON__' is not defined
|
||||
return __IPYTHON__ or check_main() # type: ignore[name-defined]
|
||||
except NameError:
|
||||
return check_main()
|
||||
|
||||
|
||||
def in_ipython_frontend() -> bool:
|
||||
"""
|
||||
Check if we're inside an IPython zmq frontend.
|
||||
|
||||
Returns
|
||||
-------
|
||||
bool
|
||||
"""
|
||||
try:
|
||||
# error: Name 'get_ipython' is not defined
|
||||
ip = get_ipython() # type: ignore[name-defined]
|
||||
return "zmq" in str(type(ip)).lower()
|
||||
except NameError:
|
||||
pass
|
||||
|
||||
return False
|
||||
421
lib/python3.11/site-packages/pandas/io/formats/css.py
Normal file
421
lib/python3.11/site-packages/pandas/io/formats/css.py
Normal file
@ -0,0 +1,421 @@
|
||||
"""
|
||||
Utilities for interpreting CSS from Stylers for formatting non-HTML outputs.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Callable,
|
||||
)
|
||||
import warnings
|
||||
|
||||
from pandas.errors import CSSWarning
|
||||
from pandas.util._exceptions import find_stack_level
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import (
|
||||
Generator,
|
||||
Iterable,
|
||||
Iterator,
|
||||
)
|
||||
|
||||
|
||||
def _side_expander(prop_fmt: str) -> Callable:
|
||||
"""
|
||||
Wrapper to expand shorthand property into top, right, bottom, left properties
|
||||
|
||||
Parameters
|
||||
----------
|
||||
side : str
|
||||
The border side to expand into properties
|
||||
|
||||
Returns
|
||||
-------
|
||||
function: Return to call when a 'border(-{side}): {value}' string is encountered
|
||||
"""
|
||||
|
||||
def expand(self, prop, value: str) -> Generator[tuple[str, str], None, None]:
|
||||
"""
|
||||
Expand shorthand property into side-specific property (top, right, bottom, left)
|
||||
|
||||
Parameters
|
||||
----------
|
||||
prop (str): CSS property name
|
||||
value (str): String token for property
|
||||
|
||||
Yields
|
||||
------
|
||||
Tuple (str, str): Expanded property, value
|
||||
"""
|
||||
tokens = value.split()
|
||||
try:
|
||||
mapping = self.SIDE_SHORTHANDS[len(tokens)]
|
||||
except KeyError:
|
||||
warnings.warn(
|
||||
f'Could not expand "{prop}: {value}"',
|
||||
CSSWarning,
|
||||
stacklevel=find_stack_level(),
|
||||
)
|
||||
return
|
||||
for key, idx in zip(self.SIDES, mapping):
|
||||
yield prop_fmt.format(key), tokens[idx]
|
||||
|
||||
return expand
|
||||
|
||||
|
||||
def _border_expander(side: str = "") -> Callable:
|
||||
"""
|
||||
Wrapper to expand 'border' property into border color, style, and width properties
|
||||
|
||||
Parameters
|
||||
----------
|
||||
side : str
|
||||
The border side to expand into properties
|
||||
|
||||
Returns
|
||||
-------
|
||||
function: Return to call when a 'border(-{side}): {value}' string is encountered
|
||||
"""
|
||||
if side != "":
|
||||
side = f"-{side}"
|
||||
|
||||
def expand(self, prop, value: str) -> Generator[tuple[str, str], None, None]:
|
||||
"""
|
||||
Expand border into color, style, and width tuples
|
||||
|
||||
Parameters
|
||||
----------
|
||||
prop : str
|
||||
CSS property name passed to styler
|
||||
value : str
|
||||
Value passed to styler for property
|
||||
|
||||
Yields
|
||||
------
|
||||
Tuple (str, str): Expanded property, value
|
||||
"""
|
||||
tokens = value.split()
|
||||
if len(tokens) == 0 or len(tokens) > 3:
|
||||
warnings.warn(
|
||||
f'Too many tokens provided to "{prop}" (expected 1-3)',
|
||||
CSSWarning,
|
||||
stacklevel=find_stack_level(),
|
||||
)
|
||||
|
||||
# TODO: Can we use current color as initial value to comply with CSS standards?
|
||||
border_declarations = {
|
||||
f"border{side}-color": "black",
|
||||
f"border{side}-style": "none",
|
||||
f"border{side}-width": "medium",
|
||||
}
|
||||
for token in tokens:
|
||||
if token.lower() in self.BORDER_STYLES:
|
||||
border_declarations[f"border{side}-style"] = token
|
||||
elif any(ratio in token.lower() for ratio in self.BORDER_WIDTH_RATIOS):
|
||||
border_declarations[f"border{side}-width"] = token
|
||||
else:
|
||||
border_declarations[f"border{side}-color"] = token
|
||||
# TODO: Warn user if item entered more than once (e.g. "border: red green")
|
||||
|
||||
# Per CSS, "border" will reset previous "border-*" definitions
|
||||
yield from self.atomize(border_declarations.items())
|
||||
|
||||
return expand
|
||||
|
||||
|
||||
class CSSResolver:
|
||||
"""
|
||||
A callable for parsing and resolving CSS to atomic properties.
|
||||
"""
|
||||
|
||||
UNIT_RATIOS = {
|
||||
"pt": ("pt", 1),
|
||||
"em": ("em", 1),
|
||||
"rem": ("pt", 12),
|
||||
"ex": ("em", 0.5),
|
||||
# 'ch':
|
||||
"px": ("pt", 0.75),
|
||||
"pc": ("pt", 12),
|
||||
"in": ("pt", 72),
|
||||
"cm": ("in", 1 / 2.54),
|
||||
"mm": ("in", 1 / 25.4),
|
||||
"q": ("mm", 0.25),
|
||||
"!!default": ("em", 0),
|
||||
}
|
||||
|
||||
FONT_SIZE_RATIOS = UNIT_RATIOS.copy()
|
||||
FONT_SIZE_RATIOS.update(
|
||||
{
|
||||
"%": ("em", 0.01),
|
||||
"xx-small": ("rem", 0.5),
|
||||
"x-small": ("rem", 0.625),
|
||||
"small": ("rem", 0.8),
|
||||
"medium": ("rem", 1),
|
||||
"large": ("rem", 1.125),
|
||||
"x-large": ("rem", 1.5),
|
||||
"xx-large": ("rem", 2),
|
||||
"smaller": ("em", 1 / 1.2),
|
||||
"larger": ("em", 1.2),
|
||||
"!!default": ("em", 1),
|
||||
}
|
||||
)
|
||||
|
||||
MARGIN_RATIOS = UNIT_RATIOS.copy()
|
||||
MARGIN_RATIOS.update({"none": ("pt", 0)})
|
||||
|
||||
BORDER_WIDTH_RATIOS = UNIT_RATIOS.copy()
|
||||
BORDER_WIDTH_RATIOS.update(
|
||||
{
|
||||
"none": ("pt", 0),
|
||||
"thick": ("px", 4),
|
||||
"medium": ("px", 2),
|
||||
"thin": ("px", 1),
|
||||
# Default: medium only if solid
|
||||
}
|
||||
)
|
||||
|
||||
BORDER_STYLES = [
|
||||
"none",
|
||||
"hidden",
|
||||
"dotted",
|
||||
"dashed",
|
||||
"solid",
|
||||
"double",
|
||||
"groove",
|
||||
"ridge",
|
||||
"inset",
|
||||
"outset",
|
||||
"mediumdashdot",
|
||||
"dashdotdot",
|
||||
"hair",
|
||||
"mediumdashdotdot",
|
||||
"dashdot",
|
||||
"slantdashdot",
|
||||
"mediumdashed",
|
||||
]
|
||||
|
||||
SIDE_SHORTHANDS = {
|
||||
1: [0, 0, 0, 0],
|
||||
2: [0, 1, 0, 1],
|
||||
3: [0, 1, 2, 1],
|
||||
4: [0, 1, 2, 3],
|
||||
}
|
||||
|
||||
SIDES = ("top", "right", "bottom", "left")
|
||||
|
||||
CSS_EXPANSIONS = {
|
||||
**{
|
||||
(f"border-{prop}" if prop else "border"): _border_expander(prop)
|
||||
for prop in ["", "top", "right", "bottom", "left"]
|
||||
},
|
||||
**{
|
||||
f"border-{prop}": _side_expander(f"border-{{:s}}-{prop}")
|
||||
for prop in ["color", "style", "width"]
|
||||
},
|
||||
"margin": _side_expander("margin-{:s}"),
|
||||
"padding": _side_expander("padding-{:s}"),
|
||||
}
|
||||
|
||||
def __call__(
|
||||
self,
|
||||
declarations: str | Iterable[tuple[str, str]],
|
||||
inherited: dict[str, str] | None = None,
|
||||
) -> dict[str, str]:
|
||||
"""
|
||||
The given declarations to atomic properties.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
declarations_str : str | Iterable[tuple[str, str]]
|
||||
A CSS string or set of CSS declaration tuples
|
||||
e.g. "font-weight: bold; background: blue" or
|
||||
{("font-weight", "bold"), ("background", "blue")}
|
||||
inherited : dict, optional
|
||||
Atomic properties indicating the inherited style context in which
|
||||
declarations_str is to be resolved. ``inherited`` should already
|
||||
be resolved, i.e. valid output of this method.
|
||||
|
||||
Returns
|
||||
-------
|
||||
dict
|
||||
Atomic CSS 2.2 properties.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> resolve = CSSResolver()
|
||||
>>> inherited = {'font-family': 'serif', 'font-weight': 'bold'}
|
||||
>>> out = resolve('''
|
||||
... border-color: BLUE RED;
|
||||
... font-size: 1em;
|
||||
... font-size: 2em;
|
||||
... font-weight: normal;
|
||||
... font-weight: inherit;
|
||||
... ''', inherited)
|
||||
>>> sorted(out.items()) # doctest: +NORMALIZE_WHITESPACE
|
||||
[('border-bottom-color', 'blue'),
|
||||
('border-left-color', 'red'),
|
||||
('border-right-color', 'red'),
|
||||
('border-top-color', 'blue'),
|
||||
('font-family', 'serif'),
|
||||
('font-size', '24pt'),
|
||||
('font-weight', 'bold')]
|
||||
"""
|
||||
if isinstance(declarations, str):
|
||||
declarations = self.parse(declarations)
|
||||
props = dict(self.atomize(declarations))
|
||||
if inherited is None:
|
||||
inherited = {}
|
||||
|
||||
props = self._update_initial(props, inherited)
|
||||
props = self._update_font_size(props, inherited)
|
||||
return self._update_other_units(props)
|
||||
|
||||
def _update_initial(
|
||||
self,
|
||||
props: dict[str, str],
|
||||
inherited: dict[str, str],
|
||||
) -> dict[str, str]:
|
||||
# 1. resolve inherited, initial
|
||||
for prop, val in inherited.items():
|
||||
if prop not in props:
|
||||
props[prop] = val
|
||||
|
||||
new_props = props.copy()
|
||||
for prop, val in props.items():
|
||||
if val == "inherit":
|
||||
val = inherited.get(prop, "initial")
|
||||
|
||||
if val in ("initial", None):
|
||||
# we do not define a complete initial stylesheet
|
||||
del new_props[prop]
|
||||
else:
|
||||
new_props[prop] = val
|
||||
return new_props
|
||||
|
||||
def _update_font_size(
|
||||
self,
|
||||
props: dict[str, str],
|
||||
inherited: dict[str, str],
|
||||
) -> dict[str, str]:
|
||||
# 2. resolve relative font size
|
||||
if props.get("font-size"):
|
||||
props["font-size"] = self.size_to_pt(
|
||||
props["font-size"],
|
||||
self._get_font_size(inherited),
|
||||
conversions=self.FONT_SIZE_RATIOS,
|
||||
)
|
||||
return props
|
||||
|
||||
def _get_font_size(self, props: dict[str, str]) -> float | None:
|
||||
if props.get("font-size"):
|
||||
font_size_string = props["font-size"]
|
||||
return self._get_float_font_size_from_pt(font_size_string)
|
||||
return None
|
||||
|
||||
def _get_float_font_size_from_pt(self, font_size_string: str) -> float:
|
||||
assert font_size_string.endswith("pt")
|
||||
return float(font_size_string.rstrip("pt"))
|
||||
|
||||
def _update_other_units(self, props: dict[str, str]) -> dict[str, str]:
|
||||
font_size = self._get_font_size(props)
|
||||
# 3. TODO: resolve other font-relative units
|
||||
for side in self.SIDES:
|
||||
prop = f"border-{side}-width"
|
||||
if prop in props:
|
||||
props[prop] = self.size_to_pt(
|
||||
props[prop],
|
||||
em_pt=font_size,
|
||||
conversions=self.BORDER_WIDTH_RATIOS,
|
||||
)
|
||||
|
||||
for prop in [f"margin-{side}", f"padding-{side}"]:
|
||||
if prop in props:
|
||||
# TODO: support %
|
||||
props[prop] = self.size_to_pt(
|
||||
props[prop],
|
||||
em_pt=font_size,
|
||||
conversions=self.MARGIN_RATIOS,
|
||||
)
|
||||
return props
|
||||
|
||||
def size_to_pt(self, in_val, em_pt=None, conversions=UNIT_RATIOS) -> str:
|
||||
def _error():
|
||||
warnings.warn(
|
||||
f"Unhandled size: {repr(in_val)}",
|
||||
CSSWarning,
|
||||
stacklevel=find_stack_level(),
|
||||
)
|
||||
return self.size_to_pt("1!!default", conversions=conversions)
|
||||
|
||||
match = re.match(r"^(\S*?)([a-zA-Z%!].*)", in_val)
|
||||
if match is None:
|
||||
return _error()
|
||||
|
||||
val, unit = match.groups()
|
||||
if val == "":
|
||||
# hack for 'large' etc.
|
||||
val = 1
|
||||
else:
|
||||
try:
|
||||
val = float(val)
|
||||
except ValueError:
|
||||
return _error()
|
||||
|
||||
while unit != "pt":
|
||||
if unit == "em":
|
||||
if em_pt is None:
|
||||
unit = "rem"
|
||||
else:
|
||||
val *= em_pt
|
||||
unit = "pt"
|
||||
continue
|
||||
|
||||
try:
|
||||
unit, mul = conversions[unit]
|
||||
except KeyError:
|
||||
return _error()
|
||||
val *= mul
|
||||
|
||||
val = round(val, 5)
|
||||
if int(val) == val:
|
||||
size_fmt = f"{int(val):d}pt"
|
||||
else:
|
||||
size_fmt = f"{val:f}pt"
|
||||
return size_fmt
|
||||
|
||||
def atomize(self, declarations: Iterable) -> Generator[tuple[str, str], None, None]:
|
||||
for prop, value in declarations:
|
||||
prop = prop.lower()
|
||||
value = value.lower()
|
||||
if prop in self.CSS_EXPANSIONS:
|
||||
expand = self.CSS_EXPANSIONS[prop]
|
||||
yield from expand(self, prop, value)
|
||||
else:
|
||||
yield prop, value
|
||||
|
||||
def parse(self, declarations_str: str) -> Iterator[tuple[str, str]]:
|
||||
"""
|
||||
Generates (prop, value) pairs from declarations.
|
||||
|
||||
In a future version may generate parsed tokens from tinycss/tinycss2
|
||||
|
||||
Parameters
|
||||
----------
|
||||
declarations_str : str
|
||||
"""
|
||||
for decl in declarations_str.split(";"):
|
||||
if not decl.strip():
|
||||
continue
|
||||
prop, sep, val = decl.partition(":")
|
||||
prop = prop.strip().lower()
|
||||
# TODO: don't lowercase case sensitive parts of values (strings)
|
||||
val = val.strip().lower()
|
||||
if sep:
|
||||
yield prop, val
|
||||
else:
|
||||
warnings.warn(
|
||||
f"Ill-formatted attribute: expected a colon in {repr(decl)}",
|
||||
CSSWarning,
|
||||
stacklevel=find_stack_level(),
|
||||
)
|
||||
330
lib/python3.11/site-packages/pandas/io/formats/csvs.py
Normal file
330
lib/python3.11/site-packages/pandas/io/formats/csvs.py
Normal file
@ -0,0 +1,330 @@
|
||||
"""
|
||||
Module for formatting output data into CSV files.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from collections.abc import (
|
||||
Hashable,
|
||||
Iterable,
|
||||
Iterator,
|
||||
Sequence,
|
||||
)
|
||||
import csv as csvlib
|
||||
import os
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
cast,
|
||||
)
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas._libs import writers as libwriters
|
||||
from pandas._typing import SequenceNotStr
|
||||
from pandas.util._decorators import cache_readonly
|
||||
|
||||
from pandas.core.dtypes.generic import (
|
||||
ABCDatetimeIndex,
|
||||
ABCIndex,
|
||||
ABCMultiIndex,
|
||||
ABCPeriodIndex,
|
||||
)
|
||||
from pandas.core.dtypes.missing import notna
|
||||
|
||||
from pandas.core.indexes.api import Index
|
||||
|
||||
from pandas.io.common import get_handle
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pandas._typing import (
|
||||
CompressionOptions,
|
||||
FilePath,
|
||||
FloatFormatType,
|
||||
IndexLabel,
|
||||
StorageOptions,
|
||||
WriteBuffer,
|
||||
npt,
|
||||
)
|
||||
|
||||
from pandas.io.formats.format import DataFrameFormatter
|
||||
|
||||
|
||||
_DEFAULT_CHUNKSIZE_CELLS = 100_000
|
||||
|
||||
|
||||
class CSVFormatter:
|
||||
cols: npt.NDArray[np.object_]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
formatter: DataFrameFormatter,
|
||||
path_or_buf: FilePath | WriteBuffer[str] | WriteBuffer[bytes] = "",
|
||||
sep: str = ",",
|
||||
cols: Sequence[Hashable] | None = None,
|
||||
index_label: IndexLabel | None = None,
|
||||
mode: str = "w",
|
||||
encoding: str | None = None,
|
||||
errors: str = "strict",
|
||||
compression: CompressionOptions = "infer",
|
||||
quoting: int | None = None,
|
||||
lineterminator: str | None = "\n",
|
||||
chunksize: int | None = None,
|
||||
quotechar: str | None = '"',
|
||||
date_format: str | None = None,
|
||||
doublequote: bool = True,
|
||||
escapechar: str | None = None,
|
||||
storage_options: StorageOptions | None = None,
|
||||
) -> None:
|
||||
self.fmt = formatter
|
||||
|
||||
self.obj = self.fmt.frame
|
||||
|
||||
self.filepath_or_buffer = path_or_buf
|
||||
self.encoding = encoding
|
||||
self.compression: CompressionOptions = compression
|
||||
self.mode = mode
|
||||
self.storage_options = storage_options
|
||||
|
||||
self.sep = sep
|
||||
self.index_label = self._initialize_index_label(index_label)
|
||||
self.errors = errors
|
||||
self.quoting = quoting or csvlib.QUOTE_MINIMAL
|
||||
self.quotechar = self._initialize_quotechar(quotechar)
|
||||
self.doublequote = doublequote
|
||||
self.escapechar = escapechar
|
||||
self.lineterminator = lineterminator or os.linesep
|
||||
self.date_format = date_format
|
||||
self.cols = self._initialize_columns(cols)
|
||||
self.chunksize = self._initialize_chunksize(chunksize)
|
||||
|
||||
@property
|
||||
def na_rep(self) -> str:
|
||||
return self.fmt.na_rep
|
||||
|
||||
@property
|
||||
def float_format(self) -> FloatFormatType | None:
|
||||
return self.fmt.float_format
|
||||
|
||||
@property
|
||||
def decimal(self) -> str:
|
||||
return self.fmt.decimal
|
||||
|
||||
@property
|
||||
def header(self) -> bool | SequenceNotStr[str]:
|
||||
return self.fmt.header
|
||||
|
||||
@property
|
||||
def index(self) -> bool:
|
||||
return self.fmt.index
|
||||
|
||||
def _initialize_index_label(self, index_label: IndexLabel | None) -> IndexLabel:
|
||||
if index_label is not False:
|
||||
if index_label is None:
|
||||
return self._get_index_label_from_obj()
|
||||
elif not isinstance(index_label, (list, tuple, np.ndarray, ABCIndex)):
|
||||
# given a string for a DF with Index
|
||||
return [index_label]
|
||||
return index_label
|
||||
|
||||
def _get_index_label_from_obj(self) -> Sequence[Hashable]:
|
||||
if isinstance(self.obj.index, ABCMultiIndex):
|
||||
return self._get_index_label_multiindex()
|
||||
else:
|
||||
return self._get_index_label_flat()
|
||||
|
||||
def _get_index_label_multiindex(self) -> Sequence[Hashable]:
|
||||
return [name or "" for name in self.obj.index.names]
|
||||
|
||||
def _get_index_label_flat(self) -> Sequence[Hashable]:
|
||||
index_label = self.obj.index.name
|
||||
return [""] if index_label is None else [index_label]
|
||||
|
||||
def _initialize_quotechar(self, quotechar: str | None) -> str | None:
|
||||
if self.quoting != csvlib.QUOTE_NONE:
|
||||
# prevents crash in _csv
|
||||
return quotechar
|
||||
return None
|
||||
|
||||
@property
|
||||
def has_mi_columns(self) -> bool:
|
||||
return bool(isinstance(self.obj.columns, ABCMultiIndex))
|
||||
|
||||
def _initialize_columns(
|
||||
self, cols: Iterable[Hashable] | None
|
||||
) -> npt.NDArray[np.object_]:
|
||||
# validate mi options
|
||||
if self.has_mi_columns:
|
||||
if cols is not None:
|
||||
msg = "cannot specify cols with a MultiIndex on the columns"
|
||||
raise TypeError(msg)
|
||||
|
||||
if cols is not None:
|
||||
if isinstance(cols, ABCIndex):
|
||||
cols = cols._get_values_for_csv(**self._number_format)
|
||||
else:
|
||||
cols = list(cols)
|
||||
self.obj = self.obj.loc[:, cols]
|
||||
|
||||
# update columns to include possible multiplicity of dupes
|
||||
# and make sure cols is just a list of labels
|
||||
new_cols = self.obj.columns
|
||||
return new_cols._get_values_for_csv(**self._number_format)
|
||||
|
||||
def _initialize_chunksize(self, chunksize: int | None) -> int:
|
||||
if chunksize is None:
|
||||
return (_DEFAULT_CHUNKSIZE_CELLS // (len(self.cols) or 1)) or 1
|
||||
return int(chunksize)
|
||||
|
||||
@property
|
||||
def _number_format(self) -> dict[str, Any]:
|
||||
"""Dictionary used for storing number formatting settings."""
|
||||
return {
|
||||
"na_rep": self.na_rep,
|
||||
"float_format": self.float_format,
|
||||
"date_format": self.date_format,
|
||||
"quoting": self.quoting,
|
||||
"decimal": self.decimal,
|
||||
}
|
||||
|
||||
@cache_readonly
|
||||
def data_index(self) -> Index:
|
||||
data_index = self.obj.index
|
||||
if (
|
||||
isinstance(data_index, (ABCDatetimeIndex, ABCPeriodIndex))
|
||||
and self.date_format is not None
|
||||
):
|
||||
data_index = Index(
|
||||
[x.strftime(self.date_format) if notna(x) else "" for x in data_index]
|
||||
)
|
||||
elif isinstance(data_index, ABCMultiIndex):
|
||||
data_index = data_index.remove_unused_levels()
|
||||
return data_index
|
||||
|
||||
@property
|
||||
def nlevels(self) -> int:
|
||||
if self.index:
|
||||
return getattr(self.data_index, "nlevels", 1)
|
||||
else:
|
||||
return 0
|
||||
|
||||
@property
|
||||
def _has_aliases(self) -> bool:
|
||||
return isinstance(self.header, (tuple, list, np.ndarray, ABCIndex))
|
||||
|
||||
@property
|
||||
def _need_to_save_header(self) -> bool:
|
||||
return bool(self._has_aliases or self.header)
|
||||
|
||||
@property
|
||||
def write_cols(self) -> SequenceNotStr[Hashable]:
|
||||
if self._has_aliases:
|
||||
assert not isinstance(self.header, bool)
|
||||
if len(self.header) != len(self.cols):
|
||||
raise ValueError(
|
||||
f"Writing {len(self.cols)} cols but got {len(self.header)} aliases"
|
||||
)
|
||||
return self.header
|
||||
else:
|
||||
# self.cols is an ndarray derived from Index._get_values_for_csv,
|
||||
# so its entries are strings, i.e. hashable
|
||||
return cast(SequenceNotStr[Hashable], self.cols)
|
||||
|
||||
@property
|
||||
def encoded_labels(self) -> list[Hashable]:
|
||||
encoded_labels: list[Hashable] = []
|
||||
|
||||
if self.index and self.index_label:
|
||||
assert isinstance(self.index_label, Sequence)
|
||||
encoded_labels = list(self.index_label)
|
||||
|
||||
if not self.has_mi_columns or self._has_aliases:
|
||||
encoded_labels += list(self.write_cols)
|
||||
|
||||
return encoded_labels
|
||||
|
||||
def save(self) -> None:
|
||||
"""
|
||||
Create the writer & save.
|
||||
"""
|
||||
# apply compression and byte/text conversion
|
||||
with get_handle(
|
||||
self.filepath_or_buffer,
|
||||
self.mode,
|
||||
encoding=self.encoding,
|
||||
errors=self.errors,
|
||||
compression=self.compression,
|
||||
storage_options=self.storage_options,
|
||||
) as handles:
|
||||
# Note: self.encoding is irrelevant here
|
||||
self.writer = csvlib.writer(
|
||||
handles.handle,
|
||||
lineterminator=self.lineterminator,
|
||||
delimiter=self.sep,
|
||||
quoting=self.quoting,
|
||||
doublequote=self.doublequote,
|
||||
escapechar=self.escapechar,
|
||||
quotechar=self.quotechar,
|
||||
)
|
||||
|
||||
self._save()
|
||||
|
||||
def _save(self) -> None:
|
||||
if self._need_to_save_header:
|
||||
self._save_header()
|
||||
self._save_body()
|
||||
|
||||
def _save_header(self) -> None:
|
||||
if not self.has_mi_columns or self._has_aliases:
|
||||
self.writer.writerow(self.encoded_labels)
|
||||
else:
|
||||
for row in self._generate_multiindex_header_rows():
|
||||
self.writer.writerow(row)
|
||||
|
||||
def _generate_multiindex_header_rows(self) -> Iterator[list[Hashable]]:
|
||||
columns = self.obj.columns
|
||||
for i in range(columns.nlevels):
|
||||
# we need at least 1 index column to write our col names
|
||||
col_line = []
|
||||
if self.index:
|
||||
# name is the first column
|
||||
col_line.append(columns.names[i])
|
||||
|
||||
if isinstance(self.index_label, list) and len(self.index_label) > 1:
|
||||
col_line.extend([""] * (len(self.index_label) - 1))
|
||||
|
||||
col_line.extend(columns._get_level_values(i))
|
||||
yield col_line
|
||||
|
||||
# Write out the index line if it's not empty.
|
||||
# Otherwise, we will print out an extraneous
|
||||
# blank line between the mi and the data rows.
|
||||
if self.encoded_labels and set(self.encoded_labels) != {""}:
|
||||
yield self.encoded_labels + [""] * len(columns)
|
||||
|
||||
def _save_body(self) -> None:
|
||||
nrows = len(self.data_index)
|
||||
chunks = (nrows // self.chunksize) + 1
|
||||
for i in range(chunks):
|
||||
start_i = i * self.chunksize
|
||||
end_i = min(start_i + self.chunksize, nrows)
|
||||
if start_i >= end_i:
|
||||
break
|
||||
self._save_chunk(start_i, end_i)
|
||||
|
||||
def _save_chunk(self, start_i: int, end_i: int) -> None:
|
||||
# create the data for a chunk
|
||||
slicer = slice(start_i, end_i)
|
||||
df = self.obj.iloc[slicer]
|
||||
|
||||
res = df._get_values_for_csv(**self._number_format)
|
||||
data = list(res._iter_column_arrays())
|
||||
|
||||
ix = self.data_index[slicer]._get_values_for_csv(**self._number_format)
|
||||
libwriters.write_csv_rows(
|
||||
data,
|
||||
ix,
|
||||
self.nlevels,
|
||||
self.cols,
|
||||
self.writer,
|
||||
)
|
||||
962
lib/python3.11/site-packages/pandas/io/formats/excel.py
Normal file
962
lib/python3.11/site-packages/pandas/io/formats/excel.py
Normal file
@ -0,0 +1,962 @@
|
||||
"""
|
||||
Utilities for conversion to writer-agnostic Excel representation.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from collections.abc import (
|
||||
Hashable,
|
||||
Iterable,
|
||||
Mapping,
|
||||
Sequence,
|
||||
)
|
||||
import functools
|
||||
import itertools
|
||||
import re
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
Callable,
|
||||
cast,
|
||||
)
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas._libs.lib import is_list_like
|
||||
from pandas.util._decorators import doc
|
||||
from pandas.util._exceptions import find_stack_level
|
||||
|
||||
from pandas.core.dtypes import missing
|
||||
from pandas.core.dtypes.common import (
|
||||
is_float,
|
||||
is_scalar,
|
||||
)
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
MultiIndex,
|
||||
PeriodIndex,
|
||||
)
|
||||
import pandas.core.common as com
|
||||
from pandas.core.shared_docs import _shared_docs
|
||||
|
||||
from pandas.io.formats._color_data import CSS4_COLORS
|
||||
from pandas.io.formats.css import (
|
||||
CSSResolver,
|
||||
CSSWarning,
|
||||
)
|
||||
from pandas.io.formats.format import get_level_lengths
|
||||
from pandas.io.formats.printing import pprint_thing
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pandas._typing import (
|
||||
FilePath,
|
||||
IndexLabel,
|
||||
StorageOptions,
|
||||
WriteExcelBuffer,
|
||||
)
|
||||
|
||||
from pandas import ExcelWriter
|
||||
|
||||
|
||||
class ExcelCell:
|
||||
__fields__ = ("row", "col", "val", "style", "mergestart", "mergeend")
|
||||
__slots__ = __fields__
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
row: int,
|
||||
col: int,
|
||||
val,
|
||||
style=None,
|
||||
mergestart: int | None = None,
|
||||
mergeend: int | None = None,
|
||||
) -> None:
|
||||
self.row = row
|
||||
self.col = col
|
||||
self.val = val
|
||||
self.style = style
|
||||
self.mergestart = mergestart
|
||||
self.mergeend = mergeend
|
||||
|
||||
|
||||
class CssExcelCell(ExcelCell):
|
||||
def __init__(
|
||||
self,
|
||||
row: int,
|
||||
col: int,
|
||||
val,
|
||||
style: dict | None,
|
||||
css_styles: dict[tuple[int, int], list[tuple[str, Any]]] | None,
|
||||
css_row: int,
|
||||
css_col: int,
|
||||
css_converter: Callable | None,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
if css_styles and css_converter:
|
||||
# Use dict to get only one (case-insensitive) declaration per property
|
||||
declaration_dict = {
|
||||
prop.lower(): val for prop, val in css_styles[css_row, css_col]
|
||||
}
|
||||
# Convert to frozenset for order-invariant caching
|
||||
unique_declarations = frozenset(declaration_dict.items())
|
||||
style = css_converter(unique_declarations)
|
||||
|
||||
super().__init__(row=row, col=col, val=val, style=style, **kwargs)
|
||||
|
||||
|
||||
class CSSToExcelConverter:
|
||||
"""
|
||||
A callable for converting CSS declarations to ExcelWriter styles
|
||||
|
||||
Supports parts of CSS 2.2, with minimal CSS 3.0 support (e.g. text-shadow),
|
||||
focusing on font styling, backgrounds, borders and alignment.
|
||||
|
||||
Operates by first computing CSS styles in a fairly generic
|
||||
way (see :meth:`compute_css`) then determining Excel style
|
||||
properties from CSS properties (see :meth:`build_xlstyle`).
|
||||
|
||||
Parameters
|
||||
----------
|
||||
inherited : str, optional
|
||||
CSS declarations understood to be the containing scope for the
|
||||
CSS processed by :meth:`__call__`.
|
||||
"""
|
||||
|
||||
NAMED_COLORS = CSS4_COLORS
|
||||
|
||||
VERTICAL_MAP = {
|
||||
"top": "top",
|
||||
"text-top": "top",
|
||||
"middle": "center",
|
||||
"baseline": "bottom",
|
||||
"bottom": "bottom",
|
||||
"text-bottom": "bottom",
|
||||
# OpenXML also has 'justify', 'distributed'
|
||||
}
|
||||
|
||||
BOLD_MAP = {
|
||||
"bold": True,
|
||||
"bolder": True,
|
||||
"600": True,
|
||||
"700": True,
|
||||
"800": True,
|
||||
"900": True,
|
||||
"normal": False,
|
||||
"lighter": False,
|
||||
"100": False,
|
||||
"200": False,
|
||||
"300": False,
|
||||
"400": False,
|
||||
"500": False,
|
||||
}
|
||||
|
||||
ITALIC_MAP = {
|
||||
"normal": False,
|
||||
"italic": True,
|
||||
"oblique": True,
|
||||
}
|
||||
|
||||
FAMILY_MAP = {
|
||||
"serif": 1, # roman
|
||||
"sans-serif": 2, # swiss
|
||||
"cursive": 4, # script
|
||||
"fantasy": 5, # decorative
|
||||
}
|
||||
|
||||
BORDER_STYLE_MAP = {
|
||||
style.lower(): style
|
||||
for style in [
|
||||
"dashed",
|
||||
"mediumDashDot",
|
||||
"dashDotDot",
|
||||
"hair",
|
||||
"dotted",
|
||||
"mediumDashDotDot",
|
||||
"double",
|
||||
"dashDot",
|
||||
"slantDashDot",
|
||||
"mediumDashed",
|
||||
]
|
||||
}
|
||||
|
||||
# NB: Most of the methods here could be classmethods, as only __init__
|
||||
# and __call__ make use of instance attributes. We leave them as
|
||||
# instancemethods so that users can easily experiment with extensions
|
||||
# without monkey-patching.
|
||||
inherited: dict[str, str] | None
|
||||
|
||||
def __init__(self, inherited: str | None = None) -> None:
|
||||
if inherited is not None:
|
||||
self.inherited = self.compute_css(inherited)
|
||||
else:
|
||||
self.inherited = None
|
||||
# We should avoid cache on the __call__ method.
|
||||
# Otherwise once the method __call__ has been called
|
||||
# garbage collection no longer deletes the instance.
|
||||
self._call_cached = functools.cache(self._call_uncached)
|
||||
|
||||
compute_css = CSSResolver()
|
||||
|
||||
def __call__(
|
||||
self, declarations: str | frozenset[tuple[str, str]]
|
||||
) -> dict[str, dict[str, str]]:
|
||||
"""
|
||||
Convert CSS declarations to ExcelWriter style.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
declarations : str | frozenset[tuple[str, str]]
|
||||
CSS string or set of CSS declaration tuples.
|
||||
e.g. "font-weight: bold; background: blue" or
|
||||
{("font-weight", "bold"), ("background", "blue")}
|
||||
|
||||
Returns
|
||||
-------
|
||||
xlstyle : dict
|
||||
A style as interpreted by ExcelWriter when found in
|
||||
ExcelCell.style.
|
||||
"""
|
||||
return self._call_cached(declarations)
|
||||
|
||||
def _call_uncached(
|
||||
self, declarations: str | frozenset[tuple[str, str]]
|
||||
) -> dict[str, dict[str, str]]:
|
||||
properties = self.compute_css(declarations, self.inherited)
|
||||
return self.build_xlstyle(properties)
|
||||
|
||||
def build_xlstyle(self, props: Mapping[str, str]) -> dict[str, dict[str, str]]:
|
||||
out = {
|
||||
"alignment": self.build_alignment(props),
|
||||
"border": self.build_border(props),
|
||||
"fill": self.build_fill(props),
|
||||
"font": self.build_font(props),
|
||||
"number_format": self.build_number_format(props),
|
||||
}
|
||||
|
||||
# TODO: handle cell width and height: needs support in pandas.io.excel
|
||||
|
||||
def remove_none(d: dict[str, str | None]) -> None:
|
||||
"""Remove key where value is None, through nested dicts"""
|
||||
for k, v in list(d.items()):
|
||||
if v is None:
|
||||
del d[k]
|
||||
elif isinstance(v, dict):
|
||||
remove_none(v)
|
||||
if not v:
|
||||
del d[k]
|
||||
|
||||
remove_none(out)
|
||||
return out
|
||||
|
||||
def build_alignment(self, props: Mapping[str, str]) -> dict[str, bool | str | None]:
|
||||
# TODO: text-indent, padding-left -> alignment.indent
|
||||
return {
|
||||
"horizontal": props.get("text-align"),
|
||||
"vertical": self._get_vertical_alignment(props),
|
||||
"wrap_text": self._get_is_wrap_text(props),
|
||||
}
|
||||
|
||||
def _get_vertical_alignment(self, props: Mapping[str, str]) -> str | None:
|
||||
vertical_align = props.get("vertical-align")
|
||||
if vertical_align:
|
||||
return self.VERTICAL_MAP.get(vertical_align)
|
||||
return None
|
||||
|
||||
def _get_is_wrap_text(self, props: Mapping[str, str]) -> bool | None:
|
||||
if props.get("white-space") is None:
|
||||
return None
|
||||
return bool(props["white-space"] not in ("nowrap", "pre", "pre-line"))
|
||||
|
||||
def build_border(
|
||||
self, props: Mapping[str, str]
|
||||
) -> dict[str, dict[str, str | None]]:
|
||||
return {
|
||||
side: {
|
||||
"style": self._border_style(
|
||||
props.get(f"border-{side}-style"),
|
||||
props.get(f"border-{side}-width"),
|
||||
self.color_to_excel(props.get(f"border-{side}-color")),
|
||||
),
|
||||
"color": self.color_to_excel(props.get(f"border-{side}-color")),
|
||||
}
|
||||
for side in ["top", "right", "bottom", "left"]
|
||||
}
|
||||
|
||||
def _border_style(self, style: str | None, width: str | None, color: str | None):
|
||||
# convert styles and widths to openxml, one of:
|
||||
# 'dashDot'
|
||||
# 'dashDotDot'
|
||||
# 'dashed'
|
||||
# 'dotted'
|
||||
# 'double'
|
||||
# 'hair'
|
||||
# 'medium'
|
||||
# 'mediumDashDot'
|
||||
# 'mediumDashDotDot'
|
||||
# 'mediumDashed'
|
||||
# 'slantDashDot'
|
||||
# 'thick'
|
||||
# 'thin'
|
||||
if width is None and style is None and color is None:
|
||||
# Return None will remove "border" from style dictionary
|
||||
return None
|
||||
|
||||
if width is None and style is None:
|
||||
# Return "none" will keep "border" in style dictionary
|
||||
return "none"
|
||||
|
||||
if style in ("none", "hidden"):
|
||||
return "none"
|
||||
|
||||
width_name = self._get_width_name(width)
|
||||
if width_name is None:
|
||||
return "none"
|
||||
|
||||
if style in (None, "groove", "ridge", "inset", "outset", "solid"):
|
||||
# not handled
|
||||
return width_name
|
||||
|
||||
if style == "double":
|
||||
return "double"
|
||||
if style == "dotted":
|
||||
if width_name in ("hair", "thin"):
|
||||
return "dotted"
|
||||
return "mediumDashDotDot"
|
||||
if style == "dashed":
|
||||
if width_name in ("hair", "thin"):
|
||||
return "dashed"
|
||||
return "mediumDashed"
|
||||
elif style in self.BORDER_STYLE_MAP:
|
||||
# Excel-specific styles
|
||||
return self.BORDER_STYLE_MAP[style]
|
||||
else:
|
||||
warnings.warn(
|
||||
f"Unhandled border style format: {repr(style)}",
|
||||
CSSWarning,
|
||||
stacklevel=find_stack_level(),
|
||||
)
|
||||
return "none"
|
||||
|
||||
def _get_width_name(self, width_input: str | None) -> str | None:
|
||||
width = self._width_to_float(width_input)
|
||||
if width < 1e-5:
|
||||
return None
|
||||
elif width < 1.3:
|
||||
return "thin"
|
||||
elif width < 2.8:
|
||||
return "medium"
|
||||
return "thick"
|
||||
|
||||
def _width_to_float(self, width: str | None) -> float:
|
||||
if width is None:
|
||||
width = "2pt"
|
||||
return self._pt_to_float(width)
|
||||
|
||||
def _pt_to_float(self, pt_string: str) -> float:
|
||||
assert pt_string.endswith("pt")
|
||||
return float(pt_string.rstrip("pt"))
|
||||
|
||||
def build_fill(self, props: Mapping[str, str]):
|
||||
# TODO: perhaps allow for special properties
|
||||
# -excel-pattern-bgcolor and -excel-pattern-type
|
||||
fill_color = props.get("background-color")
|
||||
if fill_color not in (None, "transparent", "none"):
|
||||
return {"fgColor": self.color_to_excel(fill_color), "patternType": "solid"}
|
||||
|
||||
def build_number_format(self, props: Mapping[str, str]) -> dict[str, str | None]:
|
||||
fc = props.get("number-format")
|
||||
fc = fc.replace("§", ";") if isinstance(fc, str) else fc
|
||||
return {"format_code": fc}
|
||||
|
||||
def build_font(
|
||||
self, props: Mapping[str, str]
|
||||
) -> dict[str, bool | float | str | None]:
|
||||
font_names = self._get_font_names(props)
|
||||
decoration = self._get_decoration(props)
|
||||
return {
|
||||
"name": font_names[0] if font_names else None,
|
||||
"family": self._select_font_family(font_names),
|
||||
"size": self._get_font_size(props),
|
||||
"bold": self._get_is_bold(props),
|
||||
"italic": self._get_is_italic(props),
|
||||
"underline": ("single" if "underline" in decoration else None),
|
||||
"strike": ("line-through" in decoration) or None,
|
||||
"color": self.color_to_excel(props.get("color")),
|
||||
# shadow if nonzero digit before shadow color
|
||||
"shadow": self._get_shadow(props),
|
||||
}
|
||||
|
||||
def _get_is_bold(self, props: Mapping[str, str]) -> bool | None:
|
||||
weight = props.get("font-weight")
|
||||
if weight:
|
||||
return self.BOLD_MAP.get(weight)
|
||||
return None
|
||||
|
||||
def _get_is_italic(self, props: Mapping[str, str]) -> bool | None:
|
||||
font_style = props.get("font-style")
|
||||
if font_style:
|
||||
return self.ITALIC_MAP.get(font_style)
|
||||
return None
|
||||
|
||||
def _get_decoration(self, props: Mapping[str, str]) -> Sequence[str]:
|
||||
decoration = props.get("text-decoration")
|
||||
if decoration is not None:
|
||||
return decoration.split()
|
||||
else:
|
||||
return ()
|
||||
|
||||
def _get_underline(self, decoration: Sequence[str]) -> str | None:
|
||||
if "underline" in decoration:
|
||||
return "single"
|
||||
return None
|
||||
|
||||
def _get_shadow(self, props: Mapping[str, str]) -> bool | None:
|
||||
if "text-shadow" in props:
|
||||
return bool(re.search("^[^#(]*[1-9]", props["text-shadow"]))
|
||||
return None
|
||||
|
||||
def _get_font_names(self, props: Mapping[str, str]) -> Sequence[str]:
|
||||
font_names_tmp = re.findall(
|
||||
r"""(?x)
|
||||
(
|
||||
"(?:[^"]|\\")+"
|
||||
|
|
||||
'(?:[^']|\\')+'
|
||||
|
|
||||
[^'",]+
|
||||
)(?=,|\s*$)
|
||||
""",
|
||||
props.get("font-family", ""),
|
||||
)
|
||||
|
||||
font_names = []
|
||||
for name in font_names_tmp:
|
||||
if name[:1] == '"':
|
||||
name = name[1:-1].replace('\\"', '"')
|
||||
elif name[:1] == "'":
|
||||
name = name[1:-1].replace("\\'", "'")
|
||||
else:
|
||||
name = name.strip()
|
||||
if name:
|
||||
font_names.append(name)
|
||||
return font_names
|
||||
|
||||
def _get_font_size(self, props: Mapping[str, str]) -> float | None:
|
||||
size = props.get("font-size")
|
||||
if size is None:
|
||||
return size
|
||||
return self._pt_to_float(size)
|
||||
|
||||
def _select_font_family(self, font_names: Sequence[str]) -> int | None:
|
||||
family = None
|
||||
for name in font_names:
|
||||
family = self.FAMILY_MAP.get(name)
|
||||
if family:
|
||||
break
|
||||
|
||||
return family
|
||||
|
||||
def color_to_excel(self, val: str | None) -> str | None:
|
||||
if val is None:
|
||||
return None
|
||||
|
||||
if self._is_hex_color(val):
|
||||
return self._convert_hex_to_excel(val)
|
||||
|
||||
try:
|
||||
return self.NAMED_COLORS[val]
|
||||
except KeyError:
|
||||
warnings.warn(
|
||||
f"Unhandled color format: {repr(val)}",
|
||||
CSSWarning,
|
||||
stacklevel=find_stack_level(),
|
||||
)
|
||||
return None
|
||||
|
||||
def _is_hex_color(self, color_string: str) -> bool:
|
||||
return bool(color_string.startswith("#"))
|
||||
|
||||
def _convert_hex_to_excel(self, color_string: str) -> str:
|
||||
code = color_string.lstrip("#")
|
||||
if self._is_shorthand_color(color_string):
|
||||
return (code[0] * 2 + code[1] * 2 + code[2] * 2).upper()
|
||||
else:
|
||||
return code.upper()
|
||||
|
||||
def _is_shorthand_color(self, color_string: str) -> bool:
|
||||
"""Check if color code is shorthand.
|
||||
|
||||
#FFF is a shorthand as opposed to full #FFFFFF.
|
||||
"""
|
||||
code = color_string.lstrip("#")
|
||||
if len(code) == 3:
|
||||
return True
|
||||
elif len(code) == 6:
|
||||
return False
|
||||
else:
|
||||
raise ValueError(f"Unexpected color {color_string}")
|
||||
|
||||
|
||||
class ExcelFormatter:
|
||||
"""
|
||||
Class for formatting a DataFrame to a list of ExcelCells,
|
||||
|
||||
Parameters
|
||||
----------
|
||||
df : DataFrame or Styler
|
||||
na_rep: na representation
|
||||
float_format : str, default None
|
||||
Format string for floating point numbers
|
||||
cols : sequence, optional
|
||||
Columns to write
|
||||
header : bool or sequence of str, default True
|
||||
Write out column names. If a list of string is given it is
|
||||
assumed to be aliases for the column names
|
||||
index : bool, default True
|
||||
output row names (index)
|
||||
index_label : str or sequence, default None
|
||||
Column label for index column(s) if desired. If None is given, and
|
||||
`header` and `index` are True, then the index names are used. A
|
||||
sequence should be given if the DataFrame uses MultiIndex.
|
||||
merge_cells : bool, default False
|
||||
Format MultiIndex and Hierarchical Rows as merged cells.
|
||||
inf_rep : str, default `'inf'`
|
||||
representation for np.inf values (which aren't representable in Excel)
|
||||
A `'-'` sign will be added in front of -inf.
|
||||
style_converter : callable, optional
|
||||
This translates Styler styles (CSS) into ExcelWriter styles.
|
||||
Defaults to ``CSSToExcelConverter()``.
|
||||
It should have signature css_declarations string -> excel style.
|
||||
This is only called for body cells.
|
||||
"""
|
||||
|
||||
max_rows = 2**20
|
||||
max_cols = 2**14
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
df,
|
||||
na_rep: str = "",
|
||||
float_format: str | None = None,
|
||||
cols: Sequence[Hashable] | None = None,
|
||||
header: Sequence[Hashable] | bool = True,
|
||||
index: bool = True,
|
||||
index_label: IndexLabel | None = None,
|
||||
merge_cells: bool = False,
|
||||
inf_rep: str = "inf",
|
||||
style_converter: Callable | None = None,
|
||||
) -> None:
|
||||
self.rowcounter = 0
|
||||
self.na_rep = na_rep
|
||||
if not isinstance(df, DataFrame):
|
||||
self.styler = df
|
||||
self.styler._compute() # calculate applied styles
|
||||
df = df.data
|
||||
if style_converter is None:
|
||||
style_converter = CSSToExcelConverter()
|
||||
self.style_converter: Callable | None = style_converter
|
||||
else:
|
||||
self.styler = None
|
||||
self.style_converter = None
|
||||
self.df = df
|
||||
if cols is not None:
|
||||
# all missing, raise
|
||||
if not len(Index(cols).intersection(df.columns)):
|
||||
raise KeyError("passes columns are not ALL present dataframe")
|
||||
|
||||
if len(Index(cols).intersection(df.columns)) != len(set(cols)):
|
||||
# Deprecated in GH#17295, enforced in 1.0.0
|
||||
raise KeyError("Not all names specified in 'columns' are found")
|
||||
|
||||
self.df = df.reindex(columns=cols)
|
||||
|
||||
self.columns = self.df.columns
|
||||
self.float_format = float_format
|
||||
self.index = index
|
||||
self.index_label = index_label
|
||||
self.header = header
|
||||
self.merge_cells = merge_cells
|
||||
self.inf_rep = inf_rep
|
||||
|
||||
@property
|
||||
def header_style(self) -> dict[str, dict[str, str | bool]]:
|
||||
return {
|
||||
"font": {"bold": True},
|
||||
"borders": {
|
||||
"top": "thin",
|
||||
"right": "thin",
|
||||
"bottom": "thin",
|
||||
"left": "thin",
|
||||
},
|
||||
"alignment": {"horizontal": "center", "vertical": "top"},
|
||||
}
|
||||
|
||||
def _format_value(self, val):
|
||||
if is_scalar(val) and missing.isna(val):
|
||||
val = self.na_rep
|
||||
elif is_float(val):
|
||||
if missing.isposinf_scalar(val):
|
||||
val = self.inf_rep
|
||||
elif missing.isneginf_scalar(val):
|
||||
val = f"-{self.inf_rep}"
|
||||
elif self.float_format is not None:
|
||||
val = float(self.float_format % val)
|
||||
if getattr(val, "tzinfo", None) is not None:
|
||||
raise ValueError(
|
||||
"Excel does not support datetimes with "
|
||||
"timezones. Please ensure that datetimes "
|
||||
"are timezone unaware before writing to Excel."
|
||||
)
|
||||
return val
|
||||
|
||||
def _format_header_mi(self) -> Iterable[ExcelCell]:
|
||||
if self.columns.nlevels > 1:
|
||||
if not self.index:
|
||||
raise NotImplementedError(
|
||||
"Writing to Excel with MultiIndex columns and no "
|
||||
"index ('index'=False) is not yet implemented."
|
||||
)
|
||||
|
||||
if not (self._has_aliases or self.header):
|
||||
return
|
||||
|
||||
columns = self.columns
|
||||
level_strs = columns._format_multi(
|
||||
sparsify=self.merge_cells, include_names=False
|
||||
)
|
||||
level_lengths = get_level_lengths(level_strs)
|
||||
coloffset = 0
|
||||
lnum = 0
|
||||
|
||||
if self.index and isinstance(self.df.index, MultiIndex):
|
||||
coloffset = len(self.df.index[0]) - 1
|
||||
|
||||
if self.merge_cells:
|
||||
# Format multi-index as a merged cells.
|
||||
for lnum, name in enumerate(columns.names):
|
||||
yield ExcelCell(
|
||||
row=lnum,
|
||||
col=coloffset,
|
||||
val=name,
|
||||
style=self.header_style,
|
||||
)
|
||||
|
||||
for lnum, (spans, levels, level_codes) in enumerate(
|
||||
zip(level_lengths, columns.levels, columns.codes)
|
||||
):
|
||||
values = levels.take(level_codes)
|
||||
for i, span_val in spans.items():
|
||||
mergestart, mergeend = None, None
|
||||
if span_val > 1:
|
||||
mergestart, mergeend = lnum, coloffset + i + span_val
|
||||
yield CssExcelCell(
|
||||
row=lnum,
|
||||
col=coloffset + i + 1,
|
||||
val=values[i],
|
||||
style=self.header_style,
|
||||
css_styles=getattr(self.styler, "ctx_columns", None),
|
||||
css_row=lnum,
|
||||
css_col=i,
|
||||
css_converter=self.style_converter,
|
||||
mergestart=mergestart,
|
||||
mergeend=mergeend,
|
||||
)
|
||||
else:
|
||||
# Format in legacy format with dots to indicate levels.
|
||||
for i, values in enumerate(zip(*level_strs)):
|
||||
v = ".".join(map(pprint_thing, values))
|
||||
yield CssExcelCell(
|
||||
row=lnum,
|
||||
col=coloffset + i + 1,
|
||||
val=v,
|
||||
style=self.header_style,
|
||||
css_styles=getattr(self.styler, "ctx_columns", None),
|
||||
css_row=lnum,
|
||||
css_col=i,
|
||||
css_converter=self.style_converter,
|
||||
)
|
||||
|
||||
self.rowcounter = lnum
|
||||
|
||||
def _format_header_regular(self) -> Iterable[ExcelCell]:
|
||||
if self._has_aliases or self.header:
|
||||
coloffset = 0
|
||||
|
||||
if self.index:
|
||||
coloffset = 1
|
||||
if isinstance(self.df.index, MultiIndex):
|
||||
coloffset = len(self.df.index.names)
|
||||
|
||||
colnames = self.columns
|
||||
if self._has_aliases:
|
||||
self.header = cast(Sequence, self.header)
|
||||
if len(self.header) != len(self.columns):
|
||||
raise ValueError(
|
||||
f"Writing {len(self.columns)} cols "
|
||||
f"but got {len(self.header)} aliases"
|
||||
)
|
||||
colnames = self.header
|
||||
|
||||
for colindex, colname in enumerate(colnames):
|
||||
yield CssExcelCell(
|
||||
row=self.rowcounter,
|
||||
col=colindex + coloffset,
|
||||
val=colname,
|
||||
style=self.header_style,
|
||||
css_styles=getattr(self.styler, "ctx_columns", None),
|
||||
css_row=0,
|
||||
css_col=colindex,
|
||||
css_converter=self.style_converter,
|
||||
)
|
||||
|
||||
def _format_header(self) -> Iterable[ExcelCell]:
|
||||
gen: Iterable[ExcelCell]
|
||||
|
||||
if isinstance(self.columns, MultiIndex):
|
||||
gen = self._format_header_mi()
|
||||
else:
|
||||
gen = self._format_header_regular()
|
||||
|
||||
gen2: Iterable[ExcelCell] = ()
|
||||
|
||||
if self.df.index.names:
|
||||
row = [x if x is not None else "" for x in self.df.index.names] + [
|
||||
""
|
||||
] * len(self.columns)
|
||||
if functools.reduce(lambda x, y: x and y, (x != "" for x in row)):
|
||||
gen2 = (
|
||||
ExcelCell(self.rowcounter, colindex, val, self.header_style)
|
||||
for colindex, val in enumerate(row)
|
||||
)
|
||||
self.rowcounter += 1
|
||||
return itertools.chain(gen, gen2)
|
||||
|
||||
def _format_body(self) -> Iterable[ExcelCell]:
|
||||
if isinstance(self.df.index, MultiIndex):
|
||||
return self._format_hierarchical_rows()
|
||||
else:
|
||||
return self._format_regular_rows()
|
||||
|
||||
def _format_regular_rows(self) -> Iterable[ExcelCell]:
|
||||
if self._has_aliases or self.header:
|
||||
self.rowcounter += 1
|
||||
|
||||
# output index and index_label?
|
||||
if self.index:
|
||||
# check aliases
|
||||
# if list only take first as this is not a MultiIndex
|
||||
if self.index_label and isinstance(
|
||||
self.index_label, (list, tuple, np.ndarray, Index)
|
||||
):
|
||||
index_label = self.index_label[0]
|
||||
# if string good to go
|
||||
elif self.index_label and isinstance(self.index_label, str):
|
||||
index_label = self.index_label
|
||||
else:
|
||||
index_label = self.df.index.names[0]
|
||||
|
||||
if isinstance(self.columns, MultiIndex):
|
||||
self.rowcounter += 1
|
||||
|
||||
if index_label and self.header is not False:
|
||||
yield ExcelCell(self.rowcounter - 1, 0, index_label, self.header_style)
|
||||
|
||||
# write index_values
|
||||
index_values = self.df.index
|
||||
if isinstance(self.df.index, PeriodIndex):
|
||||
index_values = self.df.index.to_timestamp()
|
||||
|
||||
for idx, idxval in enumerate(index_values):
|
||||
yield CssExcelCell(
|
||||
row=self.rowcounter + idx,
|
||||
col=0,
|
||||
val=idxval,
|
||||
style=self.header_style,
|
||||
css_styles=getattr(self.styler, "ctx_index", None),
|
||||
css_row=idx,
|
||||
css_col=0,
|
||||
css_converter=self.style_converter,
|
||||
)
|
||||
coloffset = 1
|
||||
else:
|
||||
coloffset = 0
|
||||
|
||||
yield from self._generate_body(coloffset)
|
||||
|
||||
def _format_hierarchical_rows(self) -> Iterable[ExcelCell]:
|
||||
if self._has_aliases or self.header:
|
||||
self.rowcounter += 1
|
||||
|
||||
gcolidx = 0
|
||||
|
||||
if self.index:
|
||||
index_labels = self.df.index.names
|
||||
# check for aliases
|
||||
if self.index_label and isinstance(
|
||||
self.index_label, (list, tuple, np.ndarray, Index)
|
||||
):
|
||||
index_labels = self.index_label
|
||||
|
||||
# MultiIndex columns require an extra row
|
||||
# with index names (blank if None) for
|
||||
# unambiguous round-trip, unless not merging,
|
||||
# in which case the names all go on one row Issue #11328
|
||||
if isinstance(self.columns, MultiIndex) and self.merge_cells:
|
||||
self.rowcounter += 1
|
||||
|
||||
# if index labels are not empty go ahead and dump
|
||||
if com.any_not_none(*index_labels) and self.header is not False:
|
||||
for cidx, name in enumerate(index_labels):
|
||||
yield ExcelCell(self.rowcounter - 1, cidx, name, self.header_style)
|
||||
|
||||
if self.merge_cells:
|
||||
# Format hierarchical rows as merged cells.
|
||||
level_strs = self.df.index._format_multi(
|
||||
sparsify=True, include_names=False
|
||||
)
|
||||
level_lengths = get_level_lengths(level_strs)
|
||||
|
||||
for spans, levels, level_codes in zip(
|
||||
level_lengths, self.df.index.levels, self.df.index.codes
|
||||
):
|
||||
values = levels.take(
|
||||
level_codes,
|
||||
allow_fill=levels._can_hold_na,
|
||||
fill_value=levels._na_value,
|
||||
)
|
||||
|
||||
for i, span_val in spans.items():
|
||||
mergestart, mergeend = None, None
|
||||
if span_val > 1:
|
||||
mergestart = self.rowcounter + i + span_val - 1
|
||||
mergeend = gcolidx
|
||||
yield CssExcelCell(
|
||||
row=self.rowcounter + i,
|
||||
col=gcolidx,
|
||||
val=values[i],
|
||||
style=self.header_style,
|
||||
css_styles=getattr(self.styler, "ctx_index", None),
|
||||
css_row=i,
|
||||
css_col=gcolidx,
|
||||
css_converter=self.style_converter,
|
||||
mergestart=mergestart,
|
||||
mergeend=mergeend,
|
||||
)
|
||||
gcolidx += 1
|
||||
|
||||
else:
|
||||
# Format hierarchical rows with non-merged values.
|
||||
for indexcolvals in zip(*self.df.index):
|
||||
for idx, indexcolval in enumerate(indexcolvals):
|
||||
yield CssExcelCell(
|
||||
row=self.rowcounter + idx,
|
||||
col=gcolidx,
|
||||
val=indexcolval,
|
||||
style=self.header_style,
|
||||
css_styles=getattr(self.styler, "ctx_index", None),
|
||||
css_row=idx,
|
||||
css_col=gcolidx,
|
||||
css_converter=self.style_converter,
|
||||
)
|
||||
gcolidx += 1
|
||||
|
||||
yield from self._generate_body(gcolidx)
|
||||
|
||||
@property
|
||||
def _has_aliases(self) -> bool:
|
||||
"""Whether the aliases for column names are present."""
|
||||
return is_list_like(self.header)
|
||||
|
||||
def _generate_body(self, coloffset: int) -> Iterable[ExcelCell]:
|
||||
# Write the body of the frame data series by series.
|
||||
for colidx in range(len(self.columns)):
|
||||
series = self.df.iloc[:, colidx]
|
||||
for i, val in enumerate(series):
|
||||
yield CssExcelCell(
|
||||
row=self.rowcounter + i,
|
||||
col=colidx + coloffset,
|
||||
val=val,
|
||||
style=None,
|
||||
css_styles=getattr(self.styler, "ctx", None),
|
||||
css_row=i,
|
||||
css_col=colidx,
|
||||
css_converter=self.style_converter,
|
||||
)
|
||||
|
||||
def get_formatted_cells(self) -> Iterable[ExcelCell]:
|
||||
for cell in itertools.chain(self._format_header(), self._format_body()):
|
||||
cell.val = self._format_value(cell.val)
|
||||
yield cell
|
||||
|
||||
@doc(storage_options=_shared_docs["storage_options"])
|
||||
def write(
|
||||
self,
|
||||
writer: FilePath | WriteExcelBuffer | ExcelWriter,
|
||||
sheet_name: str = "Sheet1",
|
||||
startrow: int = 0,
|
||||
startcol: int = 0,
|
||||
freeze_panes: tuple[int, int] | None = None,
|
||||
engine: str | None = None,
|
||||
storage_options: StorageOptions | None = None,
|
||||
engine_kwargs: dict | None = None,
|
||||
) -> None:
|
||||
"""
|
||||
writer : path-like, file-like, or ExcelWriter object
|
||||
File path or existing ExcelWriter
|
||||
sheet_name : str, default 'Sheet1'
|
||||
Name of sheet which will contain DataFrame
|
||||
startrow :
|
||||
upper left cell row to dump data frame
|
||||
startcol :
|
||||
upper left cell column to dump data frame
|
||||
freeze_panes : tuple of integer (length 2), default None
|
||||
Specifies the one-based bottommost row and rightmost column that
|
||||
is to be frozen
|
||||
engine : string, default None
|
||||
write engine to use if writer is a path - you can also set this
|
||||
via the options ``io.excel.xlsx.writer``,
|
||||
or ``io.excel.xlsm.writer``.
|
||||
|
||||
{storage_options}
|
||||
|
||||
engine_kwargs: dict, optional
|
||||
Arbitrary keyword arguments passed to excel engine.
|
||||
"""
|
||||
from pandas.io.excel import ExcelWriter
|
||||
|
||||
num_rows, num_cols = self.df.shape
|
||||
if num_rows > self.max_rows or num_cols > self.max_cols:
|
||||
raise ValueError(
|
||||
f"This sheet is too large! Your sheet size is: {num_rows}, {num_cols} "
|
||||
f"Max sheet size is: {self.max_rows}, {self.max_cols}"
|
||||
)
|
||||
|
||||
if engine_kwargs is None:
|
||||
engine_kwargs = {}
|
||||
|
||||
formatted_cells = self.get_formatted_cells()
|
||||
if isinstance(writer, ExcelWriter):
|
||||
need_save = False
|
||||
else:
|
||||
writer = ExcelWriter(
|
||||
writer,
|
||||
engine=engine,
|
||||
storage_options=storage_options,
|
||||
engine_kwargs=engine_kwargs,
|
||||
)
|
||||
need_save = True
|
||||
|
||||
try:
|
||||
writer._write_cells(
|
||||
formatted_cells,
|
||||
sheet_name,
|
||||
startrow=startrow,
|
||||
startcol=startcol,
|
||||
freeze_panes=freeze_panes,
|
||||
)
|
||||
finally:
|
||||
# make sure to close opened file handles
|
||||
if need_save:
|
||||
writer.close()
|
||||
2058
lib/python3.11/site-packages/pandas/io/formats/format.py
Normal file
2058
lib/python3.11/site-packages/pandas/io/formats/format.py
Normal file
File diff suppressed because it is too large
Load Diff
646
lib/python3.11/site-packages/pandas/io/formats/html.py
Normal file
646
lib/python3.11/site-packages/pandas/io/formats/html.py
Normal file
@ -0,0 +1,646 @@
|
||||
"""
|
||||
Module for formatting output data in HTML.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from textwrap import dedent
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
Final,
|
||||
cast,
|
||||
)
|
||||
|
||||
from pandas._config import get_option
|
||||
|
||||
from pandas._libs import lib
|
||||
|
||||
from pandas import (
|
||||
MultiIndex,
|
||||
option_context,
|
||||
)
|
||||
|
||||
from pandas.io.common import is_url
|
||||
from pandas.io.formats.format import (
|
||||
DataFrameFormatter,
|
||||
get_level_lengths,
|
||||
)
|
||||
from pandas.io.formats.printing import pprint_thing
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import (
|
||||
Hashable,
|
||||
Iterable,
|
||||
Mapping,
|
||||
)
|
||||
|
||||
|
||||
class HTMLFormatter:
|
||||
"""
|
||||
Internal class for formatting output data in html.
|
||||
This class is intended for shared functionality between
|
||||
DataFrame.to_html() and DataFrame._repr_html_().
|
||||
Any logic in common with other output formatting methods
|
||||
should ideally be inherited from classes in format.py
|
||||
and this class responsible for only producing html markup.
|
||||
"""
|
||||
|
||||
indent_delta: Final = 2
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
formatter: DataFrameFormatter,
|
||||
classes: str | list[str] | tuple[str, ...] | None = None,
|
||||
border: int | bool | None = None,
|
||||
table_id: str | None = None,
|
||||
render_links: bool = False,
|
||||
) -> None:
|
||||
self.fmt = formatter
|
||||
self.classes = classes
|
||||
|
||||
self.frame = self.fmt.frame
|
||||
self.columns = self.fmt.tr_frame.columns
|
||||
self.elements: list[str] = []
|
||||
self.bold_rows = self.fmt.bold_rows
|
||||
self.escape = self.fmt.escape
|
||||
self.show_dimensions = self.fmt.show_dimensions
|
||||
if border is None or border is True:
|
||||
border = cast(int, get_option("display.html.border"))
|
||||
elif not border:
|
||||
border = None
|
||||
|
||||
self.border = border
|
||||
self.table_id = table_id
|
||||
self.render_links = render_links
|
||||
|
||||
self.col_space = {}
|
||||
is_multi_index = isinstance(self.columns, MultiIndex)
|
||||
for column, value in self.fmt.col_space.items():
|
||||
col_space_value = f"{value}px" if isinstance(value, int) else value
|
||||
self.col_space[column] = col_space_value
|
||||
# GH 53885: Handling case where column is index
|
||||
# Flatten the data in the multi index and add in the map
|
||||
if is_multi_index and isinstance(column, tuple):
|
||||
for column_index in column:
|
||||
self.col_space[str(column_index)] = col_space_value
|
||||
|
||||
def to_string(self) -> str:
|
||||
lines = self.render()
|
||||
if any(isinstance(x, str) for x in lines):
|
||||
lines = [str(x) for x in lines]
|
||||
return "\n".join(lines)
|
||||
|
||||
def render(self) -> list[str]:
|
||||
self._write_table()
|
||||
|
||||
if self.should_show_dimensions:
|
||||
by = chr(215) # × # noqa: RUF003
|
||||
self.write(
|
||||
f"<p>{len(self.frame)} rows {by} {len(self.frame.columns)} columns</p>"
|
||||
)
|
||||
|
||||
return self.elements
|
||||
|
||||
@property
|
||||
def should_show_dimensions(self) -> bool:
|
||||
return self.fmt.should_show_dimensions
|
||||
|
||||
@property
|
||||
def show_row_idx_names(self) -> bool:
|
||||
return self.fmt.show_row_idx_names
|
||||
|
||||
@property
|
||||
def show_col_idx_names(self) -> bool:
|
||||
return self.fmt.show_col_idx_names
|
||||
|
||||
@property
|
||||
def row_levels(self) -> int:
|
||||
if self.fmt.index:
|
||||
# showing (row) index
|
||||
return self.frame.index.nlevels
|
||||
elif self.show_col_idx_names:
|
||||
# see gh-22579
|
||||
# Column misalignment also occurs for
|
||||
# a standard index when the columns index is named.
|
||||
# If the row index is not displayed a column of
|
||||
# blank cells need to be included before the DataFrame values.
|
||||
return 1
|
||||
# not showing (row) index
|
||||
return 0
|
||||
|
||||
def _get_columns_formatted_values(self) -> Iterable:
|
||||
return self.columns
|
||||
|
||||
@property
|
||||
def is_truncated(self) -> bool:
|
||||
return self.fmt.is_truncated
|
||||
|
||||
@property
|
||||
def ncols(self) -> int:
|
||||
return len(self.fmt.tr_frame.columns)
|
||||
|
||||
def write(self, s: Any, indent: int = 0) -> None:
|
||||
rs = pprint_thing(s)
|
||||
self.elements.append(" " * indent + rs)
|
||||
|
||||
def write_th(
|
||||
self, s: Any, header: bool = False, indent: int = 0, tags: str | None = None
|
||||
) -> None:
|
||||
"""
|
||||
Method for writing a formatted <th> cell.
|
||||
|
||||
If col_space is set on the formatter then that is used for
|
||||
the value of min-width.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s : object
|
||||
The data to be written inside the cell.
|
||||
header : bool, default False
|
||||
Set to True if the <th> is for use inside <thead>. This will
|
||||
cause min-width to be set if there is one.
|
||||
indent : int, default 0
|
||||
The indentation level of the cell.
|
||||
tags : str, default None
|
||||
Tags to include in the cell.
|
||||
|
||||
Returns
|
||||
-------
|
||||
A written <th> cell.
|
||||
"""
|
||||
col_space = self.col_space.get(s, None)
|
||||
|
||||
if header and col_space is not None:
|
||||
tags = tags or ""
|
||||
tags += f'style="min-width: {col_space};"'
|
||||
|
||||
self._write_cell(s, kind="th", indent=indent, tags=tags)
|
||||
|
||||
def write_td(self, s: Any, indent: int = 0, tags: str | None = None) -> None:
|
||||
self._write_cell(s, kind="td", indent=indent, tags=tags)
|
||||
|
||||
def _write_cell(
|
||||
self, s: Any, kind: str = "td", indent: int = 0, tags: str | None = None
|
||||
) -> None:
|
||||
if tags is not None:
|
||||
start_tag = f"<{kind} {tags}>"
|
||||
else:
|
||||
start_tag = f"<{kind}>"
|
||||
|
||||
if self.escape:
|
||||
# escape & first to prevent double escaping of &
|
||||
esc = {"&": r"&", "<": r"<", ">": r">"}
|
||||
else:
|
||||
esc = {}
|
||||
|
||||
rs = pprint_thing(s, escape_chars=esc).strip()
|
||||
|
||||
if self.render_links and is_url(rs):
|
||||
rs_unescaped = pprint_thing(s, escape_chars={}).strip()
|
||||
start_tag += f'<a href="{rs_unescaped}" target="_blank">'
|
||||
end_a = "</a>"
|
||||
else:
|
||||
end_a = ""
|
||||
|
||||
self.write(f"{start_tag}{rs}{end_a}</{kind}>", indent)
|
||||
|
||||
def write_tr(
|
||||
self,
|
||||
line: Iterable,
|
||||
indent: int = 0,
|
||||
indent_delta: int = 0,
|
||||
header: bool = False,
|
||||
align: str | None = None,
|
||||
tags: dict[int, str] | None = None,
|
||||
nindex_levels: int = 0,
|
||||
) -> None:
|
||||
if tags is None:
|
||||
tags = {}
|
||||
|
||||
if align is None:
|
||||
self.write("<tr>", indent)
|
||||
else:
|
||||
self.write(f'<tr style="text-align: {align};">', indent)
|
||||
indent += indent_delta
|
||||
|
||||
for i, s in enumerate(line):
|
||||
val_tag = tags.get(i, None)
|
||||
if header or (self.bold_rows and i < nindex_levels):
|
||||
self.write_th(s, indent=indent, header=header, tags=val_tag)
|
||||
else:
|
||||
self.write_td(s, indent, tags=val_tag)
|
||||
|
||||
indent -= indent_delta
|
||||
self.write("</tr>", indent)
|
||||
|
||||
def _write_table(self, indent: int = 0) -> None:
|
||||
_classes = ["dataframe"] # Default class.
|
||||
use_mathjax = get_option("display.html.use_mathjax")
|
||||
if not use_mathjax:
|
||||
_classes.append("tex2jax_ignore")
|
||||
if self.classes is not None:
|
||||
if isinstance(self.classes, str):
|
||||
self.classes = self.classes.split()
|
||||
if not isinstance(self.classes, (list, tuple)):
|
||||
raise TypeError(
|
||||
"classes must be a string, list, "
|
||||
f"or tuple, not {type(self.classes)}"
|
||||
)
|
||||
_classes.extend(self.classes)
|
||||
|
||||
if self.table_id is None:
|
||||
id_section = ""
|
||||
else:
|
||||
id_section = f' id="{self.table_id}"'
|
||||
|
||||
if self.border is None:
|
||||
border_attr = ""
|
||||
else:
|
||||
border_attr = f' border="{self.border}"'
|
||||
|
||||
self.write(
|
||||
f'<table{border_attr} class="{" ".join(_classes)}"{id_section}>',
|
||||
indent,
|
||||
)
|
||||
|
||||
if self.fmt.header or self.show_row_idx_names:
|
||||
self._write_header(indent + self.indent_delta)
|
||||
|
||||
self._write_body(indent + self.indent_delta)
|
||||
|
||||
self.write("</table>", indent)
|
||||
|
||||
def _write_col_header(self, indent: int) -> None:
|
||||
row: list[Hashable]
|
||||
is_truncated_horizontally = self.fmt.is_truncated_horizontally
|
||||
if isinstance(self.columns, MultiIndex):
|
||||
template = 'colspan="{span:d}" halign="left"'
|
||||
|
||||
sentinel: lib.NoDefault | bool
|
||||
if self.fmt.sparsify:
|
||||
# GH3547
|
||||
sentinel = lib.no_default
|
||||
else:
|
||||
sentinel = False
|
||||
levels = self.columns._format_multi(sparsify=sentinel, include_names=False)
|
||||
level_lengths = get_level_lengths(levels, sentinel)
|
||||
inner_lvl = len(level_lengths) - 1
|
||||
for lnum, (records, values) in enumerate(zip(level_lengths, levels)):
|
||||
if is_truncated_horizontally:
|
||||
# modify the header lines
|
||||
ins_col = self.fmt.tr_col_num
|
||||
if self.fmt.sparsify:
|
||||
recs_new = {}
|
||||
# Increment tags after ... col.
|
||||
for tag, span in list(records.items()):
|
||||
if tag >= ins_col:
|
||||
recs_new[tag + 1] = span
|
||||
elif tag + span > ins_col:
|
||||
recs_new[tag] = span + 1
|
||||
if lnum == inner_lvl:
|
||||
values = (
|
||||
values[:ins_col] + ("...",) + values[ins_col:]
|
||||
)
|
||||
else:
|
||||
# sparse col headers do not receive a ...
|
||||
values = (
|
||||
values[:ins_col]
|
||||
+ (values[ins_col - 1],)
|
||||
+ values[ins_col:]
|
||||
)
|
||||
else:
|
||||
recs_new[tag] = span
|
||||
# if ins_col lies between tags, all col headers
|
||||
# get ...
|
||||
if tag + span == ins_col:
|
||||
recs_new[ins_col] = 1
|
||||
values = values[:ins_col] + ("...",) + values[ins_col:]
|
||||
records = recs_new
|
||||
inner_lvl = len(level_lengths) - 1
|
||||
if lnum == inner_lvl:
|
||||
records[ins_col] = 1
|
||||
else:
|
||||
recs_new = {}
|
||||
for tag, span in list(records.items()):
|
||||
if tag >= ins_col:
|
||||
recs_new[tag + 1] = span
|
||||
else:
|
||||
recs_new[tag] = span
|
||||
recs_new[ins_col] = 1
|
||||
records = recs_new
|
||||
values = values[:ins_col] + ["..."] + values[ins_col:]
|
||||
|
||||
# see gh-22579
|
||||
# Column Offset Bug with to_html(index=False) with
|
||||
# MultiIndex Columns and Index.
|
||||
# Initially fill row with blank cells before column names.
|
||||
# TODO: Refactor to remove code duplication with code
|
||||
# block below for standard columns index.
|
||||
row = [""] * (self.row_levels - 1)
|
||||
if self.fmt.index or self.show_col_idx_names:
|
||||
# see gh-22747
|
||||
# If to_html(index_names=False) do not show columns
|
||||
# index names.
|
||||
# TODO: Refactor to use _get_column_name_list from
|
||||
# DataFrameFormatter class and create a
|
||||
# _get_formatted_column_labels function for code
|
||||
# parity with DataFrameFormatter class.
|
||||
if self.fmt.show_index_names:
|
||||
name = self.columns.names[lnum]
|
||||
row.append(pprint_thing(name or ""))
|
||||
else:
|
||||
row.append("")
|
||||
|
||||
tags = {}
|
||||
j = len(row)
|
||||
for i, v in enumerate(values):
|
||||
if i in records:
|
||||
if records[i] > 1:
|
||||
tags[j] = template.format(span=records[i])
|
||||
else:
|
||||
continue
|
||||
j += 1
|
||||
row.append(v)
|
||||
self.write_tr(row, indent, self.indent_delta, tags=tags, header=True)
|
||||
else:
|
||||
# see gh-22579
|
||||
# Column misalignment also occurs for
|
||||
# a standard index when the columns index is named.
|
||||
# Initially fill row with blank cells before column names.
|
||||
# TODO: Refactor to remove code duplication with code block
|
||||
# above for columns MultiIndex.
|
||||
row = [""] * (self.row_levels - 1)
|
||||
if self.fmt.index or self.show_col_idx_names:
|
||||
# see gh-22747
|
||||
# If to_html(index_names=False) do not show columns
|
||||
# index names.
|
||||
# TODO: Refactor to use _get_column_name_list from
|
||||
# DataFrameFormatter class.
|
||||
if self.fmt.show_index_names:
|
||||
row.append(self.columns.name or "")
|
||||
else:
|
||||
row.append("")
|
||||
row.extend(self._get_columns_formatted_values())
|
||||
align = self.fmt.justify
|
||||
|
||||
if is_truncated_horizontally:
|
||||
ins_col = self.row_levels + self.fmt.tr_col_num
|
||||
row.insert(ins_col, "...")
|
||||
|
||||
self.write_tr(row, indent, self.indent_delta, header=True, align=align)
|
||||
|
||||
def _write_row_header(self, indent: int) -> None:
|
||||
is_truncated_horizontally = self.fmt.is_truncated_horizontally
|
||||
row = [x if x is not None else "" for x in self.frame.index.names] + [""] * (
|
||||
self.ncols + (1 if is_truncated_horizontally else 0)
|
||||
)
|
||||
self.write_tr(row, indent, self.indent_delta, header=True)
|
||||
|
||||
def _write_header(self, indent: int) -> None:
|
||||
self.write("<thead>", indent)
|
||||
|
||||
if self.fmt.header:
|
||||
self._write_col_header(indent + self.indent_delta)
|
||||
|
||||
if self.show_row_idx_names:
|
||||
self._write_row_header(indent + self.indent_delta)
|
||||
|
||||
self.write("</thead>", indent)
|
||||
|
||||
def _get_formatted_values(self) -> dict[int, list[str]]:
|
||||
with option_context("display.max_colwidth", None):
|
||||
fmt_values = {i: self.fmt.format_col(i) for i in range(self.ncols)}
|
||||
return fmt_values
|
||||
|
||||
def _write_body(self, indent: int) -> None:
|
||||
self.write("<tbody>", indent)
|
||||
fmt_values = self._get_formatted_values()
|
||||
|
||||
# write values
|
||||
if self.fmt.index and isinstance(self.frame.index, MultiIndex):
|
||||
self._write_hierarchical_rows(fmt_values, indent + self.indent_delta)
|
||||
else:
|
||||
self._write_regular_rows(fmt_values, indent + self.indent_delta)
|
||||
|
||||
self.write("</tbody>", indent)
|
||||
|
||||
def _write_regular_rows(
|
||||
self, fmt_values: Mapping[int, list[str]], indent: int
|
||||
) -> None:
|
||||
is_truncated_horizontally = self.fmt.is_truncated_horizontally
|
||||
is_truncated_vertically = self.fmt.is_truncated_vertically
|
||||
|
||||
nrows = len(self.fmt.tr_frame)
|
||||
|
||||
if self.fmt.index:
|
||||
fmt = self.fmt._get_formatter("__index__")
|
||||
if fmt is not None:
|
||||
index_values = self.fmt.tr_frame.index.map(fmt)
|
||||
else:
|
||||
# only reached with non-Multi index
|
||||
index_values = self.fmt.tr_frame.index._format_flat(include_name=False)
|
||||
|
||||
row: list[str] = []
|
||||
for i in range(nrows):
|
||||
if is_truncated_vertically and i == (self.fmt.tr_row_num):
|
||||
str_sep_row = ["..."] * len(row)
|
||||
self.write_tr(
|
||||
str_sep_row,
|
||||
indent,
|
||||
self.indent_delta,
|
||||
tags=None,
|
||||
nindex_levels=self.row_levels,
|
||||
)
|
||||
|
||||
row = []
|
||||
if self.fmt.index:
|
||||
row.append(index_values[i])
|
||||
# see gh-22579
|
||||
# Column misalignment also occurs for
|
||||
# a standard index when the columns index is named.
|
||||
# Add blank cell before data cells.
|
||||
elif self.show_col_idx_names:
|
||||
row.append("")
|
||||
row.extend(fmt_values[j][i] for j in range(self.ncols))
|
||||
|
||||
if is_truncated_horizontally:
|
||||
dot_col_ix = self.fmt.tr_col_num + self.row_levels
|
||||
row.insert(dot_col_ix, "...")
|
||||
self.write_tr(
|
||||
row, indent, self.indent_delta, tags=None, nindex_levels=self.row_levels
|
||||
)
|
||||
|
||||
def _write_hierarchical_rows(
|
||||
self, fmt_values: Mapping[int, list[str]], indent: int
|
||||
) -> None:
|
||||
template = 'rowspan="{span}" valign="top"'
|
||||
|
||||
is_truncated_horizontally = self.fmt.is_truncated_horizontally
|
||||
is_truncated_vertically = self.fmt.is_truncated_vertically
|
||||
frame = self.fmt.tr_frame
|
||||
nrows = len(frame)
|
||||
|
||||
assert isinstance(frame.index, MultiIndex)
|
||||
idx_values = frame.index._format_multi(sparsify=False, include_names=False)
|
||||
idx_values = list(zip(*idx_values))
|
||||
|
||||
if self.fmt.sparsify:
|
||||
# GH3547
|
||||
sentinel = lib.no_default
|
||||
levels = frame.index._format_multi(sparsify=sentinel, include_names=False)
|
||||
|
||||
level_lengths = get_level_lengths(levels, sentinel)
|
||||
inner_lvl = len(level_lengths) - 1
|
||||
if is_truncated_vertically:
|
||||
# Insert ... row and adjust idx_values and
|
||||
# level_lengths to take this into account.
|
||||
ins_row = self.fmt.tr_row_num
|
||||
inserted = False
|
||||
for lnum, records in enumerate(level_lengths):
|
||||
rec_new = {}
|
||||
for tag, span in list(records.items()):
|
||||
if tag >= ins_row:
|
||||
rec_new[tag + 1] = span
|
||||
elif tag + span > ins_row:
|
||||
rec_new[tag] = span + 1
|
||||
|
||||
# GH 14882 - Make sure insertion done once
|
||||
if not inserted:
|
||||
dot_row = list(idx_values[ins_row - 1])
|
||||
dot_row[-1] = "..."
|
||||
idx_values.insert(ins_row, tuple(dot_row))
|
||||
inserted = True
|
||||
else:
|
||||
dot_row = list(idx_values[ins_row])
|
||||
dot_row[inner_lvl - lnum] = "..."
|
||||
idx_values[ins_row] = tuple(dot_row)
|
||||
else:
|
||||
rec_new[tag] = span
|
||||
# If ins_row lies between tags, all cols idx cols
|
||||
# receive ...
|
||||
if tag + span == ins_row:
|
||||
rec_new[ins_row] = 1
|
||||
if lnum == 0:
|
||||
idx_values.insert(
|
||||
ins_row, tuple(["..."] * len(level_lengths))
|
||||
)
|
||||
|
||||
# GH 14882 - Place ... in correct level
|
||||
elif inserted:
|
||||
dot_row = list(idx_values[ins_row])
|
||||
dot_row[inner_lvl - lnum] = "..."
|
||||
idx_values[ins_row] = tuple(dot_row)
|
||||
level_lengths[lnum] = rec_new
|
||||
|
||||
level_lengths[inner_lvl][ins_row] = 1
|
||||
for ix_col in fmt_values:
|
||||
fmt_values[ix_col].insert(ins_row, "...")
|
||||
nrows += 1
|
||||
|
||||
for i in range(nrows):
|
||||
row = []
|
||||
tags = {}
|
||||
|
||||
sparse_offset = 0
|
||||
j = 0
|
||||
for records, v in zip(level_lengths, idx_values[i]):
|
||||
if i in records:
|
||||
if records[i] > 1:
|
||||
tags[j] = template.format(span=records[i])
|
||||
else:
|
||||
sparse_offset += 1
|
||||
continue
|
||||
|
||||
j += 1
|
||||
row.append(v)
|
||||
|
||||
row.extend(fmt_values[j][i] for j in range(self.ncols))
|
||||
if is_truncated_horizontally:
|
||||
row.insert(
|
||||
self.row_levels - sparse_offset + self.fmt.tr_col_num, "..."
|
||||
)
|
||||
self.write_tr(
|
||||
row,
|
||||
indent,
|
||||
self.indent_delta,
|
||||
tags=tags,
|
||||
nindex_levels=len(levels) - sparse_offset,
|
||||
)
|
||||
else:
|
||||
row = []
|
||||
for i in range(len(frame)):
|
||||
if is_truncated_vertically and i == (self.fmt.tr_row_num):
|
||||
str_sep_row = ["..."] * len(row)
|
||||
self.write_tr(
|
||||
str_sep_row,
|
||||
indent,
|
||||
self.indent_delta,
|
||||
tags=None,
|
||||
nindex_levels=self.row_levels,
|
||||
)
|
||||
|
||||
idx_values = list(
|
||||
zip(*frame.index._format_multi(sparsify=False, include_names=False))
|
||||
)
|
||||
row = []
|
||||
row.extend(idx_values[i])
|
||||
row.extend(fmt_values[j][i] for j in range(self.ncols))
|
||||
if is_truncated_horizontally:
|
||||
row.insert(self.row_levels + self.fmt.tr_col_num, "...")
|
||||
self.write_tr(
|
||||
row,
|
||||
indent,
|
||||
self.indent_delta,
|
||||
tags=None,
|
||||
nindex_levels=frame.index.nlevels,
|
||||
)
|
||||
|
||||
|
||||
class NotebookFormatter(HTMLFormatter):
|
||||
"""
|
||||
Internal class for formatting output data in html for display in Jupyter
|
||||
Notebooks. This class is intended for functionality specific to
|
||||
DataFrame._repr_html_() and DataFrame.to_html(notebook=True)
|
||||
"""
|
||||
|
||||
def _get_formatted_values(self) -> dict[int, list[str]]:
|
||||
return {i: self.fmt.format_col(i) for i in range(self.ncols)}
|
||||
|
||||
def _get_columns_formatted_values(self) -> list[str]:
|
||||
# only reached with non-Multi Index
|
||||
return self.columns._format_flat(include_name=False)
|
||||
|
||||
def write_style(self) -> None:
|
||||
# We use the "scoped" attribute here so that the desired
|
||||
# style properties for the data frame are not then applied
|
||||
# throughout the entire notebook.
|
||||
template_first = """\
|
||||
<style scoped>"""
|
||||
template_last = """\
|
||||
</style>"""
|
||||
template_select = """\
|
||||
.dataframe %s {
|
||||
%s: %s;
|
||||
}"""
|
||||
element_props = [
|
||||
("tbody tr th:only-of-type", "vertical-align", "middle"),
|
||||
("tbody tr th", "vertical-align", "top"),
|
||||
]
|
||||
if isinstance(self.columns, MultiIndex):
|
||||
element_props.append(("thead tr th", "text-align", "left"))
|
||||
if self.show_row_idx_names:
|
||||
element_props.append(
|
||||
("thead tr:last-of-type th", "text-align", "right")
|
||||
)
|
||||
else:
|
||||
element_props.append(("thead th", "text-align", "right"))
|
||||
template_mid = "\n\n".join(template_select % t for t in element_props)
|
||||
template = dedent(f"{template_first}\n{template_mid}\n{template_last}")
|
||||
self.write(template)
|
||||
|
||||
def render(self) -> list[str]:
|
||||
self.write("<div>")
|
||||
self.write_style()
|
||||
super().render()
|
||||
self.write("</div>")
|
||||
return self.elements
|
||||
1101
lib/python3.11/site-packages/pandas/io/formats/info.py
Normal file
1101
lib/python3.11/site-packages/pandas/io/formats/info.py
Normal file
File diff suppressed because it is too large
Load Diff
572
lib/python3.11/site-packages/pandas/io/formats/printing.py
Normal file
572
lib/python3.11/site-packages/pandas/io/formats/printing.py
Normal file
@ -0,0 +1,572 @@
|
||||
"""
|
||||
Printing tools.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from collections.abc import (
|
||||
Iterable,
|
||||
Mapping,
|
||||
Sequence,
|
||||
)
|
||||
import sys
|
||||
from typing import (
|
||||
Any,
|
||||
Callable,
|
||||
TypeVar,
|
||||
Union,
|
||||
)
|
||||
from unicodedata import east_asian_width
|
||||
|
||||
from pandas._config import get_option
|
||||
|
||||
from pandas.core.dtypes.inference import is_sequence
|
||||
|
||||
from pandas.io.formats.console import get_console_size
|
||||
|
||||
EscapeChars = Union[Mapping[str, str], Iterable[str]]
|
||||
_KT = TypeVar("_KT")
|
||||
_VT = TypeVar("_VT")
|
||||
|
||||
|
||||
def adjoin(space: int, *lists: list[str], **kwargs) -> str:
|
||||
"""
|
||||
Glues together two sets of strings using the amount of space requested.
|
||||
The idea is to prettify.
|
||||
|
||||
----------
|
||||
space : int
|
||||
number of spaces for padding
|
||||
lists : str
|
||||
list of str which being joined
|
||||
strlen : callable
|
||||
function used to calculate the length of each str. Needed for unicode
|
||||
handling.
|
||||
justfunc : callable
|
||||
function used to justify str. Needed for unicode handling.
|
||||
"""
|
||||
strlen = kwargs.pop("strlen", len)
|
||||
justfunc = kwargs.pop("justfunc", _adj_justify)
|
||||
|
||||
newLists = []
|
||||
lengths = [max(map(strlen, x)) + space for x in lists[:-1]]
|
||||
# not the last one
|
||||
lengths.append(max(map(len, lists[-1])))
|
||||
maxLen = max(map(len, lists))
|
||||
for i, lst in enumerate(lists):
|
||||
nl = justfunc(lst, lengths[i], mode="left")
|
||||
nl = ([" " * lengths[i]] * (maxLen - len(lst))) + nl
|
||||
newLists.append(nl)
|
||||
toJoin = zip(*newLists)
|
||||
return "\n".join("".join(lines) for lines in toJoin)
|
||||
|
||||
|
||||
def _adj_justify(texts: Iterable[str], max_len: int, mode: str = "right") -> list[str]:
|
||||
"""
|
||||
Perform ljust, center, rjust against string or list-like
|
||||
"""
|
||||
if mode == "left":
|
||||
return [x.ljust(max_len) for x in texts]
|
||||
elif mode == "center":
|
||||
return [x.center(max_len) for x in texts]
|
||||
else:
|
||||
return [x.rjust(max_len) for x in texts]
|
||||
|
||||
|
||||
# Unicode consolidation
|
||||
# ---------------------
|
||||
#
|
||||
# pprinting utility functions for generating Unicode text or
|
||||
# bytes(3.x)/str(2.x) representations of objects.
|
||||
# Try to use these as much as possible rather than rolling your own.
|
||||
#
|
||||
# When to use
|
||||
# -----------
|
||||
#
|
||||
# 1) If you're writing code internal to pandas (no I/O directly involved),
|
||||
# use pprint_thing().
|
||||
#
|
||||
# It will always return unicode text which can handled by other
|
||||
# parts of the package without breakage.
|
||||
#
|
||||
# 2) if you need to write something out to file, use
|
||||
# pprint_thing_encoded(encoding).
|
||||
#
|
||||
# If no encoding is specified, it defaults to utf-8. Since encoding pure
|
||||
# ascii with utf-8 is a no-op you can safely use the default utf-8 if you're
|
||||
# working with straight ascii.
|
||||
|
||||
|
||||
def _pprint_seq(
|
||||
seq: Sequence, _nest_lvl: int = 0, max_seq_items: int | None = None, **kwds
|
||||
) -> str:
|
||||
"""
|
||||
internal. pprinter for iterables. you should probably use pprint_thing()
|
||||
rather than calling this directly.
|
||||
|
||||
bounds length of printed sequence, depending on options
|
||||
"""
|
||||
if isinstance(seq, set):
|
||||
fmt = "{{{body}}}"
|
||||
else:
|
||||
fmt = "[{body}]" if hasattr(seq, "__setitem__") else "({body})"
|
||||
|
||||
if max_seq_items is False:
|
||||
nitems = len(seq)
|
||||
else:
|
||||
nitems = max_seq_items or get_option("max_seq_items") or len(seq)
|
||||
|
||||
s = iter(seq)
|
||||
# handle sets, no slicing
|
||||
r = [
|
||||
pprint_thing(next(s), _nest_lvl + 1, max_seq_items=max_seq_items, **kwds)
|
||||
for i in range(min(nitems, len(seq)))
|
||||
]
|
||||
body = ", ".join(r)
|
||||
|
||||
if nitems < len(seq):
|
||||
body += ", ..."
|
||||
elif isinstance(seq, tuple) and len(seq) == 1:
|
||||
body += ","
|
||||
|
||||
return fmt.format(body=body)
|
||||
|
||||
|
||||
def _pprint_dict(
|
||||
seq: Mapping, _nest_lvl: int = 0, max_seq_items: int | None = None, **kwds
|
||||
) -> str:
|
||||
"""
|
||||
internal. pprinter for iterables. you should probably use pprint_thing()
|
||||
rather than calling this directly.
|
||||
"""
|
||||
fmt = "{{{things}}}"
|
||||
pairs = []
|
||||
|
||||
pfmt = "{key}: {val}"
|
||||
|
||||
if max_seq_items is False:
|
||||
nitems = len(seq)
|
||||
else:
|
||||
nitems = max_seq_items or get_option("max_seq_items") or len(seq)
|
||||
|
||||
for k, v in list(seq.items())[:nitems]:
|
||||
pairs.append(
|
||||
pfmt.format(
|
||||
key=pprint_thing(k, _nest_lvl + 1, max_seq_items=max_seq_items, **kwds),
|
||||
val=pprint_thing(v, _nest_lvl + 1, max_seq_items=max_seq_items, **kwds),
|
||||
)
|
||||
)
|
||||
|
||||
if nitems < len(seq):
|
||||
return fmt.format(things=", ".join(pairs) + ", ...")
|
||||
else:
|
||||
return fmt.format(things=", ".join(pairs))
|
||||
|
||||
|
||||
def pprint_thing(
|
||||
thing: Any,
|
||||
_nest_lvl: int = 0,
|
||||
escape_chars: EscapeChars | None = None,
|
||||
default_escapes: bool = False,
|
||||
quote_strings: bool = False,
|
||||
max_seq_items: int | None = None,
|
||||
) -> str:
|
||||
"""
|
||||
This function is the sanctioned way of converting objects
|
||||
to a string representation and properly handles nested sequences.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
thing : anything to be formatted
|
||||
_nest_lvl : internal use only. pprint_thing() is mutually-recursive
|
||||
with pprint_sequence, this argument is used to keep track of the
|
||||
current nesting level, and limit it.
|
||||
escape_chars : list or dict, optional
|
||||
Characters to escape. If a dict is passed the values are the
|
||||
replacements
|
||||
default_escapes : bool, default False
|
||||
Whether the input escape characters replaces or adds to the defaults
|
||||
max_seq_items : int or None, default None
|
||||
Pass through to other pretty printers to limit sequence printing
|
||||
|
||||
Returns
|
||||
-------
|
||||
str
|
||||
"""
|
||||
|
||||
def as_escaped_string(
|
||||
thing: Any, escape_chars: EscapeChars | None = escape_chars
|
||||
) -> str:
|
||||
translate = {"\t": r"\t", "\n": r"\n", "\r": r"\r"}
|
||||
if isinstance(escape_chars, dict):
|
||||
if default_escapes:
|
||||
translate.update(escape_chars)
|
||||
else:
|
||||
translate = escape_chars
|
||||
escape_chars = list(escape_chars.keys())
|
||||
else:
|
||||
escape_chars = escape_chars or ()
|
||||
|
||||
result = str(thing)
|
||||
for c in escape_chars:
|
||||
result = result.replace(c, translate[c])
|
||||
return result
|
||||
|
||||
if hasattr(thing, "__next__"):
|
||||
return str(thing)
|
||||
elif isinstance(thing, dict) and _nest_lvl < get_option(
|
||||
"display.pprint_nest_depth"
|
||||
):
|
||||
result = _pprint_dict(
|
||||
thing, _nest_lvl, quote_strings=True, max_seq_items=max_seq_items
|
||||
)
|
||||
elif is_sequence(thing) and _nest_lvl < get_option("display.pprint_nest_depth"):
|
||||
result = _pprint_seq(
|
||||
thing,
|
||||
_nest_lvl,
|
||||
escape_chars=escape_chars,
|
||||
quote_strings=quote_strings,
|
||||
max_seq_items=max_seq_items,
|
||||
)
|
||||
elif isinstance(thing, str) and quote_strings:
|
||||
result = f"'{as_escaped_string(thing)}'"
|
||||
else:
|
||||
result = as_escaped_string(thing)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def pprint_thing_encoded(
|
||||
object, encoding: str = "utf-8", errors: str = "replace"
|
||||
) -> bytes:
|
||||
value = pprint_thing(object) # get unicode representation of object
|
||||
return value.encode(encoding, errors)
|
||||
|
||||
|
||||
def enable_data_resource_formatter(enable: bool) -> None:
|
||||
if "IPython" not in sys.modules:
|
||||
# definitely not in IPython
|
||||
return
|
||||
from IPython import get_ipython
|
||||
|
||||
ip = get_ipython()
|
||||
if ip is None:
|
||||
# still not in IPython
|
||||
return
|
||||
|
||||
formatters = ip.display_formatter.formatters
|
||||
mimetype = "application/vnd.dataresource+json"
|
||||
|
||||
if enable:
|
||||
if mimetype not in formatters:
|
||||
# define tableschema formatter
|
||||
from IPython.core.formatters import BaseFormatter
|
||||
from traitlets import ObjectName
|
||||
|
||||
class TableSchemaFormatter(BaseFormatter):
|
||||
print_method = ObjectName("_repr_data_resource_")
|
||||
_return_type = (dict,)
|
||||
|
||||
# register it:
|
||||
formatters[mimetype] = TableSchemaFormatter()
|
||||
# enable it if it's been disabled:
|
||||
formatters[mimetype].enabled = True
|
||||
# unregister tableschema mime-type
|
||||
elif mimetype in formatters:
|
||||
formatters[mimetype].enabled = False
|
||||
|
||||
|
||||
def default_pprint(thing: Any, max_seq_items: int | None = None) -> str:
|
||||
return pprint_thing(
|
||||
thing,
|
||||
escape_chars=("\t", "\r", "\n"),
|
||||
quote_strings=True,
|
||||
max_seq_items=max_seq_items,
|
||||
)
|
||||
|
||||
|
||||
def format_object_summary(
|
||||
obj,
|
||||
formatter: Callable,
|
||||
is_justify: bool = True,
|
||||
name: str | None = None,
|
||||
indent_for_name: bool = True,
|
||||
line_break_each_value: bool = False,
|
||||
) -> str:
|
||||
"""
|
||||
Return the formatted obj as a unicode string
|
||||
|
||||
Parameters
|
||||
----------
|
||||
obj : object
|
||||
must be iterable and support __getitem__
|
||||
formatter : callable
|
||||
string formatter for an element
|
||||
is_justify : bool
|
||||
should justify the display
|
||||
name : name, optional
|
||||
defaults to the class name of the obj
|
||||
indent_for_name : bool, default True
|
||||
Whether subsequent lines should be indented to
|
||||
align with the name.
|
||||
line_break_each_value : bool, default False
|
||||
If True, inserts a line break for each value of ``obj``.
|
||||
If False, only break lines when the a line of values gets wider
|
||||
than the display width.
|
||||
|
||||
Returns
|
||||
-------
|
||||
summary string
|
||||
"""
|
||||
display_width, _ = get_console_size()
|
||||
if display_width is None:
|
||||
display_width = get_option("display.width") or 80
|
||||
if name is None:
|
||||
name = type(obj).__name__
|
||||
|
||||
if indent_for_name:
|
||||
name_len = len(name)
|
||||
space1 = f'\n{(" " * (name_len + 1))}'
|
||||
space2 = f'\n{(" " * (name_len + 2))}'
|
||||
else:
|
||||
space1 = "\n"
|
||||
space2 = "\n " # space for the opening '['
|
||||
|
||||
n = len(obj)
|
||||
if line_break_each_value:
|
||||
# If we want to vertically align on each value of obj, we need to
|
||||
# separate values by a line break and indent the values
|
||||
sep = ",\n " + " " * len(name)
|
||||
else:
|
||||
sep = ","
|
||||
max_seq_items = get_option("display.max_seq_items") or n
|
||||
|
||||
# are we a truncated display
|
||||
is_truncated = n > max_seq_items
|
||||
|
||||
# adj can optionally handle unicode eastern asian width
|
||||
adj = get_adjustment()
|
||||
|
||||
def _extend_line(
|
||||
s: str, line: str, value: str, display_width: int, next_line_prefix: str
|
||||
) -> tuple[str, str]:
|
||||
if adj.len(line.rstrip()) + adj.len(value.rstrip()) >= display_width:
|
||||
s += line.rstrip()
|
||||
line = next_line_prefix
|
||||
line += value
|
||||
return s, line
|
||||
|
||||
def best_len(values: list[str]) -> int:
|
||||
if values:
|
||||
return max(adj.len(x) for x in values)
|
||||
else:
|
||||
return 0
|
||||
|
||||
close = ", "
|
||||
|
||||
if n == 0:
|
||||
summary = f"[]{close}"
|
||||
elif n == 1 and not line_break_each_value:
|
||||
first = formatter(obj[0])
|
||||
summary = f"[{first}]{close}"
|
||||
elif n == 2 and not line_break_each_value:
|
||||
first = formatter(obj[0])
|
||||
last = formatter(obj[-1])
|
||||
summary = f"[{first}, {last}]{close}"
|
||||
else:
|
||||
if max_seq_items == 1:
|
||||
# If max_seq_items=1 show only last element
|
||||
head = []
|
||||
tail = [formatter(x) for x in obj[-1:]]
|
||||
elif n > max_seq_items:
|
||||
n = min(max_seq_items // 2, 10)
|
||||
head = [formatter(x) for x in obj[:n]]
|
||||
tail = [formatter(x) for x in obj[-n:]]
|
||||
else:
|
||||
head = []
|
||||
tail = [formatter(x) for x in obj]
|
||||
|
||||
# adjust all values to max length if needed
|
||||
if is_justify:
|
||||
if line_break_each_value:
|
||||
# Justify each string in the values of head and tail, so the
|
||||
# strings will right align when head and tail are stacked
|
||||
# vertically.
|
||||
head, tail = _justify(head, tail)
|
||||
elif is_truncated or not (
|
||||
len(", ".join(head)) < display_width
|
||||
and len(", ".join(tail)) < display_width
|
||||
):
|
||||
# Each string in head and tail should align with each other
|
||||
max_length = max(best_len(head), best_len(tail))
|
||||
head = [x.rjust(max_length) for x in head]
|
||||
tail = [x.rjust(max_length) for x in tail]
|
||||
# If we are not truncated and we are only a single
|
||||
# line, then don't justify
|
||||
|
||||
if line_break_each_value:
|
||||
# Now head and tail are of type List[Tuple[str]]. Below we
|
||||
# convert them into List[str], so there will be one string per
|
||||
# value. Also truncate items horizontally if wider than
|
||||
# max_space
|
||||
max_space = display_width - len(space2)
|
||||
value = tail[0]
|
||||
max_items = 1
|
||||
for num_items in reversed(range(1, len(value) + 1)):
|
||||
pprinted_seq = _pprint_seq(value, max_seq_items=num_items)
|
||||
if len(pprinted_seq) < max_space:
|
||||
max_items = num_items
|
||||
break
|
||||
head = [_pprint_seq(x, max_seq_items=max_items) for x in head]
|
||||
tail = [_pprint_seq(x, max_seq_items=max_items) for x in tail]
|
||||
|
||||
summary = ""
|
||||
line = space2
|
||||
|
||||
for head_value in head:
|
||||
word = head_value + sep + " "
|
||||
summary, line = _extend_line(summary, line, word, display_width, space2)
|
||||
|
||||
if is_truncated:
|
||||
# remove trailing space of last line
|
||||
summary += line.rstrip() + space2 + "..."
|
||||
line = space2
|
||||
|
||||
for tail_item in tail[:-1]:
|
||||
word = tail_item + sep + " "
|
||||
summary, line = _extend_line(summary, line, word, display_width, space2)
|
||||
|
||||
# last value: no sep added + 1 space of width used for trailing ','
|
||||
summary, line = _extend_line(summary, line, tail[-1], display_width - 2, space2)
|
||||
summary += line
|
||||
|
||||
# right now close is either '' or ', '
|
||||
# Now we want to include the ']', but not the maybe space.
|
||||
close = "]" + close.rstrip(" ")
|
||||
summary += close
|
||||
|
||||
if len(summary) > (display_width) or line_break_each_value:
|
||||
summary += space1
|
||||
else: # one row
|
||||
summary += " "
|
||||
|
||||
# remove initial space
|
||||
summary = "[" + summary[len(space2) :]
|
||||
|
||||
return summary
|
||||
|
||||
|
||||
def _justify(
|
||||
head: list[Sequence[str]], tail: list[Sequence[str]]
|
||||
) -> tuple[list[tuple[str, ...]], list[tuple[str, ...]]]:
|
||||
"""
|
||||
Justify items in head and tail, so they are right-aligned when stacked.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
head : list-like of list-likes of strings
|
||||
tail : list-like of list-likes of strings
|
||||
|
||||
Returns
|
||||
-------
|
||||
tuple of list of tuples of strings
|
||||
Same as head and tail, but items are right aligned when stacked
|
||||
vertically.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> _justify([['a', 'b']], [['abc', 'abcd']])
|
||||
([(' a', ' b')], [('abc', 'abcd')])
|
||||
"""
|
||||
combined = head + tail
|
||||
|
||||
# For each position for the sequences in ``combined``,
|
||||
# find the length of the largest string.
|
||||
max_length = [0] * len(combined[0])
|
||||
for inner_seq in combined:
|
||||
length = [len(item) for item in inner_seq]
|
||||
max_length = [max(x, y) for x, y in zip(max_length, length)]
|
||||
|
||||
# justify each item in each list-like in head and tail using max_length
|
||||
head_tuples = [
|
||||
tuple(x.rjust(max_len) for x, max_len in zip(seq, max_length)) for seq in head
|
||||
]
|
||||
tail_tuples = [
|
||||
tuple(x.rjust(max_len) for x, max_len in zip(seq, max_length)) for seq in tail
|
||||
]
|
||||
return head_tuples, tail_tuples
|
||||
|
||||
|
||||
class PrettyDict(dict[_KT, _VT]):
|
||||
"""Dict extension to support abbreviated __repr__"""
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return pprint_thing(self)
|
||||
|
||||
|
||||
class _TextAdjustment:
|
||||
def __init__(self) -> None:
|
||||
self.encoding = get_option("display.encoding")
|
||||
|
||||
def len(self, text: str) -> int:
|
||||
return len(text)
|
||||
|
||||
def justify(self, texts: Any, max_len: int, mode: str = "right") -> list[str]:
|
||||
"""
|
||||
Perform ljust, center, rjust against string or list-like
|
||||
"""
|
||||
if mode == "left":
|
||||
return [x.ljust(max_len) for x in texts]
|
||||
elif mode == "center":
|
||||
return [x.center(max_len) for x in texts]
|
||||
else:
|
||||
return [x.rjust(max_len) for x in texts]
|
||||
|
||||
def adjoin(self, space: int, *lists, **kwargs) -> str:
|
||||
return adjoin(space, *lists, strlen=self.len, justfunc=self.justify, **kwargs)
|
||||
|
||||
|
||||
class _EastAsianTextAdjustment(_TextAdjustment):
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
if get_option("display.unicode.ambiguous_as_wide"):
|
||||
self.ambiguous_width = 2
|
||||
else:
|
||||
self.ambiguous_width = 1
|
||||
|
||||
# Definition of East Asian Width
|
||||
# https://unicode.org/reports/tr11/
|
||||
# Ambiguous width can be changed by option
|
||||
self._EAW_MAP = {"Na": 1, "N": 1, "W": 2, "F": 2, "H": 1}
|
||||
|
||||
def len(self, text: str) -> int:
|
||||
"""
|
||||
Calculate display width considering unicode East Asian Width
|
||||
"""
|
||||
if not isinstance(text, str):
|
||||
return len(text)
|
||||
|
||||
return sum(
|
||||
self._EAW_MAP.get(east_asian_width(c), self.ambiguous_width) for c in text
|
||||
)
|
||||
|
||||
def justify(
|
||||
self, texts: Iterable[str], max_len: int, mode: str = "right"
|
||||
) -> list[str]:
|
||||
# re-calculate padding space per str considering East Asian Width
|
||||
def _get_pad(t):
|
||||
return max_len - self.len(t) + len(t)
|
||||
|
||||
if mode == "left":
|
||||
return [x.ljust(_get_pad(x)) for x in texts]
|
||||
elif mode == "center":
|
||||
return [x.center(_get_pad(x)) for x in texts]
|
||||
else:
|
||||
return [x.rjust(_get_pad(x)) for x in texts]
|
||||
|
||||
|
||||
def get_adjustment() -> _TextAdjustment:
|
||||
use_east_asian_width = get_option("display.unicode.east_asian_width")
|
||||
if use_east_asian_width:
|
||||
return _EastAsianTextAdjustment()
|
||||
else:
|
||||
return _TextAdjustment()
|
||||
206
lib/python3.11/site-packages/pandas/io/formats/string.py
Normal file
206
lib/python3.11/site-packages/pandas/io/formats/string.py
Normal file
@ -0,0 +1,206 @@
|
||||
"""
|
||||
Module for formatting output data in console (to string).
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from shutil import get_terminal_size
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas.io.formats.printing import pprint_thing
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Iterable
|
||||
|
||||
from pandas.io.formats.format import DataFrameFormatter
|
||||
|
||||
|
||||
class StringFormatter:
|
||||
"""Formatter for string representation of a dataframe."""
|
||||
|
||||
def __init__(self, fmt: DataFrameFormatter, line_width: int | None = None) -> None:
|
||||
self.fmt = fmt
|
||||
self.adj = fmt.adj
|
||||
self.frame = fmt.frame
|
||||
self.line_width = line_width
|
||||
|
||||
def to_string(self) -> str:
|
||||
text = self._get_string_representation()
|
||||
if self.fmt.should_show_dimensions:
|
||||
text = f"{text}{self.fmt.dimensions_info}"
|
||||
return text
|
||||
|
||||
def _get_strcols(self) -> list[list[str]]:
|
||||
strcols = self.fmt.get_strcols()
|
||||
if self.fmt.is_truncated:
|
||||
strcols = self._insert_dot_separators(strcols)
|
||||
return strcols
|
||||
|
||||
def _get_string_representation(self) -> str:
|
||||
if self.fmt.frame.empty:
|
||||
return self._empty_info_line
|
||||
|
||||
strcols = self._get_strcols()
|
||||
|
||||
if self.line_width is None:
|
||||
# no need to wrap around just print the whole frame
|
||||
return self.adj.adjoin(1, *strcols)
|
||||
|
||||
if self._need_to_wrap_around:
|
||||
return self._join_multiline(strcols)
|
||||
|
||||
return self._fit_strcols_to_terminal_width(strcols)
|
||||
|
||||
@property
|
||||
def _empty_info_line(self) -> str:
|
||||
return (
|
||||
f"Empty {type(self.frame).__name__}\n"
|
||||
f"Columns: {pprint_thing(self.frame.columns)}\n"
|
||||
f"Index: {pprint_thing(self.frame.index)}"
|
||||
)
|
||||
|
||||
@property
|
||||
def _need_to_wrap_around(self) -> bool:
|
||||
return bool(self.fmt.max_cols is None or self.fmt.max_cols > 0)
|
||||
|
||||
def _insert_dot_separators(self, strcols: list[list[str]]) -> list[list[str]]:
|
||||
str_index = self.fmt._get_formatted_index(self.fmt.tr_frame)
|
||||
index_length = len(str_index)
|
||||
|
||||
if self.fmt.is_truncated_horizontally:
|
||||
strcols = self._insert_dot_separator_horizontal(strcols, index_length)
|
||||
|
||||
if self.fmt.is_truncated_vertically:
|
||||
strcols = self._insert_dot_separator_vertical(strcols, index_length)
|
||||
|
||||
return strcols
|
||||
|
||||
@property
|
||||
def _adjusted_tr_col_num(self) -> int:
|
||||
return self.fmt.tr_col_num + 1 if self.fmt.index else self.fmt.tr_col_num
|
||||
|
||||
def _insert_dot_separator_horizontal(
|
||||
self, strcols: list[list[str]], index_length: int
|
||||
) -> list[list[str]]:
|
||||
strcols.insert(self._adjusted_tr_col_num, [" ..."] * index_length)
|
||||
return strcols
|
||||
|
||||
def _insert_dot_separator_vertical(
|
||||
self, strcols: list[list[str]], index_length: int
|
||||
) -> list[list[str]]:
|
||||
n_header_rows = index_length - len(self.fmt.tr_frame)
|
||||
row_num = self.fmt.tr_row_num
|
||||
for ix, col in enumerate(strcols):
|
||||
cwidth = self.adj.len(col[row_num])
|
||||
|
||||
if self.fmt.is_truncated_horizontally:
|
||||
is_dot_col = ix == self._adjusted_tr_col_num
|
||||
else:
|
||||
is_dot_col = False
|
||||
|
||||
if cwidth > 3 or is_dot_col:
|
||||
dots = "..."
|
||||
else:
|
||||
dots = ".."
|
||||
|
||||
if ix == 0 and self.fmt.index:
|
||||
dot_mode = "left"
|
||||
elif is_dot_col:
|
||||
cwidth = 4
|
||||
dot_mode = "right"
|
||||
else:
|
||||
dot_mode = "right"
|
||||
|
||||
dot_str = self.adj.justify([dots], cwidth, mode=dot_mode)[0]
|
||||
col.insert(row_num + n_header_rows, dot_str)
|
||||
return strcols
|
||||
|
||||
def _join_multiline(self, strcols_input: Iterable[list[str]]) -> str:
|
||||
lwidth = self.line_width
|
||||
adjoin_width = 1
|
||||
strcols = list(strcols_input)
|
||||
|
||||
if self.fmt.index:
|
||||
idx = strcols.pop(0)
|
||||
lwidth -= np.array([self.adj.len(x) for x in idx]).max() + adjoin_width
|
||||
|
||||
col_widths = [
|
||||
np.array([self.adj.len(x) for x in col]).max() if len(col) > 0 else 0
|
||||
for col in strcols
|
||||
]
|
||||
|
||||
assert lwidth is not None
|
||||
col_bins = _binify(col_widths, lwidth)
|
||||
nbins = len(col_bins)
|
||||
|
||||
str_lst = []
|
||||
start = 0
|
||||
for i, end in enumerate(col_bins):
|
||||
row = strcols[start:end]
|
||||
if self.fmt.index:
|
||||
row.insert(0, idx)
|
||||
if nbins > 1:
|
||||
nrows = len(row[-1])
|
||||
if end <= len(strcols) and i < nbins - 1:
|
||||
row.append([" \\"] + [" "] * (nrows - 1))
|
||||
else:
|
||||
row.append([" "] * nrows)
|
||||
str_lst.append(self.adj.adjoin(adjoin_width, *row))
|
||||
start = end
|
||||
return "\n\n".join(str_lst)
|
||||
|
||||
def _fit_strcols_to_terminal_width(self, strcols: list[list[str]]) -> str:
|
||||
from pandas import Series
|
||||
|
||||
lines = self.adj.adjoin(1, *strcols).split("\n")
|
||||
max_len = Series(lines).str.len().max()
|
||||
# plus truncate dot col
|
||||
width, _ = get_terminal_size()
|
||||
dif = max_len - width
|
||||
# '+ 1' to avoid too wide repr (GH PR #17023)
|
||||
adj_dif = dif + 1
|
||||
col_lens = Series([Series(ele).str.len().max() for ele in strcols])
|
||||
n_cols = len(col_lens)
|
||||
counter = 0
|
||||
while adj_dif > 0 and n_cols > 1:
|
||||
counter += 1
|
||||
mid = round(n_cols / 2)
|
||||
mid_ix = col_lens.index[mid]
|
||||
col_len = col_lens[mid_ix]
|
||||
# adjoin adds one
|
||||
adj_dif -= col_len + 1
|
||||
col_lens = col_lens.drop(mid_ix)
|
||||
n_cols = len(col_lens)
|
||||
|
||||
# subtract index column
|
||||
max_cols_fitted = n_cols - self.fmt.index
|
||||
# GH-21180. Ensure that we print at least two.
|
||||
max_cols_fitted = max(max_cols_fitted, 2)
|
||||
self.fmt.max_cols_fitted = max_cols_fitted
|
||||
|
||||
# Call again _truncate to cut frame appropriately
|
||||
# and then generate string representation
|
||||
self.fmt.truncate()
|
||||
strcols = self._get_strcols()
|
||||
return self.adj.adjoin(1, *strcols)
|
||||
|
||||
|
||||
def _binify(cols: list[int], line_width: int) -> list[int]:
|
||||
adjoin_width = 1
|
||||
bins = []
|
||||
curr_width = 0
|
||||
i_last_column = len(cols) - 1
|
||||
for i, w in enumerate(cols):
|
||||
w_adjoined = w + adjoin_width
|
||||
curr_width += w_adjoined
|
||||
if i_last_column == i:
|
||||
wrap = curr_width + 1 > line_width and i > 0
|
||||
else:
|
||||
wrap = curr_width + 2 > line_width and i > 0
|
||||
if wrap:
|
||||
bins.append(i)
|
||||
curr_width = w_adjoined
|
||||
|
||||
bins.append(len(cols))
|
||||
return bins
|
||||
4136
lib/python3.11/site-packages/pandas/io/formats/style.py
Normal file
4136
lib/python3.11/site-packages/pandas/io/formats/style.py
Normal file
File diff suppressed because it is too large
Load Diff
2497
lib/python3.11/site-packages/pandas/io/formats/style_render.py
Normal file
2497
lib/python3.11/site-packages/pandas/io/formats/style_render.py
Normal file
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,16 @@
|
||||
{# Update the html_style/table_structure.html documentation too #}
|
||||
{% if doctype_html %}
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<meta charset="{{encoding}}">
|
||||
{% if not exclude_styles %}{% include html_style_tpl %}{% endif %}
|
||||
</head>
|
||||
<body>
|
||||
{% include html_table_tpl %}
|
||||
</body>
|
||||
</html>
|
||||
{% elif not doctype_html %}
|
||||
{% if not exclude_styles %}{% include html_style_tpl %}{% endif %}
|
||||
{% include html_table_tpl %}
|
||||
{% endif %}
|
||||
@ -0,0 +1,26 @@
|
||||
{%- block before_style -%}{%- endblock before_style -%}
|
||||
{% block style %}
|
||||
<style type="text/css">
|
||||
{% block table_styles %}
|
||||
{% for s in table_styles %}
|
||||
#T_{{uuid}} {{s.selector}} {
|
||||
{% for p,val in s.props %}
|
||||
{{p}}: {{val}};
|
||||
{% endfor %}
|
||||
}
|
||||
{% endfor %}
|
||||
{% endblock table_styles %}
|
||||
{% block before_cellstyle %}{% endblock before_cellstyle %}
|
||||
{% block cellstyle %}
|
||||
{% for cs in [cellstyle, cellstyle_index, cellstyle_columns] %}
|
||||
{% for s in cs %}
|
||||
{% for selector in s.selectors %}{% if not loop.first %}, {% endif %}#T_{{uuid}}_{{selector}}{% endfor %} {
|
||||
{% for p,val in s.props %}
|
||||
{{p}}: {{val}};
|
||||
{% endfor %}
|
||||
}
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
{% endblock cellstyle %}
|
||||
</style>
|
||||
{% endblock style %}
|
||||
@ -0,0 +1,63 @@
|
||||
{% block before_table %}{% endblock before_table %}
|
||||
{% block table %}
|
||||
{% if exclude_styles %}
|
||||
<table>
|
||||
{% else %}
|
||||
<table id="T_{{uuid}}"{% if table_attributes %} {{table_attributes}}{% endif %}>
|
||||
{% endif %}
|
||||
{% block caption %}
|
||||
{% if caption and caption is string %}
|
||||
<caption>{{caption}}</caption>
|
||||
{% elif caption and caption is sequence %}
|
||||
<caption>{{caption[0]}}</caption>
|
||||
{% endif %}
|
||||
{% endblock caption %}
|
||||
{% block thead %}
|
||||
<thead>
|
||||
{% block before_head_rows %}{% endblock %}
|
||||
{% for r in head %}
|
||||
{% block head_tr scoped %}
|
||||
<tr>
|
||||
{% if exclude_styles %}
|
||||
{% for c in r %}
|
||||
{% if c.is_visible != False %}
|
||||
<{{c.type}} {{c.attributes}}>{{c.display_value}}</{{c.type}}>
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
{% else %}
|
||||
{% for c in r %}
|
||||
{% if c.is_visible != False %}
|
||||
<{{c.type}} {%- if c.id is defined %} id="T_{{uuid}}_{{c.id}}" {%- endif %} class="{{c.class}}" {{c.attributes}}>{{c.display_value}}</{{c.type}}>
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
{% endif %}
|
||||
</tr>
|
||||
{% endblock head_tr %}
|
||||
{% endfor %}
|
||||
{% block after_head_rows %}{% endblock %}
|
||||
</thead>
|
||||
{% endblock thead %}
|
||||
{% block tbody %}
|
||||
<tbody>
|
||||
{% block before_rows %}{% endblock before_rows %}
|
||||
{% for r in body %}
|
||||
{% block tr scoped %}
|
||||
<tr>
|
||||
{% if exclude_styles %}
|
||||
{% for c in r %}{% if c.is_visible != False %}
|
||||
<{{c.type}} {{c.attributes}}>{{c.display_value}}</{{c.type}}>
|
||||
{% endif %}{% endfor %}
|
||||
{% else %}
|
||||
{% for c in r %}{% if c.is_visible != False %}
|
||||
<{{c.type}} {%- if c.id is defined %} id="T_{{uuid}}_{{c.id}}" {%- endif %} class="{{c.class}}" {{c.attributes}}>{{c.display_value}}</{{c.type}}>
|
||||
{% endif %}{% endfor %}
|
||||
{% endif %}
|
||||
</tr>
|
||||
{% endblock tr %}
|
||||
{% endfor %}
|
||||
{% block after_rows %}{% endblock after_rows %}
|
||||
</tbody>
|
||||
{% endblock tbody %}
|
||||
</table>
|
||||
{% endblock table %}
|
||||
{% block after_table %}{% endblock after_table %}
|
||||
@ -0,0 +1,5 @@
|
||||
{% if environment == "longtable" %}
|
||||
{% include "latex_longtable.tpl" %}
|
||||
{% else %}
|
||||
{% include "latex_table.tpl" %}
|
||||
{% endif %}
|
||||
@ -0,0 +1,82 @@
|
||||
\begin{longtable}
|
||||
{%- set position = parse_table(table_styles, 'position') %}
|
||||
{%- if position is not none %}
|
||||
[{{position}}]
|
||||
{%- endif %}
|
||||
{%- set column_format = parse_table(table_styles, 'column_format') %}
|
||||
{% raw %}{{% endraw %}{{column_format}}{% raw %}}{% endraw %}
|
||||
|
||||
{% for style in table_styles %}
|
||||
{% if style['selector'] not in ['position', 'position_float', 'caption', 'toprule', 'midrule', 'bottomrule', 'column_format', 'label'] %}
|
||||
\{{style['selector']}}{{parse_table(table_styles, style['selector'])}}
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
{% if caption and caption is string %}
|
||||
\caption{% raw %}{{% endraw %}{{caption}}{% raw %}}{% endraw %}
|
||||
{%- set label = parse_table(table_styles, 'label') %}
|
||||
{%- if label is not none %}
|
||||
\label{{label}}
|
||||
{%- endif %} \\
|
||||
{% elif caption and caption is sequence %}
|
||||
\caption[{{caption[1]}}]{% raw %}{{% endraw %}{{caption[0]}}{% raw %}}{% endraw %}
|
||||
{%- set label = parse_table(table_styles, 'label') %}
|
||||
{%- if label is not none %}
|
||||
\label{{label}}
|
||||
{%- endif %} \\
|
||||
{% else %}
|
||||
{%- set label = parse_table(table_styles, 'label') %}
|
||||
{%- if label is not none %}
|
||||
\label{{label}} \\
|
||||
{% endif %}
|
||||
{% endif %}
|
||||
{% set toprule = parse_table(table_styles, 'toprule') %}
|
||||
{% if toprule is not none %}
|
||||
\{{toprule}}
|
||||
{% endif %}
|
||||
{% for row in head %}
|
||||
{% for c in row %}{%- if not loop.first %} & {% endif %}{{parse_header(c, multirow_align, multicol_align, siunitx)}}{% endfor %} \\
|
||||
{% endfor %}
|
||||
{% set midrule = parse_table(table_styles, 'midrule') %}
|
||||
{% if midrule is not none %}
|
||||
\{{midrule}}
|
||||
{% endif %}
|
||||
\endfirsthead
|
||||
{% if caption and caption is string %}
|
||||
\caption[]{% raw %}{{% endraw %}{{caption}}{% raw %}}{% endraw %} \\
|
||||
{% elif caption and caption is sequence %}
|
||||
\caption[]{% raw %}{{% endraw %}{{caption[0]}}{% raw %}}{% endraw %} \\
|
||||
{% endif %}
|
||||
{% if toprule is not none %}
|
||||
\{{toprule}}
|
||||
{% endif %}
|
||||
{% for row in head %}
|
||||
{% for c in row %}{%- if not loop.first %} & {% endif %}{{parse_header(c, multirow_align, multicol_align, siunitx)}}{% endfor %} \\
|
||||
{% endfor %}
|
||||
{% if midrule is not none %}
|
||||
\{{midrule}}
|
||||
{% endif %}
|
||||
\endhead
|
||||
{% if midrule is not none %}
|
||||
\{{midrule}}
|
||||
{% endif %}
|
||||
\multicolumn{% raw %}{{% endraw %}{{body[0]|length}}{% raw %}}{% endraw %}{r}{Continued on next page} \\
|
||||
{% if midrule is not none %}
|
||||
\{{midrule}}
|
||||
{% endif %}
|
||||
\endfoot
|
||||
{% set bottomrule = parse_table(table_styles, 'bottomrule') %}
|
||||
{% if bottomrule is not none %}
|
||||
\{{bottomrule}}
|
||||
{% endif %}
|
||||
\endlastfoot
|
||||
{% for row in body %}
|
||||
{% for c in row %}{% if not loop.first %} & {% endif %}
|
||||
{%- if c.type == 'th' %}{{parse_header(c, multirow_align, multicol_align)}}{% else %}{{parse_cell(c.cellstyle, c.display_value, convert_css)}}{% endif %}
|
||||
{%- endfor %} \\
|
||||
{% if clines and clines[loop.index] | length > 0 %}
|
||||
{%- for cline in clines[loop.index] %}{% if not loop.first %} {% endif %}{{ cline }}{% endfor %}
|
||||
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
\end{longtable}
|
||||
{% raw %}{% endraw %}
|
||||
@ -0,0 +1,57 @@
|
||||
{% if environment or parse_wrap(table_styles, caption) %}
|
||||
\begin{% raw %}{{% endraw %}{{environment if environment else "table"}}{% raw %}}{% endraw %}
|
||||
{%- set position = parse_table(table_styles, 'position') %}
|
||||
{%- if position is not none %}
|
||||
[{{position}}]
|
||||
{%- endif %}
|
||||
|
||||
{% set position_float = parse_table(table_styles, 'position_float') %}
|
||||
{% if position_float is not none%}
|
||||
\{{position_float}}
|
||||
{% endif %}
|
||||
{% if caption and caption is string %}
|
||||
\caption{% raw %}{{% endraw %}{{caption}}{% raw %}}{% endraw %}
|
||||
|
||||
{% elif caption and caption is sequence %}
|
||||
\caption[{{caption[1]}}]{% raw %}{{% endraw %}{{caption[0]}}{% raw %}}{% endraw %}
|
||||
|
||||
{% endif %}
|
||||
{% for style in table_styles %}
|
||||
{% if style['selector'] not in ['position', 'position_float', 'caption', 'toprule', 'midrule', 'bottomrule', 'column_format'] %}
|
||||
\{{style['selector']}}{{parse_table(table_styles, style['selector'])}}
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
{% endif %}
|
||||
\begin{tabular}
|
||||
{%- set column_format = parse_table(table_styles, 'column_format') %}
|
||||
{% raw %}{{% endraw %}{{column_format}}{% raw %}}{% endraw %}
|
||||
|
||||
{% set toprule = parse_table(table_styles, 'toprule') %}
|
||||
{% if toprule is not none %}
|
||||
\{{toprule}}
|
||||
{% endif %}
|
||||
{% for row in head %}
|
||||
{% for c in row %}{%- if not loop.first %} & {% endif %}{{parse_header(c, multirow_align, multicol_align, siunitx, convert_css)}}{% endfor %} \\
|
||||
{% endfor %}
|
||||
{% set midrule = parse_table(table_styles, 'midrule') %}
|
||||
{% if midrule is not none %}
|
||||
\{{midrule}}
|
||||
{% endif %}
|
||||
{% for row in body %}
|
||||
{% for c in row %}{% if not loop.first %} & {% endif %}
|
||||
{%- if c.type == 'th' %}{{parse_header(c, multirow_align, multicol_align, False, convert_css)}}{% else %}{{parse_cell(c.cellstyle, c.display_value, convert_css)}}{% endif %}
|
||||
{%- endfor %} \\
|
||||
{% if clines and clines[loop.index] | length > 0 %}
|
||||
{%- for cline in clines[loop.index] %}{% if not loop.first %} {% endif %}{{ cline }}{% endfor %}
|
||||
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
{% set bottomrule = parse_table(table_styles, 'bottomrule') %}
|
||||
{% if bottomrule is not none %}
|
||||
\{{bottomrule}}
|
||||
{% endif %}
|
||||
\end{tabular}
|
||||
{% if environment or parse_wrap(table_styles, caption) %}
|
||||
\end{% raw %}{{% endraw %}{{environment if environment else "table"}}{% raw %}}{% endraw %}
|
||||
|
||||
{% endif %}
|
||||
@ -0,0 +1,12 @@
|
||||
{% for r in head %}
|
||||
{% for c in r %}{% if c["is_visible"] %}
|
||||
{{ c["display_value"] }}{% if not loop.last %}{{ delimiter }}{% endif %}
|
||||
{% endif %}{% endfor %}
|
||||
|
||||
{% endfor %}
|
||||
{% for r in body %}
|
||||
{% for c in r %}{% if c["is_visible"] %}
|
||||
{{ c["display_value"] }}{% if not loop.last %}{{ delimiter }}{% endif %}
|
||||
{% endif %}{% endfor %}
|
||||
|
||||
{% endfor %}
|
||||
560
lib/python3.11/site-packages/pandas/io/formats/xml.py
Normal file
560
lib/python3.11/site-packages/pandas/io/formats/xml.py
Normal file
@ -0,0 +1,560 @@
|
||||
"""
|
||||
:mod:`pandas.io.formats.xml` is a module for formatting data in XML.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import codecs
|
||||
import io
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
final,
|
||||
)
|
||||
import warnings
|
||||
|
||||
from pandas.errors import AbstractMethodError
|
||||
from pandas.util._decorators import (
|
||||
cache_readonly,
|
||||
doc,
|
||||
)
|
||||
|
||||
from pandas.core.dtypes.common import is_list_like
|
||||
from pandas.core.dtypes.missing import isna
|
||||
|
||||
from pandas.core.shared_docs import _shared_docs
|
||||
|
||||
from pandas.io.common import get_handle
|
||||
from pandas.io.xml import (
|
||||
get_data_from_filepath,
|
||||
preprocess_data,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pandas._typing import (
|
||||
CompressionOptions,
|
||||
FilePath,
|
||||
ReadBuffer,
|
||||
StorageOptions,
|
||||
WriteBuffer,
|
||||
)
|
||||
|
||||
from pandas import DataFrame
|
||||
|
||||
|
||||
@doc(
|
||||
storage_options=_shared_docs["storage_options"],
|
||||
compression_options=_shared_docs["compression_options"] % "path_or_buffer",
|
||||
)
|
||||
class _BaseXMLFormatter:
|
||||
"""
|
||||
Subclass for formatting data in XML.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path_or_buffer : str or file-like
|
||||
This can be either a string of raw XML, a valid URL,
|
||||
file or file-like object.
|
||||
|
||||
index : bool
|
||||
Whether to include index in xml document.
|
||||
|
||||
row_name : str
|
||||
Name for root of xml document. Default is 'data'.
|
||||
|
||||
root_name : str
|
||||
Name for row elements of xml document. Default is 'row'.
|
||||
|
||||
na_rep : str
|
||||
Missing data representation.
|
||||
|
||||
attrs_cols : list
|
||||
List of columns to write as attributes in row element.
|
||||
|
||||
elem_cols : list
|
||||
List of columns to write as children in row element.
|
||||
|
||||
namespaces : dict
|
||||
The namespaces to define in XML document as dicts with key
|
||||
being namespace and value the URI.
|
||||
|
||||
prefix : str
|
||||
The prefix for each element in XML document including root.
|
||||
|
||||
encoding : str
|
||||
Encoding of xml object or document.
|
||||
|
||||
xml_declaration : bool
|
||||
Whether to include xml declaration at top line item in xml.
|
||||
|
||||
pretty_print : bool
|
||||
Whether to write xml document with line breaks and indentation.
|
||||
|
||||
stylesheet : str or file-like
|
||||
A URL, file, file-like object, or a raw string containing XSLT.
|
||||
|
||||
{compression_options}
|
||||
|
||||
.. versionchanged:: 1.4.0 Zstandard support.
|
||||
|
||||
{storage_options}
|
||||
|
||||
See also
|
||||
--------
|
||||
pandas.io.formats.xml.EtreeXMLFormatter
|
||||
pandas.io.formats.xml.LxmlXMLFormatter
|
||||
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
frame: DataFrame,
|
||||
path_or_buffer: FilePath | WriteBuffer[bytes] | WriteBuffer[str] | None = None,
|
||||
index: bool = True,
|
||||
root_name: str | None = "data",
|
||||
row_name: str | None = "row",
|
||||
na_rep: str | None = None,
|
||||
attr_cols: list[str] | None = None,
|
||||
elem_cols: list[str] | None = None,
|
||||
namespaces: dict[str | None, str] | None = None,
|
||||
prefix: str | None = None,
|
||||
encoding: str = "utf-8",
|
||||
xml_declaration: bool | None = True,
|
||||
pretty_print: bool | None = True,
|
||||
stylesheet: FilePath | ReadBuffer[str] | ReadBuffer[bytes] | None = None,
|
||||
compression: CompressionOptions = "infer",
|
||||
storage_options: StorageOptions | None = None,
|
||||
) -> None:
|
||||
self.frame = frame
|
||||
self.path_or_buffer = path_or_buffer
|
||||
self.index = index
|
||||
self.root_name = root_name
|
||||
self.row_name = row_name
|
||||
self.na_rep = na_rep
|
||||
self.attr_cols = attr_cols
|
||||
self.elem_cols = elem_cols
|
||||
self.namespaces = namespaces
|
||||
self.prefix = prefix
|
||||
self.encoding = encoding
|
||||
self.xml_declaration = xml_declaration
|
||||
self.pretty_print = pretty_print
|
||||
self.stylesheet = stylesheet
|
||||
self.compression: CompressionOptions = compression
|
||||
self.storage_options = storage_options
|
||||
|
||||
self.orig_cols = self.frame.columns.tolist()
|
||||
self.frame_dicts = self._process_dataframe()
|
||||
|
||||
self._validate_columns()
|
||||
self._validate_encoding()
|
||||
self.prefix_uri = self._get_prefix_uri()
|
||||
self._handle_indexes()
|
||||
|
||||
def _build_tree(self) -> bytes:
|
||||
"""
|
||||
Build tree from data.
|
||||
|
||||
This method initializes the root and builds attributes and elements
|
||||
with optional namespaces.
|
||||
"""
|
||||
raise AbstractMethodError(self)
|
||||
|
||||
@final
|
||||
def _validate_columns(self) -> None:
|
||||
"""
|
||||
Validate elems_cols and attrs_cols.
|
||||
|
||||
This method will check if columns is list-like.
|
||||
|
||||
Raises
|
||||
------
|
||||
ValueError
|
||||
* If value is not a list and less then length of nodes.
|
||||
"""
|
||||
if self.attr_cols and not is_list_like(self.attr_cols):
|
||||
raise TypeError(
|
||||
f"{type(self.attr_cols).__name__} is not a valid type for attr_cols"
|
||||
)
|
||||
|
||||
if self.elem_cols and not is_list_like(self.elem_cols):
|
||||
raise TypeError(
|
||||
f"{type(self.elem_cols).__name__} is not a valid type for elem_cols"
|
||||
)
|
||||
|
||||
@final
|
||||
def _validate_encoding(self) -> None:
|
||||
"""
|
||||
Validate encoding.
|
||||
|
||||
This method will check if encoding is among listed under codecs.
|
||||
|
||||
Raises
|
||||
------
|
||||
LookupError
|
||||
* If encoding is not available in codecs.
|
||||
"""
|
||||
|
||||
codecs.lookup(self.encoding)
|
||||
|
||||
@final
|
||||
def _process_dataframe(self) -> dict[int | str, dict[str, Any]]:
|
||||
"""
|
||||
Adjust Data Frame to fit xml output.
|
||||
|
||||
This method will adjust underlying data frame for xml output,
|
||||
including optionally replacing missing values and including indexes.
|
||||
"""
|
||||
|
||||
df = self.frame
|
||||
|
||||
if self.index:
|
||||
df = df.reset_index()
|
||||
|
||||
if self.na_rep is not None:
|
||||
with warnings.catch_warnings():
|
||||
warnings.filterwarnings(
|
||||
"ignore",
|
||||
"Downcasting object dtype arrays",
|
||||
category=FutureWarning,
|
||||
)
|
||||
df = df.fillna(self.na_rep)
|
||||
|
||||
return df.to_dict(orient="index")
|
||||
|
||||
@final
|
||||
def _handle_indexes(self) -> None:
|
||||
"""
|
||||
Handle indexes.
|
||||
|
||||
This method will add indexes into attr_cols or elem_cols.
|
||||
"""
|
||||
|
||||
if not self.index:
|
||||
return
|
||||
|
||||
first_key = next(iter(self.frame_dicts))
|
||||
indexes: list[str] = [
|
||||
x for x in self.frame_dicts[first_key].keys() if x not in self.orig_cols
|
||||
]
|
||||
|
||||
if self.attr_cols:
|
||||
self.attr_cols = indexes + self.attr_cols
|
||||
|
||||
if self.elem_cols:
|
||||
self.elem_cols = indexes + self.elem_cols
|
||||
|
||||
def _get_prefix_uri(self) -> str:
|
||||
"""
|
||||
Get uri of namespace prefix.
|
||||
|
||||
This method retrieves corresponding URI to prefix in namespaces.
|
||||
|
||||
Raises
|
||||
------
|
||||
KeyError
|
||||
*If prefix is not included in namespace dict.
|
||||
"""
|
||||
|
||||
raise AbstractMethodError(self)
|
||||
|
||||
@final
|
||||
def _other_namespaces(self) -> dict:
|
||||
"""
|
||||
Define other namespaces.
|
||||
|
||||
This method will build dictionary of namespaces attributes
|
||||
for root element, conditionally with optional namespaces and
|
||||
prefix.
|
||||
"""
|
||||
|
||||
nmsp_dict: dict[str, str] = {}
|
||||
if self.namespaces:
|
||||
nmsp_dict = {
|
||||
f"xmlns{p if p=='' else f':{p}'}": n
|
||||
for p, n in self.namespaces.items()
|
||||
if n != self.prefix_uri[1:-1]
|
||||
}
|
||||
|
||||
return nmsp_dict
|
||||
|
||||
@final
|
||||
def _build_attribs(self, d: dict[str, Any], elem_row: Any) -> Any:
|
||||
"""
|
||||
Create attributes of row.
|
||||
|
||||
This method adds attributes using attr_cols to row element and
|
||||
works with tuples for multindex or hierarchical columns.
|
||||
"""
|
||||
|
||||
if not self.attr_cols:
|
||||
return elem_row
|
||||
|
||||
for col in self.attr_cols:
|
||||
attr_name = self._get_flat_col_name(col)
|
||||
try:
|
||||
if not isna(d[col]):
|
||||
elem_row.attrib[attr_name] = str(d[col])
|
||||
except KeyError:
|
||||
raise KeyError(f"no valid column, {col}")
|
||||
return elem_row
|
||||
|
||||
@final
|
||||
def _get_flat_col_name(self, col: str | tuple) -> str:
|
||||
flat_col = col
|
||||
if isinstance(col, tuple):
|
||||
flat_col = (
|
||||
"".join([str(c) for c in col]).strip()
|
||||
if "" in col
|
||||
else "_".join([str(c) for c in col]).strip()
|
||||
)
|
||||
return f"{self.prefix_uri}{flat_col}"
|
||||
|
||||
@cache_readonly
|
||||
def _sub_element_cls(self):
|
||||
raise AbstractMethodError(self)
|
||||
|
||||
@final
|
||||
def _build_elems(self, d: dict[str, Any], elem_row: Any) -> None:
|
||||
"""
|
||||
Create child elements of row.
|
||||
|
||||
This method adds child elements using elem_cols to row element and
|
||||
works with tuples for multindex or hierarchical columns.
|
||||
"""
|
||||
sub_element_cls = self._sub_element_cls
|
||||
|
||||
if not self.elem_cols:
|
||||
return
|
||||
|
||||
for col in self.elem_cols:
|
||||
elem_name = self._get_flat_col_name(col)
|
||||
try:
|
||||
val = None if isna(d[col]) or d[col] == "" else str(d[col])
|
||||
sub_element_cls(elem_row, elem_name).text = val
|
||||
except KeyError:
|
||||
raise KeyError(f"no valid column, {col}")
|
||||
|
||||
@final
|
||||
def write_output(self) -> str | None:
|
||||
xml_doc = self._build_tree()
|
||||
|
||||
if self.path_or_buffer is not None:
|
||||
with get_handle(
|
||||
self.path_or_buffer,
|
||||
"wb",
|
||||
compression=self.compression,
|
||||
storage_options=self.storage_options,
|
||||
is_text=False,
|
||||
) as handles:
|
||||
handles.handle.write(xml_doc)
|
||||
return None
|
||||
|
||||
else:
|
||||
return xml_doc.decode(self.encoding).rstrip()
|
||||
|
||||
|
||||
class EtreeXMLFormatter(_BaseXMLFormatter):
|
||||
"""
|
||||
Class for formatting data in xml using Python standard library
|
||||
modules: `xml.etree.ElementTree` and `xml.dom.minidom`.
|
||||
"""
|
||||
|
||||
def _build_tree(self) -> bytes:
|
||||
from xml.etree.ElementTree import (
|
||||
Element,
|
||||
SubElement,
|
||||
tostring,
|
||||
)
|
||||
|
||||
self.root = Element(
|
||||
f"{self.prefix_uri}{self.root_name}", attrib=self._other_namespaces()
|
||||
)
|
||||
|
||||
for d in self.frame_dicts.values():
|
||||
elem_row = SubElement(self.root, f"{self.prefix_uri}{self.row_name}")
|
||||
|
||||
if not self.attr_cols and not self.elem_cols:
|
||||
self.elem_cols = list(d.keys())
|
||||
self._build_elems(d, elem_row)
|
||||
|
||||
else:
|
||||
elem_row = self._build_attribs(d, elem_row)
|
||||
self._build_elems(d, elem_row)
|
||||
|
||||
self.out_xml = tostring(
|
||||
self.root,
|
||||
method="xml",
|
||||
encoding=self.encoding,
|
||||
xml_declaration=self.xml_declaration,
|
||||
)
|
||||
|
||||
if self.pretty_print:
|
||||
self.out_xml = self._prettify_tree()
|
||||
|
||||
if self.stylesheet is not None:
|
||||
raise ValueError(
|
||||
"To use stylesheet, you need lxml installed and selected as parser."
|
||||
)
|
||||
|
||||
return self.out_xml
|
||||
|
||||
def _get_prefix_uri(self) -> str:
|
||||
from xml.etree.ElementTree import register_namespace
|
||||
|
||||
uri = ""
|
||||
if self.namespaces:
|
||||
for p, n in self.namespaces.items():
|
||||
if isinstance(p, str) and isinstance(n, str):
|
||||
register_namespace(p, n)
|
||||
if self.prefix:
|
||||
try:
|
||||
uri = f"{{{self.namespaces[self.prefix]}}}"
|
||||
except KeyError:
|
||||
raise KeyError(f"{self.prefix} is not included in namespaces")
|
||||
elif "" in self.namespaces:
|
||||
uri = f'{{{self.namespaces[""]}}}'
|
||||
else:
|
||||
uri = ""
|
||||
|
||||
return uri
|
||||
|
||||
@cache_readonly
|
||||
def _sub_element_cls(self):
|
||||
from xml.etree.ElementTree import SubElement
|
||||
|
||||
return SubElement
|
||||
|
||||
def _prettify_tree(self) -> bytes:
|
||||
"""
|
||||
Output tree for pretty print format.
|
||||
|
||||
This method will pretty print xml with line breaks and indentation.
|
||||
"""
|
||||
|
||||
from xml.dom.minidom import parseString
|
||||
|
||||
dom = parseString(self.out_xml)
|
||||
|
||||
return dom.toprettyxml(indent=" ", encoding=self.encoding)
|
||||
|
||||
|
||||
class LxmlXMLFormatter(_BaseXMLFormatter):
|
||||
"""
|
||||
Class for formatting data in xml using Python standard library
|
||||
modules: `xml.etree.ElementTree` and `xml.dom.minidom`.
|
||||
"""
|
||||
|
||||
def __init__(self, *args, **kwargs) -> None:
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
self._convert_empty_str_key()
|
||||
|
||||
def _build_tree(self) -> bytes:
|
||||
"""
|
||||
Build tree from data.
|
||||
|
||||
This method initializes the root and builds attributes and elements
|
||||
with optional namespaces.
|
||||
"""
|
||||
from lxml.etree import (
|
||||
Element,
|
||||
SubElement,
|
||||
tostring,
|
||||
)
|
||||
|
||||
self.root = Element(f"{self.prefix_uri}{self.root_name}", nsmap=self.namespaces)
|
||||
|
||||
for d in self.frame_dicts.values():
|
||||
elem_row = SubElement(self.root, f"{self.prefix_uri}{self.row_name}")
|
||||
|
||||
if not self.attr_cols and not self.elem_cols:
|
||||
self.elem_cols = list(d.keys())
|
||||
self._build_elems(d, elem_row)
|
||||
|
||||
else:
|
||||
elem_row = self._build_attribs(d, elem_row)
|
||||
self._build_elems(d, elem_row)
|
||||
|
||||
self.out_xml = tostring(
|
||||
self.root,
|
||||
pretty_print=self.pretty_print,
|
||||
method="xml",
|
||||
encoding=self.encoding,
|
||||
xml_declaration=self.xml_declaration,
|
||||
)
|
||||
|
||||
if self.stylesheet is not None:
|
||||
self.out_xml = self._transform_doc()
|
||||
|
||||
return self.out_xml
|
||||
|
||||
def _convert_empty_str_key(self) -> None:
|
||||
"""
|
||||
Replace zero-length string in `namespaces`.
|
||||
|
||||
This method will replace '' with None to align to `lxml`
|
||||
requirement that empty string prefixes are not allowed.
|
||||
"""
|
||||
|
||||
if self.namespaces and "" in self.namespaces.keys():
|
||||
self.namespaces[None] = self.namespaces.pop("", "default")
|
||||
|
||||
def _get_prefix_uri(self) -> str:
|
||||
uri = ""
|
||||
if self.namespaces:
|
||||
if self.prefix:
|
||||
try:
|
||||
uri = f"{{{self.namespaces[self.prefix]}}}"
|
||||
except KeyError:
|
||||
raise KeyError(f"{self.prefix} is not included in namespaces")
|
||||
elif "" in self.namespaces:
|
||||
uri = f'{{{self.namespaces[""]}}}'
|
||||
else:
|
||||
uri = ""
|
||||
|
||||
return uri
|
||||
|
||||
@cache_readonly
|
||||
def _sub_element_cls(self):
|
||||
from lxml.etree import SubElement
|
||||
|
||||
return SubElement
|
||||
|
||||
def _transform_doc(self) -> bytes:
|
||||
"""
|
||||
Parse stylesheet from file or buffer and run it.
|
||||
|
||||
This method will parse stylesheet object into tree for parsing
|
||||
conditionally by its specific object type, then transforms
|
||||
original tree with XSLT script.
|
||||
"""
|
||||
from lxml.etree import (
|
||||
XSLT,
|
||||
XMLParser,
|
||||
fromstring,
|
||||
parse,
|
||||
)
|
||||
|
||||
style_doc = self.stylesheet
|
||||
assert style_doc is not None # is ensured by caller
|
||||
|
||||
handle_data = get_data_from_filepath(
|
||||
filepath_or_buffer=style_doc,
|
||||
encoding=self.encoding,
|
||||
compression=self.compression,
|
||||
storage_options=self.storage_options,
|
||||
)
|
||||
|
||||
with preprocess_data(handle_data) as xml_data:
|
||||
curr_parser = XMLParser(encoding=self.encoding)
|
||||
|
||||
if isinstance(xml_data, io.StringIO):
|
||||
xsl_doc = fromstring(
|
||||
xml_data.getvalue().encode(self.encoding), parser=curr_parser
|
||||
)
|
||||
else:
|
||||
xsl_doc = parse(xml_data, parser=curr_parser)
|
||||
|
||||
transformer = XSLT(xsl_doc)
|
||||
new_doc = transformer(self.root)
|
||||
|
||||
return bytes(new_doc)
|
||||
255
lib/python3.11/site-packages/pandas/io/gbq.py
Normal file
255
lib/python3.11/site-packages/pandas/io/gbq.py
Normal file
@ -0,0 +1,255 @@
|
||||
""" Google BigQuery support """
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
)
|
||||
import warnings
|
||||
|
||||
from pandas.compat._optional import import_optional_dependency
|
||||
from pandas.util._exceptions import find_stack_level
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from google.auth.credentials import Credentials
|
||||
|
||||
from pandas import DataFrame
|
||||
|
||||
|
||||
def _try_import():
|
||||
# since pandas is a dependency of pandas-gbq
|
||||
# we need to import on first use
|
||||
msg = (
|
||||
"pandas-gbq is required to load data from Google BigQuery. "
|
||||
"See the docs: https://pandas-gbq.readthedocs.io."
|
||||
)
|
||||
pandas_gbq = import_optional_dependency("pandas_gbq", extra=msg)
|
||||
return pandas_gbq
|
||||
|
||||
|
||||
def read_gbq(
|
||||
query: str,
|
||||
project_id: str | None = None,
|
||||
index_col: str | None = None,
|
||||
col_order: list[str] | None = None,
|
||||
reauth: bool = False,
|
||||
auth_local_webserver: bool = True,
|
||||
dialect: str | None = None,
|
||||
location: str | None = None,
|
||||
configuration: dict[str, Any] | None = None,
|
||||
credentials: Credentials | None = None,
|
||||
use_bqstorage_api: bool | None = None,
|
||||
max_results: int | None = None,
|
||||
progress_bar_type: str | None = None,
|
||||
) -> DataFrame:
|
||||
"""
|
||||
Load data from Google BigQuery.
|
||||
|
||||
.. deprecated:: 2.2.0
|
||||
|
||||
Please use ``pandas_gbq.read_gbq`` instead.
|
||||
|
||||
This function requires the `pandas-gbq package
|
||||
<https://pandas-gbq.readthedocs.io>`__.
|
||||
|
||||
See the `How to authenticate with Google BigQuery
|
||||
<https://pandas-gbq.readthedocs.io/en/latest/howto/authentication.html>`__
|
||||
guide for authentication instructions.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
query : str
|
||||
SQL-Like Query to return data values.
|
||||
project_id : str, optional
|
||||
Google BigQuery Account project ID. Optional when available from
|
||||
the environment.
|
||||
index_col : str, optional
|
||||
Name of result column to use for index in results DataFrame.
|
||||
col_order : list(str), optional
|
||||
List of BigQuery column names in the desired order for results
|
||||
DataFrame.
|
||||
reauth : bool, default False
|
||||
Force Google BigQuery to re-authenticate the user. This is useful
|
||||
if multiple accounts are used.
|
||||
auth_local_webserver : bool, default True
|
||||
Use the `local webserver flow`_ instead of the `console flow`_
|
||||
when getting user credentials.
|
||||
|
||||
.. _local webserver flow:
|
||||
https://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_local_server
|
||||
.. _console flow:
|
||||
https://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_console
|
||||
|
||||
*New in version 0.2.0 of pandas-gbq*.
|
||||
|
||||
.. versionchanged:: 1.5.0
|
||||
Default value is changed to ``True``. Google has deprecated the
|
||||
``auth_local_webserver = False`` `"out of band" (copy-paste)
|
||||
flow
|
||||
<https://developers.googleblog.com/2022/02/making-oauth-flows-safer.html?m=1#disallowed-oob>`_.
|
||||
dialect : str, default 'legacy'
|
||||
Note: The default value is changing to 'standard' in a future version.
|
||||
|
||||
SQL syntax dialect to use. Value can be one of:
|
||||
|
||||
``'legacy'``
|
||||
Use BigQuery's legacy SQL dialect. For more information see
|
||||
`BigQuery Legacy SQL Reference
|
||||
<https://cloud.google.com/bigquery/docs/reference/legacy-sql>`__.
|
||||
``'standard'``
|
||||
Use BigQuery's standard SQL, which is
|
||||
compliant with the SQL 2011 standard. For more information
|
||||
see `BigQuery Standard SQL Reference
|
||||
<https://cloud.google.com/bigquery/docs/reference/standard-sql/>`__.
|
||||
location : str, optional
|
||||
Location where the query job should run. See the `BigQuery locations
|
||||
documentation
|
||||
<https://cloud.google.com/bigquery/docs/dataset-locations>`__ for a
|
||||
list of available locations. The location must match that of any
|
||||
datasets used in the query.
|
||||
|
||||
*New in version 0.5.0 of pandas-gbq*.
|
||||
configuration : dict, optional
|
||||
Query config parameters for job processing.
|
||||
For example:
|
||||
|
||||
configuration = {'query': {'useQueryCache': False}}
|
||||
|
||||
For more information see `BigQuery REST API Reference
|
||||
<https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs#configuration.query>`__.
|
||||
credentials : google.auth.credentials.Credentials, optional
|
||||
Credentials for accessing Google APIs. Use this parameter to override
|
||||
default credentials, such as to use Compute Engine
|
||||
:class:`google.auth.compute_engine.Credentials` or Service Account
|
||||
:class:`google.oauth2.service_account.Credentials` directly.
|
||||
|
||||
*New in version 0.8.0 of pandas-gbq*.
|
||||
use_bqstorage_api : bool, default False
|
||||
Use the `BigQuery Storage API
|
||||
<https://cloud.google.com/bigquery/docs/reference/storage/>`__ to
|
||||
download query results quickly, but at an increased cost. To use this
|
||||
API, first `enable it in the Cloud Console
|
||||
<https://console.cloud.google.com/apis/library/bigquerystorage.googleapis.com>`__.
|
||||
You must also have the `bigquery.readsessions.create
|
||||
<https://cloud.google.com/bigquery/docs/access-control#roles>`__
|
||||
permission on the project you are billing queries to.
|
||||
|
||||
This feature requires version 0.10.0 or later of the ``pandas-gbq``
|
||||
package. It also requires the ``google-cloud-bigquery-storage`` and
|
||||
``fastavro`` packages.
|
||||
|
||||
max_results : int, optional
|
||||
If set, limit the maximum number of rows to fetch from the query
|
||||
results.
|
||||
|
||||
progress_bar_type : Optional, str
|
||||
If set, use the `tqdm <https://tqdm.github.io/>`__ library to
|
||||
display a progress bar while the data downloads. Install the
|
||||
``tqdm`` package to use this feature.
|
||||
|
||||
Possible values of ``progress_bar_type`` include:
|
||||
|
||||
``None``
|
||||
No progress bar.
|
||||
``'tqdm'``
|
||||
Use the :func:`tqdm.tqdm` function to print a progress bar
|
||||
to :data:`sys.stderr`.
|
||||
``'tqdm_notebook'``
|
||||
Use the :func:`tqdm.tqdm_notebook` function to display a
|
||||
progress bar as a Jupyter notebook widget.
|
||||
``'tqdm_gui'``
|
||||
Use the :func:`tqdm.tqdm_gui` function to display a
|
||||
progress bar as a graphical dialog box.
|
||||
|
||||
Returns
|
||||
-------
|
||||
df: DataFrame
|
||||
DataFrame representing results of query.
|
||||
|
||||
See Also
|
||||
--------
|
||||
pandas_gbq.read_gbq : This function in the pandas-gbq library.
|
||||
DataFrame.to_gbq : Write a DataFrame to Google BigQuery.
|
||||
|
||||
Examples
|
||||
--------
|
||||
Example taken from `Google BigQuery documentation
|
||||
<https://cloud.google.com/bigquery/docs/pandas-gbq-migration>`_
|
||||
|
||||
>>> sql = "SELECT name FROM table_name WHERE state = 'TX' LIMIT 100;"
|
||||
>>> df = pd.read_gbq(sql, dialect="standard") # doctest: +SKIP
|
||||
>>> project_id = "your-project-id" # doctest: +SKIP
|
||||
>>> df = pd.read_gbq(sql,
|
||||
... project_id=project_id,
|
||||
... dialect="standard"
|
||||
... ) # doctest: +SKIP
|
||||
"""
|
||||
warnings.warn(
|
||||
"read_gbq is deprecated and will be removed in a future version. "
|
||||
"Please use pandas_gbq.read_gbq instead: "
|
||||
"https://pandas-gbq.readthedocs.io/en/latest/api.html#pandas_gbq.read_gbq",
|
||||
FutureWarning,
|
||||
stacklevel=find_stack_level(),
|
||||
)
|
||||
pandas_gbq = _try_import()
|
||||
|
||||
kwargs: dict[str, str | bool | int | None] = {}
|
||||
|
||||
# START: new kwargs. Don't populate unless explicitly set.
|
||||
if use_bqstorage_api is not None:
|
||||
kwargs["use_bqstorage_api"] = use_bqstorage_api
|
||||
if max_results is not None:
|
||||
kwargs["max_results"] = max_results
|
||||
|
||||
kwargs["progress_bar_type"] = progress_bar_type
|
||||
# END: new kwargs
|
||||
|
||||
return pandas_gbq.read_gbq(
|
||||
query,
|
||||
project_id=project_id,
|
||||
index_col=index_col,
|
||||
col_order=col_order,
|
||||
reauth=reauth,
|
||||
auth_local_webserver=auth_local_webserver,
|
||||
dialect=dialect,
|
||||
location=location,
|
||||
configuration=configuration,
|
||||
credentials=credentials,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
|
||||
def to_gbq(
|
||||
dataframe: DataFrame,
|
||||
destination_table: str,
|
||||
project_id: str | None = None,
|
||||
chunksize: int | None = None,
|
||||
reauth: bool = False,
|
||||
if_exists: str = "fail",
|
||||
auth_local_webserver: bool = True,
|
||||
table_schema: list[dict[str, str]] | None = None,
|
||||
location: str | None = None,
|
||||
progress_bar: bool = True,
|
||||
credentials: Credentials | None = None,
|
||||
) -> None:
|
||||
warnings.warn(
|
||||
"to_gbq is deprecated and will be removed in a future version. "
|
||||
"Please use pandas_gbq.to_gbq instead: "
|
||||
"https://pandas-gbq.readthedocs.io/en/latest/api.html#pandas_gbq.to_gbq",
|
||||
FutureWarning,
|
||||
stacklevel=find_stack_level(),
|
||||
)
|
||||
pandas_gbq = _try_import()
|
||||
pandas_gbq.to_gbq(
|
||||
dataframe,
|
||||
destination_table,
|
||||
project_id=project_id,
|
||||
chunksize=chunksize,
|
||||
reauth=reauth,
|
||||
if_exists=if_exists,
|
||||
auth_local_webserver=auth_local_webserver,
|
||||
table_schema=table_schema,
|
||||
location=location,
|
||||
progress_bar=progress_bar,
|
||||
credentials=credentials,
|
||||
)
|
||||
1259
lib/python3.11/site-packages/pandas/io/html.py
Normal file
1259
lib/python3.11/site-packages/pandas/io/html.py
Normal file
File diff suppressed because it is too large
Load Diff
15
lib/python3.11/site-packages/pandas/io/json/__init__.py
Normal file
15
lib/python3.11/site-packages/pandas/io/json/__init__.py
Normal file
@ -0,0 +1,15 @@
|
||||
from pandas.io.json._json import (
|
||||
read_json,
|
||||
to_json,
|
||||
ujson_dumps,
|
||||
ujson_loads,
|
||||
)
|
||||
from pandas.io.json._table_schema import build_table_schema
|
||||
|
||||
__all__ = [
|
||||
"ujson_dumps",
|
||||
"ujson_loads",
|
||||
"read_json",
|
||||
"to_json",
|
||||
"build_table_schema",
|
||||
]
|
||||
1494
lib/python3.11/site-packages/pandas/io/json/_json.py
Normal file
1494
lib/python3.11/site-packages/pandas/io/json/_json.py
Normal file
File diff suppressed because it is too large
Load Diff
544
lib/python3.11/site-packages/pandas/io/json/_normalize.py
Normal file
544
lib/python3.11/site-packages/pandas/io/json/_normalize.py
Normal file
@ -0,0 +1,544 @@
|
||||
# ---------------------------------------------------------------------
|
||||
# JSON normalization routines
|
||||
from __future__ import annotations
|
||||
|
||||
from collections import (
|
||||
abc,
|
||||
defaultdict,
|
||||
)
|
||||
import copy
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
DefaultDict,
|
||||
)
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas._libs.writers import convert_json_to_lines
|
||||
|
||||
import pandas as pd
|
||||
from pandas import DataFrame
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Iterable
|
||||
|
||||
from pandas._typing import (
|
||||
IgnoreRaise,
|
||||
Scalar,
|
||||
)
|
||||
|
||||
|
||||
def convert_to_line_delimits(s: str) -> str:
|
||||
"""
|
||||
Helper function that converts JSON lists to line delimited JSON.
|
||||
"""
|
||||
# Determine we have a JSON list to turn to lines otherwise just return the
|
||||
# json object, only lists can
|
||||
if not s[0] == "[" and s[-1] == "]":
|
||||
return s
|
||||
s = s[1:-1]
|
||||
|
||||
return convert_json_to_lines(s)
|
||||
|
||||
|
||||
def nested_to_record(
|
||||
ds,
|
||||
prefix: str = "",
|
||||
sep: str = ".",
|
||||
level: int = 0,
|
||||
max_level: int | None = None,
|
||||
):
|
||||
"""
|
||||
A simplified json_normalize
|
||||
|
||||
Converts a nested dict into a flat dict ("record"), unlike json_normalize,
|
||||
it does not attempt to extract a subset of the data.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
ds : dict or list of dicts
|
||||
prefix: the prefix, optional, default: ""
|
||||
sep : str, default '.'
|
||||
Nested records will generate names separated by sep,
|
||||
e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar
|
||||
level: int, optional, default: 0
|
||||
The number of levels in the json string.
|
||||
|
||||
max_level: int, optional, default: None
|
||||
The max depth to normalize.
|
||||
|
||||
Returns
|
||||
-------
|
||||
d - dict or list of dicts, matching `ds`
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> nested_to_record(
|
||||
... dict(flat1=1, dict1=dict(c=1, d=2), nested=dict(e=dict(c=1, d=2), d=2))
|
||||
... )
|
||||
{\
|
||||
'flat1': 1, \
|
||||
'dict1.c': 1, \
|
||||
'dict1.d': 2, \
|
||||
'nested.e.c': 1, \
|
||||
'nested.e.d': 2, \
|
||||
'nested.d': 2\
|
||||
}
|
||||
"""
|
||||
singleton = False
|
||||
if isinstance(ds, dict):
|
||||
ds = [ds]
|
||||
singleton = True
|
||||
new_ds = []
|
||||
for d in ds:
|
||||
new_d = copy.deepcopy(d)
|
||||
for k, v in d.items():
|
||||
# each key gets renamed with prefix
|
||||
if not isinstance(k, str):
|
||||
k = str(k)
|
||||
if level == 0:
|
||||
newkey = k
|
||||
else:
|
||||
newkey = prefix + sep + k
|
||||
|
||||
# flatten if type is dict and
|
||||
# current dict level < maximum level provided and
|
||||
# only dicts gets recurse-flattened
|
||||
# only at level>1 do we rename the rest of the keys
|
||||
if not isinstance(v, dict) or (
|
||||
max_level is not None and level >= max_level
|
||||
):
|
||||
if level != 0: # so we skip copying for top level, common case
|
||||
v = new_d.pop(k)
|
||||
new_d[newkey] = v
|
||||
continue
|
||||
|
||||
v = new_d.pop(k)
|
||||
new_d.update(nested_to_record(v, newkey, sep, level + 1, max_level))
|
||||
new_ds.append(new_d)
|
||||
|
||||
if singleton:
|
||||
return new_ds[0]
|
||||
return new_ds
|
||||
|
||||
|
||||
def _normalise_json(
|
||||
data: Any,
|
||||
key_string: str,
|
||||
normalized_dict: dict[str, Any],
|
||||
separator: str,
|
||||
) -> dict[str, Any]:
|
||||
"""
|
||||
Main recursive function
|
||||
Designed for the most basic use case of pd.json_normalize(data)
|
||||
intended as a performance improvement, see #15621
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : Any
|
||||
Type dependent on types contained within nested Json
|
||||
key_string : str
|
||||
New key (with separator(s) in) for data
|
||||
normalized_dict : dict
|
||||
The new normalized/flattened Json dict
|
||||
separator : str, default '.'
|
||||
Nested records will generate names separated by sep,
|
||||
e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar
|
||||
"""
|
||||
if isinstance(data, dict):
|
||||
for key, value in data.items():
|
||||
new_key = f"{key_string}{separator}{key}"
|
||||
|
||||
if not key_string:
|
||||
new_key = new_key.removeprefix(separator)
|
||||
|
||||
_normalise_json(
|
||||
data=value,
|
||||
key_string=new_key,
|
||||
normalized_dict=normalized_dict,
|
||||
separator=separator,
|
||||
)
|
||||
else:
|
||||
normalized_dict[key_string] = data
|
||||
return normalized_dict
|
||||
|
||||
|
||||
def _normalise_json_ordered(data: dict[str, Any], separator: str) -> dict[str, Any]:
|
||||
"""
|
||||
Order the top level keys and then recursively go to depth
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : dict or list of dicts
|
||||
separator : str, default '.'
|
||||
Nested records will generate names separated by sep,
|
||||
e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar
|
||||
|
||||
Returns
|
||||
-------
|
||||
dict or list of dicts, matching `normalised_json_object`
|
||||
"""
|
||||
top_dict_ = {k: v for k, v in data.items() if not isinstance(v, dict)}
|
||||
nested_dict_ = _normalise_json(
|
||||
data={k: v for k, v in data.items() if isinstance(v, dict)},
|
||||
key_string="",
|
||||
normalized_dict={},
|
||||
separator=separator,
|
||||
)
|
||||
return {**top_dict_, **nested_dict_}
|
||||
|
||||
|
||||
def _simple_json_normalize(
|
||||
ds: dict | list[dict],
|
||||
sep: str = ".",
|
||||
) -> dict | list[dict] | Any:
|
||||
"""
|
||||
A optimized basic json_normalize
|
||||
|
||||
Converts a nested dict into a flat dict ("record"), unlike
|
||||
json_normalize and nested_to_record it doesn't do anything clever.
|
||||
But for the most basic use cases it enhances performance.
|
||||
E.g. pd.json_normalize(data)
|
||||
|
||||
Parameters
|
||||
----------
|
||||
ds : dict or list of dicts
|
||||
sep : str, default '.'
|
||||
Nested records will generate names separated by sep,
|
||||
e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar
|
||||
|
||||
Returns
|
||||
-------
|
||||
frame : DataFrame
|
||||
d - dict or list of dicts, matching `normalised_json_object`
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> _simple_json_normalize(
|
||||
... {
|
||||
... "flat1": 1,
|
||||
... "dict1": {"c": 1, "d": 2},
|
||||
... "nested": {"e": {"c": 1, "d": 2}, "d": 2},
|
||||
... }
|
||||
... )
|
||||
{\
|
||||
'flat1': 1, \
|
||||
'dict1.c': 1, \
|
||||
'dict1.d': 2, \
|
||||
'nested.e.c': 1, \
|
||||
'nested.e.d': 2, \
|
||||
'nested.d': 2\
|
||||
}
|
||||
|
||||
"""
|
||||
normalised_json_object = {}
|
||||
# expect a dictionary, as most jsons are. However, lists are perfectly valid
|
||||
if isinstance(ds, dict):
|
||||
normalised_json_object = _normalise_json_ordered(data=ds, separator=sep)
|
||||
elif isinstance(ds, list):
|
||||
normalised_json_list = [_simple_json_normalize(row, sep=sep) for row in ds]
|
||||
return normalised_json_list
|
||||
return normalised_json_object
|
||||
|
||||
|
||||
def json_normalize(
|
||||
data: dict | list[dict],
|
||||
record_path: str | list | None = None,
|
||||
meta: str | list[str | list[str]] | None = None,
|
||||
meta_prefix: str | None = None,
|
||||
record_prefix: str | None = None,
|
||||
errors: IgnoreRaise = "raise",
|
||||
sep: str = ".",
|
||||
max_level: int | None = None,
|
||||
) -> DataFrame:
|
||||
"""
|
||||
Normalize semi-structured JSON data into a flat table.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : dict or list of dicts
|
||||
Unserialized JSON objects.
|
||||
record_path : str or list of str, default None
|
||||
Path in each object to list of records. If not passed, data will be
|
||||
assumed to be an array of records.
|
||||
meta : list of paths (str or list of str), default None
|
||||
Fields to use as metadata for each record in resulting table.
|
||||
meta_prefix : str, default None
|
||||
If True, prefix records with dotted (?) path, e.g. foo.bar.field if
|
||||
meta is ['foo', 'bar'].
|
||||
record_prefix : str, default None
|
||||
If True, prefix records with dotted (?) path, e.g. foo.bar.field if
|
||||
path to records is ['foo', 'bar'].
|
||||
errors : {'raise', 'ignore'}, default 'raise'
|
||||
Configures error handling.
|
||||
|
||||
* 'ignore' : will ignore KeyError if keys listed in meta are not
|
||||
always present.
|
||||
* 'raise' : will raise KeyError if keys listed in meta are not
|
||||
always present.
|
||||
sep : str, default '.'
|
||||
Nested records will generate names separated by sep.
|
||||
e.g., for sep='.', {'foo': {'bar': 0}} -> foo.bar.
|
||||
max_level : int, default None
|
||||
Max number of levels(depth of dict) to normalize.
|
||||
if None, normalizes all levels.
|
||||
|
||||
Returns
|
||||
-------
|
||||
frame : DataFrame
|
||||
Normalize semi-structured JSON data into a flat table.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> data = [
|
||||
... {"id": 1, "name": {"first": "Coleen", "last": "Volk"}},
|
||||
... {"name": {"given": "Mark", "family": "Regner"}},
|
||||
... {"id": 2, "name": "Faye Raker"},
|
||||
... ]
|
||||
>>> pd.json_normalize(data)
|
||||
id name.first name.last name.given name.family name
|
||||
0 1.0 Coleen Volk NaN NaN NaN
|
||||
1 NaN NaN NaN Mark Regner NaN
|
||||
2 2.0 NaN NaN NaN NaN Faye Raker
|
||||
|
||||
>>> data = [
|
||||
... {
|
||||
... "id": 1,
|
||||
... "name": "Cole Volk",
|
||||
... "fitness": {"height": 130, "weight": 60},
|
||||
... },
|
||||
... {"name": "Mark Reg", "fitness": {"height": 130, "weight": 60}},
|
||||
... {
|
||||
... "id": 2,
|
||||
... "name": "Faye Raker",
|
||||
... "fitness": {"height": 130, "weight": 60},
|
||||
... },
|
||||
... ]
|
||||
>>> pd.json_normalize(data, max_level=0)
|
||||
id name fitness
|
||||
0 1.0 Cole Volk {'height': 130, 'weight': 60}
|
||||
1 NaN Mark Reg {'height': 130, 'weight': 60}
|
||||
2 2.0 Faye Raker {'height': 130, 'weight': 60}
|
||||
|
||||
Normalizes nested data up to level 1.
|
||||
|
||||
>>> data = [
|
||||
... {
|
||||
... "id": 1,
|
||||
... "name": "Cole Volk",
|
||||
... "fitness": {"height": 130, "weight": 60},
|
||||
... },
|
||||
... {"name": "Mark Reg", "fitness": {"height": 130, "weight": 60}},
|
||||
... {
|
||||
... "id": 2,
|
||||
... "name": "Faye Raker",
|
||||
... "fitness": {"height": 130, "weight": 60},
|
||||
... },
|
||||
... ]
|
||||
>>> pd.json_normalize(data, max_level=1)
|
||||
id name fitness.height fitness.weight
|
||||
0 1.0 Cole Volk 130 60
|
||||
1 NaN Mark Reg 130 60
|
||||
2 2.0 Faye Raker 130 60
|
||||
|
||||
>>> data = [
|
||||
... {
|
||||
... "state": "Florida",
|
||||
... "shortname": "FL",
|
||||
... "info": {"governor": "Rick Scott"},
|
||||
... "counties": [
|
||||
... {"name": "Dade", "population": 12345},
|
||||
... {"name": "Broward", "population": 40000},
|
||||
... {"name": "Palm Beach", "population": 60000},
|
||||
... ],
|
||||
... },
|
||||
... {
|
||||
... "state": "Ohio",
|
||||
... "shortname": "OH",
|
||||
... "info": {"governor": "John Kasich"},
|
||||
... "counties": [
|
||||
... {"name": "Summit", "population": 1234},
|
||||
... {"name": "Cuyahoga", "population": 1337},
|
||||
... ],
|
||||
... },
|
||||
... ]
|
||||
>>> result = pd.json_normalize(
|
||||
... data, "counties", ["state", "shortname", ["info", "governor"]]
|
||||
... )
|
||||
>>> result
|
||||
name population state shortname info.governor
|
||||
0 Dade 12345 Florida FL Rick Scott
|
||||
1 Broward 40000 Florida FL Rick Scott
|
||||
2 Palm Beach 60000 Florida FL Rick Scott
|
||||
3 Summit 1234 Ohio OH John Kasich
|
||||
4 Cuyahoga 1337 Ohio OH John Kasich
|
||||
|
||||
>>> data = {"A": [1, 2]}
|
||||
>>> pd.json_normalize(data, "A", record_prefix="Prefix.")
|
||||
Prefix.0
|
||||
0 1
|
||||
1 2
|
||||
|
||||
Returns normalized data with columns prefixed with the given string.
|
||||
"""
|
||||
|
||||
def _pull_field(
|
||||
js: dict[str, Any], spec: list | str, extract_record: bool = False
|
||||
) -> Scalar | Iterable:
|
||||
"""Internal function to pull field"""
|
||||
result = js
|
||||
try:
|
||||
if isinstance(spec, list):
|
||||
for field in spec:
|
||||
if result is None:
|
||||
raise KeyError(field)
|
||||
result = result[field]
|
||||
else:
|
||||
result = result[spec]
|
||||
except KeyError as e:
|
||||
if extract_record:
|
||||
raise KeyError(
|
||||
f"Key {e} not found. If specifying a record_path, all elements of "
|
||||
f"data should have the path."
|
||||
) from e
|
||||
if errors == "ignore":
|
||||
return np.nan
|
||||
else:
|
||||
raise KeyError(
|
||||
f"Key {e} not found. To replace missing values of {e} with "
|
||||
f"np.nan, pass in errors='ignore'"
|
||||
) from e
|
||||
|
||||
return result
|
||||
|
||||
def _pull_records(js: dict[str, Any], spec: list | str) -> list:
|
||||
"""
|
||||
Internal function to pull field for records, and similar to
|
||||
_pull_field, but require to return list. And will raise error
|
||||
if has non iterable value.
|
||||
"""
|
||||
result = _pull_field(js, spec, extract_record=True)
|
||||
|
||||
# GH 31507 GH 30145, GH 26284 if result is not list, raise TypeError if not
|
||||
# null, otherwise return an empty list
|
||||
if not isinstance(result, list):
|
||||
if pd.isnull(result):
|
||||
result = []
|
||||
else:
|
||||
raise TypeError(
|
||||
f"{js} has non list value {result} for path {spec}. "
|
||||
"Must be list or null."
|
||||
)
|
||||
return result
|
||||
|
||||
if isinstance(data, list) and not data:
|
||||
return DataFrame()
|
||||
elif isinstance(data, dict):
|
||||
# A bit of a hackjob
|
||||
data = [data]
|
||||
elif isinstance(data, abc.Iterable) and not isinstance(data, str):
|
||||
# GH35923 Fix pd.json_normalize to not skip the first element of a
|
||||
# generator input
|
||||
data = list(data)
|
||||
else:
|
||||
raise NotImplementedError
|
||||
|
||||
# check to see if a simple recursive function is possible to
|
||||
# improve performance (see #15621) but only for cases such
|
||||
# as pd.Dataframe(data) or pd.Dataframe(data, sep)
|
||||
if (
|
||||
record_path is None
|
||||
and meta is None
|
||||
and meta_prefix is None
|
||||
and record_prefix is None
|
||||
and max_level is None
|
||||
):
|
||||
return DataFrame(_simple_json_normalize(data, sep=sep))
|
||||
|
||||
if record_path is None:
|
||||
if any([isinstance(x, dict) for x in y.values()] for y in data):
|
||||
# naive normalization, this is idempotent for flat records
|
||||
# and potentially will inflate the data considerably for
|
||||
# deeply nested structures:
|
||||
# {VeryLong: { b: 1,c:2}} -> {VeryLong.b:1 ,VeryLong.c:@}
|
||||
#
|
||||
# TODO: handle record value which are lists, at least error
|
||||
# reasonably
|
||||
data = nested_to_record(data, sep=sep, max_level=max_level)
|
||||
return DataFrame(data)
|
||||
elif not isinstance(record_path, list):
|
||||
record_path = [record_path]
|
||||
|
||||
if meta is None:
|
||||
meta = []
|
||||
elif not isinstance(meta, list):
|
||||
meta = [meta]
|
||||
|
||||
_meta = [m if isinstance(m, list) else [m] for m in meta]
|
||||
|
||||
# Disastrously inefficient for now
|
||||
records: list = []
|
||||
lengths = []
|
||||
|
||||
meta_vals: DefaultDict = defaultdict(list)
|
||||
meta_keys = [sep.join(val) for val in _meta]
|
||||
|
||||
def _recursive_extract(data, path, seen_meta, level: int = 0) -> None:
|
||||
if isinstance(data, dict):
|
||||
data = [data]
|
||||
if len(path) > 1:
|
||||
for obj in data:
|
||||
for val, key in zip(_meta, meta_keys):
|
||||
if level + 1 == len(val):
|
||||
seen_meta[key] = _pull_field(obj, val[-1])
|
||||
|
||||
_recursive_extract(obj[path[0]], path[1:], seen_meta, level=level + 1)
|
||||
else:
|
||||
for obj in data:
|
||||
recs = _pull_records(obj, path[0])
|
||||
recs = [
|
||||
nested_to_record(r, sep=sep, max_level=max_level)
|
||||
if isinstance(r, dict)
|
||||
else r
|
||||
for r in recs
|
||||
]
|
||||
|
||||
# For repeating the metadata later
|
||||
lengths.append(len(recs))
|
||||
for val, key in zip(_meta, meta_keys):
|
||||
if level + 1 > len(val):
|
||||
meta_val = seen_meta[key]
|
||||
else:
|
||||
meta_val = _pull_field(obj, val[level:])
|
||||
meta_vals[key].append(meta_val)
|
||||
records.extend(recs)
|
||||
|
||||
_recursive_extract(data, record_path, {}, level=0)
|
||||
|
||||
result = DataFrame(records)
|
||||
|
||||
if record_prefix is not None:
|
||||
result = result.rename(columns=lambda x: f"{record_prefix}{x}")
|
||||
|
||||
# Data types, a problem
|
||||
for k, v in meta_vals.items():
|
||||
if meta_prefix is not None:
|
||||
k = meta_prefix + k
|
||||
|
||||
if k in result:
|
||||
raise ValueError(
|
||||
f"Conflicting metadata name {k}, need distinguishing prefix "
|
||||
)
|
||||
# GH 37782
|
||||
|
||||
values = np.array(v, dtype=object)
|
||||
|
||||
if values.ndim > 1:
|
||||
# GH 37782
|
||||
values = np.empty((len(v),), dtype=object)
|
||||
for i, v in enumerate(v):
|
||||
values[i] = v
|
||||
|
||||
result[k] = values.repeat(lengths)
|
||||
return result
|
||||
387
lib/python3.11/site-packages/pandas/io/json/_table_schema.py
Normal file
387
lib/python3.11/site-packages/pandas/io/json/_table_schema.py
Normal file
@ -0,0 +1,387 @@
|
||||
"""
|
||||
Table Schema builders
|
||||
|
||||
https://specs.frictionlessdata.io/table-schema/
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
cast,
|
||||
)
|
||||
import warnings
|
||||
|
||||
from pandas._libs import lib
|
||||
from pandas._libs.json import ujson_loads
|
||||
from pandas._libs.tslibs import timezones
|
||||
from pandas._libs.tslibs.dtypes import freq_to_period_freqstr
|
||||
from pandas.util._exceptions import find_stack_level
|
||||
|
||||
from pandas.core.dtypes.base import _registry as registry
|
||||
from pandas.core.dtypes.common import (
|
||||
is_bool_dtype,
|
||||
is_integer_dtype,
|
||||
is_numeric_dtype,
|
||||
is_string_dtype,
|
||||
)
|
||||
from pandas.core.dtypes.dtypes import (
|
||||
CategoricalDtype,
|
||||
DatetimeTZDtype,
|
||||
ExtensionDtype,
|
||||
PeriodDtype,
|
||||
)
|
||||
|
||||
from pandas import DataFrame
|
||||
import pandas.core.common as com
|
||||
|
||||
from pandas.tseries.frequencies import to_offset
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pandas._typing import (
|
||||
DtypeObj,
|
||||
JSONSerializable,
|
||||
)
|
||||
|
||||
from pandas import Series
|
||||
from pandas.core.indexes.multi import MultiIndex
|
||||
|
||||
|
||||
TABLE_SCHEMA_VERSION = "1.4.0"
|
||||
|
||||
|
||||
def as_json_table_type(x: DtypeObj) -> str:
|
||||
"""
|
||||
Convert a NumPy / pandas type to its corresponding json_table.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
x : np.dtype or ExtensionDtype
|
||||
|
||||
Returns
|
||||
-------
|
||||
str
|
||||
the Table Schema data types
|
||||
|
||||
Notes
|
||||
-----
|
||||
This table shows the relationship between NumPy / pandas dtypes,
|
||||
and Table Schema dtypes.
|
||||
|
||||
============== =================
|
||||
Pandas type Table Schema type
|
||||
============== =================
|
||||
int64 integer
|
||||
float64 number
|
||||
bool boolean
|
||||
datetime64[ns] datetime
|
||||
timedelta64[ns] duration
|
||||
object str
|
||||
categorical any
|
||||
=============== =================
|
||||
"""
|
||||
if is_integer_dtype(x):
|
||||
return "integer"
|
||||
elif is_bool_dtype(x):
|
||||
return "boolean"
|
||||
elif is_numeric_dtype(x):
|
||||
return "number"
|
||||
elif lib.is_np_dtype(x, "M") or isinstance(x, (DatetimeTZDtype, PeriodDtype)):
|
||||
return "datetime"
|
||||
elif lib.is_np_dtype(x, "m"):
|
||||
return "duration"
|
||||
elif is_string_dtype(x):
|
||||
return "string"
|
||||
else:
|
||||
return "any"
|
||||
|
||||
|
||||
def set_default_names(data):
|
||||
"""Sets index names to 'index' for regular, or 'level_x' for Multi"""
|
||||
if com.all_not_none(*data.index.names):
|
||||
nms = data.index.names
|
||||
if len(nms) == 1 and data.index.name == "index":
|
||||
warnings.warn(
|
||||
"Index name of 'index' is not round-trippable.",
|
||||
stacklevel=find_stack_level(),
|
||||
)
|
||||
elif len(nms) > 1 and any(x.startswith("level_") for x in nms):
|
||||
warnings.warn(
|
||||
"Index names beginning with 'level_' are not round-trippable.",
|
||||
stacklevel=find_stack_level(),
|
||||
)
|
||||
return data
|
||||
|
||||
data = data.copy()
|
||||
if data.index.nlevels > 1:
|
||||
data.index.names = com.fill_missing_names(data.index.names)
|
||||
else:
|
||||
data.index.name = data.index.name or "index"
|
||||
return data
|
||||
|
||||
|
||||
def convert_pandas_type_to_json_field(arr) -> dict[str, JSONSerializable]:
|
||||
dtype = arr.dtype
|
||||
name: JSONSerializable
|
||||
if arr.name is None:
|
||||
name = "values"
|
||||
else:
|
||||
name = arr.name
|
||||
field: dict[str, JSONSerializable] = {
|
||||
"name": name,
|
||||
"type": as_json_table_type(dtype),
|
||||
}
|
||||
|
||||
if isinstance(dtype, CategoricalDtype):
|
||||
cats = dtype.categories
|
||||
ordered = dtype.ordered
|
||||
|
||||
field["constraints"] = {"enum": list(cats)}
|
||||
field["ordered"] = ordered
|
||||
elif isinstance(dtype, PeriodDtype):
|
||||
field["freq"] = dtype.freq.freqstr
|
||||
elif isinstance(dtype, DatetimeTZDtype):
|
||||
if timezones.is_utc(dtype.tz):
|
||||
# timezone.utc has no "zone" attr
|
||||
field["tz"] = "UTC"
|
||||
else:
|
||||
# error: "tzinfo" has no attribute "zone"
|
||||
field["tz"] = dtype.tz.zone # type: ignore[attr-defined]
|
||||
elif isinstance(dtype, ExtensionDtype):
|
||||
field["extDtype"] = dtype.name
|
||||
return field
|
||||
|
||||
|
||||
def convert_json_field_to_pandas_type(field) -> str | CategoricalDtype:
|
||||
"""
|
||||
Converts a JSON field descriptor into its corresponding NumPy / pandas type
|
||||
|
||||
Parameters
|
||||
----------
|
||||
field
|
||||
A JSON field descriptor
|
||||
|
||||
Returns
|
||||
-------
|
||||
dtype
|
||||
|
||||
Raises
|
||||
------
|
||||
ValueError
|
||||
If the type of the provided field is unknown or currently unsupported
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> convert_json_field_to_pandas_type({"name": "an_int", "type": "integer"})
|
||||
'int64'
|
||||
|
||||
>>> convert_json_field_to_pandas_type(
|
||||
... {
|
||||
... "name": "a_categorical",
|
||||
... "type": "any",
|
||||
... "constraints": {"enum": ["a", "b", "c"]},
|
||||
... "ordered": True,
|
||||
... }
|
||||
... )
|
||||
CategoricalDtype(categories=['a', 'b', 'c'], ordered=True, categories_dtype=object)
|
||||
|
||||
>>> convert_json_field_to_pandas_type({"name": "a_datetime", "type": "datetime"})
|
||||
'datetime64[ns]'
|
||||
|
||||
>>> convert_json_field_to_pandas_type(
|
||||
... {"name": "a_datetime_with_tz", "type": "datetime", "tz": "US/Central"}
|
||||
... )
|
||||
'datetime64[ns, US/Central]'
|
||||
"""
|
||||
typ = field["type"]
|
||||
if typ == "string":
|
||||
return field.get("extDtype", None)
|
||||
elif typ == "integer":
|
||||
return field.get("extDtype", "int64")
|
||||
elif typ == "number":
|
||||
return field.get("extDtype", "float64")
|
||||
elif typ == "boolean":
|
||||
return field.get("extDtype", "bool")
|
||||
elif typ == "duration":
|
||||
return "timedelta64"
|
||||
elif typ == "datetime":
|
||||
if field.get("tz"):
|
||||
return f"datetime64[ns, {field['tz']}]"
|
||||
elif field.get("freq"):
|
||||
# GH#9586 rename frequency M to ME for offsets
|
||||
offset = to_offset(field["freq"])
|
||||
freq_n, freq_name = offset.n, offset.name
|
||||
freq = freq_to_period_freqstr(freq_n, freq_name)
|
||||
# GH#47747 using datetime over period to minimize the change surface
|
||||
return f"period[{freq}]"
|
||||
else:
|
||||
return "datetime64[ns]"
|
||||
elif typ == "any":
|
||||
if "constraints" in field and "ordered" in field:
|
||||
return CategoricalDtype(
|
||||
categories=field["constraints"]["enum"], ordered=field["ordered"]
|
||||
)
|
||||
elif "extDtype" in field:
|
||||
return registry.find(field["extDtype"])
|
||||
else:
|
||||
return "object"
|
||||
|
||||
raise ValueError(f"Unsupported or invalid field type: {typ}")
|
||||
|
||||
|
||||
def build_table_schema(
|
||||
data: DataFrame | Series,
|
||||
index: bool = True,
|
||||
primary_key: bool | None = None,
|
||||
version: bool = True,
|
||||
) -> dict[str, JSONSerializable]:
|
||||
"""
|
||||
Create a Table schema from ``data``.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : Series, DataFrame
|
||||
index : bool, default True
|
||||
Whether to include ``data.index`` in the schema.
|
||||
primary_key : bool or None, default True
|
||||
Column names to designate as the primary key.
|
||||
The default `None` will set `'primaryKey'` to the index
|
||||
level or levels if the index is unique.
|
||||
version : bool, default True
|
||||
Whether to include a field `pandas_version` with the version
|
||||
of pandas that last revised the table schema. This version
|
||||
can be different from the installed pandas version.
|
||||
|
||||
Returns
|
||||
-------
|
||||
dict
|
||||
|
||||
Notes
|
||||
-----
|
||||
See `Table Schema
|
||||
<https://pandas.pydata.org/docs/user_guide/io.html#table-schema>`__ for
|
||||
conversion types.
|
||||
Timedeltas as converted to ISO8601 duration format with
|
||||
9 decimal places after the seconds field for nanosecond precision.
|
||||
|
||||
Categoricals are converted to the `any` dtype, and use the `enum` field
|
||||
constraint to list the allowed values. The `ordered` attribute is included
|
||||
in an `ordered` field.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from pandas.io.json._table_schema import build_table_schema
|
||||
>>> df = pd.DataFrame(
|
||||
... {'A': [1, 2, 3],
|
||||
... 'B': ['a', 'b', 'c'],
|
||||
... 'C': pd.date_range('2016-01-01', freq='d', periods=3),
|
||||
... }, index=pd.Index(range(3), name='idx'))
|
||||
>>> build_table_schema(df)
|
||||
{'fields': \
|
||||
[{'name': 'idx', 'type': 'integer'}, \
|
||||
{'name': 'A', 'type': 'integer'}, \
|
||||
{'name': 'B', 'type': 'string'}, \
|
||||
{'name': 'C', 'type': 'datetime'}], \
|
||||
'primaryKey': ['idx'], \
|
||||
'pandas_version': '1.4.0'}
|
||||
"""
|
||||
if index is True:
|
||||
data = set_default_names(data)
|
||||
|
||||
schema: dict[str, Any] = {}
|
||||
fields = []
|
||||
|
||||
if index:
|
||||
if data.index.nlevels > 1:
|
||||
data.index = cast("MultiIndex", data.index)
|
||||
for level, name in zip(data.index.levels, data.index.names):
|
||||
new_field = convert_pandas_type_to_json_field(level)
|
||||
new_field["name"] = name
|
||||
fields.append(new_field)
|
||||
else:
|
||||
fields.append(convert_pandas_type_to_json_field(data.index))
|
||||
|
||||
if data.ndim > 1:
|
||||
for column, s in data.items():
|
||||
fields.append(convert_pandas_type_to_json_field(s))
|
||||
else:
|
||||
fields.append(convert_pandas_type_to_json_field(data))
|
||||
|
||||
schema["fields"] = fields
|
||||
if index and data.index.is_unique and primary_key is None:
|
||||
if data.index.nlevels == 1:
|
||||
schema["primaryKey"] = [data.index.name]
|
||||
else:
|
||||
schema["primaryKey"] = data.index.names
|
||||
elif primary_key is not None:
|
||||
schema["primaryKey"] = primary_key
|
||||
|
||||
if version:
|
||||
schema["pandas_version"] = TABLE_SCHEMA_VERSION
|
||||
return schema
|
||||
|
||||
|
||||
def parse_table_schema(json, precise_float: bool) -> DataFrame:
|
||||
"""
|
||||
Builds a DataFrame from a given schema
|
||||
|
||||
Parameters
|
||||
----------
|
||||
json :
|
||||
A JSON table schema
|
||||
precise_float : bool
|
||||
Flag controlling precision when decoding string to double values, as
|
||||
dictated by ``read_json``
|
||||
|
||||
Returns
|
||||
-------
|
||||
df : DataFrame
|
||||
|
||||
Raises
|
||||
------
|
||||
NotImplementedError
|
||||
If the JSON table schema contains either timezone or timedelta data
|
||||
|
||||
Notes
|
||||
-----
|
||||
Because :func:`DataFrame.to_json` uses the string 'index' to denote a
|
||||
name-less :class:`Index`, this function sets the name of the returned
|
||||
:class:`DataFrame` to ``None`` when said string is encountered with a
|
||||
normal :class:`Index`. For a :class:`MultiIndex`, the same limitation
|
||||
applies to any strings beginning with 'level_'. Therefore, an
|
||||
:class:`Index` name of 'index' and :class:`MultiIndex` names starting
|
||||
with 'level_' are not supported.
|
||||
|
||||
See Also
|
||||
--------
|
||||
build_table_schema : Inverse function.
|
||||
pandas.read_json
|
||||
"""
|
||||
table = ujson_loads(json, precise_float=precise_float)
|
||||
col_order = [field["name"] for field in table["schema"]["fields"]]
|
||||
df = DataFrame(table["data"], columns=col_order)[col_order]
|
||||
|
||||
dtypes = {
|
||||
field["name"]: convert_json_field_to_pandas_type(field)
|
||||
for field in table["schema"]["fields"]
|
||||
}
|
||||
|
||||
# No ISO constructor for Timedelta as of yet, so need to raise
|
||||
if "timedelta64" in dtypes.values():
|
||||
raise NotImplementedError(
|
||||
'table="orient" can not yet read ISO-formatted Timedelta data'
|
||||
)
|
||||
|
||||
df = df.astype(dtypes)
|
||||
|
||||
if "primaryKey" in table["schema"]:
|
||||
df = df.set_index(table["schema"]["primaryKey"])
|
||||
if len(df.index.names) == 1:
|
||||
if df.index.name == "index":
|
||||
df.index.name = None
|
||||
else:
|
||||
df.index.names = [
|
||||
None if x.startswith("level_") else x for x in df.index.names
|
||||
]
|
||||
|
||||
return df
|
||||
228
lib/python3.11/site-packages/pandas/io/orc.py
Normal file
228
lib/python3.11/site-packages/pandas/io/orc.py
Normal file
@ -0,0 +1,228 @@
|
||||
""" orc compat """
|
||||
from __future__ import annotations
|
||||
|
||||
import io
|
||||
from types import ModuleType
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
Literal,
|
||||
)
|
||||
|
||||
from pandas._libs import lib
|
||||
from pandas.compat._optional import import_optional_dependency
|
||||
from pandas.util._validators import check_dtype_backend
|
||||
|
||||
from pandas.core.indexes.api import default_index
|
||||
|
||||
from pandas.io._util import arrow_table_to_pandas
|
||||
from pandas.io.common import (
|
||||
get_handle,
|
||||
is_fsspec_url,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import fsspec
|
||||
import pyarrow.fs
|
||||
|
||||
from pandas._typing import (
|
||||
DtypeBackend,
|
||||
FilePath,
|
||||
ReadBuffer,
|
||||
WriteBuffer,
|
||||
)
|
||||
|
||||
from pandas.core.frame import DataFrame
|
||||
|
||||
|
||||
def read_orc(
|
||||
path: FilePath | ReadBuffer[bytes],
|
||||
columns: list[str] | None = None,
|
||||
dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
|
||||
filesystem: pyarrow.fs.FileSystem | fsspec.spec.AbstractFileSystem | None = None,
|
||||
**kwargs: Any,
|
||||
) -> DataFrame:
|
||||
"""
|
||||
Load an ORC object from the file path, returning a DataFrame.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path : str, path object, or file-like object
|
||||
String, path object (implementing ``os.PathLike[str]``), or file-like
|
||||
object implementing a binary ``read()`` function. The string could be a URL.
|
||||
Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is
|
||||
expected. A local file could be:
|
||||
``file://localhost/path/to/table.orc``.
|
||||
columns : list, default None
|
||||
If not None, only these columns will be read from the file.
|
||||
Output always follows the ordering of the file and not the columns list.
|
||||
This mirrors the original behaviour of
|
||||
:external+pyarrow:py:meth:`pyarrow.orc.ORCFile.read`.
|
||||
dtype_backend : {'numpy_nullable', 'pyarrow'}, default 'numpy_nullable'
|
||||
Back-end data type applied to the resultant :class:`DataFrame`
|
||||
(still experimental). Behaviour is as follows:
|
||||
|
||||
* ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame`
|
||||
(default).
|
||||
* ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype`
|
||||
DataFrame.
|
||||
|
||||
.. versionadded:: 2.0
|
||||
|
||||
filesystem : fsspec or pyarrow filesystem, default None
|
||||
Filesystem object to use when reading the parquet file.
|
||||
|
||||
.. versionadded:: 2.1.0
|
||||
|
||||
**kwargs
|
||||
Any additional kwargs are passed to pyarrow.
|
||||
|
||||
Returns
|
||||
-------
|
||||
DataFrame
|
||||
|
||||
Notes
|
||||
-----
|
||||
Before using this function you should read the :ref:`user guide about ORC <io.orc>`
|
||||
and :ref:`install optional dependencies <install.warn_orc>`.
|
||||
|
||||
If ``path`` is a URI scheme pointing to a local or remote file (e.g. "s3://"),
|
||||
a ``pyarrow.fs`` filesystem will be attempted to read the file. You can also pass a
|
||||
pyarrow or fsspec filesystem object into the filesystem keyword to override this
|
||||
behavior.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> result = pd.read_orc("example_pa.orc") # doctest: +SKIP
|
||||
"""
|
||||
# we require a newer version of pyarrow than we support for parquet
|
||||
|
||||
orc = import_optional_dependency("pyarrow.orc")
|
||||
|
||||
check_dtype_backend(dtype_backend)
|
||||
|
||||
with get_handle(path, "rb", is_text=False) as handles:
|
||||
source = handles.handle
|
||||
if is_fsspec_url(path) and filesystem is None:
|
||||
pa = import_optional_dependency("pyarrow")
|
||||
pa_fs = import_optional_dependency("pyarrow.fs")
|
||||
try:
|
||||
filesystem, source = pa_fs.FileSystem.from_uri(path)
|
||||
except (TypeError, pa.ArrowInvalid):
|
||||
pass
|
||||
|
||||
pa_table = orc.read_table(
|
||||
source=source, columns=columns, filesystem=filesystem, **kwargs
|
||||
)
|
||||
return arrow_table_to_pandas(pa_table, dtype_backend=dtype_backend)
|
||||
|
||||
|
||||
def to_orc(
|
||||
df: DataFrame,
|
||||
path: FilePath | WriteBuffer[bytes] | None = None,
|
||||
*,
|
||||
engine: Literal["pyarrow"] = "pyarrow",
|
||||
index: bool | None = None,
|
||||
engine_kwargs: dict[str, Any] | None = None,
|
||||
) -> bytes | None:
|
||||
"""
|
||||
Write a DataFrame to the ORC format.
|
||||
|
||||
.. versionadded:: 1.5.0
|
||||
|
||||
Parameters
|
||||
----------
|
||||
df : DataFrame
|
||||
The dataframe to be written to ORC. Raises NotImplementedError
|
||||
if dtype of one or more columns is category, unsigned integers,
|
||||
intervals, periods or sparse.
|
||||
path : str, file-like object or None, default None
|
||||
If a string, it will be used as Root Directory path
|
||||
when writing a partitioned dataset. By file-like object,
|
||||
we refer to objects with a write() method, such as a file handle
|
||||
(e.g. via builtin open function). If path is None,
|
||||
a bytes object is returned.
|
||||
engine : str, default 'pyarrow'
|
||||
ORC library to use.
|
||||
index : bool, optional
|
||||
If ``True``, include the dataframe's index(es) in the file output. If
|
||||
``False``, they will not be written to the file.
|
||||
If ``None``, similar to ``infer`` the dataframe's index(es)
|
||||
will be saved. However, instead of being saved as values,
|
||||
the RangeIndex will be stored as a range in the metadata so it
|
||||
doesn't require much space and is faster. Other indexes will
|
||||
be included as columns in the file output.
|
||||
engine_kwargs : dict[str, Any] or None, default None
|
||||
Additional keyword arguments passed to :func:`pyarrow.orc.write_table`.
|
||||
|
||||
Returns
|
||||
-------
|
||||
bytes if no path argument is provided else None
|
||||
|
||||
Raises
|
||||
------
|
||||
NotImplementedError
|
||||
Dtype of one or more columns is category, unsigned integers, interval,
|
||||
period or sparse.
|
||||
ValueError
|
||||
engine is not pyarrow.
|
||||
|
||||
Notes
|
||||
-----
|
||||
* Before using this function you should read the
|
||||
:ref:`user guide about ORC <io.orc>` and
|
||||
:ref:`install optional dependencies <install.warn_orc>`.
|
||||
* This function requires `pyarrow <https://arrow.apache.org/docs/python/>`_
|
||||
library.
|
||||
* For supported dtypes please refer to `supported ORC features in Arrow
|
||||
<https://arrow.apache.org/docs/cpp/orc.html#data-types>`__.
|
||||
* Currently timezones in datetime columns are not preserved when a
|
||||
dataframe is converted into ORC files.
|
||||
"""
|
||||
if index is None:
|
||||
index = df.index.names[0] is not None
|
||||
if engine_kwargs is None:
|
||||
engine_kwargs = {}
|
||||
|
||||
# validate index
|
||||
# --------------
|
||||
|
||||
# validate that we have only a default index
|
||||
# raise on anything else as we don't serialize the index
|
||||
|
||||
if not df.index.equals(default_index(len(df))):
|
||||
raise ValueError(
|
||||
"orc does not support serializing a non-default index for the index; "
|
||||
"you can .reset_index() to make the index into column(s)"
|
||||
)
|
||||
|
||||
if df.index.name is not None:
|
||||
raise ValueError("orc does not serialize index meta-data on a default index")
|
||||
|
||||
if engine != "pyarrow":
|
||||
raise ValueError("engine must be 'pyarrow'")
|
||||
engine = import_optional_dependency(engine, min_version="10.0.1")
|
||||
pa = import_optional_dependency("pyarrow")
|
||||
orc = import_optional_dependency("pyarrow.orc")
|
||||
|
||||
was_none = path is None
|
||||
if was_none:
|
||||
path = io.BytesIO()
|
||||
assert path is not None # For mypy
|
||||
with get_handle(path, "wb", is_text=False) as handles:
|
||||
assert isinstance(engine, ModuleType) # For mypy
|
||||
try:
|
||||
orc.write_table(
|
||||
engine.Table.from_pandas(df, preserve_index=index),
|
||||
handles.handle,
|
||||
**engine_kwargs,
|
||||
)
|
||||
except (TypeError, pa.ArrowNotImplementedError) as e:
|
||||
raise NotImplementedError(
|
||||
"The dtype of one or more columns is not supported yet."
|
||||
) from e
|
||||
|
||||
if was_none:
|
||||
assert isinstance(path, io.BytesIO) # For mypy
|
||||
return path.getvalue()
|
||||
return None
|
||||
678
lib/python3.11/site-packages/pandas/io/parquet.py
Normal file
678
lib/python3.11/site-packages/pandas/io/parquet.py
Normal file
@ -0,0 +1,678 @@
|
||||
""" parquet compat """
|
||||
from __future__ import annotations
|
||||
|
||||
import io
|
||||
import json
|
||||
import os
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
Literal,
|
||||
)
|
||||
import warnings
|
||||
from warnings import (
|
||||
catch_warnings,
|
||||
filterwarnings,
|
||||
)
|
||||
|
||||
from pandas._config.config import _get_option
|
||||
|
||||
from pandas._libs import lib
|
||||
from pandas.compat._optional import import_optional_dependency
|
||||
from pandas.errors import AbstractMethodError
|
||||
from pandas.util._decorators import doc
|
||||
from pandas.util._exceptions import find_stack_level
|
||||
from pandas.util._validators import check_dtype_backend
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
get_option,
|
||||
)
|
||||
from pandas.core.shared_docs import _shared_docs
|
||||
|
||||
from pandas.io._util import arrow_table_to_pandas
|
||||
from pandas.io.common import (
|
||||
IOHandles,
|
||||
get_handle,
|
||||
is_fsspec_url,
|
||||
is_url,
|
||||
stringify_path,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pandas._typing import (
|
||||
DtypeBackend,
|
||||
FilePath,
|
||||
ReadBuffer,
|
||||
StorageOptions,
|
||||
WriteBuffer,
|
||||
)
|
||||
|
||||
|
||||
def get_engine(engine: str) -> BaseImpl:
|
||||
"""return our implementation"""
|
||||
if engine == "auto":
|
||||
engine = get_option("io.parquet.engine")
|
||||
|
||||
if engine == "auto":
|
||||
# try engines in this order
|
||||
engine_classes = [PyArrowImpl, FastParquetImpl]
|
||||
|
||||
error_msgs = ""
|
||||
for engine_class in engine_classes:
|
||||
try:
|
||||
return engine_class()
|
||||
except ImportError as err:
|
||||
error_msgs += "\n - " + str(err)
|
||||
|
||||
raise ImportError(
|
||||
"Unable to find a usable engine; "
|
||||
"tried using: 'pyarrow', 'fastparquet'.\n"
|
||||
"A suitable version of "
|
||||
"pyarrow or fastparquet is required for parquet "
|
||||
"support.\n"
|
||||
"Trying to import the above resulted in these errors:"
|
||||
f"{error_msgs}"
|
||||
)
|
||||
|
||||
if engine == "pyarrow":
|
||||
return PyArrowImpl()
|
||||
elif engine == "fastparquet":
|
||||
return FastParquetImpl()
|
||||
|
||||
raise ValueError("engine must be one of 'pyarrow', 'fastparquet'")
|
||||
|
||||
|
||||
def _get_path_or_handle(
|
||||
path: FilePath | ReadBuffer[bytes] | WriteBuffer[bytes],
|
||||
fs: Any,
|
||||
storage_options: StorageOptions | None = None,
|
||||
mode: str = "rb",
|
||||
is_dir: bool = False,
|
||||
) -> tuple[
|
||||
FilePath | ReadBuffer[bytes] | WriteBuffer[bytes], IOHandles[bytes] | None, Any
|
||||
]:
|
||||
"""File handling for PyArrow."""
|
||||
path_or_handle = stringify_path(path)
|
||||
if fs is not None:
|
||||
pa_fs = import_optional_dependency("pyarrow.fs", errors="ignore")
|
||||
fsspec = import_optional_dependency("fsspec", errors="ignore")
|
||||
if pa_fs is not None and isinstance(fs, pa_fs.FileSystem):
|
||||
if storage_options:
|
||||
raise NotImplementedError(
|
||||
"storage_options not supported with a pyarrow FileSystem."
|
||||
)
|
||||
elif fsspec is not None and isinstance(fs, fsspec.spec.AbstractFileSystem):
|
||||
pass
|
||||
else:
|
||||
raise ValueError(
|
||||
f"filesystem must be a pyarrow or fsspec FileSystem, "
|
||||
f"not a {type(fs).__name__}"
|
||||
)
|
||||
if is_fsspec_url(path_or_handle) and fs is None:
|
||||
if storage_options is None:
|
||||
pa = import_optional_dependency("pyarrow")
|
||||
pa_fs = import_optional_dependency("pyarrow.fs")
|
||||
|
||||
try:
|
||||
fs, path_or_handle = pa_fs.FileSystem.from_uri(path)
|
||||
except (TypeError, pa.ArrowInvalid):
|
||||
pass
|
||||
if fs is None:
|
||||
fsspec = import_optional_dependency("fsspec")
|
||||
fs, path_or_handle = fsspec.core.url_to_fs(
|
||||
path_or_handle, **(storage_options or {})
|
||||
)
|
||||
elif storage_options and (not is_url(path_or_handle) or mode != "rb"):
|
||||
# can't write to a remote url
|
||||
# without making use of fsspec at the moment
|
||||
raise ValueError("storage_options passed with buffer, or non-supported URL")
|
||||
|
||||
handles = None
|
||||
if (
|
||||
not fs
|
||||
and not is_dir
|
||||
and isinstance(path_or_handle, str)
|
||||
and not os.path.isdir(path_or_handle)
|
||||
):
|
||||
# use get_handle only when we are very certain that it is not a directory
|
||||
# fsspec resources can also point to directories
|
||||
# this branch is used for example when reading from non-fsspec URLs
|
||||
handles = get_handle(
|
||||
path_or_handle, mode, is_text=False, storage_options=storage_options
|
||||
)
|
||||
fs = None
|
||||
path_or_handle = handles.handle
|
||||
return path_or_handle, handles, fs
|
||||
|
||||
|
||||
class BaseImpl:
|
||||
@staticmethod
|
||||
def validate_dataframe(df: DataFrame) -> None:
|
||||
if not isinstance(df, DataFrame):
|
||||
raise ValueError("to_parquet only supports IO with DataFrames")
|
||||
|
||||
def write(self, df: DataFrame, path, compression, **kwargs):
|
||||
raise AbstractMethodError(self)
|
||||
|
||||
def read(self, path, columns=None, **kwargs) -> DataFrame:
|
||||
raise AbstractMethodError(self)
|
||||
|
||||
|
||||
class PyArrowImpl(BaseImpl):
|
||||
def __init__(self) -> None:
|
||||
import_optional_dependency(
|
||||
"pyarrow", extra="pyarrow is required for parquet support."
|
||||
)
|
||||
import pyarrow.parquet
|
||||
|
||||
# import utils to register the pyarrow extension types
|
||||
import pandas.core.arrays.arrow.extension_types # pyright: ignore[reportUnusedImport] # noqa: F401
|
||||
|
||||
self.api = pyarrow
|
||||
|
||||
def write(
|
||||
self,
|
||||
df: DataFrame,
|
||||
path: FilePath | WriteBuffer[bytes],
|
||||
compression: str | None = "snappy",
|
||||
index: bool | None = None,
|
||||
storage_options: StorageOptions | None = None,
|
||||
partition_cols: list[str] | None = None,
|
||||
filesystem=None,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
self.validate_dataframe(df)
|
||||
|
||||
from_pandas_kwargs: dict[str, Any] = {"schema": kwargs.pop("schema", None)}
|
||||
if index is not None:
|
||||
from_pandas_kwargs["preserve_index"] = index
|
||||
|
||||
table = self.api.Table.from_pandas(df, **from_pandas_kwargs)
|
||||
|
||||
if df.attrs:
|
||||
df_metadata = {"PANDAS_ATTRS": json.dumps(df.attrs)}
|
||||
existing_metadata = table.schema.metadata
|
||||
merged_metadata = {**existing_metadata, **df_metadata}
|
||||
table = table.replace_schema_metadata(merged_metadata)
|
||||
|
||||
path_or_handle, handles, filesystem = _get_path_or_handle(
|
||||
path,
|
||||
filesystem,
|
||||
storage_options=storage_options,
|
||||
mode="wb",
|
||||
is_dir=partition_cols is not None,
|
||||
)
|
||||
if (
|
||||
isinstance(path_or_handle, io.BufferedWriter)
|
||||
and hasattr(path_or_handle, "name")
|
||||
and isinstance(path_or_handle.name, (str, bytes))
|
||||
):
|
||||
if isinstance(path_or_handle.name, bytes):
|
||||
path_or_handle = path_or_handle.name.decode()
|
||||
else:
|
||||
path_or_handle = path_or_handle.name
|
||||
|
||||
try:
|
||||
if partition_cols is not None:
|
||||
# writes to multiple files under the given path
|
||||
self.api.parquet.write_to_dataset(
|
||||
table,
|
||||
path_or_handle,
|
||||
compression=compression,
|
||||
partition_cols=partition_cols,
|
||||
filesystem=filesystem,
|
||||
**kwargs,
|
||||
)
|
||||
else:
|
||||
# write to single output file
|
||||
self.api.parquet.write_table(
|
||||
table,
|
||||
path_or_handle,
|
||||
compression=compression,
|
||||
filesystem=filesystem,
|
||||
**kwargs,
|
||||
)
|
||||
finally:
|
||||
if handles is not None:
|
||||
handles.close()
|
||||
|
||||
def read(
|
||||
self,
|
||||
path,
|
||||
columns=None,
|
||||
filters=None,
|
||||
use_nullable_dtypes: bool = False,
|
||||
dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
|
||||
storage_options: StorageOptions | None = None,
|
||||
filesystem=None,
|
||||
**kwargs,
|
||||
) -> DataFrame:
|
||||
kwargs["use_pandas_metadata"] = True
|
||||
|
||||
to_pandas_kwargs = {}
|
||||
|
||||
manager = _get_option("mode.data_manager", silent=True)
|
||||
if manager == "array":
|
||||
to_pandas_kwargs["split_blocks"] = True
|
||||
path_or_handle, handles, filesystem = _get_path_or_handle(
|
||||
path,
|
||||
filesystem,
|
||||
storage_options=storage_options,
|
||||
mode="rb",
|
||||
)
|
||||
try:
|
||||
pa_table = self.api.parquet.read_table(
|
||||
path_or_handle,
|
||||
columns=columns,
|
||||
filesystem=filesystem,
|
||||
filters=filters,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
with catch_warnings():
|
||||
filterwarnings(
|
||||
"ignore",
|
||||
"make_block is deprecated",
|
||||
DeprecationWarning,
|
||||
)
|
||||
result = arrow_table_to_pandas(
|
||||
pa_table,
|
||||
dtype_backend=dtype_backend,
|
||||
to_pandas_kwargs=to_pandas_kwargs,
|
||||
)
|
||||
|
||||
if manager == "array":
|
||||
result = result._as_manager("array", copy=False)
|
||||
|
||||
if pa_table.schema.metadata:
|
||||
if b"PANDAS_ATTRS" in pa_table.schema.metadata:
|
||||
df_metadata = pa_table.schema.metadata[b"PANDAS_ATTRS"]
|
||||
result.attrs = json.loads(df_metadata)
|
||||
return result
|
||||
finally:
|
||||
if handles is not None:
|
||||
handles.close()
|
||||
|
||||
|
||||
class FastParquetImpl(BaseImpl):
|
||||
def __init__(self) -> None:
|
||||
# since pandas is a dependency of fastparquet
|
||||
# we need to import on first use
|
||||
fastparquet = import_optional_dependency(
|
||||
"fastparquet", extra="fastparquet is required for parquet support."
|
||||
)
|
||||
self.api = fastparquet
|
||||
|
||||
def write(
|
||||
self,
|
||||
df: DataFrame,
|
||||
path,
|
||||
compression: Literal["snappy", "gzip", "brotli"] | None = "snappy",
|
||||
index=None,
|
||||
partition_cols=None,
|
||||
storage_options: StorageOptions | None = None,
|
||||
filesystem=None,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
self.validate_dataframe(df)
|
||||
|
||||
if "partition_on" in kwargs and partition_cols is not None:
|
||||
raise ValueError(
|
||||
"Cannot use both partition_on and "
|
||||
"partition_cols. Use partition_cols for partitioning data"
|
||||
)
|
||||
if "partition_on" in kwargs:
|
||||
partition_cols = kwargs.pop("partition_on")
|
||||
|
||||
if partition_cols is not None:
|
||||
kwargs["file_scheme"] = "hive"
|
||||
|
||||
if filesystem is not None:
|
||||
raise NotImplementedError(
|
||||
"filesystem is not implemented for the fastparquet engine."
|
||||
)
|
||||
|
||||
# cannot use get_handle as write() does not accept file buffers
|
||||
path = stringify_path(path)
|
||||
if is_fsspec_url(path):
|
||||
fsspec = import_optional_dependency("fsspec")
|
||||
|
||||
# if filesystem is provided by fsspec, file must be opened in 'wb' mode.
|
||||
kwargs["open_with"] = lambda path, _: fsspec.open(
|
||||
path, "wb", **(storage_options or {})
|
||||
).open()
|
||||
elif storage_options:
|
||||
raise ValueError(
|
||||
"storage_options passed with file object or non-fsspec file path"
|
||||
)
|
||||
|
||||
with catch_warnings(record=True):
|
||||
self.api.write(
|
||||
path,
|
||||
df,
|
||||
compression=compression,
|
||||
write_index=index,
|
||||
partition_on=partition_cols,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
def read(
|
||||
self,
|
||||
path,
|
||||
columns=None,
|
||||
filters=None,
|
||||
storage_options: StorageOptions | None = None,
|
||||
filesystem=None,
|
||||
**kwargs,
|
||||
) -> DataFrame:
|
||||
parquet_kwargs: dict[str, Any] = {}
|
||||
use_nullable_dtypes = kwargs.pop("use_nullable_dtypes", False)
|
||||
dtype_backend = kwargs.pop("dtype_backend", lib.no_default)
|
||||
# We are disabling nullable dtypes for fastparquet pending discussion
|
||||
parquet_kwargs["pandas_nulls"] = False
|
||||
if use_nullable_dtypes:
|
||||
raise ValueError(
|
||||
"The 'use_nullable_dtypes' argument is not supported for the "
|
||||
"fastparquet engine"
|
||||
)
|
||||
if dtype_backend is not lib.no_default:
|
||||
raise ValueError(
|
||||
"The 'dtype_backend' argument is not supported for the "
|
||||
"fastparquet engine"
|
||||
)
|
||||
if filesystem is not None:
|
||||
raise NotImplementedError(
|
||||
"filesystem is not implemented for the fastparquet engine."
|
||||
)
|
||||
path = stringify_path(path)
|
||||
handles = None
|
||||
if is_fsspec_url(path):
|
||||
fsspec = import_optional_dependency("fsspec")
|
||||
|
||||
parquet_kwargs["fs"] = fsspec.open(path, "rb", **(storage_options or {})).fs
|
||||
elif isinstance(path, str) and not os.path.isdir(path):
|
||||
# use get_handle only when we are very certain that it is not a directory
|
||||
# fsspec resources can also point to directories
|
||||
# this branch is used for example when reading from non-fsspec URLs
|
||||
handles = get_handle(
|
||||
path, "rb", is_text=False, storage_options=storage_options
|
||||
)
|
||||
path = handles.handle
|
||||
|
||||
try:
|
||||
parquet_file = self.api.ParquetFile(path, **parquet_kwargs)
|
||||
return parquet_file.to_pandas(columns=columns, filters=filters, **kwargs)
|
||||
finally:
|
||||
if handles is not None:
|
||||
handles.close()
|
||||
|
||||
|
||||
@doc(storage_options=_shared_docs["storage_options"])
|
||||
def to_parquet(
|
||||
df: DataFrame,
|
||||
path: FilePath | WriteBuffer[bytes] | None = None,
|
||||
engine: str = "auto",
|
||||
compression: str | None = "snappy",
|
||||
index: bool | None = None,
|
||||
storage_options: StorageOptions | None = None,
|
||||
partition_cols: list[str] | None = None,
|
||||
filesystem: Any = None,
|
||||
**kwargs,
|
||||
) -> bytes | None:
|
||||
"""
|
||||
Write a DataFrame to the parquet format.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
df : DataFrame
|
||||
path : str, path object, file-like object, or None, default None
|
||||
String, path object (implementing ``os.PathLike[str]``), or file-like
|
||||
object implementing a binary ``write()`` function. If None, the result is
|
||||
returned as bytes. If a string, it will be used as Root Directory path
|
||||
when writing a partitioned dataset. The engine fastparquet does not
|
||||
accept file-like objects.
|
||||
engine : {{'auto', 'pyarrow', 'fastparquet'}}, default 'auto'
|
||||
Parquet library to use. If 'auto', then the option
|
||||
``io.parquet.engine`` is used. The default ``io.parquet.engine``
|
||||
behavior is to try 'pyarrow', falling back to 'fastparquet' if
|
||||
'pyarrow' is unavailable.
|
||||
|
||||
When using the ``'pyarrow'`` engine and no storage options are provided
|
||||
and a filesystem is implemented by both ``pyarrow.fs`` and ``fsspec``
|
||||
(e.g. "s3://"), then the ``pyarrow.fs`` filesystem is attempted first.
|
||||
Use the filesystem keyword with an instantiated fsspec filesystem
|
||||
if you wish to use its implementation.
|
||||
compression : {{'snappy', 'gzip', 'brotli', 'lz4', 'zstd', None}},
|
||||
default 'snappy'. Name of the compression to use. Use ``None``
|
||||
for no compression.
|
||||
index : bool, default None
|
||||
If ``True``, include the dataframe's index(es) in the file output. If
|
||||
``False``, they will not be written to the file.
|
||||
If ``None``, similar to ``True`` the dataframe's index(es)
|
||||
will be saved. However, instead of being saved as values,
|
||||
the RangeIndex will be stored as a range in the metadata so it
|
||||
doesn't require much space and is faster. Other indexes will
|
||||
be included as columns in the file output.
|
||||
partition_cols : str or list, optional, default None
|
||||
Column names by which to partition the dataset.
|
||||
Columns are partitioned in the order they are given.
|
||||
Must be None if path is not a string.
|
||||
{storage_options}
|
||||
|
||||
filesystem : fsspec or pyarrow filesystem, default None
|
||||
Filesystem object to use when reading the parquet file. Only implemented
|
||||
for ``engine="pyarrow"``.
|
||||
|
||||
.. versionadded:: 2.1.0
|
||||
|
||||
kwargs
|
||||
Additional keyword arguments passed to the engine
|
||||
|
||||
Returns
|
||||
-------
|
||||
bytes if no path argument is provided else None
|
||||
"""
|
||||
if isinstance(partition_cols, str):
|
||||
partition_cols = [partition_cols]
|
||||
impl = get_engine(engine)
|
||||
|
||||
path_or_buf: FilePath | WriteBuffer[bytes] = io.BytesIO() if path is None else path
|
||||
|
||||
impl.write(
|
||||
df,
|
||||
path_or_buf,
|
||||
compression=compression,
|
||||
index=index,
|
||||
partition_cols=partition_cols,
|
||||
storage_options=storage_options,
|
||||
filesystem=filesystem,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
if path is None:
|
||||
assert isinstance(path_or_buf, io.BytesIO)
|
||||
return path_or_buf.getvalue()
|
||||
else:
|
||||
return None
|
||||
|
||||
|
||||
@doc(storage_options=_shared_docs["storage_options"])
|
||||
def read_parquet(
|
||||
path: FilePath | ReadBuffer[bytes],
|
||||
engine: str = "auto",
|
||||
columns: list[str] | None = None,
|
||||
storage_options: StorageOptions | None = None,
|
||||
use_nullable_dtypes: bool | lib.NoDefault = lib.no_default,
|
||||
dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
|
||||
filesystem: Any = None,
|
||||
filters: list[tuple] | list[list[tuple]] | None = None,
|
||||
**kwargs,
|
||||
) -> DataFrame:
|
||||
"""
|
||||
Load a parquet object from the file path, returning a DataFrame.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path : str, path object or file-like object
|
||||
String, path object (implementing ``os.PathLike[str]``), or file-like
|
||||
object implementing a binary ``read()`` function.
|
||||
The string could be a URL. Valid URL schemes include http, ftp, s3,
|
||||
gs, and file. For file URLs, a host is expected. A local file could be:
|
||||
``file://localhost/path/to/table.parquet``.
|
||||
A file URL can also be a path to a directory that contains multiple
|
||||
partitioned parquet files. Both pyarrow and fastparquet support
|
||||
paths to directories as well as file URLs. A directory path could be:
|
||||
``file://localhost/path/to/tables`` or ``s3://bucket/partition_dir``.
|
||||
engine : {{'auto', 'pyarrow', 'fastparquet'}}, default 'auto'
|
||||
Parquet library to use. If 'auto', then the option
|
||||
``io.parquet.engine`` is used. The default ``io.parquet.engine``
|
||||
behavior is to try 'pyarrow', falling back to 'fastparquet' if
|
||||
'pyarrow' is unavailable.
|
||||
|
||||
When using the ``'pyarrow'`` engine and no storage options are provided
|
||||
and a filesystem is implemented by both ``pyarrow.fs`` and ``fsspec``
|
||||
(e.g. "s3://"), then the ``pyarrow.fs`` filesystem is attempted first.
|
||||
Use the filesystem keyword with an instantiated fsspec filesystem
|
||||
if you wish to use its implementation.
|
||||
columns : list, default=None
|
||||
If not None, only these columns will be read from the file.
|
||||
{storage_options}
|
||||
|
||||
.. versionadded:: 1.3.0
|
||||
|
||||
use_nullable_dtypes : bool, default False
|
||||
If True, use dtypes that use ``pd.NA`` as missing value indicator
|
||||
for the resulting DataFrame. (only applicable for the ``pyarrow``
|
||||
engine)
|
||||
As new dtypes are added that support ``pd.NA`` in the future, the
|
||||
output with this option will change to use those dtypes.
|
||||
Note: this is an experimental option, and behaviour (e.g. additional
|
||||
support dtypes) may change without notice.
|
||||
|
||||
.. deprecated:: 2.0
|
||||
|
||||
dtype_backend : {{'numpy_nullable', 'pyarrow'}}, default 'numpy_nullable'
|
||||
Back-end data type applied to the resultant :class:`DataFrame`
|
||||
(still experimental). Behaviour is as follows:
|
||||
|
||||
* ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame`
|
||||
(default).
|
||||
* ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype`
|
||||
DataFrame.
|
||||
|
||||
.. versionadded:: 2.0
|
||||
|
||||
filesystem : fsspec or pyarrow filesystem, default None
|
||||
Filesystem object to use when reading the parquet file. Only implemented
|
||||
for ``engine="pyarrow"``.
|
||||
|
||||
.. versionadded:: 2.1.0
|
||||
|
||||
filters : List[Tuple] or List[List[Tuple]], default None
|
||||
To filter out data.
|
||||
Filter syntax: [[(column, op, val), ...],...]
|
||||
where op is [==, =, >, >=, <, <=, !=, in, not in]
|
||||
The innermost tuples are transposed into a set of filters applied
|
||||
through an `AND` operation.
|
||||
The outer list combines these sets of filters through an `OR`
|
||||
operation.
|
||||
A single list of tuples can also be used, meaning that no `OR`
|
||||
operation between set of filters is to be conducted.
|
||||
|
||||
Using this argument will NOT result in row-wise filtering of the final
|
||||
partitions unless ``engine="pyarrow"`` is also specified. For
|
||||
other engines, filtering is only performed at the partition level, that is,
|
||||
to prevent the loading of some row-groups and/or files.
|
||||
|
||||
.. versionadded:: 2.1.0
|
||||
|
||||
**kwargs
|
||||
Any additional kwargs are passed to the engine.
|
||||
|
||||
Returns
|
||||
-------
|
||||
DataFrame
|
||||
|
||||
See Also
|
||||
--------
|
||||
DataFrame.to_parquet : Create a parquet object that serializes a DataFrame.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> original_df = pd.DataFrame(
|
||||
... {{"foo": range(5), "bar": range(5, 10)}}
|
||||
... )
|
||||
>>> original_df
|
||||
foo bar
|
||||
0 0 5
|
||||
1 1 6
|
||||
2 2 7
|
||||
3 3 8
|
||||
4 4 9
|
||||
>>> df_parquet_bytes = original_df.to_parquet()
|
||||
>>> from io import BytesIO
|
||||
>>> restored_df = pd.read_parquet(BytesIO(df_parquet_bytes))
|
||||
>>> restored_df
|
||||
foo bar
|
||||
0 0 5
|
||||
1 1 6
|
||||
2 2 7
|
||||
3 3 8
|
||||
4 4 9
|
||||
>>> restored_df.equals(original_df)
|
||||
True
|
||||
>>> restored_bar = pd.read_parquet(BytesIO(df_parquet_bytes), columns=["bar"])
|
||||
>>> restored_bar
|
||||
bar
|
||||
0 5
|
||||
1 6
|
||||
2 7
|
||||
3 8
|
||||
4 9
|
||||
>>> restored_bar.equals(original_df[['bar']])
|
||||
True
|
||||
|
||||
The function uses `kwargs` that are passed directly to the engine.
|
||||
In the following example, we use the `filters` argument of the pyarrow
|
||||
engine to filter the rows of the DataFrame.
|
||||
|
||||
Since `pyarrow` is the default engine, we can omit the `engine` argument.
|
||||
Note that the `filters` argument is implemented by the `pyarrow` engine,
|
||||
which can benefit from multithreading and also potentially be more
|
||||
economical in terms of memory.
|
||||
|
||||
>>> sel = [("foo", ">", 2)]
|
||||
>>> restored_part = pd.read_parquet(BytesIO(df_parquet_bytes), filters=sel)
|
||||
>>> restored_part
|
||||
foo bar
|
||||
0 3 8
|
||||
1 4 9
|
||||
"""
|
||||
|
||||
impl = get_engine(engine)
|
||||
|
||||
if use_nullable_dtypes is not lib.no_default:
|
||||
msg = (
|
||||
"The argument 'use_nullable_dtypes' is deprecated and will be removed "
|
||||
"in a future version."
|
||||
)
|
||||
if use_nullable_dtypes is True:
|
||||
msg += (
|
||||
"Use dtype_backend='numpy_nullable' instead of use_nullable_dtype=True."
|
||||
)
|
||||
warnings.warn(msg, FutureWarning, stacklevel=find_stack_level())
|
||||
else:
|
||||
use_nullable_dtypes = False
|
||||
check_dtype_backend(dtype_backend)
|
||||
|
||||
return impl.read(
|
||||
path,
|
||||
columns=columns,
|
||||
filters=filters,
|
||||
storage_options=storage_options,
|
||||
use_nullable_dtypes=use_nullable_dtypes,
|
||||
dtype_backend=dtype_backend,
|
||||
filesystem=filesystem,
|
||||
**kwargs,
|
||||
)
|
||||
@ -0,0 +1,9 @@
|
||||
from pandas.io.parsers.readers import (
|
||||
TextFileReader,
|
||||
TextParser,
|
||||
read_csv,
|
||||
read_fwf,
|
||||
read_table,
|
||||
)
|
||||
|
||||
__all__ = ["TextFileReader", "TextParser", "read_csv", "read_fwf", "read_table"]
|
||||
@ -0,0 +1,295 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
import warnings
|
||||
|
||||
from pandas._libs import lib
|
||||
from pandas.compat._optional import import_optional_dependency
|
||||
from pandas.errors import (
|
||||
ParserError,
|
||||
ParserWarning,
|
||||
)
|
||||
from pandas.util._exceptions import find_stack_level
|
||||
|
||||
from pandas.core.dtypes.common import pandas_dtype
|
||||
from pandas.core.dtypes.inference import is_integer
|
||||
|
||||
from pandas.io._util import arrow_table_to_pandas
|
||||
from pandas.io.parsers.base_parser import ParserBase
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pandas._typing import ReadBuffer
|
||||
|
||||
from pandas import DataFrame
|
||||
|
||||
|
||||
class ArrowParserWrapper(ParserBase):
|
||||
"""
|
||||
Wrapper for the pyarrow engine for read_csv()
|
||||
"""
|
||||
|
||||
def __init__(self, src: ReadBuffer[bytes], **kwds) -> None:
|
||||
super().__init__(kwds)
|
||||
self.kwds = kwds
|
||||
self.src = src
|
||||
|
||||
self._parse_kwds()
|
||||
|
||||
def _parse_kwds(self) -> None:
|
||||
"""
|
||||
Validates keywords before passing to pyarrow.
|
||||
"""
|
||||
encoding: str | None = self.kwds.get("encoding")
|
||||
self.encoding = "utf-8" if encoding is None else encoding
|
||||
|
||||
na_values = self.kwds["na_values"]
|
||||
if isinstance(na_values, dict):
|
||||
raise ValueError(
|
||||
"The pyarrow engine doesn't support passing a dict for na_values"
|
||||
)
|
||||
self.na_values = list(self.kwds["na_values"])
|
||||
|
||||
def _get_pyarrow_options(self) -> None:
|
||||
"""
|
||||
Rename some arguments to pass to pyarrow
|
||||
"""
|
||||
mapping = {
|
||||
"usecols": "include_columns",
|
||||
"na_values": "null_values",
|
||||
"escapechar": "escape_char",
|
||||
"skip_blank_lines": "ignore_empty_lines",
|
||||
"decimal": "decimal_point",
|
||||
"quotechar": "quote_char",
|
||||
}
|
||||
for pandas_name, pyarrow_name in mapping.items():
|
||||
if pandas_name in self.kwds and self.kwds.get(pandas_name) is not None:
|
||||
self.kwds[pyarrow_name] = self.kwds.pop(pandas_name)
|
||||
|
||||
# Date format handling
|
||||
# If we get a string, we need to convert it into a list for pyarrow
|
||||
# If we get a dict, we want to parse those separately
|
||||
date_format = self.date_format
|
||||
if isinstance(date_format, str):
|
||||
date_format = [date_format]
|
||||
else:
|
||||
# In case of dict, we don't want to propagate through, so
|
||||
# just set to pyarrow default of None
|
||||
|
||||
# Ideally, in future we disable pyarrow dtype inference (read in as string)
|
||||
# to prevent misreads.
|
||||
date_format = None
|
||||
self.kwds["timestamp_parsers"] = date_format
|
||||
|
||||
self.parse_options = {
|
||||
option_name: option_value
|
||||
for option_name, option_value in self.kwds.items()
|
||||
if option_value is not None
|
||||
and option_name
|
||||
in ("delimiter", "quote_char", "escape_char", "ignore_empty_lines")
|
||||
}
|
||||
|
||||
on_bad_lines = self.kwds.get("on_bad_lines")
|
||||
if on_bad_lines is not None:
|
||||
if callable(on_bad_lines):
|
||||
self.parse_options["invalid_row_handler"] = on_bad_lines
|
||||
elif on_bad_lines == ParserBase.BadLineHandleMethod.ERROR:
|
||||
self.parse_options[
|
||||
"invalid_row_handler"
|
||||
] = None # PyArrow raises an exception by default
|
||||
elif on_bad_lines == ParserBase.BadLineHandleMethod.WARN:
|
||||
|
||||
def handle_warning(invalid_row) -> str:
|
||||
warnings.warn(
|
||||
f"Expected {invalid_row.expected_columns} columns, but found "
|
||||
f"{invalid_row.actual_columns}: {invalid_row.text}",
|
||||
ParserWarning,
|
||||
stacklevel=find_stack_level(),
|
||||
)
|
||||
return "skip"
|
||||
|
||||
self.parse_options["invalid_row_handler"] = handle_warning
|
||||
elif on_bad_lines == ParserBase.BadLineHandleMethod.SKIP:
|
||||
self.parse_options["invalid_row_handler"] = lambda _: "skip"
|
||||
|
||||
self.convert_options = {
|
||||
option_name: option_value
|
||||
for option_name, option_value in self.kwds.items()
|
||||
if option_value is not None
|
||||
and option_name
|
||||
in (
|
||||
"include_columns",
|
||||
"null_values",
|
||||
"true_values",
|
||||
"false_values",
|
||||
"decimal_point",
|
||||
"timestamp_parsers",
|
||||
)
|
||||
}
|
||||
self.convert_options["strings_can_be_null"] = "" in self.kwds["null_values"]
|
||||
# autogenerated column names are prefixed with 'f' in pyarrow.csv
|
||||
if self.header is None and "include_columns" in self.convert_options:
|
||||
self.convert_options["include_columns"] = [
|
||||
f"f{n}" for n in self.convert_options["include_columns"]
|
||||
]
|
||||
|
||||
self.read_options = {
|
||||
"autogenerate_column_names": self.header is None,
|
||||
"skip_rows": self.header
|
||||
if self.header is not None
|
||||
else self.kwds["skiprows"],
|
||||
"encoding": self.encoding,
|
||||
}
|
||||
|
||||
def _finalize_pandas_output(self, frame: DataFrame) -> DataFrame:
|
||||
"""
|
||||
Processes data read in based on kwargs.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
frame: DataFrame
|
||||
The DataFrame to process.
|
||||
|
||||
Returns
|
||||
-------
|
||||
DataFrame
|
||||
The processed DataFrame.
|
||||
"""
|
||||
num_cols = len(frame.columns)
|
||||
multi_index_named = True
|
||||
if self.header is None:
|
||||
if self.names is None:
|
||||
if self.header is None:
|
||||
self.names = range(num_cols)
|
||||
if len(self.names) != num_cols:
|
||||
# usecols is passed through to pyarrow, we only handle index col here
|
||||
# The only way self.names is not the same length as number of cols is
|
||||
# if we have int index_col. We should just pad the names(they will get
|
||||
# removed anyways) to expected length then.
|
||||
columns_prefix = [str(x) for x in range(num_cols - len(self.names))]
|
||||
self.names = columns_prefix + self.names
|
||||
multi_index_named = False
|
||||
frame.columns = self.names
|
||||
# we only need the frame not the names
|
||||
_, frame = self._do_date_conversions(frame.columns, frame)
|
||||
if self.index_col is not None:
|
||||
index_to_set = self.index_col.copy()
|
||||
for i, item in enumerate(self.index_col):
|
||||
if is_integer(item):
|
||||
index_to_set[i] = frame.columns[item]
|
||||
# String case
|
||||
elif item not in frame.columns:
|
||||
raise ValueError(f"Index {item} invalid")
|
||||
|
||||
# Process dtype for index_col and drop from dtypes
|
||||
if self.dtype is not None:
|
||||
key, new_dtype = (
|
||||
(item, self.dtype.get(item))
|
||||
if self.dtype.get(item) is not None
|
||||
else (frame.columns[item], self.dtype.get(frame.columns[item]))
|
||||
)
|
||||
if new_dtype is not None:
|
||||
frame[key] = frame[key].astype(new_dtype)
|
||||
del self.dtype[key]
|
||||
|
||||
frame.set_index(index_to_set, drop=True, inplace=True)
|
||||
# Clear names if headerless and no name given
|
||||
if self.header is None and not multi_index_named:
|
||||
frame.index.names = [None] * len(frame.index.names)
|
||||
|
||||
if self.dtype is not None:
|
||||
# Ignore non-existent columns from dtype mapping
|
||||
# like other parsers do
|
||||
if isinstance(self.dtype, dict):
|
||||
self.dtype = {
|
||||
k: pandas_dtype(v)
|
||||
for k, v in self.dtype.items()
|
||||
if k in frame.columns
|
||||
}
|
||||
else:
|
||||
self.dtype = pandas_dtype(self.dtype)
|
||||
try:
|
||||
frame = frame.astype(self.dtype)
|
||||
except TypeError as e:
|
||||
# GH#44901 reraise to keep api consistent
|
||||
raise ValueError(e)
|
||||
return frame
|
||||
|
||||
def _validate_usecols(self, usecols) -> None:
|
||||
if lib.is_list_like(usecols) and not all(isinstance(x, str) for x in usecols):
|
||||
raise ValueError(
|
||||
"The pyarrow engine does not allow 'usecols' to be integer "
|
||||
"column positions. Pass a list of string column names instead."
|
||||
)
|
||||
elif callable(usecols):
|
||||
raise ValueError(
|
||||
"The pyarrow engine does not allow 'usecols' to be a callable."
|
||||
)
|
||||
|
||||
def read(self) -> DataFrame:
|
||||
"""
|
||||
Reads the contents of a CSV file into a DataFrame and
|
||||
processes it according to the kwargs passed in the
|
||||
constructor.
|
||||
|
||||
Returns
|
||||
-------
|
||||
DataFrame
|
||||
The DataFrame created from the CSV file.
|
||||
"""
|
||||
pa = import_optional_dependency("pyarrow")
|
||||
pyarrow_csv = import_optional_dependency("pyarrow.csv")
|
||||
self._get_pyarrow_options()
|
||||
|
||||
try:
|
||||
convert_options = pyarrow_csv.ConvertOptions(**self.convert_options)
|
||||
except TypeError:
|
||||
include = self.convert_options.get("include_columns", None)
|
||||
if include is not None:
|
||||
self._validate_usecols(include)
|
||||
|
||||
nulls = self.convert_options.get("null_values", set())
|
||||
if not lib.is_list_like(nulls) or not all(
|
||||
isinstance(x, str) for x in nulls
|
||||
):
|
||||
raise TypeError(
|
||||
"The 'pyarrow' engine requires all na_values to be strings"
|
||||
)
|
||||
|
||||
raise
|
||||
|
||||
try:
|
||||
table = pyarrow_csv.read_csv(
|
||||
self.src,
|
||||
read_options=pyarrow_csv.ReadOptions(**self.read_options),
|
||||
parse_options=pyarrow_csv.ParseOptions(**self.parse_options),
|
||||
convert_options=convert_options,
|
||||
)
|
||||
except pa.ArrowInvalid as e:
|
||||
raise ParserError(e) from e
|
||||
|
||||
dtype_backend = self.kwds["dtype_backend"]
|
||||
|
||||
# Convert all pa.null() cols -> float64 (non nullable)
|
||||
# else Int64 (nullable case, see below)
|
||||
if dtype_backend is lib.no_default:
|
||||
new_schema = table.schema
|
||||
new_type = pa.float64()
|
||||
for i, arrow_type in enumerate(table.schema.types):
|
||||
if pa.types.is_null(arrow_type):
|
||||
new_schema = new_schema.set(
|
||||
i, new_schema.field(i).with_type(new_type)
|
||||
)
|
||||
|
||||
table = table.cast(new_schema)
|
||||
|
||||
with warnings.catch_warnings():
|
||||
warnings.filterwarnings(
|
||||
"ignore",
|
||||
"make_block is deprecated",
|
||||
DeprecationWarning,
|
||||
)
|
||||
frame = arrow_table_to_pandas(
|
||||
table, dtype_backend=dtype_backend, null_to_int64=True
|
||||
)
|
||||
|
||||
return self._finalize_pandas_output(frame)
|
||||
1462
lib/python3.11/site-packages/pandas/io/parsers/base_parser.py
Normal file
1462
lib/python3.11/site-packages/pandas/io/parsers/base_parser.py
Normal file
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,410 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from collections import defaultdict
|
||||
from typing import TYPE_CHECKING
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas._libs import (
|
||||
lib,
|
||||
parsers,
|
||||
)
|
||||
from pandas.compat._optional import import_optional_dependency
|
||||
from pandas.errors import DtypeWarning
|
||||
from pandas.util._exceptions import find_stack_level
|
||||
|
||||
from pandas.core.dtypes.common import pandas_dtype
|
||||
from pandas.core.dtypes.concat import (
|
||||
concat_compat,
|
||||
union_categoricals,
|
||||
)
|
||||
from pandas.core.dtypes.dtypes import CategoricalDtype
|
||||
|
||||
from pandas.core.indexes.api import ensure_index_from_sequences
|
||||
|
||||
from pandas.io.common import (
|
||||
dedup_names,
|
||||
is_potential_multi_index,
|
||||
)
|
||||
from pandas.io.parsers.base_parser import (
|
||||
ParserBase,
|
||||
ParserError,
|
||||
is_index_col,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import (
|
||||
Hashable,
|
||||
Mapping,
|
||||
Sequence,
|
||||
)
|
||||
|
||||
from pandas._typing import (
|
||||
ArrayLike,
|
||||
DtypeArg,
|
||||
DtypeObj,
|
||||
ReadCsvBuffer,
|
||||
)
|
||||
|
||||
from pandas import (
|
||||
Index,
|
||||
MultiIndex,
|
||||
)
|
||||
|
||||
|
||||
class CParserWrapper(ParserBase):
|
||||
low_memory: bool
|
||||
_reader: parsers.TextReader
|
||||
|
||||
def __init__(self, src: ReadCsvBuffer[str], **kwds) -> None:
|
||||
super().__init__(kwds)
|
||||
self.kwds = kwds
|
||||
kwds = kwds.copy()
|
||||
|
||||
self.low_memory = kwds.pop("low_memory", False)
|
||||
|
||||
# #2442
|
||||
# error: Cannot determine type of 'index_col'
|
||||
kwds["allow_leading_cols"] = (
|
||||
self.index_col is not False # type: ignore[has-type]
|
||||
)
|
||||
|
||||
# GH20529, validate usecol arg before TextReader
|
||||
kwds["usecols"] = self.usecols
|
||||
|
||||
# Have to pass int, would break tests using TextReader directly otherwise :(
|
||||
kwds["on_bad_lines"] = self.on_bad_lines.value
|
||||
|
||||
for key in (
|
||||
"storage_options",
|
||||
"encoding",
|
||||
"memory_map",
|
||||
"compression",
|
||||
):
|
||||
kwds.pop(key, None)
|
||||
|
||||
kwds["dtype"] = ensure_dtype_objs(kwds.get("dtype", None))
|
||||
if "dtype_backend" not in kwds or kwds["dtype_backend"] is lib.no_default:
|
||||
kwds["dtype_backend"] = "numpy"
|
||||
if kwds["dtype_backend"] == "pyarrow":
|
||||
# Fail here loudly instead of in cython after reading
|
||||
import_optional_dependency("pyarrow")
|
||||
self._reader = parsers.TextReader(src, **kwds)
|
||||
|
||||
self.unnamed_cols = self._reader.unnamed_cols
|
||||
|
||||
# error: Cannot determine type of 'names'
|
||||
passed_names = self.names is None # type: ignore[has-type]
|
||||
|
||||
if self._reader.header is None:
|
||||
self.names = None
|
||||
else:
|
||||
# error: Cannot determine type of 'names'
|
||||
# error: Cannot determine type of 'index_names'
|
||||
(
|
||||
self.names, # type: ignore[has-type]
|
||||
self.index_names,
|
||||
self.col_names,
|
||||
passed_names,
|
||||
) = self._extract_multi_indexer_columns(
|
||||
self._reader.header,
|
||||
self.index_names, # type: ignore[has-type]
|
||||
passed_names,
|
||||
)
|
||||
|
||||
# error: Cannot determine type of 'names'
|
||||
if self.names is None: # type: ignore[has-type]
|
||||
self.names = list(range(self._reader.table_width))
|
||||
|
||||
# gh-9755
|
||||
#
|
||||
# need to set orig_names here first
|
||||
# so that proper indexing can be done
|
||||
# with _set_noconvert_columns
|
||||
#
|
||||
# once names has been filtered, we will
|
||||
# then set orig_names again to names
|
||||
# error: Cannot determine type of 'names'
|
||||
self.orig_names = self.names[:] # type: ignore[has-type]
|
||||
|
||||
if self.usecols:
|
||||
usecols = self._evaluate_usecols(self.usecols, self.orig_names)
|
||||
|
||||
# GH 14671
|
||||
# assert for mypy, orig_names is List or None, None would error in issubset
|
||||
assert self.orig_names is not None
|
||||
if self.usecols_dtype == "string" and not set(usecols).issubset(
|
||||
self.orig_names
|
||||
):
|
||||
self._validate_usecols_names(usecols, self.orig_names)
|
||||
|
||||
# error: Cannot determine type of 'names'
|
||||
if len(self.names) > len(usecols): # type: ignore[has-type]
|
||||
# error: Cannot determine type of 'names'
|
||||
self.names = [ # type: ignore[has-type]
|
||||
n
|
||||
# error: Cannot determine type of 'names'
|
||||
for i, n in enumerate(self.names) # type: ignore[has-type]
|
||||
if (i in usecols or n in usecols)
|
||||
]
|
||||
|
||||
# error: Cannot determine type of 'names'
|
||||
if len(self.names) < len(usecols): # type: ignore[has-type]
|
||||
# error: Cannot determine type of 'names'
|
||||
self._validate_usecols_names(
|
||||
usecols,
|
||||
self.names, # type: ignore[has-type]
|
||||
)
|
||||
|
||||
# error: Cannot determine type of 'names'
|
||||
self._validate_parse_dates_presence(self.names) # type: ignore[has-type]
|
||||
self._set_noconvert_columns()
|
||||
|
||||
# error: Cannot determine type of 'names'
|
||||
self.orig_names = self.names # type: ignore[has-type]
|
||||
|
||||
if not self._has_complex_date_col:
|
||||
# error: Cannot determine type of 'index_col'
|
||||
if self._reader.leading_cols == 0 and is_index_col(
|
||||
self.index_col # type: ignore[has-type]
|
||||
):
|
||||
self._name_processed = True
|
||||
(
|
||||
index_names,
|
||||
# error: Cannot determine type of 'names'
|
||||
self.names, # type: ignore[has-type]
|
||||
self.index_col,
|
||||
) = self._clean_index_names(
|
||||
# error: Cannot determine type of 'names'
|
||||
self.names, # type: ignore[has-type]
|
||||
# error: Cannot determine type of 'index_col'
|
||||
self.index_col, # type: ignore[has-type]
|
||||
)
|
||||
|
||||
if self.index_names is None:
|
||||
self.index_names = index_names
|
||||
|
||||
if self._reader.header is None and not passed_names:
|
||||
assert self.index_names is not None
|
||||
self.index_names = [None] * len(self.index_names)
|
||||
|
||||
self._implicit_index = self._reader.leading_cols > 0
|
||||
|
||||
def close(self) -> None:
|
||||
# close handles opened by C parser
|
||||
try:
|
||||
self._reader.close()
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
def _set_noconvert_columns(self) -> None:
|
||||
"""
|
||||
Set the columns that should not undergo dtype conversions.
|
||||
|
||||
Currently, any column that is involved with date parsing will not
|
||||
undergo such conversions.
|
||||
"""
|
||||
assert self.orig_names is not None
|
||||
# error: Cannot determine type of 'names'
|
||||
|
||||
# much faster than using orig_names.index(x) xref GH#44106
|
||||
names_dict = {x: i for i, x in enumerate(self.orig_names)}
|
||||
col_indices = [names_dict[x] for x in self.names] # type: ignore[has-type]
|
||||
# error: Cannot determine type of 'names'
|
||||
noconvert_columns = self._set_noconvert_dtype_columns(
|
||||
col_indices,
|
||||
self.names, # type: ignore[has-type]
|
||||
)
|
||||
for col in noconvert_columns:
|
||||
self._reader.set_noconvert(col)
|
||||
|
||||
def read(
|
||||
self,
|
||||
nrows: int | None = None,
|
||||
) -> tuple[
|
||||
Index | MultiIndex | None,
|
||||
Sequence[Hashable] | MultiIndex,
|
||||
Mapping[Hashable, ArrayLike],
|
||||
]:
|
||||
index: Index | MultiIndex | None
|
||||
column_names: Sequence[Hashable] | MultiIndex
|
||||
try:
|
||||
if self.low_memory:
|
||||
chunks = self._reader.read_low_memory(nrows)
|
||||
# destructive to chunks
|
||||
data = _concatenate_chunks(chunks)
|
||||
|
||||
else:
|
||||
data = self._reader.read(nrows)
|
||||
except StopIteration:
|
||||
if self._first_chunk:
|
||||
self._first_chunk = False
|
||||
names = dedup_names(
|
||||
self.orig_names,
|
||||
is_potential_multi_index(self.orig_names, self.index_col),
|
||||
)
|
||||
index, columns, col_dict = self._get_empty_meta(
|
||||
names,
|
||||
dtype=self.dtype,
|
||||
)
|
||||
columns = self._maybe_make_multi_index_columns(columns, self.col_names)
|
||||
|
||||
if self.usecols is not None:
|
||||
columns = self._filter_usecols(columns)
|
||||
|
||||
col_dict = {k: v for k, v in col_dict.items() if k in columns}
|
||||
|
||||
return index, columns, col_dict
|
||||
|
||||
else:
|
||||
self.close()
|
||||
raise
|
||||
|
||||
# Done with first read, next time raise StopIteration
|
||||
self._first_chunk = False
|
||||
|
||||
# error: Cannot determine type of 'names'
|
||||
names = self.names # type: ignore[has-type]
|
||||
|
||||
if self._reader.leading_cols:
|
||||
if self._has_complex_date_col:
|
||||
raise NotImplementedError("file structure not yet supported")
|
||||
|
||||
# implicit index, no index names
|
||||
arrays = []
|
||||
|
||||
if self.index_col and self._reader.leading_cols != len(self.index_col):
|
||||
raise ParserError(
|
||||
"Could not construct index. Requested to use "
|
||||
f"{len(self.index_col)} number of columns, but "
|
||||
f"{self._reader.leading_cols} left to parse."
|
||||
)
|
||||
|
||||
for i in range(self._reader.leading_cols):
|
||||
if self.index_col is None:
|
||||
values = data.pop(i)
|
||||
else:
|
||||
values = data.pop(self.index_col[i])
|
||||
|
||||
values = self._maybe_parse_dates(values, i, try_parse_dates=True)
|
||||
arrays.append(values)
|
||||
|
||||
index = ensure_index_from_sequences(arrays)
|
||||
|
||||
if self.usecols is not None:
|
||||
names = self._filter_usecols(names)
|
||||
|
||||
names = dedup_names(names, is_potential_multi_index(names, self.index_col))
|
||||
|
||||
# rename dict keys
|
||||
data_tups = sorted(data.items())
|
||||
data = {k: v for k, (i, v) in zip(names, data_tups)}
|
||||
|
||||
column_names, date_data = self._do_date_conversions(names, data)
|
||||
|
||||
# maybe create a mi on the columns
|
||||
column_names = self._maybe_make_multi_index_columns(
|
||||
column_names, self.col_names
|
||||
)
|
||||
|
||||
else:
|
||||
# rename dict keys
|
||||
data_tups = sorted(data.items())
|
||||
|
||||
# ugh, mutation
|
||||
|
||||
# assert for mypy, orig_names is List or None, None would error in list(...)
|
||||
assert self.orig_names is not None
|
||||
names = list(self.orig_names)
|
||||
names = dedup_names(names, is_potential_multi_index(names, self.index_col))
|
||||
|
||||
if self.usecols is not None:
|
||||
names = self._filter_usecols(names)
|
||||
|
||||
# columns as list
|
||||
alldata = [x[1] for x in data_tups]
|
||||
if self.usecols is None:
|
||||
self._check_data_length(names, alldata)
|
||||
|
||||
data = {k: v for k, (i, v) in zip(names, data_tups)}
|
||||
|
||||
names, date_data = self._do_date_conversions(names, data)
|
||||
index, column_names = self._make_index(date_data, alldata, names)
|
||||
|
||||
return index, column_names, date_data
|
||||
|
||||
def _filter_usecols(self, names: Sequence[Hashable]) -> Sequence[Hashable]:
|
||||
# hackish
|
||||
usecols = self._evaluate_usecols(self.usecols, names)
|
||||
if usecols is not None and len(names) != len(usecols):
|
||||
names = [
|
||||
name for i, name in enumerate(names) if i in usecols or name in usecols
|
||||
]
|
||||
return names
|
||||
|
||||
def _maybe_parse_dates(self, values, index: int, try_parse_dates: bool = True):
|
||||
if try_parse_dates and self._should_parse_dates(index):
|
||||
values = self._date_conv(
|
||||
values,
|
||||
col=self.index_names[index] if self.index_names is not None else None,
|
||||
)
|
||||
return values
|
||||
|
||||
|
||||
def _concatenate_chunks(chunks: list[dict[int, ArrayLike]]) -> dict:
|
||||
"""
|
||||
Concatenate chunks of data read with low_memory=True.
|
||||
|
||||
The tricky part is handling Categoricals, where different chunks
|
||||
may have different inferred categories.
|
||||
"""
|
||||
names = list(chunks[0].keys())
|
||||
warning_columns = []
|
||||
|
||||
result: dict = {}
|
||||
for name in names:
|
||||
arrs = [chunk.pop(name) for chunk in chunks]
|
||||
# Check each arr for consistent types.
|
||||
dtypes = {a.dtype for a in arrs}
|
||||
non_cat_dtypes = {x for x in dtypes if not isinstance(x, CategoricalDtype)}
|
||||
|
||||
dtype = dtypes.pop()
|
||||
if isinstance(dtype, CategoricalDtype):
|
||||
result[name] = union_categoricals(arrs, sort_categories=False)
|
||||
else:
|
||||
result[name] = concat_compat(arrs)
|
||||
if len(non_cat_dtypes) > 1 and result[name].dtype == np.dtype(object):
|
||||
warning_columns.append(str(name))
|
||||
|
||||
if warning_columns:
|
||||
warning_names = ",".join(warning_columns)
|
||||
warning_message = " ".join(
|
||||
[
|
||||
f"Columns ({warning_names}) have mixed types. "
|
||||
f"Specify dtype option on import or set low_memory=False."
|
||||
]
|
||||
)
|
||||
warnings.warn(warning_message, DtypeWarning, stacklevel=find_stack_level())
|
||||
return result
|
||||
|
||||
|
||||
def ensure_dtype_objs(
|
||||
dtype: DtypeArg | dict[Hashable, DtypeArg] | None
|
||||
) -> DtypeObj | dict[Hashable, DtypeObj] | None:
|
||||
"""
|
||||
Ensure we have either None, a dtype object, or a dictionary mapping to
|
||||
dtype objects.
|
||||
"""
|
||||
if isinstance(dtype, defaultdict):
|
||||
# "None" not callable [misc]
|
||||
default_dtype = pandas_dtype(dtype.default_factory()) # type: ignore[misc]
|
||||
dtype_converted: defaultdict = defaultdict(lambda: default_dtype)
|
||||
for key in dtype.keys():
|
||||
dtype_converted[key] = pandas_dtype(dtype[key])
|
||||
return dtype_converted
|
||||
elif isinstance(dtype, dict):
|
||||
return {k: pandas_dtype(dtype[k]) for k in dtype}
|
||||
elif dtype is not None:
|
||||
return pandas_dtype(dtype)
|
||||
return dtype
|
||||
1387
lib/python3.11/site-packages/pandas/io/parsers/python_parser.py
Normal file
1387
lib/python3.11/site-packages/pandas/io/parsers/python_parser.py
Normal file
File diff suppressed because it is too large
Load Diff
2383
lib/python3.11/site-packages/pandas/io/parsers/readers.py
Normal file
2383
lib/python3.11/site-packages/pandas/io/parsers/readers.py
Normal file
File diff suppressed because it is too large
Load Diff
210
lib/python3.11/site-packages/pandas/io/pickle.py
Normal file
210
lib/python3.11/site-packages/pandas/io/pickle.py
Normal file
@ -0,0 +1,210 @@
|
||||
""" pickle compat """
|
||||
from __future__ import annotations
|
||||
|
||||
import pickle
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
)
|
||||
import warnings
|
||||
|
||||
from pandas.compat import pickle_compat as pc
|
||||
from pandas.util._decorators import doc
|
||||
|
||||
from pandas.core.shared_docs import _shared_docs
|
||||
|
||||
from pandas.io.common import get_handle
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pandas._typing import (
|
||||
CompressionOptions,
|
||||
FilePath,
|
||||
ReadPickleBuffer,
|
||||
StorageOptions,
|
||||
WriteBuffer,
|
||||
)
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Series,
|
||||
)
|
||||
|
||||
|
||||
@doc(
|
||||
storage_options=_shared_docs["storage_options"],
|
||||
compression_options=_shared_docs["compression_options"] % "filepath_or_buffer",
|
||||
)
|
||||
def to_pickle(
|
||||
obj: Any,
|
||||
filepath_or_buffer: FilePath | WriteBuffer[bytes],
|
||||
compression: CompressionOptions = "infer",
|
||||
protocol: int = pickle.HIGHEST_PROTOCOL,
|
||||
storage_options: StorageOptions | None = None,
|
||||
) -> None:
|
||||
"""
|
||||
Pickle (serialize) object to file.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
obj : any object
|
||||
Any python object.
|
||||
filepath_or_buffer : str, path object, or file-like object
|
||||
String, path object (implementing ``os.PathLike[str]``), or file-like
|
||||
object implementing a binary ``write()`` function.
|
||||
Also accepts URL. URL has to be of S3 or GCS.
|
||||
{compression_options}
|
||||
|
||||
.. versionchanged:: 1.4.0 Zstandard support.
|
||||
|
||||
protocol : int
|
||||
Int which indicates which protocol should be used by the pickler,
|
||||
default HIGHEST_PROTOCOL (see [1], paragraph 12.1.2). The possible
|
||||
values for this parameter depend on the version of Python. For Python
|
||||
2.x, possible values are 0, 1, 2. For Python>=3.0, 3 is a valid value.
|
||||
For Python >= 3.4, 4 is a valid value. A negative value for the
|
||||
protocol parameter is equivalent to setting its value to
|
||||
HIGHEST_PROTOCOL.
|
||||
|
||||
{storage_options}
|
||||
|
||||
.. [1] https://docs.python.org/3/library/pickle.html
|
||||
|
||||
See Also
|
||||
--------
|
||||
read_pickle : Load pickled pandas object (or any object) from file.
|
||||
DataFrame.to_hdf : Write DataFrame to an HDF5 file.
|
||||
DataFrame.to_sql : Write DataFrame to a SQL database.
|
||||
DataFrame.to_parquet : Write a DataFrame to the binary parquet format.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> original_df = pd.DataFrame({{"foo": range(5), "bar": range(5, 10)}}) # doctest: +SKIP
|
||||
>>> original_df # doctest: +SKIP
|
||||
foo bar
|
||||
0 0 5
|
||||
1 1 6
|
||||
2 2 7
|
||||
3 3 8
|
||||
4 4 9
|
||||
>>> pd.to_pickle(original_df, "./dummy.pkl") # doctest: +SKIP
|
||||
|
||||
>>> unpickled_df = pd.read_pickle("./dummy.pkl") # doctest: +SKIP
|
||||
>>> unpickled_df # doctest: +SKIP
|
||||
foo bar
|
||||
0 0 5
|
||||
1 1 6
|
||||
2 2 7
|
||||
3 3 8
|
||||
4 4 9
|
||||
""" # noqa: E501
|
||||
if protocol < 0:
|
||||
protocol = pickle.HIGHEST_PROTOCOL
|
||||
|
||||
with get_handle(
|
||||
filepath_or_buffer,
|
||||
"wb",
|
||||
compression=compression,
|
||||
is_text=False,
|
||||
storage_options=storage_options,
|
||||
) as handles:
|
||||
# letting pickle write directly to the buffer is more memory-efficient
|
||||
pickle.dump(obj, handles.handle, protocol=protocol)
|
||||
|
||||
|
||||
@doc(
|
||||
storage_options=_shared_docs["storage_options"],
|
||||
decompression_options=_shared_docs["decompression_options"] % "filepath_or_buffer",
|
||||
)
|
||||
def read_pickle(
|
||||
filepath_or_buffer: FilePath | ReadPickleBuffer,
|
||||
compression: CompressionOptions = "infer",
|
||||
storage_options: StorageOptions | None = None,
|
||||
) -> DataFrame | Series:
|
||||
"""
|
||||
Load pickled pandas object (or any object) from file.
|
||||
|
||||
.. warning::
|
||||
|
||||
Loading pickled data received from untrusted sources can be
|
||||
unsafe. See `here <https://docs.python.org/3/library/pickle.html>`__.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
filepath_or_buffer : str, path object, or file-like object
|
||||
String, path object (implementing ``os.PathLike[str]``), or file-like
|
||||
object implementing a binary ``readlines()`` function.
|
||||
Also accepts URL. URL is not limited to S3 and GCS.
|
||||
|
||||
{decompression_options}
|
||||
|
||||
.. versionchanged:: 1.4.0 Zstandard support.
|
||||
|
||||
{storage_options}
|
||||
|
||||
Returns
|
||||
-------
|
||||
same type as object stored in file
|
||||
|
||||
See Also
|
||||
--------
|
||||
DataFrame.to_pickle : Pickle (serialize) DataFrame object to file.
|
||||
Series.to_pickle : Pickle (serialize) Series object to file.
|
||||
read_hdf : Read HDF5 file into a DataFrame.
|
||||
read_sql : Read SQL query or database table into a DataFrame.
|
||||
read_parquet : Load a parquet object, returning a DataFrame.
|
||||
|
||||
Notes
|
||||
-----
|
||||
read_pickle is only guaranteed to be backwards compatible to pandas 0.20.3
|
||||
provided the object was serialized with to_pickle.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> original_df = pd.DataFrame(
|
||||
... {{"foo": range(5), "bar": range(5, 10)}}
|
||||
... ) # doctest: +SKIP
|
||||
>>> original_df # doctest: +SKIP
|
||||
foo bar
|
||||
0 0 5
|
||||
1 1 6
|
||||
2 2 7
|
||||
3 3 8
|
||||
4 4 9
|
||||
>>> pd.to_pickle(original_df, "./dummy.pkl") # doctest: +SKIP
|
||||
|
||||
>>> unpickled_df = pd.read_pickle("./dummy.pkl") # doctest: +SKIP
|
||||
>>> unpickled_df # doctest: +SKIP
|
||||
foo bar
|
||||
0 0 5
|
||||
1 1 6
|
||||
2 2 7
|
||||
3 3 8
|
||||
4 4 9
|
||||
"""
|
||||
excs_to_catch = (AttributeError, ImportError, ModuleNotFoundError, TypeError)
|
||||
with get_handle(
|
||||
filepath_or_buffer,
|
||||
"rb",
|
||||
compression=compression,
|
||||
is_text=False,
|
||||
storage_options=storage_options,
|
||||
) as handles:
|
||||
# 1) try standard library Pickle
|
||||
# 2) try pickle_compat (older pandas version) to handle subclass changes
|
||||
# 3) try pickle_compat with latin-1 encoding upon a UnicodeDecodeError
|
||||
|
||||
try:
|
||||
# TypeError for Cython complaints about object.__new__ vs Tick.__new__
|
||||
try:
|
||||
with warnings.catch_warnings(record=True):
|
||||
# We want to silence any warnings about, e.g. moved modules.
|
||||
warnings.simplefilter("ignore", Warning)
|
||||
return pickle.load(handles.handle)
|
||||
except excs_to_catch:
|
||||
# e.g.
|
||||
# "No module named 'pandas.core.sparse.series'"
|
||||
# "Can't get attribute '__nat_unpickle' on <module 'pandas._libs.tslib"
|
||||
return pc.load(handles.handle, encoding=None)
|
||||
except UnicodeDecodeError:
|
||||
# e.g. can occur for files written in py27; see GH#28645 and GH#31988
|
||||
return pc.load(handles.handle, encoding="latin-1")
|
||||
5532
lib/python3.11/site-packages/pandas/io/pytables.py
Normal file
5532
lib/python3.11/site-packages/pandas/io/pytables.py
Normal file
File diff suppressed because it is too large
Load Diff
3
lib/python3.11/site-packages/pandas/io/sas/__init__.py
Normal file
3
lib/python3.11/site-packages/pandas/io/sas/__init__.py
Normal file
@ -0,0 +1,3 @@
|
||||
from pandas.io.sas.sasreader import read_sas
|
||||
|
||||
__all__ = ["read_sas"]
|
||||
762
lib/python3.11/site-packages/pandas/io/sas/sas7bdat.py
Normal file
762
lib/python3.11/site-packages/pandas/io/sas/sas7bdat.py
Normal file
@ -0,0 +1,762 @@
|
||||
"""
|
||||
Read SAS7BDAT files
|
||||
|
||||
Based on code written by Jared Hobbs:
|
||||
https://bitbucket.org/jaredhobbs/sas7bdat
|
||||
|
||||
See also:
|
||||
https://github.com/BioStatMatt/sas7bdat
|
||||
|
||||
Partial documentation of the file format:
|
||||
https://cran.r-project.org/package=sas7bdat/vignettes/sas7bdat.pdf
|
||||
|
||||
Reference for binary data compression:
|
||||
http://collaboration.cmc.ec.gc.ca/science/rpn/biblio/ddj/Website/articles/CUJ/1992/9210/ross/ross.htm
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from collections import abc
|
||||
from datetime import (
|
||||
datetime,
|
||||
timedelta,
|
||||
)
|
||||
import sys
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas._config import get_option
|
||||
|
||||
from pandas._libs.byteswap import (
|
||||
read_double_with_byteswap,
|
||||
read_float_with_byteswap,
|
||||
read_uint16_with_byteswap,
|
||||
read_uint32_with_byteswap,
|
||||
read_uint64_with_byteswap,
|
||||
)
|
||||
from pandas._libs.sas import (
|
||||
Parser,
|
||||
get_subheader_index,
|
||||
)
|
||||
from pandas._libs.tslibs.conversion import cast_from_unit_vectorized
|
||||
from pandas.errors import EmptyDataError
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Timestamp,
|
||||
isna,
|
||||
)
|
||||
|
||||
from pandas.io.common import get_handle
|
||||
import pandas.io.sas.sas_constants as const
|
||||
from pandas.io.sas.sasreader import ReaderBase
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pandas._typing import (
|
||||
CompressionOptions,
|
||||
FilePath,
|
||||
ReadBuffer,
|
||||
)
|
||||
|
||||
|
||||
_unix_origin = Timestamp("1970-01-01")
|
||||
_sas_origin = Timestamp("1960-01-01")
|
||||
|
||||
|
||||
def _parse_datetime(sas_datetime: float, unit: str):
|
||||
if isna(sas_datetime):
|
||||
return pd.NaT
|
||||
|
||||
if unit == "s":
|
||||
return datetime(1960, 1, 1) + timedelta(seconds=sas_datetime)
|
||||
|
||||
elif unit == "d":
|
||||
return datetime(1960, 1, 1) + timedelta(days=sas_datetime)
|
||||
|
||||
else:
|
||||
raise ValueError("unit must be 'd' or 's'")
|
||||
|
||||
|
||||
def _convert_datetimes(sas_datetimes: pd.Series, unit: str) -> pd.Series:
|
||||
"""
|
||||
Convert to Timestamp if possible, otherwise to datetime.datetime.
|
||||
SAS float64 lacks precision for more than ms resolution so the fit
|
||||
to datetime.datetime is ok.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
sas_datetimes : {Series, Sequence[float]}
|
||||
Dates or datetimes in SAS
|
||||
unit : {'d', 's'}
|
||||
"d" if the floats represent dates, "s" for datetimes
|
||||
|
||||
Returns
|
||||
-------
|
||||
Series
|
||||
Series of datetime64 dtype or datetime.datetime.
|
||||
"""
|
||||
td = (_sas_origin - _unix_origin).as_unit("s")
|
||||
if unit == "s":
|
||||
millis = cast_from_unit_vectorized(
|
||||
sas_datetimes._values, unit="s", out_unit="ms"
|
||||
)
|
||||
dt64ms = millis.view("M8[ms]") + td
|
||||
return pd.Series(dt64ms, index=sas_datetimes.index, copy=False)
|
||||
else:
|
||||
vals = np.array(sas_datetimes, dtype="M8[D]") + td
|
||||
return pd.Series(vals, dtype="M8[s]", index=sas_datetimes.index, copy=False)
|
||||
|
||||
|
||||
class _Column:
|
||||
col_id: int
|
||||
name: str | bytes
|
||||
label: str | bytes
|
||||
format: str | bytes
|
||||
ctype: bytes
|
||||
length: int
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
col_id: int,
|
||||
# These can be bytes when convert_header_text is False
|
||||
name: str | bytes,
|
||||
label: str | bytes,
|
||||
format: str | bytes,
|
||||
ctype: bytes,
|
||||
length: int,
|
||||
) -> None:
|
||||
self.col_id = col_id
|
||||
self.name = name
|
||||
self.label = label
|
||||
self.format = format
|
||||
self.ctype = ctype
|
||||
self.length = length
|
||||
|
||||
|
||||
# SAS7BDAT represents a SAS data file in SAS7BDAT format.
|
||||
class SAS7BDATReader(ReaderBase, abc.Iterator):
|
||||
"""
|
||||
Read SAS files in SAS7BDAT format.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path_or_buf : path name or buffer
|
||||
Name of SAS file or file-like object pointing to SAS file
|
||||
contents.
|
||||
index : column identifier, defaults to None
|
||||
Column to use as index.
|
||||
convert_dates : bool, defaults to True
|
||||
Attempt to convert dates to Pandas datetime values. Note that
|
||||
some rarely used SAS date formats may be unsupported.
|
||||
blank_missing : bool, defaults to True
|
||||
Convert empty strings to missing values (SAS uses blanks to
|
||||
indicate missing character variables).
|
||||
chunksize : int, defaults to None
|
||||
Return SAS7BDATReader object for iterations, returns chunks
|
||||
with given number of lines.
|
||||
encoding : str, 'infer', defaults to None
|
||||
String encoding acc. to Python standard encodings,
|
||||
encoding='infer' tries to detect the encoding from the file header,
|
||||
encoding=None will leave the data in binary format.
|
||||
convert_text : bool, defaults to True
|
||||
If False, text variables are left as raw bytes.
|
||||
convert_header_text : bool, defaults to True
|
||||
If False, header text, including column names, are left as raw
|
||||
bytes.
|
||||
"""
|
||||
|
||||
_int_length: int
|
||||
_cached_page: bytes | None
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
path_or_buf: FilePath | ReadBuffer[bytes],
|
||||
index=None,
|
||||
convert_dates: bool = True,
|
||||
blank_missing: bool = True,
|
||||
chunksize: int | None = None,
|
||||
encoding: str | None = None,
|
||||
convert_text: bool = True,
|
||||
convert_header_text: bool = True,
|
||||
compression: CompressionOptions = "infer",
|
||||
) -> None:
|
||||
self.index = index
|
||||
self.convert_dates = convert_dates
|
||||
self.blank_missing = blank_missing
|
||||
self.chunksize = chunksize
|
||||
self.encoding = encoding
|
||||
self.convert_text = convert_text
|
||||
self.convert_header_text = convert_header_text
|
||||
|
||||
self.default_encoding = "latin-1"
|
||||
self.compression = b""
|
||||
self.column_names_raw: list[bytes] = []
|
||||
self.column_names: list[str | bytes] = []
|
||||
self.column_formats: list[str | bytes] = []
|
||||
self.columns: list[_Column] = []
|
||||
|
||||
self._current_page_data_subheader_pointers: list[tuple[int, int]] = []
|
||||
self._cached_page = None
|
||||
self._column_data_lengths: list[int] = []
|
||||
self._column_data_offsets: list[int] = []
|
||||
self._column_types: list[bytes] = []
|
||||
|
||||
self._current_row_in_file_index = 0
|
||||
self._current_row_on_page_index = 0
|
||||
self._current_row_in_file_index = 0
|
||||
|
||||
self.handles = get_handle(
|
||||
path_or_buf, "rb", is_text=False, compression=compression
|
||||
)
|
||||
|
||||
self._path_or_buf = self.handles.handle
|
||||
|
||||
# Same order as const.SASIndex
|
||||
self._subheader_processors = [
|
||||
self._process_rowsize_subheader,
|
||||
self._process_columnsize_subheader,
|
||||
self._process_subheader_counts,
|
||||
self._process_columntext_subheader,
|
||||
self._process_columnname_subheader,
|
||||
self._process_columnattributes_subheader,
|
||||
self._process_format_subheader,
|
||||
self._process_columnlist_subheader,
|
||||
None, # Data
|
||||
]
|
||||
|
||||
try:
|
||||
self._get_properties()
|
||||
self._parse_metadata()
|
||||
except Exception:
|
||||
self.close()
|
||||
raise
|
||||
|
||||
def column_data_lengths(self) -> np.ndarray:
|
||||
"""Return a numpy int64 array of the column data lengths"""
|
||||
return np.asarray(self._column_data_lengths, dtype=np.int64)
|
||||
|
||||
def column_data_offsets(self) -> np.ndarray:
|
||||
"""Return a numpy int64 array of the column offsets"""
|
||||
return np.asarray(self._column_data_offsets, dtype=np.int64)
|
||||
|
||||
def column_types(self) -> np.ndarray:
|
||||
"""
|
||||
Returns a numpy character array of the column types:
|
||||
s (string) or d (double)
|
||||
"""
|
||||
return np.asarray(self._column_types, dtype=np.dtype("S1"))
|
||||
|
||||
def close(self) -> None:
|
||||
self.handles.close()
|
||||
|
||||
def _get_properties(self) -> None:
|
||||
# Check magic number
|
||||
self._path_or_buf.seek(0)
|
||||
self._cached_page = self._path_or_buf.read(288)
|
||||
if self._cached_page[0 : len(const.magic)] != const.magic:
|
||||
raise ValueError("magic number mismatch (not a SAS file?)")
|
||||
|
||||
# Get alignment information
|
||||
buf = self._read_bytes(const.align_1_offset, const.align_1_length)
|
||||
if buf == const.u64_byte_checker_value:
|
||||
self.U64 = True
|
||||
self._int_length = 8
|
||||
self._page_bit_offset = const.page_bit_offset_x64
|
||||
self._subheader_pointer_length = const.subheader_pointer_length_x64
|
||||
else:
|
||||
self.U64 = False
|
||||
self._page_bit_offset = const.page_bit_offset_x86
|
||||
self._subheader_pointer_length = const.subheader_pointer_length_x86
|
||||
self._int_length = 4
|
||||
buf = self._read_bytes(const.align_2_offset, const.align_2_length)
|
||||
if buf == const.align_1_checker_value:
|
||||
align1 = const.align_2_value
|
||||
else:
|
||||
align1 = 0
|
||||
|
||||
# Get endianness information
|
||||
buf = self._read_bytes(const.endianness_offset, const.endianness_length)
|
||||
if buf == b"\x01":
|
||||
self.byte_order = "<"
|
||||
self.need_byteswap = sys.byteorder == "big"
|
||||
else:
|
||||
self.byte_order = ">"
|
||||
self.need_byteswap = sys.byteorder == "little"
|
||||
|
||||
# Get encoding information
|
||||
buf = self._read_bytes(const.encoding_offset, const.encoding_length)[0]
|
||||
if buf in const.encoding_names:
|
||||
self.inferred_encoding = const.encoding_names[buf]
|
||||
if self.encoding == "infer":
|
||||
self.encoding = self.inferred_encoding
|
||||
else:
|
||||
self.inferred_encoding = f"unknown (code={buf})"
|
||||
|
||||
# Timestamp is epoch 01/01/1960
|
||||
epoch = datetime(1960, 1, 1)
|
||||
x = self._read_float(
|
||||
const.date_created_offset + align1, const.date_created_length
|
||||
)
|
||||
self.date_created = epoch + pd.to_timedelta(x, unit="s")
|
||||
x = self._read_float(
|
||||
const.date_modified_offset + align1, const.date_modified_length
|
||||
)
|
||||
self.date_modified = epoch + pd.to_timedelta(x, unit="s")
|
||||
|
||||
self.header_length = self._read_uint(
|
||||
const.header_size_offset + align1, const.header_size_length
|
||||
)
|
||||
|
||||
# Read the rest of the header into cached_page.
|
||||
buf = self._path_or_buf.read(self.header_length - 288)
|
||||
self._cached_page += buf
|
||||
# error: Argument 1 to "len" has incompatible type "Optional[bytes]";
|
||||
# expected "Sized"
|
||||
if len(self._cached_page) != self.header_length: # type: ignore[arg-type]
|
||||
raise ValueError("The SAS7BDAT file appears to be truncated.")
|
||||
|
||||
self._page_length = self._read_uint(
|
||||
const.page_size_offset + align1, const.page_size_length
|
||||
)
|
||||
|
||||
def __next__(self) -> DataFrame:
|
||||
da = self.read(nrows=self.chunksize or 1)
|
||||
if da.empty:
|
||||
self.close()
|
||||
raise StopIteration
|
||||
return da
|
||||
|
||||
# Read a single float of the given width (4 or 8).
|
||||
def _read_float(self, offset: int, width: int):
|
||||
assert self._cached_page is not None
|
||||
if width == 4:
|
||||
return read_float_with_byteswap(
|
||||
self._cached_page, offset, self.need_byteswap
|
||||
)
|
||||
elif width == 8:
|
||||
return read_double_with_byteswap(
|
||||
self._cached_page, offset, self.need_byteswap
|
||||
)
|
||||
else:
|
||||
self.close()
|
||||
raise ValueError("invalid float width")
|
||||
|
||||
# Read a single unsigned integer of the given width (1, 2, 4 or 8).
|
||||
def _read_uint(self, offset: int, width: int) -> int:
|
||||
assert self._cached_page is not None
|
||||
if width == 1:
|
||||
return self._read_bytes(offset, 1)[0]
|
||||
elif width == 2:
|
||||
return read_uint16_with_byteswap(
|
||||
self._cached_page, offset, self.need_byteswap
|
||||
)
|
||||
elif width == 4:
|
||||
return read_uint32_with_byteswap(
|
||||
self._cached_page, offset, self.need_byteswap
|
||||
)
|
||||
elif width == 8:
|
||||
return read_uint64_with_byteswap(
|
||||
self._cached_page, offset, self.need_byteswap
|
||||
)
|
||||
else:
|
||||
self.close()
|
||||
raise ValueError("invalid int width")
|
||||
|
||||
def _read_bytes(self, offset: int, length: int):
|
||||
assert self._cached_page is not None
|
||||
if offset + length > len(self._cached_page):
|
||||
self.close()
|
||||
raise ValueError("The cached page is too small.")
|
||||
return self._cached_page[offset : offset + length]
|
||||
|
||||
def _read_and_convert_header_text(self, offset: int, length: int) -> str | bytes:
|
||||
return self._convert_header_text(
|
||||
self._read_bytes(offset, length).rstrip(b"\x00 ")
|
||||
)
|
||||
|
||||
def _parse_metadata(self) -> None:
|
||||
done = False
|
||||
while not done:
|
||||
self._cached_page = self._path_or_buf.read(self._page_length)
|
||||
if len(self._cached_page) <= 0:
|
||||
break
|
||||
if len(self._cached_page) != self._page_length:
|
||||
raise ValueError("Failed to read a meta data page from the SAS file.")
|
||||
done = self._process_page_meta()
|
||||
|
||||
def _process_page_meta(self) -> bool:
|
||||
self._read_page_header()
|
||||
pt = const.page_meta_types + [const.page_amd_type, const.page_mix_type]
|
||||
if self._current_page_type in pt:
|
||||
self._process_page_metadata()
|
||||
is_data_page = self._current_page_type == const.page_data_type
|
||||
is_mix_page = self._current_page_type == const.page_mix_type
|
||||
return bool(
|
||||
is_data_page
|
||||
or is_mix_page
|
||||
or self._current_page_data_subheader_pointers != []
|
||||
)
|
||||
|
||||
def _read_page_header(self) -> None:
|
||||
bit_offset = self._page_bit_offset
|
||||
tx = const.page_type_offset + bit_offset
|
||||
self._current_page_type = (
|
||||
self._read_uint(tx, const.page_type_length) & const.page_type_mask2
|
||||
)
|
||||
tx = const.block_count_offset + bit_offset
|
||||
self._current_page_block_count = self._read_uint(tx, const.block_count_length)
|
||||
tx = const.subheader_count_offset + bit_offset
|
||||
self._current_page_subheaders_count = self._read_uint(
|
||||
tx, const.subheader_count_length
|
||||
)
|
||||
|
||||
def _process_page_metadata(self) -> None:
|
||||
bit_offset = self._page_bit_offset
|
||||
|
||||
for i in range(self._current_page_subheaders_count):
|
||||
offset = const.subheader_pointers_offset + bit_offset
|
||||
total_offset = offset + self._subheader_pointer_length * i
|
||||
|
||||
subheader_offset = self._read_uint(total_offset, self._int_length)
|
||||
total_offset += self._int_length
|
||||
|
||||
subheader_length = self._read_uint(total_offset, self._int_length)
|
||||
total_offset += self._int_length
|
||||
|
||||
subheader_compression = self._read_uint(total_offset, 1)
|
||||
total_offset += 1
|
||||
|
||||
subheader_type = self._read_uint(total_offset, 1)
|
||||
|
||||
if (
|
||||
subheader_length == 0
|
||||
or subheader_compression == const.truncated_subheader_id
|
||||
):
|
||||
continue
|
||||
|
||||
subheader_signature = self._read_bytes(subheader_offset, self._int_length)
|
||||
subheader_index = get_subheader_index(subheader_signature)
|
||||
subheader_processor = self._subheader_processors[subheader_index]
|
||||
|
||||
if subheader_processor is None:
|
||||
f1 = subheader_compression in (const.compressed_subheader_id, 0)
|
||||
f2 = subheader_type == const.compressed_subheader_type
|
||||
if self.compression and f1 and f2:
|
||||
self._current_page_data_subheader_pointers.append(
|
||||
(subheader_offset, subheader_length)
|
||||
)
|
||||
else:
|
||||
self.close()
|
||||
raise ValueError(
|
||||
f"Unknown subheader signature {subheader_signature}"
|
||||
)
|
||||
else:
|
||||
subheader_processor(subheader_offset, subheader_length)
|
||||
|
||||
def _process_rowsize_subheader(self, offset: int, length: int) -> None:
|
||||
int_len = self._int_length
|
||||
lcs_offset = offset
|
||||
lcp_offset = offset
|
||||
if self.U64:
|
||||
lcs_offset += 682
|
||||
lcp_offset += 706
|
||||
else:
|
||||
lcs_offset += 354
|
||||
lcp_offset += 378
|
||||
|
||||
self.row_length = self._read_uint(
|
||||
offset + const.row_length_offset_multiplier * int_len,
|
||||
int_len,
|
||||
)
|
||||
self.row_count = self._read_uint(
|
||||
offset + const.row_count_offset_multiplier * int_len,
|
||||
int_len,
|
||||
)
|
||||
self.col_count_p1 = self._read_uint(
|
||||
offset + const.col_count_p1_multiplier * int_len, int_len
|
||||
)
|
||||
self.col_count_p2 = self._read_uint(
|
||||
offset + const.col_count_p2_multiplier * int_len, int_len
|
||||
)
|
||||
mx = const.row_count_on_mix_page_offset_multiplier * int_len
|
||||
self._mix_page_row_count = self._read_uint(offset + mx, int_len)
|
||||
self._lcs = self._read_uint(lcs_offset, 2)
|
||||
self._lcp = self._read_uint(lcp_offset, 2)
|
||||
|
||||
def _process_columnsize_subheader(self, offset: int, length: int) -> None:
|
||||
int_len = self._int_length
|
||||
offset += int_len
|
||||
self.column_count = self._read_uint(offset, int_len)
|
||||
if self.col_count_p1 + self.col_count_p2 != self.column_count:
|
||||
print(
|
||||
f"Warning: column count mismatch ({self.col_count_p1} + "
|
||||
f"{self.col_count_p2} != {self.column_count})\n"
|
||||
)
|
||||
|
||||
# Unknown purpose
|
||||
def _process_subheader_counts(self, offset: int, length: int) -> None:
|
||||
pass
|
||||
|
||||
def _process_columntext_subheader(self, offset: int, length: int) -> None:
|
||||
offset += self._int_length
|
||||
text_block_size = self._read_uint(offset, const.text_block_size_length)
|
||||
|
||||
buf = self._read_bytes(offset, text_block_size)
|
||||
cname_raw = buf[0:text_block_size].rstrip(b"\x00 ")
|
||||
self.column_names_raw.append(cname_raw)
|
||||
|
||||
if len(self.column_names_raw) == 1:
|
||||
compression_literal = b""
|
||||
for cl in const.compression_literals:
|
||||
if cl in cname_raw:
|
||||
compression_literal = cl
|
||||
self.compression = compression_literal
|
||||
offset -= self._int_length
|
||||
|
||||
offset1 = offset + 16
|
||||
if self.U64:
|
||||
offset1 += 4
|
||||
|
||||
buf = self._read_bytes(offset1, self._lcp)
|
||||
compression_literal = buf.rstrip(b"\x00")
|
||||
if compression_literal == b"":
|
||||
self._lcs = 0
|
||||
offset1 = offset + 32
|
||||
if self.U64:
|
||||
offset1 += 4
|
||||
buf = self._read_bytes(offset1, self._lcp)
|
||||
self.creator_proc = buf[0 : self._lcp]
|
||||
elif compression_literal == const.rle_compression:
|
||||
offset1 = offset + 40
|
||||
if self.U64:
|
||||
offset1 += 4
|
||||
buf = self._read_bytes(offset1, self._lcp)
|
||||
self.creator_proc = buf[0 : self._lcp]
|
||||
elif self._lcs > 0:
|
||||
self._lcp = 0
|
||||
offset1 = offset + 16
|
||||
if self.U64:
|
||||
offset1 += 4
|
||||
buf = self._read_bytes(offset1, self._lcs)
|
||||
self.creator_proc = buf[0 : self._lcp]
|
||||
if hasattr(self, "creator_proc"):
|
||||
self.creator_proc = self._convert_header_text(self.creator_proc)
|
||||
|
||||
def _process_columnname_subheader(self, offset: int, length: int) -> None:
|
||||
int_len = self._int_length
|
||||
offset += int_len
|
||||
column_name_pointers_count = (length - 2 * int_len - 12) // 8
|
||||
for i in range(column_name_pointers_count):
|
||||
text_subheader = (
|
||||
offset
|
||||
+ const.column_name_pointer_length * (i + 1)
|
||||
+ const.column_name_text_subheader_offset
|
||||
)
|
||||
col_name_offset = (
|
||||
offset
|
||||
+ const.column_name_pointer_length * (i + 1)
|
||||
+ const.column_name_offset_offset
|
||||
)
|
||||
col_name_length = (
|
||||
offset
|
||||
+ const.column_name_pointer_length * (i + 1)
|
||||
+ const.column_name_length_offset
|
||||
)
|
||||
|
||||
idx = self._read_uint(
|
||||
text_subheader, const.column_name_text_subheader_length
|
||||
)
|
||||
col_offset = self._read_uint(
|
||||
col_name_offset, const.column_name_offset_length
|
||||
)
|
||||
col_len = self._read_uint(col_name_length, const.column_name_length_length)
|
||||
|
||||
name_raw = self.column_names_raw[idx]
|
||||
cname = name_raw[col_offset : col_offset + col_len]
|
||||
self.column_names.append(self._convert_header_text(cname))
|
||||
|
||||
def _process_columnattributes_subheader(self, offset: int, length: int) -> None:
|
||||
int_len = self._int_length
|
||||
column_attributes_vectors_count = (length - 2 * int_len - 12) // (int_len + 8)
|
||||
for i in range(column_attributes_vectors_count):
|
||||
col_data_offset = (
|
||||
offset + int_len + const.column_data_offset_offset + i * (int_len + 8)
|
||||
)
|
||||
col_data_len = (
|
||||
offset
|
||||
+ 2 * int_len
|
||||
+ const.column_data_length_offset
|
||||
+ i * (int_len + 8)
|
||||
)
|
||||
col_types = (
|
||||
offset + 2 * int_len + const.column_type_offset + i * (int_len + 8)
|
||||
)
|
||||
|
||||
x = self._read_uint(col_data_offset, int_len)
|
||||
self._column_data_offsets.append(x)
|
||||
|
||||
x = self._read_uint(col_data_len, const.column_data_length_length)
|
||||
self._column_data_lengths.append(x)
|
||||
|
||||
x = self._read_uint(col_types, const.column_type_length)
|
||||
self._column_types.append(b"d" if x == 1 else b"s")
|
||||
|
||||
def _process_columnlist_subheader(self, offset: int, length: int) -> None:
|
||||
# unknown purpose
|
||||
pass
|
||||
|
||||
def _process_format_subheader(self, offset: int, length: int) -> None:
|
||||
int_len = self._int_length
|
||||
text_subheader_format = (
|
||||
offset + const.column_format_text_subheader_index_offset + 3 * int_len
|
||||
)
|
||||
col_format_offset = offset + const.column_format_offset_offset + 3 * int_len
|
||||
col_format_len = offset + const.column_format_length_offset + 3 * int_len
|
||||
text_subheader_label = (
|
||||
offset + const.column_label_text_subheader_index_offset + 3 * int_len
|
||||
)
|
||||
col_label_offset = offset + const.column_label_offset_offset + 3 * int_len
|
||||
col_label_len = offset + const.column_label_length_offset + 3 * int_len
|
||||
|
||||
x = self._read_uint(
|
||||
text_subheader_format, const.column_format_text_subheader_index_length
|
||||
)
|
||||
format_idx = min(x, len(self.column_names_raw) - 1)
|
||||
|
||||
format_start = self._read_uint(
|
||||
col_format_offset, const.column_format_offset_length
|
||||
)
|
||||
format_len = self._read_uint(col_format_len, const.column_format_length_length)
|
||||
|
||||
label_idx = self._read_uint(
|
||||
text_subheader_label, const.column_label_text_subheader_index_length
|
||||
)
|
||||
label_idx = min(label_idx, len(self.column_names_raw) - 1)
|
||||
|
||||
label_start = self._read_uint(
|
||||
col_label_offset, const.column_label_offset_length
|
||||
)
|
||||
label_len = self._read_uint(col_label_len, const.column_label_length_length)
|
||||
|
||||
label_names = self.column_names_raw[label_idx]
|
||||
column_label = self._convert_header_text(
|
||||
label_names[label_start : label_start + label_len]
|
||||
)
|
||||
format_names = self.column_names_raw[format_idx]
|
||||
column_format = self._convert_header_text(
|
||||
format_names[format_start : format_start + format_len]
|
||||
)
|
||||
current_column_number = len(self.columns)
|
||||
|
||||
col = _Column(
|
||||
current_column_number,
|
||||
self.column_names[current_column_number],
|
||||
column_label,
|
||||
column_format,
|
||||
self._column_types[current_column_number],
|
||||
self._column_data_lengths[current_column_number],
|
||||
)
|
||||
|
||||
self.column_formats.append(column_format)
|
||||
self.columns.append(col)
|
||||
|
||||
def read(self, nrows: int | None = None) -> DataFrame:
|
||||
if (nrows is None) and (self.chunksize is not None):
|
||||
nrows = self.chunksize
|
||||
elif nrows is None:
|
||||
nrows = self.row_count
|
||||
|
||||
if len(self._column_types) == 0:
|
||||
self.close()
|
||||
raise EmptyDataError("No columns to parse from file")
|
||||
|
||||
if nrows > 0 and self._current_row_in_file_index >= self.row_count:
|
||||
return DataFrame()
|
||||
|
||||
nrows = min(nrows, self.row_count - self._current_row_in_file_index)
|
||||
|
||||
nd = self._column_types.count(b"d")
|
||||
ns = self._column_types.count(b"s")
|
||||
|
||||
self._string_chunk = np.empty((ns, nrows), dtype=object)
|
||||
self._byte_chunk = np.zeros((nd, 8 * nrows), dtype=np.uint8)
|
||||
|
||||
self._current_row_in_chunk_index = 0
|
||||
p = Parser(self)
|
||||
p.read(nrows)
|
||||
|
||||
rslt = self._chunk_to_dataframe()
|
||||
if self.index is not None:
|
||||
rslt = rslt.set_index(self.index)
|
||||
|
||||
return rslt
|
||||
|
||||
def _read_next_page(self):
|
||||
self._current_page_data_subheader_pointers = []
|
||||
self._cached_page = self._path_or_buf.read(self._page_length)
|
||||
if len(self._cached_page) <= 0:
|
||||
return True
|
||||
elif len(self._cached_page) != self._page_length:
|
||||
self.close()
|
||||
msg = (
|
||||
"failed to read complete page from file (read "
|
||||
f"{len(self._cached_page):d} of {self._page_length:d} bytes)"
|
||||
)
|
||||
raise ValueError(msg)
|
||||
|
||||
self._read_page_header()
|
||||
if self._current_page_type in const.page_meta_types:
|
||||
self._process_page_metadata()
|
||||
|
||||
if self._current_page_type not in const.page_meta_types + [
|
||||
const.page_data_type,
|
||||
const.page_mix_type,
|
||||
]:
|
||||
return self._read_next_page()
|
||||
|
||||
return False
|
||||
|
||||
def _chunk_to_dataframe(self) -> DataFrame:
|
||||
n = self._current_row_in_chunk_index
|
||||
m = self._current_row_in_file_index
|
||||
ix = range(m - n, m)
|
||||
rslt = {}
|
||||
|
||||
js, jb = 0, 0
|
||||
infer_string = get_option("future.infer_string")
|
||||
for j in range(self.column_count):
|
||||
name = self.column_names[j]
|
||||
|
||||
if self._column_types[j] == b"d":
|
||||
col_arr = self._byte_chunk[jb, :].view(dtype=self.byte_order + "d")
|
||||
rslt[name] = pd.Series(col_arr, dtype=np.float64, index=ix, copy=False)
|
||||
if self.convert_dates:
|
||||
if self.column_formats[j] in const.sas_date_formats:
|
||||
rslt[name] = _convert_datetimes(rslt[name], "d")
|
||||
elif self.column_formats[j] in const.sas_datetime_formats:
|
||||
rslt[name] = _convert_datetimes(rslt[name], "s")
|
||||
jb += 1
|
||||
elif self._column_types[j] == b"s":
|
||||
rslt[name] = pd.Series(self._string_chunk[js, :], index=ix, copy=False)
|
||||
if self.convert_text and (self.encoding is not None):
|
||||
rslt[name] = self._decode_string(rslt[name].str)
|
||||
if infer_string:
|
||||
rslt[name] = rslt[name].astype("str")
|
||||
|
||||
js += 1
|
||||
else:
|
||||
self.close()
|
||||
raise ValueError(f"unknown column type {repr(self._column_types[j])}")
|
||||
|
||||
df = DataFrame(rslt, columns=self.column_names, index=ix, copy=False)
|
||||
return df
|
||||
|
||||
def _decode_string(self, b):
|
||||
return b.decode(self.encoding or self.default_encoding)
|
||||
|
||||
def _convert_header_text(self, b: bytes) -> str | bytes:
|
||||
if self.convert_header_text:
|
||||
return self._decode_string(b)
|
||||
else:
|
||||
return b
|
||||
310
lib/python3.11/site-packages/pandas/io/sas/sas_constants.py
Normal file
310
lib/python3.11/site-packages/pandas/io/sas/sas_constants.py
Normal file
@ -0,0 +1,310 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Final
|
||||
|
||||
magic: Final = (
|
||||
b"\x00\x00\x00\x00\x00\x00\x00\x00"
|
||||
b"\x00\x00\x00\x00\xc2\xea\x81\x60"
|
||||
b"\xb3\x14\x11\xcf\xbd\x92\x08\x00"
|
||||
b"\x09\xc7\x31\x8c\x18\x1f\x10\x11"
|
||||
)
|
||||
|
||||
align_1_checker_value: Final = b"3"
|
||||
align_1_offset: Final = 32
|
||||
align_1_length: Final = 1
|
||||
align_1_value: Final = 4
|
||||
u64_byte_checker_value: Final = b"3"
|
||||
align_2_offset: Final = 35
|
||||
align_2_length: Final = 1
|
||||
align_2_value: Final = 4
|
||||
endianness_offset: Final = 37
|
||||
endianness_length: Final = 1
|
||||
platform_offset: Final = 39
|
||||
platform_length: Final = 1
|
||||
encoding_offset: Final = 70
|
||||
encoding_length: Final = 1
|
||||
dataset_offset: Final = 92
|
||||
dataset_length: Final = 64
|
||||
file_type_offset: Final = 156
|
||||
file_type_length: Final = 8
|
||||
date_created_offset: Final = 164
|
||||
date_created_length: Final = 8
|
||||
date_modified_offset: Final = 172
|
||||
date_modified_length: Final = 8
|
||||
header_size_offset: Final = 196
|
||||
header_size_length: Final = 4
|
||||
page_size_offset: Final = 200
|
||||
page_size_length: Final = 4
|
||||
page_count_offset: Final = 204
|
||||
page_count_length: Final = 4
|
||||
sas_release_offset: Final = 216
|
||||
sas_release_length: Final = 8
|
||||
sas_server_type_offset: Final = 224
|
||||
sas_server_type_length: Final = 16
|
||||
os_version_number_offset: Final = 240
|
||||
os_version_number_length: Final = 16
|
||||
os_maker_offset: Final = 256
|
||||
os_maker_length: Final = 16
|
||||
os_name_offset: Final = 272
|
||||
os_name_length: Final = 16
|
||||
page_bit_offset_x86: Final = 16
|
||||
page_bit_offset_x64: Final = 32
|
||||
subheader_pointer_length_x86: Final = 12
|
||||
subheader_pointer_length_x64: Final = 24
|
||||
page_type_offset: Final = 0
|
||||
page_type_length: Final = 2
|
||||
block_count_offset: Final = 2
|
||||
block_count_length: Final = 2
|
||||
subheader_count_offset: Final = 4
|
||||
subheader_count_length: Final = 2
|
||||
page_type_mask: Final = 0x0F00
|
||||
# Keep "page_comp_type" bits
|
||||
page_type_mask2: Final = 0xF000 | page_type_mask
|
||||
page_meta_type: Final = 0x0000
|
||||
page_data_type: Final = 0x0100
|
||||
page_mix_type: Final = 0x0200
|
||||
page_amd_type: Final = 0x0400
|
||||
page_meta2_type: Final = 0x4000
|
||||
page_comp_type: Final = 0x9000
|
||||
page_meta_types: Final = [page_meta_type, page_meta2_type]
|
||||
subheader_pointers_offset: Final = 8
|
||||
truncated_subheader_id: Final = 1
|
||||
compressed_subheader_id: Final = 4
|
||||
compressed_subheader_type: Final = 1
|
||||
text_block_size_length: Final = 2
|
||||
row_length_offset_multiplier: Final = 5
|
||||
row_count_offset_multiplier: Final = 6
|
||||
col_count_p1_multiplier: Final = 9
|
||||
col_count_p2_multiplier: Final = 10
|
||||
row_count_on_mix_page_offset_multiplier: Final = 15
|
||||
column_name_pointer_length: Final = 8
|
||||
column_name_text_subheader_offset: Final = 0
|
||||
column_name_text_subheader_length: Final = 2
|
||||
column_name_offset_offset: Final = 2
|
||||
column_name_offset_length: Final = 2
|
||||
column_name_length_offset: Final = 4
|
||||
column_name_length_length: Final = 2
|
||||
column_data_offset_offset: Final = 8
|
||||
column_data_length_offset: Final = 8
|
||||
column_data_length_length: Final = 4
|
||||
column_type_offset: Final = 14
|
||||
column_type_length: Final = 1
|
||||
column_format_text_subheader_index_offset: Final = 22
|
||||
column_format_text_subheader_index_length: Final = 2
|
||||
column_format_offset_offset: Final = 24
|
||||
column_format_offset_length: Final = 2
|
||||
column_format_length_offset: Final = 26
|
||||
column_format_length_length: Final = 2
|
||||
column_label_text_subheader_index_offset: Final = 28
|
||||
column_label_text_subheader_index_length: Final = 2
|
||||
column_label_offset_offset: Final = 30
|
||||
column_label_offset_length: Final = 2
|
||||
column_label_length_offset: Final = 32
|
||||
column_label_length_length: Final = 2
|
||||
rle_compression: Final = b"SASYZCRL"
|
||||
rdc_compression: Final = b"SASYZCR2"
|
||||
|
||||
compression_literals: Final = [rle_compression, rdc_compression]
|
||||
|
||||
# Incomplete list of encodings, using SAS nomenclature:
|
||||
# https://support.sas.com/documentation/onlinedoc/dfdmstudio/2.6/dmpdmsug/Content/dfU_Encodings_SAS.html
|
||||
# corresponding to the Python documentation of standard encodings
|
||||
# https://docs.python.org/3/library/codecs.html#standard-encodings
|
||||
encoding_names: Final = {
|
||||
20: "utf-8",
|
||||
29: "latin1",
|
||||
30: "latin2",
|
||||
31: "latin3",
|
||||
32: "latin4",
|
||||
33: "cyrillic",
|
||||
34: "arabic",
|
||||
35: "greek",
|
||||
36: "hebrew",
|
||||
37: "latin5",
|
||||
38: "latin6",
|
||||
39: "cp874",
|
||||
40: "latin9",
|
||||
41: "cp437",
|
||||
42: "cp850",
|
||||
43: "cp852",
|
||||
44: "cp857",
|
||||
45: "cp858",
|
||||
46: "cp862",
|
||||
47: "cp864",
|
||||
48: "cp865",
|
||||
49: "cp866",
|
||||
50: "cp869",
|
||||
51: "cp874",
|
||||
# 52: "", # not found
|
||||
# 53: "", # not found
|
||||
# 54: "", # not found
|
||||
55: "cp720",
|
||||
56: "cp737",
|
||||
57: "cp775",
|
||||
58: "cp860",
|
||||
59: "cp863",
|
||||
60: "cp1250",
|
||||
61: "cp1251",
|
||||
62: "cp1252",
|
||||
63: "cp1253",
|
||||
64: "cp1254",
|
||||
65: "cp1255",
|
||||
66: "cp1256",
|
||||
67: "cp1257",
|
||||
68: "cp1258",
|
||||
118: "cp950",
|
||||
# 119: "", # not found
|
||||
123: "big5",
|
||||
125: "gb2312",
|
||||
126: "cp936",
|
||||
134: "euc_jp",
|
||||
136: "cp932",
|
||||
138: "shift_jis",
|
||||
140: "euc-kr",
|
||||
141: "cp949",
|
||||
227: "latin8",
|
||||
# 228: "", # not found
|
||||
# 229: "" # not found
|
||||
}
|
||||
|
||||
|
||||
class SASIndex:
|
||||
row_size_index: Final = 0
|
||||
column_size_index: Final = 1
|
||||
subheader_counts_index: Final = 2
|
||||
column_text_index: Final = 3
|
||||
column_name_index: Final = 4
|
||||
column_attributes_index: Final = 5
|
||||
format_and_label_index: Final = 6
|
||||
column_list_index: Final = 7
|
||||
data_subheader_index: Final = 8
|
||||
|
||||
|
||||
subheader_signature_to_index: Final = {
|
||||
b"\xF7\xF7\xF7\xF7": SASIndex.row_size_index,
|
||||
b"\x00\x00\x00\x00\xF7\xF7\xF7\xF7": SASIndex.row_size_index,
|
||||
b"\xF7\xF7\xF7\xF7\x00\x00\x00\x00": SASIndex.row_size_index,
|
||||
b"\xF7\xF7\xF7\xF7\xFF\xFF\xFB\xFE": SASIndex.row_size_index,
|
||||
b"\xF6\xF6\xF6\xF6": SASIndex.column_size_index,
|
||||
b"\x00\x00\x00\x00\xF6\xF6\xF6\xF6": SASIndex.column_size_index,
|
||||
b"\xF6\xF6\xF6\xF6\x00\x00\x00\x00": SASIndex.column_size_index,
|
||||
b"\xF6\xF6\xF6\xF6\xFF\xFF\xFB\xFE": SASIndex.column_size_index,
|
||||
b"\x00\xFC\xFF\xFF": SASIndex.subheader_counts_index,
|
||||
b"\xFF\xFF\xFC\x00": SASIndex.subheader_counts_index,
|
||||
b"\x00\xFC\xFF\xFF\xFF\xFF\xFF\xFF": SASIndex.subheader_counts_index,
|
||||
b"\xFF\xFF\xFF\xFF\xFF\xFF\xFC\x00": SASIndex.subheader_counts_index,
|
||||
b"\xFD\xFF\xFF\xFF": SASIndex.column_text_index,
|
||||
b"\xFF\xFF\xFF\xFD": SASIndex.column_text_index,
|
||||
b"\xFD\xFF\xFF\xFF\xFF\xFF\xFF\xFF": SASIndex.column_text_index,
|
||||
b"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFD": SASIndex.column_text_index,
|
||||
b"\xFF\xFF\xFF\xFF": SASIndex.column_name_index,
|
||||
b"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF": SASIndex.column_name_index,
|
||||
b"\xFC\xFF\xFF\xFF": SASIndex.column_attributes_index,
|
||||
b"\xFF\xFF\xFF\xFC": SASIndex.column_attributes_index,
|
||||
b"\xFC\xFF\xFF\xFF\xFF\xFF\xFF\xFF": SASIndex.column_attributes_index,
|
||||
b"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFC": SASIndex.column_attributes_index,
|
||||
b"\xFE\xFB\xFF\xFF": SASIndex.format_and_label_index,
|
||||
b"\xFF\xFF\xFB\xFE": SASIndex.format_and_label_index,
|
||||
b"\xFE\xFB\xFF\xFF\xFF\xFF\xFF\xFF": SASIndex.format_and_label_index,
|
||||
b"\xFF\xFF\xFF\xFF\xFF\xFF\xFB\xFE": SASIndex.format_and_label_index,
|
||||
b"\xFE\xFF\xFF\xFF": SASIndex.column_list_index,
|
||||
b"\xFF\xFF\xFF\xFE": SASIndex.column_list_index,
|
||||
b"\xFE\xFF\xFF\xFF\xFF\xFF\xFF\xFF": SASIndex.column_list_index,
|
||||
b"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFE": SASIndex.column_list_index,
|
||||
}
|
||||
|
||||
|
||||
# List of frequently used SAS date and datetime formats
|
||||
# http://support.sas.com/documentation/cdl/en/etsug/60372/HTML/default/viewer.htm#etsug_intervals_sect009.htm
|
||||
# https://github.com/epam/parso/blob/master/src/main/java/com/epam/parso/impl/SasFileConstants.java
|
||||
sas_date_formats: Final = (
|
||||
"DATE",
|
||||
"DAY",
|
||||
"DDMMYY",
|
||||
"DOWNAME",
|
||||
"JULDAY",
|
||||
"JULIAN",
|
||||
"MMDDYY",
|
||||
"MMYY",
|
||||
"MMYYC",
|
||||
"MMYYD",
|
||||
"MMYYP",
|
||||
"MMYYS",
|
||||
"MMYYN",
|
||||
"MONNAME",
|
||||
"MONTH",
|
||||
"MONYY",
|
||||
"QTR",
|
||||
"QTRR",
|
||||
"NENGO",
|
||||
"WEEKDATE",
|
||||
"WEEKDATX",
|
||||
"WEEKDAY",
|
||||
"WEEKV",
|
||||
"WORDDATE",
|
||||
"WORDDATX",
|
||||
"YEAR",
|
||||
"YYMM",
|
||||
"YYMMC",
|
||||
"YYMMD",
|
||||
"YYMMP",
|
||||
"YYMMS",
|
||||
"YYMMN",
|
||||
"YYMON",
|
||||
"YYMMDD",
|
||||
"YYQ",
|
||||
"YYQC",
|
||||
"YYQD",
|
||||
"YYQP",
|
||||
"YYQS",
|
||||
"YYQN",
|
||||
"YYQR",
|
||||
"YYQRC",
|
||||
"YYQRD",
|
||||
"YYQRP",
|
||||
"YYQRS",
|
||||
"YYQRN",
|
||||
"YYMMDDP",
|
||||
"YYMMDDC",
|
||||
"E8601DA",
|
||||
"YYMMDDN",
|
||||
"MMDDYYC",
|
||||
"MMDDYYS",
|
||||
"MMDDYYD",
|
||||
"YYMMDDS",
|
||||
"B8601DA",
|
||||
"DDMMYYN",
|
||||
"YYMMDDD",
|
||||
"DDMMYYB",
|
||||
"DDMMYYP",
|
||||
"MMDDYYP",
|
||||
"YYMMDDB",
|
||||
"MMDDYYN",
|
||||
"DDMMYYC",
|
||||
"DDMMYYD",
|
||||
"DDMMYYS",
|
||||
"MINGUO",
|
||||
)
|
||||
|
||||
sas_datetime_formats: Final = (
|
||||
"DATETIME",
|
||||
"DTWKDATX",
|
||||
"B8601DN",
|
||||
"B8601DT",
|
||||
"B8601DX",
|
||||
"B8601DZ",
|
||||
"B8601LX",
|
||||
"E8601DN",
|
||||
"E8601DT",
|
||||
"E8601DX",
|
||||
"E8601DZ",
|
||||
"E8601LX",
|
||||
"DATEAMPM",
|
||||
"DTDATE",
|
||||
"DTMONYY",
|
||||
"DTMONYY",
|
||||
"DTWKDATX",
|
||||
"DTYEAR",
|
||||
"TOD",
|
||||
"MDYAMPM",
|
||||
)
|
||||
508
lib/python3.11/site-packages/pandas/io/sas/sas_xport.py
Normal file
508
lib/python3.11/site-packages/pandas/io/sas/sas_xport.py
Normal file
@ -0,0 +1,508 @@
|
||||
"""
|
||||
Read a SAS XPort format file into a Pandas DataFrame.
|
||||
|
||||
Based on code from Jack Cushman (github.com/jcushman/xport).
|
||||
|
||||
The file format is defined here:
|
||||
|
||||
https://support.sas.com/content/dam/SAS/support/en/technical-papers/record-layout-of-a-sas-version-5-or-6-data-set-in-sas-transport-xport-format.pdf
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from collections import abc
|
||||
from datetime import datetime
|
||||
import struct
|
||||
from typing import TYPE_CHECKING
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas.util._decorators import Appender
|
||||
from pandas.util._exceptions import find_stack_level
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from pandas.io.common import get_handle
|
||||
from pandas.io.sas.sasreader import ReaderBase
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pandas._typing import (
|
||||
CompressionOptions,
|
||||
DatetimeNaTType,
|
||||
FilePath,
|
||||
ReadBuffer,
|
||||
)
|
||||
_correct_line1 = (
|
||||
"HEADER RECORD*******LIBRARY HEADER RECORD!!!!!!!"
|
||||
"000000000000000000000000000000 "
|
||||
)
|
||||
_correct_header1 = (
|
||||
"HEADER RECORD*******MEMBER HEADER RECORD!!!!!!!000000000000000001600000000"
|
||||
)
|
||||
_correct_header2 = (
|
||||
"HEADER RECORD*******DSCRPTR HEADER RECORD!!!!!!!"
|
||||
"000000000000000000000000000000 "
|
||||
)
|
||||
_correct_obs_header = (
|
||||
"HEADER RECORD*******OBS HEADER RECORD!!!!!!!"
|
||||
"000000000000000000000000000000 "
|
||||
)
|
||||
_fieldkeys = [
|
||||
"ntype",
|
||||
"nhfun",
|
||||
"field_length",
|
||||
"nvar0",
|
||||
"name",
|
||||
"label",
|
||||
"nform",
|
||||
"nfl",
|
||||
"num_decimals",
|
||||
"nfj",
|
||||
"nfill",
|
||||
"niform",
|
||||
"nifl",
|
||||
"nifd",
|
||||
"npos",
|
||||
"_",
|
||||
]
|
||||
|
||||
|
||||
_base_params_doc = """\
|
||||
Parameters
|
||||
----------
|
||||
filepath_or_buffer : str or file-like object
|
||||
Path to SAS file or object implementing binary read method."""
|
||||
|
||||
_params2_doc = """\
|
||||
index : identifier of index column
|
||||
Identifier of column that should be used as index of the DataFrame.
|
||||
encoding : str
|
||||
Encoding for text data.
|
||||
chunksize : int
|
||||
Read file `chunksize` lines at a time, returns iterator."""
|
||||
|
||||
_format_params_doc = """\
|
||||
format : str
|
||||
File format, only `xport` is currently supported."""
|
||||
|
||||
_iterator_doc = """\
|
||||
iterator : bool, default False
|
||||
Return XportReader object for reading file incrementally."""
|
||||
|
||||
|
||||
_read_sas_doc = f"""Read a SAS file into a DataFrame.
|
||||
|
||||
{_base_params_doc}
|
||||
{_format_params_doc}
|
||||
{_params2_doc}
|
||||
{_iterator_doc}
|
||||
|
||||
Returns
|
||||
-------
|
||||
DataFrame or XportReader
|
||||
|
||||
Examples
|
||||
--------
|
||||
Read a SAS Xport file:
|
||||
|
||||
>>> df = pd.read_sas('filename.XPT')
|
||||
|
||||
Read a Xport file in 10,000 line chunks:
|
||||
|
||||
>>> itr = pd.read_sas('filename.XPT', chunksize=10000)
|
||||
>>> for chunk in itr:
|
||||
>>> do_something(chunk)
|
||||
|
||||
"""
|
||||
|
||||
_xport_reader_doc = f"""\
|
||||
Class for reading SAS Xport files.
|
||||
|
||||
{_base_params_doc}
|
||||
{_params2_doc}
|
||||
|
||||
Attributes
|
||||
----------
|
||||
member_info : list
|
||||
Contains information about the file
|
||||
fields : list
|
||||
Contains information about the variables in the file
|
||||
"""
|
||||
|
||||
_read_method_doc = """\
|
||||
Read observations from SAS Xport file, returning as data frame.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
nrows : int
|
||||
Number of rows to read from data file; if None, read whole
|
||||
file.
|
||||
|
||||
Returns
|
||||
-------
|
||||
A DataFrame.
|
||||
"""
|
||||
|
||||
|
||||
def _parse_date(datestr: str) -> DatetimeNaTType:
|
||||
"""Given a date in xport format, return Python date."""
|
||||
try:
|
||||
# e.g. "16FEB11:10:07:55"
|
||||
return datetime.strptime(datestr, "%d%b%y:%H:%M:%S")
|
||||
except ValueError:
|
||||
return pd.NaT
|
||||
|
||||
|
||||
def _split_line(s: str, parts):
|
||||
"""
|
||||
Parameters
|
||||
----------
|
||||
s: str
|
||||
Fixed-length string to split
|
||||
parts: list of (name, length) pairs
|
||||
Used to break up string, name '_' will be filtered from output.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Dict of name:contents of string at given location.
|
||||
"""
|
||||
out = {}
|
||||
start = 0
|
||||
for name, length in parts:
|
||||
out[name] = s[start : start + length].strip()
|
||||
start += length
|
||||
del out["_"]
|
||||
return out
|
||||
|
||||
|
||||
def _handle_truncated_float_vec(vec, nbytes):
|
||||
# This feature is not well documented, but some SAS XPORT files
|
||||
# have 2-7 byte "truncated" floats. To read these truncated
|
||||
# floats, pad them with zeros on the right to make 8 byte floats.
|
||||
#
|
||||
# References:
|
||||
# https://github.com/jcushman/xport/pull/3
|
||||
# The R "foreign" library
|
||||
|
||||
if nbytes != 8:
|
||||
vec1 = np.zeros(len(vec), np.dtype("S8"))
|
||||
dtype = np.dtype(f"S{nbytes},S{8 - nbytes}")
|
||||
vec2 = vec1.view(dtype=dtype)
|
||||
vec2["f0"] = vec
|
||||
return vec2
|
||||
|
||||
return vec
|
||||
|
||||
|
||||
def _parse_float_vec(vec):
|
||||
"""
|
||||
Parse a vector of float values representing IBM 8 byte floats into
|
||||
native 8 byte floats.
|
||||
"""
|
||||
dtype = np.dtype(">u4,>u4")
|
||||
vec1 = vec.view(dtype=dtype)
|
||||
xport1 = vec1["f0"]
|
||||
xport2 = vec1["f1"]
|
||||
|
||||
# Start by setting first half of ieee number to first half of IBM
|
||||
# number sans exponent
|
||||
ieee1 = xport1 & 0x00FFFFFF
|
||||
|
||||
# The fraction bit to the left of the binary point in the ieee
|
||||
# format was set and the number was shifted 0, 1, 2, or 3
|
||||
# places. This will tell us how to adjust the ibm exponent to be a
|
||||
# power of 2 ieee exponent and how to shift the fraction bits to
|
||||
# restore the correct magnitude.
|
||||
shift = np.zeros(len(vec), dtype=np.uint8)
|
||||
shift[np.where(xport1 & 0x00200000)] = 1
|
||||
shift[np.where(xport1 & 0x00400000)] = 2
|
||||
shift[np.where(xport1 & 0x00800000)] = 3
|
||||
|
||||
# shift the ieee number down the correct number of places then
|
||||
# set the second half of the ieee number to be the second half
|
||||
# of the ibm number shifted appropriately, ored with the bits
|
||||
# from the first half that would have been shifted in if we
|
||||
# could shift a double. All we are worried about are the low
|
||||
# order 3 bits of the first half since we're only shifting by
|
||||
# 1, 2, or 3.
|
||||
ieee1 >>= shift
|
||||
ieee2 = (xport2 >> shift) | ((xport1 & 0x00000007) << (29 + (3 - shift)))
|
||||
|
||||
# clear the 1 bit to the left of the binary point
|
||||
ieee1 &= 0xFFEFFFFF
|
||||
|
||||
# set the exponent of the ieee number to be the actual exponent
|
||||
# plus the shift count + 1023. Or this into the first half of the
|
||||
# ieee number. The ibm exponent is excess 64 but is adjusted by 65
|
||||
# since during conversion to ibm format the exponent is
|
||||
# incremented by 1 and the fraction bits left 4 positions to the
|
||||
# right of the radix point. (had to add >> 24 because C treats &
|
||||
# 0x7f as 0x7f000000 and Python doesn't)
|
||||
ieee1 |= ((((((xport1 >> 24) & 0x7F) - 65) << 2) + shift + 1023) << 20) | (
|
||||
xport1 & 0x80000000
|
||||
)
|
||||
|
||||
ieee = np.empty((len(ieee1),), dtype=">u4,>u4")
|
||||
ieee["f0"] = ieee1
|
||||
ieee["f1"] = ieee2
|
||||
ieee = ieee.view(dtype=">f8")
|
||||
ieee = ieee.astype("f8")
|
||||
|
||||
return ieee
|
||||
|
||||
|
||||
class XportReader(ReaderBase, abc.Iterator):
|
||||
__doc__ = _xport_reader_doc
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
filepath_or_buffer: FilePath | ReadBuffer[bytes],
|
||||
index=None,
|
||||
encoding: str | None = "ISO-8859-1",
|
||||
chunksize: int | None = None,
|
||||
compression: CompressionOptions = "infer",
|
||||
) -> None:
|
||||
self._encoding = encoding
|
||||
self._lines_read = 0
|
||||
self._index = index
|
||||
self._chunksize = chunksize
|
||||
|
||||
self.handles = get_handle(
|
||||
filepath_or_buffer,
|
||||
"rb",
|
||||
encoding=encoding,
|
||||
is_text=False,
|
||||
compression=compression,
|
||||
)
|
||||
self.filepath_or_buffer = self.handles.handle
|
||||
|
||||
try:
|
||||
self._read_header()
|
||||
except Exception:
|
||||
self.close()
|
||||
raise
|
||||
|
||||
def close(self) -> None:
|
||||
self.handles.close()
|
||||
|
||||
def _get_row(self):
|
||||
return self.filepath_or_buffer.read(80).decode()
|
||||
|
||||
def _read_header(self) -> None:
|
||||
self.filepath_or_buffer.seek(0)
|
||||
|
||||
# read file header
|
||||
line1 = self._get_row()
|
||||
if line1 != _correct_line1:
|
||||
if "**COMPRESSED**" in line1:
|
||||
# this was created with the PROC CPORT method and can't be read
|
||||
# https://documentation.sas.com/doc/en/pgmsascdc/9.4_3.5/movefile/p1bm6aqp3fw4uin1hucwh718f6kp.htm
|
||||
raise ValueError(
|
||||
"Header record indicates a CPORT file, which is not readable."
|
||||
)
|
||||
raise ValueError("Header record is not an XPORT file.")
|
||||
|
||||
line2 = self._get_row()
|
||||
fif = [["prefix", 24], ["version", 8], ["OS", 8], ["_", 24], ["created", 16]]
|
||||
file_info = _split_line(line2, fif)
|
||||
if file_info["prefix"] != "SAS SAS SASLIB":
|
||||
raise ValueError("Header record has invalid prefix.")
|
||||
file_info["created"] = _parse_date(file_info["created"])
|
||||
self.file_info = file_info
|
||||
|
||||
line3 = self._get_row()
|
||||
file_info["modified"] = _parse_date(line3[:16])
|
||||
|
||||
# read member header
|
||||
header1 = self._get_row()
|
||||
header2 = self._get_row()
|
||||
headflag1 = header1.startswith(_correct_header1)
|
||||
headflag2 = header2 == _correct_header2
|
||||
if not (headflag1 and headflag2):
|
||||
raise ValueError("Member header not found")
|
||||
# usually 140, could be 135
|
||||
fieldnamelength = int(header1[-5:-2])
|
||||
|
||||
# member info
|
||||
mem = [
|
||||
["prefix", 8],
|
||||
["set_name", 8],
|
||||
["sasdata", 8],
|
||||
["version", 8],
|
||||
["OS", 8],
|
||||
["_", 24],
|
||||
["created", 16],
|
||||
]
|
||||
member_info = _split_line(self._get_row(), mem)
|
||||
mem = [["modified", 16], ["_", 16], ["label", 40], ["type", 8]]
|
||||
member_info.update(_split_line(self._get_row(), mem))
|
||||
member_info["modified"] = _parse_date(member_info["modified"])
|
||||
member_info["created"] = _parse_date(member_info["created"])
|
||||
self.member_info = member_info
|
||||
|
||||
# read field names
|
||||
types = {1: "numeric", 2: "char"}
|
||||
fieldcount = int(self._get_row()[54:58])
|
||||
datalength = fieldnamelength * fieldcount
|
||||
# round up to nearest 80
|
||||
if datalength % 80:
|
||||
datalength += 80 - datalength % 80
|
||||
fielddata = self.filepath_or_buffer.read(datalength)
|
||||
fields = []
|
||||
obs_length = 0
|
||||
while len(fielddata) >= fieldnamelength:
|
||||
# pull data for one field
|
||||
fieldbytes, fielddata = (
|
||||
fielddata[:fieldnamelength],
|
||||
fielddata[fieldnamelength:],
|
||||
)
|
||||
|
||||
# rest at end gets ignored, so if field is short, pad out
|
||||
# to match struct pattern below
|
||||
fieldbytes = fieldbytes.ljust(140)
|
||||
|
||||
fieldstruct = struct.unpack(">hhhh8s40s8shhh2s8shhl52s", fieldbytes)
|
||||
field = dict(zip(_fieldkeys, fieldstruct))
|
||||
del field["_"]
|
||||
field["ntype"] = types[field["ntype"]]
|
||||
fl = field["field_length"]
|
||||
if field["ntype"] == "numeric" and ((fl < 2) or (fl > 8)):
|
||||
msg = f"Floating field width {fl} is not between 2 and 8."
|
||||
raise TypeError(msg)
|
||||
|
||||
for k, v in field.items():
|
||||
try:
|
||||
field[k] = v.strip()
|
||||
except AttributeError:
|
||||
pass
|
||||
|
||||
obs_length += field["field_length"]
|
||||
fields += [field]
|
||||
|
||||
header = self._get_row()
|
||||
if not header == _correct_obs_header:
|
||||
raise ValueError("Observation header not found.")
|
||||
|
||||
self.fields = fields
|
||||
self.record_length = obs_length
|
||||
self.record_start = self.filepath_or_buffer.tell()
|
||||
|
||||
self.nobs = self._record_count()
|
||||
self.columns = [x["name"].decode() for x in self.fields]
|
||||
|
||||
# Setup the dtype.
|
||||
dtypel = [
|
||||
("s" + str(i), "S" + str(field["field_length"]))
|
||||
for i, field in enumerate(self.fields)
|
||||
]
|
||||
dtype = np.dtype(dtypel)
|
||||
self._dtype = dtype
|
||||
|
||||
def __next__(self) -> pd.DataFrame:
|
||||
return self.read(nrows=self._chunksize or 1)
|
||||
|
||||
def _record_count(self) -> int:
|
||||
"""
|
||||
Get number of records in file.
|
||||
|
||||
This is maybe suboptimal because we have to seek to the end of
|
||||
the file.
|
||||
|
||||
Side effect: returns file position to record_start.
|
||||
"""
|
||||
self.filepath_or_buffer.seek(0, 2)
|
||||
total_records_length = self.filepath_or_buffer.tell() - self.record_start
|
||||
|
||||
if total_records_length % 80 != 0:
|
||||
warnings.warn(
|
||||
"xport file may be corrupted.",
|
||||
stacklevel=find_stack_level(),
|
||||
)
|
||||
|
||||
if self.record_length > 80:
|
||||
self.filepath_or_buffer.seek(self.record_start)
|
||||
return total_records_length // self.record_length
|
||||
|
||||
self.filepath_or_buffer.seek(-80, 2)
|
||||
last_card_bytes = self.filepath_or_buffer.read(80)
|
||||
last_card = np.frombuffer(last_card_bytes, dtype=np.uint64)
|
||||
|
||||
# 8 byte blank
|
||||
ix = np.flatnonzero(last_card == 2314885530818453536)
|
||||
|
||||
if len(ix) == 0:
|
||||
tail_pad = 0
|
||||
else:
|
||||
tail_pad = 8 * len(ix)
|
||||
|
||||
self.filepath_or_buffer.seek(self.record_start)
|
||||
|
||||
return (total_records_length - tail_pad) // self.record_length
|
||||
|
||||
def get_chunk(self, size: int | None = None) -> pd.DataFrame:
|
||||
"""
|
||||
Reads lines from Xport file and returns as dataframe
|
||||
|
||||
Parameters
|
||||
----------
|
||||
size : int, defaults to None
|
||||
Number of lines to read. If None, reads whole file.
|
||||
|
||||
Returns
|
||||
-------
|
||||
DataFrame
|
||||
"""
|
||||
if size is None:
|
||||
size = self._chunksize
|
||||
return self.read(nrows=size)
|
||||
|
||||
def _missing_double(self, vec):
|
||||
v = vec.view(dtype="u1,u1,u2,u4")
|
||||
miss = (v["f1"] == 0) & (v["f2"] == 0) & (v["f3"] == 0)
|
||||
miss1 = (
|
||||
((v["f0"] >= 0x41) & (v["f0"] <= 0x5A))
|
||||
| (v["f0"] == 0x5F)
|
||||
| (v["f0"] == 0x2E)
|
||||
)
|
||||
miss &= miss1
|
||||
return miss
|
||||
|
||||
@Appender(_read_method_doc)
|
||||
def read(self, nrows: int | None = None) -> pd.DataFrame:
|
||||
if nrows is None:
|
||||
nrows = self.nobs
|
||||
|
||||
read_lines = min(nrows, self.nobs - self._lines_read)
|
||||
read_len = read_lines * self.record_length
|
||||
if read_len <= 0:
|
||||
self.close()
|
||||
raise StopIteration
|
||||
raw = self.filepath_or_buffer.read(read_len)
|
||||
data = np.frombuffer(raw, dtype=self._dtype, count=read_lines)
|
||||
|
||||
df_data = {}
|
||||
for j, x in enumerate(self.columns):
|
||||
vec = data["s" + str(j)]
|
||||
ntype = self.fields[j]["ntype"]
|
||||
if ntype == "numeric":
|
||||
vec = _handle_truncated_float_vec(vec, self.fields[j]["field_length"])
|
||||
miss = self._missing_double(vec)
|
||||
v = _parse_float_vec(vec)
|
||||
v[miss] = np.nan
|
||||
elif self.fields[j]["ntype"] == "char":
|
||||
v = [y.rstrip() for y in vec]
|
||||
|
||||
if self._encoding is not None:
|
||||
v = [y.decode(self._encoding) for y in v]
|
||||
|
||||
df_data.update({x: v})
|
||||
df = pd.DataFrame(df_data)
|
||||
|
||||
if self._index is None:
|
||||
df.index = pd.Index(range(self._lines_read, self._lines_read + read_lines))
|
||||
else:
|
||||
df = df.set_index(self._index)
|
||||
|
||||
self._lines_read += read_lines
|
||||
|
||||
return df
|
||||
178
lib/python3.11/site-packages/pandas/io/sas/sasreader.py
Normal file
178
lib/python3.11/site-packages/pandas/io/sas/sasreader.py
Normal file
@ -0,0 +1,178 @@
|
||||
"""
|
||||
Read SAS sas7bdat or xport files.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from abc import (
|
||||
ABC,
|
||||
abstractmethod,
|
||||
)
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
overload,
|
||||
)
|
||||
|
||||
from pandas.util._decorators import doc
|
||||
|
||||
from pandas.core.shared_docs import _shared_docs
|
||||
|
||||
from pandas.io.common import stringify_path
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Hashable
|
||||
from types import TracebackType
|
||||
|
||||
from pandas._typing import (
|
||||
CompressionOptions,
|
||||
FilePath,
|
||||
ReadBuffer,
|
||||
Self,
|
||||
)
|
||||
|
||||
from pandas import DataFrame
|
||||
|
||||
|
||||
class ReaderBase(ABC):
|
||||
"""
|
||||
Protocol for XportReader and SAS7BDATReader classes.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def read(self, nrows: int | None = None) -> DataFrame:
|
||||
...
|
||||
|
||||
@abstractmethod
|
||||
def close(self) -> None:
|
||||
...
|
||||
|
||||
def __enter__(self) -> Self:
|
||||
return self
|
||||
|
||||
def __exit__(
|
||||
self,
|
||||
exc_type: type[BaseException] | None,
|
||||
exc_value: BaseException | None,
|
||||
traceback: TracebackType | None,
|
||||
) -> None:
|
||||
self.close()
|
||||
|
||||
|
||||
@overload
|
||||
def read_sas(
|
||||
filepath_or_buffer: FilePath | ReadBuffer[bytes],
|
||||
*,
|
||||
format: str | None = ...,
|
||||
index: Hashable | None = ...,
|
||||
encoding: str | None = ...,
|
||||
chunksize: int = ...,
|
||||
iterator: bool = ...,
|
||||
compression: CompressionOptions = ...,
|
||||
) -> ReaderBase:
|
||||
...
|
||||
|
||||
|
||||
@overload
|
||||
def read_sas(
|
||||
filepath_or_buffer: FilePath | ReadBuffer[bytes],
|
||||
*,
|
||||
format: str | None = ...,
|
||||
index: Hashable | None = ...,
|
||||
encoding: str | None = ...,
|
||||
chunksize: None = ...,
|
||||
iterator: bool = ...,
|
||||
compression: CompressionOptions = ...,
|
||||
) -> DataFrame | ReaderBase:
|
||||
...
|
||||
|
||||
|
||||
@doc(decompression_options=_shared_docs["decompression_options"] % "filepath_or_buffer")
|
||||
def read_sas(
|
||||
filepath_or_buffer: FilePath | ReadBuffer[bytes],
|
||||
*,
|
||||
format: str | None = None,
|
||||
index: Hashable | None = None,
|
||||
encoding: str | None = None,
|
||||
chunksize: int | None = None,
|
||||
iterator: bool = False,
|
||||
compression: CompressionOptions = "infer",
|
||||
) -> DataFrame | ReaderBase:
|
||||
"""
|
||||
Read SAS files stored as either XPORT or SAS7BDAT format files.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
filepath_or_buffer : str, path object, or file-like object
|
||||
String, path object (implementing ``os.PathLike[str]``), or file-like
|
||||
object implementing a binary ``read()`` function. The string could be a URL.
|
||||
Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is
|
||||
expected. A local file could be:
|
||||
``file://localhost/path/to/table.sas7bdat``.
|
||||
format : str {{'xport', 'sas7bdat'}} or None
|
||||
If None, file format is inferred from file extension. If 'xport' or
|
||||
'sas7bdat', uses the corresponding format.
|
||||
index : identifier of index column, defaults to None
|
||||
Identifier of column that should be used as index of the DataFrame.
|
||||
encoding : str, default is None
|
||||
Encoding for text data. If None, text data are stored as raw bytes.
|
||||
chunksize : int
|
||||
Read file `chunksize` lines at a time, returns iterator.
|
||||
iterator : bool, defaults to False
|
||||
If True, returns an iterator for reading the file incrementally.
|
||||
{decompression_options}
|
||||
|
||||
Returns
|
||||
-------
|
||||
DataFrame if iterator=False and chunksize=None, else SAS7BDATReader
|
||||
or XportReader
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = pd.read_sas("sas_data.sas7bdat") # doctest: +SKIP
|
||||
"""
|
||||
if format is None:
|
||||
buffer_error_msg = (
|
||||
"If this is a buffer object rather "
|
||||
"than a string name, you must specify a format string"
|
||||
)
|
||||
filepath_or_buffer = stringify_path(filepath_or_buffer)
|
||||
if not isinstance(filepath_or_buffer, str):
|
||||
raise ValueError(buffer_error_msg)
|
||||
fname = filepath_or_buffer.lower()
|
||||
if ".xpt" in fname:
|
||||
format = "xport"
|
||||
elif ".sas7bdat" in fname:
|
||||
format = "sas7bdat"
|
||||
else:
|
||||
raise ValueError(
|
||||
f"unable to infer format of SAS file from filename: {repr(fname)}"
|
||||
)
|
||||
|
||||
reader: ReaderBase
|
||||
if format.lower() == "xport":
|
||||
from pandas.io.sas.sas_xport import XportReader
|
||||
|
||||
reader = XportReader(
|
||||
filepath_or_buffer,
|
||||
index=index,
|
||||
encoding=encoding,
|
||||
chunksize=chunksize,
|
||||
compression=compression,
|
||||
)
|
||||
elif format.lower() == "sas7bdat":
|
||||
from pandas.io.sas.sas7bdat import SAS7BDATReader
|
||||
|
||||
reader = SAS7BDATReader(
|
||||
filepath_or_buffer,
|
||||
index=index,
|
||||
encoding=encoding,
|
||||
chunksize=chunksize,
|
||||
compression=compression,
|
||||
)
|
||||
else:
|
||||
raise ValueError("unknown SAS format")
|
||||
|
||||
if iterator or chunksize:
|
||||
return reader
|
||||
|
||||
with reader:
|
||||
return reader.read()
|
||||
72
lib/python3.11/site-packages/pandas/io/spss.py
Normal file
72
lib/python3.11/site-packages/pandas/io/spss.py
Normal file
@ -0,0 +1,72 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from pandas._libs import lib
|
||||
from pandas.compat._optional import import_optional_dependency
|
||||
from pandas.util._validators import check_dtype_backend
|
||||
|
||||
from pandas.core.dtypes.inference import is_list_like
|
||||
|
||||
from pandas.io.common import stringify_path
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Sequence
|
||||
from pathlib import Path
|
||||
|
||||
from pandas._typing import DtypeBackend
|
||||
|
||||
from pandas import DataFrame
|
||||
|
||||
|
||||
def read_spss(
|
||||
path: str | Path,
|
||||
usecols: Sequence[str] | None = None,
|
||||
convert_categoricals: bool = True,
|
||||
dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
|
||||
) -> DataFrame:
|
||||
"""
|
||||
Load an SPSS file from the file path, returning a DataFrame.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path : str or Path
|
||||
File path.
|
||||
usecols : list-like, optional
|
||||
Return a subset of the columns. If None, return all columns.
|
||||
convert_categoricals : bool, default is True
|
||||
Convert categorical columns into pd.Categorical.
|
||||
dtype_backend : {'numpy_nullable', 'pyarrow'}, default 'numpy_nullable'
|
||||
Back-end data type applied to the resultant :class:`DataFrame`
|
||||
(still experimental). Behaviour is as follows:
|
||||
|
||||
* ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame`
|
||||
(default).
|
||||
* ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype`
|
||||
DataFrame.
|
||||
|
||||
.. versionadded:: 2.0
|
||||
|
||||
Returns
|
||||
-------
|
||||
DataFrame
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = pd.read_spss("spss_data.sav") # doctest: +SKIP
|
||||
"""
|
||||
pyreadstat = import_optional_dependency("pyreadstat")
|
||||
check_dtype_backend(dtype_backend)
|
||||
|
||||
if usecols is not None:
|
||||
if not is_list_like(usecols):
|
||||
raise TypeError("usecols must be list-like.")
|
||||
usecols = list(usecols) # pyreadstat requires a list
|
||||
|
||||
df, metadata = pyreadstat.read_sav(
|
||||
stringify_path(path), usecols=usecols, apply_value_formats=convert_categoricals
|
||||
)
|
||||
df.attrs = metadata.__dict__
|
||||
if dtype_backend is not lib.no_default:
|
||||
df = df.convert_dtypes(dtype_backend=dtype_backend)
|
||||
return df
|
||||
2916
lib/python3.11/site-packages/pandas/io/sql.py
Normal file
2916
lib/python3.11/site-packages/pandas/io/sql.py
Normal file
File diff suppressed because it is too large
Load Diff
3768
lib/python3.11/site-packages/pandas/io/stata.py
Normal file
3768
lib/python3.11/site-packages/pandas/io/stata.py
Normal file
File diff suppressed because it is too large
Load Diff
1177
lib/python3.11/site-packages/pandas/io/xml.py
Normal file
1177
lib/python3.11/site-packages/pandas/io/xml.py
Normal file
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user