done
This commit is contained in:
		| @ -0,0 +1,9 @@ | ||||
| import pytest | ||||
|  | ||||
|  | ||||
| @pytest.fixture(params=["split", "records", "index", "columns", "values"]) | ||||
| def orient(request): | ||||
|     """ | ||||
|     Fixture for orients excluding the table format. | ||||
|     """ | ||||
|     return request.param | ||||
| @ -0,0 +1,130 @@ | ||||
| from io import ( | ||||
|     BytesIO, | ||||
|     StringIO, | ||||
| ) | ||||
|  | ||||
| import pytest | ||||
|  | ||||
| import pandas.util._test_decorators as td | ||||
|  | ||||
| import pandas as pd | ||||
| import pandas._testing as tm | ||||
|  | ||||
|  | ||||
| def test_compression_roundtrip(compression): | ||||
|     df = pd.DataFrame( | ||||
|         [[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]], | ||||
|         index=["A", "B"], | ||||
|         columns=["X", "Y", "Z"], | ||||
|     ) | ||||
|  | ||||
|     with tm.ensure_clean() as path: | ||||
|         df.to_json(path, compression=compression) | ||||
|         tm.assert_frame_equal(df, pd.read_json(path, compression=compression)) | ||||
|  | ||||
|         # explicitly ensure file was compressed. | ||||
|         with tm.decompress_file(path, compression) as fh: | ||||
|             result = fh.read().decode("utf8") | ||||
|             data = StringIO(result) | ||||
|         tm.assert_frame_equal(df, pd.read_json(data)) | ||||
|  | ||||
|  | ||||
| def test_read_zipped_json(datapath): | ||||
|     uncompressed_path = datapath("io", "json", "data", "tsframe_v012.json") | ||||
|     uncompressed_df = pd.read_json(uncompressed_path) | ||||
|  | ||||
|     compressed_path = datapath("io", "json", "data", "tsframe_v012.json.zip") | ||||
|     compressed_df = pd.read_json(compressed_path, compression="zip") | ||||
|  | ||||
|     tm.assert_frame_equal(uncompressed_df, compressed_df) | ||||
|  | ||||
|  | ||||
| @td.skip_if_not_us_locale | ||||
| @pytest.mark.single_cpu | ||||
| def test_with_s3_url(compression, s3_public_bucket, s3so): | ||||
|     # Bucket created in tests/io/conftest.py | ||||
|     df = pd.read_json(StringIO('{"a": [1, 2, 3], "b": [4, 5, 6]}')) | ||||
|  | ||||
|     with tm.ensure_clean() as path: | ||||
|         df.to_json(path, compression=compression) | ||||
|         with open(path, "rb") as f: | ||||
|             s3_public_bucket.put_object(Key="test-1", Body=f) | ||||
|  | ||||
|     roundtripped_df = pd.read_json( | ||||
|         f"s3://{s3_public_bucket.name}/test-1", | ||||
|         compression=compression, | ||||
|         storage_options=s3so, | ||||
|     ) | ||||
|     tm.assert_frame_equal(df, roundtripped_df) | ||||
|  | ||||
|  | ||||
| def test_lines_with_compression(compression): | ||||
|     with tm.ensure_clean() as path: | ||||
|         df = pd.read_json(StringIO('{"a": [1, 2, 3], "b": [4, 5, 6]}')) | ||||
|         df.to_json(path, orient="records", lines=True, compression=compression) | ||||
|         roundtripped_df = pd.read_json(path, lines=True, compression=compression) | ||||
|         tm.assert_frame_equal(df, roundtripped_df) | ||||
|  | ||||
|  | ||||
| def test_chunksize_with_compression(compression): | ||||
|     with tm.ensure_clean() as path: | ||||
|         df = pd.read_json(StringIO('{"a": ["foo", "bar", "baz"], "b": [4, 5, 6]}')) | ||||
|         df.to_json(path, orient="records", lines=True, compression=compression) | ||||
|  | ||||
|         with pd.read_json( | ||||
|             path, lines=True, chunksize=1, compression=compression | ||||
|         ) as res: | ||||
|             roundtripped_df = pd.concat(res) | ||||
|         tm.assert_frame_equal(df, roundtripped_df) | ||||
|  | ||||
|  | ||||
| def test_write_unsupported_compression_type(): | ||||
|     df = pd.read_json(StringIO('{"a": [1, 2, 3], "b": [4, 5, 6]}')) | ||||
|     with tm.ensure_clean() as path: | ||||
|         msg = "Unrecognized compression type: unsupported" | ||||
|         with pytest.raises(ValueError, match=msg): | ||||
|             df.to_json(path, compression="unsupported") | ||||
|  | ||||
|  | ||||
| def test_read_unsupported_compression_type(): | ||||
|     with tm.ensure_clean() as path: | ||||
|         msg = "Unrecognized compression type: unsupported" | ||||
|         with pytest.raises(ValueError, match=msg): | ||||
|             pd.read_json(path, compression="unsupported") | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize( | ||||
|     "infer_string", [False, pytest.param(True, marks=td.skip_if_no("pyarrow"))] | ||||
| ) | ||||
| @pytest.mark.parametrize("to_infer", [True, False]) | ||||
| @pytest.mark.parametrize("read_infer", [True, False]) | ||||
| def test_to_json_compression( | ||||
|     compression_only, read_infer, to_infer, compression_to_extension, infer_string | ||||
| ): | ||||
|     with pd.option_context("future.infer_string", infer_string): | ||||
|         # see gh-15008 | ||||
|         compression = compression_only | ||||
|  | ||||
|         # We'll complete file extension subsequently. | ||||
|         filename = "test." | ||||
|         filename += compression_to_extension[compression] | ||||
|  | ||||
|         df = pd.DataFrame({"A": [1]}) | ||||
|  | ||||
|         to_compression = "infer" if to_infer else compression | ||||
|         read_compression = "infer" if read_infer else compression | ||||
|  | ||||
|         with tm.ensure_clean(filename) as path: | ||||
|             df.to_json(path, compression=to_compression) | ||||
|             result = pd.read_json(path, compression=read_compression) | ||||
|             tm.assert_frame_equal(result, df) | ||||
|  | ||||
|  | ||||
| def test_to_json_compression_mode(compression): | ||||
|     # GH 39985 (read_json does not support user-provided binary files) | ||||
|     expected = pd.DataFrame({"A": [1]}) | ||||
|  | ||||
|     with BytesIO() as buffer: | ||||
|         expected.to_json(buffer, compression=compression) | ||||
|         # df = pd.read_json(buffer, compression=compression) | ||||
|         # tm.assert_frame_equal(expected, df) | ||||
| @ -0,0 +1,21 @@ | ||||
| """ | ||||
| Tests for the deprecated keyword arguments for `read_json`. | ||||
| """ | ||||
| from io import StringIO | ||||
|  | ||||
| import pandas as pd | ||||
| import pandas._testing as tm | ||||
|  | ||||
| from pandas.io.json import read_json | ||||
|  | ||||
|  | ||||
| def test_good_kwargs(): | ||||
|     df = pd.DataFrame({"A": [2, 4, 6], "B": [3, 6, 9]}, index=[0, 1, 2]) | ||||
|  | ||||
|     with tm.assert_produces_warning(None): | ||||
|         data1 = StringIO(df.to_json(orient="split")) | ||||
|         tm.assert_frame_equal(df, read_json(data1, orient="split")) | ||||
|         data2 = StringIO(df.to_json(orient="columns")) | ||||
|         tm.assert_frame_equal(df, read_json(data2, orient="columns")) | ||||
|         data3 = StringIO(df.to_json(orient="index")) | ||||
|         tm.assert_frame_equal(df, read_json(data3, orient="index")) | ||||
| @ -0,0 +1,873 @@ | ||||
| """Tests for Table Schema integration.""" | ||||
| from collections import OrderedDict | ||||
| from io import StringIO | ||||
| import json | ||||
|  | ||||
| import numpy as np | ||||
| import pytest | ||||
|  | ||||
| from pandas.core.dtypes.dtypes import ( | ||||
|     CategoricalDtype, | ||||
|     DatetimeTZDtype, | ||||
|     PeriodDtype, | ||||
| ) | ||||
|  | ||||
| import pandas as pd | ||||
| from pandas import DataFrame | ||||
| import pandas._testing as tm | ||||
|  | ||||
| from pandas.io.json._table_schema import ( | ||||
|     as_json_table_type, | ||||
|     build_table_schema, | ||||
|     convert_json_field_to_pandas_type, | ||||
|     convert_pandas_type_to_json_field, | ||||
|     set_default_names, | ||||
| ) | ||||
|  | ||||
|  | ||||
| @pytest.fixture | ||||
| def df_schema(): | ||||
|     return DataFrame( | ||||
|         { | ||||
|             "A": [1, 2, 3, 4], | ||||
|             "B": ["a", "b", "c", "c"], | ||||
|             "C": pd.date_range("2016-01-01", freq="d", periods=4), | ||||
|             "D": pd.timedelta_range("1h", periods=4, freq="min"), | ||||
|         }, | ||||
|         index=pd.Index(range(4), name="idx"), | ||||
|     ) | ||||
|  | ||||
|  | ||||
| @pytest.fixture | ||||
| def df_table(): | ||||
|     return DataFrame( | ||||
|         { | ||||
|             "A": [1, 2, 3, 4], | ||||
|             "B": ["a", "b", "c", "c"], | ||||
|             "C": pd.date_range("2016-01-01", freq="d", periods=4), | ||||
|             "D": pd.timedelta_range("1h", periods=4, freq="min"), | ||||
|             "E": pd.Series(pd.Categorical(["a", "b", "c", "c"])), | ||||
|             "F": pd.Series(pd.Categorical(["a", "b", "c", "c"], ordered=True)), | ||||
|             "G": [1.0, 2.0, 3, 4.0], | ||||
|             "H": pd.date_range("2016-01-01", freq="d", periods=4, tz="US/Central"), | ||||
|         }, | ||||
|         index=pd.Index(range(4), name="idx"), | ||||
|     ) | ||||
|  | ||||
|  | ||||
| class TestBuildSchema: | ||||
|     def test_build_table_schema(self, df_schema, using_infer_string): | ||||
|         result = build_table_schema(df_schema, version=False) | ||||
|         expected = { | ||||
|             "fields": [ | ||||
|                 {"name": "idx", "type": "integer"}, | ||||
|                 {"name": "A", "type": "integer"}, | ||||
|                 {"name": "B", "type": "string"}, | ||||
|                 {"name": "C", "type": "datetime"}, | ||||
|                 {"name": "D", "type": "duration"}, | ||||
|             ], | ||||
|             "primaryKey": ["idx"], | ||||
|         } | ||||
|         if using_infer_string: | ||||
|             expected["fields"][2] = {"name": "B", "type": "string", "extDtype": "str"} | ||||
|         assert result == expected | ||||
|         result = build_table_schema(df_schema) | ||||
|         assert "pandas_version" in result | ||||
|  | ||||
|     def test_series(self): | ||||
|         s = pd.Series([1, 2, 3], name="foo") | ||||
|         result = build_table_schema(s, version=False) | ||||
|         expected = { | ||||
|             "fields": [ | ||||
|                 {"name": "index", "type": "integer"}, | ||||
|                 {"name": "foo", "type": "integer"}, | ||||
|             ], | ||||
|             "primaryKey": ["index"], | ||||
|         } | ||||
|         assert result == expected | ||||
|         result = build_table_schema(s) | ||||
|         assert "pandas_version" in result | ||||
|  | ||||
|     def test_series_unnamed(self): | ||||
|         result = build_table_schema(pd.Series([1, 2, 3]), version=False) | ||||
|         expected = { | ||||
|             "fields": [ | ||||
|                 {"name": "index", "type": "integer"}, | ||||
|                 {"name": "values", "type": "integer"}, | ||||
|             ], | ||||
|             "primaryKey": ["index"], | ||||
|         } | ||||
|         assert result == expected | ||||
|  | ||||
|     def test_multiindex(self, df_schema, using_infer_string): | ||||
|         df = df_schema | ||||
|         idx = pd.MultiIndex.from_product([("a", "b"), (1, 2)]) | ||||
|         df.index = idx | ||||
|  | ||||
|         result = build_table_schema(df, version=False) | ||||
|         expected = { | ||||
|             "fields": [ | ||||
|                 {"name": "level_0", "type": "string"}, | ||||
|                 {"name": "level_1", "type": "integer"}, | ||||
|                 {"name": "A", "type": "integer"}, | ||||
|                 {"name": "B", "type": "string"}, | ||||
|                 {"name": "C", "type": "datetime"}, | ||||
|                 {"name": "D", "type": "duration"}, | ||||
|             ], | ||||
|             "primaryKey": ["level_0", "level_1"], | ||||
|         } | ||||
|         if using_infer_string: | ||||
|             expected["fields"][0] = { | ||||
|                 "name": "level_0", | ||||
|                 "type": "string", | ||||
|                 "extDtype": "str", | ||||
|             } | ||||
|             expected["fields"][3] = {"name": "B", "type": "string", "extDtype": "str"} | ||||
|         assert result == expected | ||||
|  | ||||
|         df.index.names = ["idx0", None] | ||||
|         expected["fields"][0]["name"] = "idx0" | ||||
|         expected["primaryKey"] = ["idx0", "level_1"] | ||||
|         result = build_table_schema(df, version=False) | ||||
|         assert result == expected | ||||
|  | ||||
|  | ||||
| class TestTableSchemaType: | ||||
|     @pytest.mark.parametrize("int_type", [int, np.int16, np.int32, np.int64]) | ||||
|     def test_as_json_table_type_int_data(self, int_type): | ||||
|         int_data = [1, 2, 3] | ||||
|         assert as_json_table_type(np.array(int_data, dtype=int_type).dtype) == "integer" | ||||
|  | ||||
|     @pytest.mark.parametrize("float_type", [float, np.float16, np.float32, np.float64]) | ||||
|     def test_as_json_table_type_float_data(self, float_type): | ||||
|         float_data = [1.0, 2.0, 3.0] | ||||
|         assert ( | ||||
|             as_json_table_type(np.array(float_data, dtype=float_type).dtype) == "number" | ||||
|         ) | ||||
|  | ||||
|     @pytest.mark.parametrize("bool_type", [bool, np.bool_]) | ||||
|     def test_as_json_table_type_bool_data(self, bool_type): | ||||
|         bool_data = [True, False] | ||||
|         assert ( | ||||
|             as_json_table_type(np.array(bool_data, dtype=bool_type).dtype) == "boolean" | ||||
|         ) | ||||
|  | ||||
|     @pytest.mark.parametrize( | ||||
|         "date_data", | ||||
|         [ | ||||
|             pd.to_datetime(["2016"]), | ||||
|             pd.to_datetime(["2016"], utc=True), | ||||
|             pd.Series(pd.to_datetime(["2016"])), | ||||
|             pd.Series(pd.to_datetime(["2016"], utc=True)), | ||||
|             pd.period_range("2016", freq="Y", periods=3), | ||||
|         ], | ||||
|     ) | ||||
|     def test_as_json_table_type_date_data(self, date_data): | ||||
|         assert as_json_table_type(date_data.dtype) == "datetime" | ||||
|  | ||||
|     @pytest.mark.parametrize( | ||||
|         "str_data", | ||||
|         [pd.Series(["a", "b"], dtype=object), pd.Index(["a", "b"], dtype=object)], | ||||
|     ) | ||||
|     def test_as_json_table_type_string_data(self, str_data): | ||||
|         assert as_json_table_type(str_data.dtype) == "string" | ||||
|  | ||||
|     @pytest.mark.parametrize( | ||||
|         "cat_data", | ||||
|         [ | ||||
|             pd.Categorical(["a"]), | ||||
|             pd.Categorical([1]), | ||||
|             pd.Series(pd.Categorical([1])), | ||||
|             pd.CategoricalIndex([1]), | ||||
|             pd.Categorical([1]), | ||||
|         ], | ||||
|     ) | ||||
|     def test_as_json_table_type_categorical_data(self, cat_data): | ||||
|         assert as_json_table_type(cat_data.dtype) == "any" | ||||
|  | ||||
|     # ------ | ||||
|     # dtypes | ||||
|     # ------ | ||||
|     @pytest.mark.parametrize("int_dtype", [int, np.int16, np.int32, np.int64]) | ||||
|     def test_as_json_table_type_int_dtypes(self, int_dtype): | ||||
|         assert as_json_table_type(int_dtype) == "integer" | ||||
|  | ||||
|     @pytest.mark.parametrize("float_dtype", [float, np.float16, np.float32, np.float64]) | ||||
|     def test_as_json_table_type_float_dtypes(self, float_dtype): | ||||
|         assert as_json_table_type(float_dtype) == "number" | ||||
|  | ||||
|     @pytest.mark.parametrize("bool_dtype", [bool, np.bool_]) | ||||
|     def test_as_json_table_type_bool_dtypes(self, bool_dtype): | ||||
|         assert as_json_table_type(bool_dtype) == "boolean" | ||||
|  | ||||
|     @pytest.mark.parametrize( | ||||
|         "date_dtype", | ||||
|         [ | ||||
|             np.dtype("<M8[ns]"), | ||||
|             PeriodDtype("D"), | ||||
|             DatetimeTZDtype("ns", "US/Central"), | ||||
|         ], | ||||
|     ) | ||||
|     def test_as_json_table_type_date_dtypes(self, date_dtype): | ||||
|         # TODO: datedate.date? datetime.time? | ||||
|         assert as_json_table_type(date_dtype) == "datetime" | ||||
|  | ||||
|     @pytest.mark.parametrize("td_dtype", [np.dtype("<m8[ns]")]) | ||||
|     def test_as_json_table_type_timedelta_dtypes(self, td_dtype): | ||||
|         assert as_json_table_type(td_dtype) == "duration" | ||||
|  | ||||
|     @pytest.mark.parametrize("str_dtype", [object])  # TODO(GH#14904) flesh out dtypes? | ||||
|     def test_as_json_table_type_string_dtypes(self, str_dtype): | ||||
|         assert as_json_table_type(str_dtype) == "string" | ||||
|  | ||||
|     def test_as_json_table_type_categorical_dtypes(self): | ||||
|         assert as_json_table_type(pd.Categorical(["a"]).dtype) == "any" | ||||
|         assert as_json_table_type(CategoricalDtype()) == "any" | ||||
|  | ||||
|  | ||||
| class TestTableOrient: | ||||
|     def test_build_series(self): | ||||
|         s = pd.Series([1, 2], name="a") | ||||
|         s.index.name = "id" | ||||
|         result = s.to_json(orient="table", date_format="iso") | ||||
|         result = json.loads(result, object_pairs_hook=OrderedDict) | ||||
|  | ||||
|         assert "pandas_version" in result["schema"] | ||||
|         result["schema"].pop("pandas_version") | ||||
|  | ||||
|         fields = [{"name": "id", "type": "integer"}, {"name": "a", "type": "integer"}] | ||||
|  | ||||
|         schema = {"fields": fields, "primaryKey": ["id"]} | ||||
|  | ||||
|         expected = OrderedDict( | ||||
|             [ | ||||
|                 ("schema", schema), | ||||
|                 ( | ||||
|                     "data", | ||||
|                     [ | ||||
|                         OrderedDict([("id", 0), ("a", 1)]), | ||||
|                         OrderedDict([("id", 1), ("a", 2)]), | ||||
|                     ], | ||||
|                 ), | ||||
|             ] | ||||
|         ) | ||||
|  | ||||
|         assert result == expected | ||||
|  | ||||
|     def test_read_json_from_to_json_results(self): | ||||
|         # GH32383 | ||||
|         df = DataFrame( | ||||
|             { | ||||
|                 "_id": {"row_0": 0}, | ||||
|                 "category": {"row_0": "Goods"}, | ||||
|                 "recommender_id": {"row_0": 3}, | ||||
|                 "recommender_name_jp": {"row_0": "浦田"}, | ||||
|                 "recommender_name_en": {"row_0": "Urata"}, | ||||
|                 "name_jp": {"row_0": "博多人形(松尾吉将まつお よしまさ)"}, | ||||
|                 "name_en": {"row_0": "Hakata Dolls Matsuo"}, | ||||
|             } | ||||
|         ) | ||||
|  | ||||
|         result1 = pd.read_json(StringIO(df.to_json())) | ||||
|         result2 = DataFrame.from_dict(json.loads(df.to_json())) | ||||
|         tm.assert_frame_equal(result1, df) | ||||
|         tm.assert_frame_equal(result2, df) | ||||
|  | ||||
|     def test_to_json(self, df_table, using_infer_string): | ||||
|         df = df_table | ||||
|         df.index.name = "idx" | ||||
|         result = df.to_json(orient="table", date_format="iso") | ||||
|         result = json.loads(result, object_pairs_hook=OrderedDict) | ||||
|  | ||||
|         assert "pandas_version" in result["schema"] | ||||
|         result["schema"].pop("pandas_version") | ||||
|  | ||||
|         fields = [ | ||||
|             {"name": "idx", "type": "integer"}, | ||||
|             {"name": "A", "type": "integer"}, | ||||
|             {"name": "B", "type": "string"}, | ||||
|             {"name": "C", "type": "datetime"}, | ||||
|             {"name": "D", "type": "duration"}, | ||||
|             { | ||||
|                 "constraints": {"enum": ["a", "b", "c"]}, | ||||
|                 "name": "E", | ||||
|                 "ordered": False, | ||||
|                 "type": "any", | ||||
|             }, | ||||
|             { | ||||
|                 "constraints": {"enum": ["a", "b", "c"]}, | ||||
|                 "name": "F", | ||||
|                 "ordered": True, | ||||
|                 "type": "any", | ||||
|             }, | ||||
|             {"name": "G", "type": "number"}, | ||||
|             {"name": "H", "type": "datetime", "tz": "US/Central"}, | ||||
|         ] | ||||
|  | ||||
|         if using_infer_string: | ||||
|             fields[2] = {"name": "B", "type": "string", "extDtype": "str"} | ||||
|  | ||||
|         schema = {"fields": fields, "primaryKey": ["idx"]} | ||||
|         data = [ | ||||
|             OrderedDict( | ||||
|                 [ | ||||
|                     ("idx", 0), | ||||
|                     ("A", 1), | ||||
|                     ("B", "a"), | ||||
|                     ("C", "2016-01-01T00:00:00.000"), | ||||
|                     ("D", "P0DT1H0M0S"), | ||||
|                     ("E", "a"), | ||||
|                     ("F", "a"), | ||||
|                     ("G", 1.0), | ||||
|                     ("H", "2016-01-01T06:00:00.000Z"), | ||||
|                 ] | ||||
|             ), | ||||
|             OrderedDict( | ||||
|                 [ | ||||
|                     ("idx", 1), | ||||
|                     ("A", 2), | ||||
|                     ("B", "b"), | ||||
|                     ("C", "2016-01-02T00:00:00.000"), | ||||
|                     ("D", "P0DT1H1M0S"), | ||||
|                     ("E", "b"), | ||||
|                     ("F", "b"), | ||||
|                     ("G", 2.0), | ||||
|                     ("H", "2016-01-02T06:00:00.000Z"), | ||||
|                 ] | ||||
|             ), | ||||
|             OrderedDict( | ||||
|                 [ | ||||
|                     ("idx", 2), | ||||
|                     ("A", 3), | ||||
|                     ("B", "c"), | ||||
|                     ("C", "2016-01-03T00:00:00.000"), | ||||
|                     ("D", "P0DT1H2M0S"), | ||||
|                     ("E", "c"), | ||||
|                     ("F", "c"), | ||||
|                     ("G", 3.0), | ||||
|                     ("H", "2016-01-03T06:00:00.000Z"), | ||||
|                 ] | ||||
|             ), | ||||
|             OrderedDict( | ||||
|                 [ | ||||
|                     ("idx", 3), | ||||
|                     ("A", 4), | ||||
|                     ("B", "c"), | ||||
|                     ("C", "2016-01-04T00:00:00.000"), | ||||
|                     ("D", "P0DT1H3M0S"), | ||||
|                     ("E", "c"), | ||||
|                     ("F", "c"), | ||||
|                     ("G", 4.0), | ||||
|                     ("H", "2016-01-04T06:00:00.000Z"), | ||||
|                 ] | ||||
|             ), | ||||
|         ] | ||||
|         expected = OrderedDict([("schema", schema), ("data", data)]) | ||||
|  | ||||
|         assert result == expected | ||||
|  | ||||
|     def test_to_json_float_index(self): | ||||
|         data = pd.Series(1, index=[1.0, 2.0]) | ||||
|         result = data.to_json(orient="table", date_format="iso") | ||||
|         result = json.loads(result, object_pairs_hook=OrderedDict) | ||||
|         result["schema"].pop("pandas_version") | ||||
|  | ||||
|         expected = OrderedDict( | ||||
|             [ | ||||
|                 ( | ||||
|                     "schema", | ||||
|                     { | ||||
|                         "fields": [ | ||||
|                             {"name": "index", "type": "number"}, | ||||
|                             {"name": "values", "type": "integer"}, | ||||
|                         ], | ||||
|                         "primaryKey": ["index"], | ||||
|                     }, | ||||
|                 ), | ||||
|                 ( | ||||
|                     "data", | ||||
|                     [ | ||||
|                         OrderedDict([("index", 1.0), ("values", 1)]), | ||||
|                         OrderedDict([("index", 2.0), ("values", 1)]), | ||||
|                     ], | ||||
|                 ), | ||||
|             ] | ||||
|         ) | ||||
|  | ||||
|         assert result == expected | ||||
|  | ||||
|     def test_to_json_period_index(self): | ||||
|         idx = pd.period_range("2016", freq="Q-JAN", periods=2) | ||||
|         data = pd.Series(1, idx) | ||||
|         result = data.to_json(orient="table", date_format="iso") | ||||
|         result = json.loads(result, object_pairs_hook=OrderedDict) | ||||
|         result["schema"].pop("pandas_version") | ||||
|  | ||||
|         fields = [ | ||||
|             {"freq": "QE-JAN", "name": "index", "type": "datetime"}, | ||||
|             {"name": "values", "type": "integer"}, | ||||
|         ] | ||||
|  | ||||
|         schema = {"fields": fields, "primaryKey": ["index"]} | ||||
|         data = [ | ||||
|             OrderedDict([("index", "2015-11-01T00:00:00.000"), ("values", 1)]), | ||||
|             OrderedDict([("index", "2016-02-01T00:00:00.000"), ("values", 1)]), | ||||
|         ] | ||||
|         expected = OrderedDict([("schema", schema), ("data", data)]) | ||||
|  | ||||
|         assert result == expected | ||||
|  | ||||
|     def test_to_json_categorical_index(self): | ||||
|         data = pd.Series(1, pd.CategoricalIndex(["a", "b"])) | ||||
|         result = data.to_json(orient="table", date_format="iso") | ||||
|         result = json.loads(result, object_pairs_hook=OrderedDict) | ||||
|         result["schema"].pop("pandas_version") | ||||
|  | ||||
|         expected = OrderedDict( | ||||
|             [ | ||||
|                 ( | ||||
|                     "schema", | ||||
|                     { | ||||
|                         "fields": [ | ||||
|                             { | ||||
|                                 "name": "index", | ||||
|                                 "type": "any", | ||||
|                                 "constraints": {"enum": ["a", "b"]}, | ||||
|                                 "ordered": False, | ||||
|                             }, | ||||
|                             {"name": "values", "type": "integer"}, | ||||
|                         ], | ||||
|                         "primaryKey": ["index"], | ||||
|                     }, | ||||
|                 ), | ||||
|                 ( | ||||
|                     "data", | ||||
|                     [ | ||||
|                         OrderedDict([("index", "a"), ("values", 1)]), | ||||
|                         OrderedDict([("index", "b"), ("values", 1)]), | ||||
|                     ], | ||||
|                 ), | ||||
|             ] | ||||
|         ) | ||||
|  | ||||
|         assert result == expected | ||||
|  | ||||
|     def test_date_format_raises(self, df_table): | ||||
|         msg = ( | ||||
|             "Trying to write with `orient='table'` and `date_format='epoch'`. Table " | ||||
|             "Schema requires dates to be formatted with `date_format='iso'`" | ||||
|         ) | ||||
|         with pytest.raises(ValueError, match=msg): | ||||
|             df_table.to_json(orient="table", date_format="epoch") | ||||
|  | ||||
|         # others work | ||||
|         df_table.to_json(orient="table", date_format="iso") | ||||
|         df_table.to_json(orient="table") | ||||
|  | ||||
|     def test_convert_pandas_type_to_json_field_int(self, index_or_series): | ||||
|         kind = index_or_series | ||||
|         data = [1, 2, 3] | ||||
|         result = convert_pandas_type_to_json_field(kind(data, name="name")) | ||||
|         expected = {"name": "name", "type": "integer"} | ||||
|         assert result == expected | ||||
|  | ||||
|     def test_convert_pandas_type_to_json_field_float(self, index_or_series): | ||||
|         kind = index_or_series | ||||
|         data = [1.0, 2.0, 3.0] | ||||
|         result = convert_pandas_type_to_json_field(kind(data, name="name")) | ||||
|         expected = {"name": "name", "type": "number"} | ||||
|         assert result == expected | ||||
|  | ||||
|     @pytest.mark.parametrize( | ||||
|         "dt_args,extra_exp", [({}, {}), ({"utc": True}, {"tz": "UTC"})] | ||||
|     ) | ||||
|     @pytest.mark.parametrize("wrapper", [None, pd.Series]) | ||||
|     def test_convert_pandas_type_to_json_field_datetime( | ||||
|         self, dt_args, extra_exp, wrapper | ||||
|     ): | ||||
|         data = [1.0, 2.0, 3.0] | ||||
|         data = pd.to_datetime(data, **dt_args) | ||||
|         if wrapper is pd.Series: | ||||
|             data = pd.Series(data, name="values") | ||||
|         result = convert_pandas_type_to_json_field(data) | ||||
|         expected = {"name": "values", "type": "datetime"} | ||||
|         expected.update(extra_exp) | ||||
|         assert result == expected | ||||
|  | ||||
|     def test_convert_pandas_type_to_json_period_range(self): | ||||
|         arr = pd.period_range("2016", freq="Y-DEC", periods=4) | ||||
|         result = convert_pandas_type_to_json_field(arr) | ||||
|         expected = {"name": "values", "type": "datetime", "freq": "YE-DEC"} | ||||
|         assert result == expected | ||||
|  | ||||
|     @pytest.mark.parametrize("kind", [pd.Categorical, pd.CategoricalIndex]) | ||||
|     @pytest.mark.parametrize("ordered", [True, False]) | ||||
|     def test_convert_pandas_type_to_json_field_categorical(self, kind, ordered): | ||||
|         data = ["a", "b", "c"] | ||||
|         if kind is pd.Categorical: | ||||
|             arr = pd.Series(kind(data, ordered=ordered), name="cats") | ||||
|         elif kind is pd.CategoricalIndex: | ||||
|             arr = kind(data, ordered=ordered, name="cats") | ||||
|  | ||||
|         result = convert_pandas_type_to_json_field(arr) | ||||
|         expected = { | ||||
|             "name": "cats", | ||||
|             "type": "any", | ||||
|             "constraints": {"enum": data}, | ||||
|             "ordered": ordered, | ||||
|         } | ||||
|         assert result == expected | ||||
|  | ||||
|     @pytest.mark.parametrize( | ||||
|         "inp,exp", | ||||
|         [ | ||||
|             ({"type": "integer"}, "int64"), | ||||
|             ({"type": "number"}, "float64"), | ||||
|             ({"type": "boolean"}, "bool"), | ||||
|             ({"type": "duration"}, "timedelta64"), | ||||
|             ({"type": "datetime"}, "datetime64[ns]"), | ||||
|             ({"type": "datetime", "tz": "US/Hawaii"}, "datetime64[ns, US/Hawaii]"), | ||||
|             ({"type": "any"}, "object"), | ||||
|             ( | ||||
|                 { | ||||
|                     "type": "any", | ||||
|                     "constraints": {"enum": ["a", "b", "c"]}, | ||||
|                     "ordered": False, | ||||
|                 }, | ||||
|                 CategoricalDtype(categories=["a", "b", "c"], ordered=False), | ||||
|             ), | ||||
|             ( | ||||
|                 { | ||||
|                     "type": "any", | ||||
|                     "constraints": {"enum": ["a", "b", "c"]}, | ||||
|                     "ordered": True, | ||||
|                 }, | ||||
|                 CategoricalDtype(categories=["a", "b", "c"], ordered=True), | ||||
|             ), | ||||
|             ({"type": "string"}, None), | ||||
|         ], | ||||
|     ) | ||||
|     def test_convert_json_field_to_pandas_type(self, inp, exp): | ||||
|         field = {"name": "foo"} | ||||
|         field.update(inp) | ||||
|         assert convert_json_field_to_pandas_type(field) == exp | ||||
|  | ||||
|     @pytest.mark.parametrize("inp", ["geopoint", "geojson", "fake_type"]) | ||||
|     def test_convert_json_field_to_pandas_type_raises(self, inp): | ||||
|         field = {"type": inp} | ||||
|         with pytest.raises( | ||||
|             ValueError, match=f"Unsupported or invalid field type: {inp}" | ||||
|         ): | ||||
|             convert_json_field_to_pandas_type(field) | ||||
|  | ||||
|     def test_categorical(self): | ||||
|         s = pd.Series(pd.Categorical(["a", "b", "a"])) | ||||
|         s.index.name = "idx" | ||||
|         result = s.to_json(orient="table", date_format="iso") | ||||
|         result = json.loads(result, object_pairs_hook=OrderedDict) | ||||
|         result["schema"].pop("pandas_version") | ||||
|  | ||||
|         fields = [ | ||||
|             {"name": "idx", "type": "integer"}, | ||||
|             { | ||||
|                 "constraints": {"enum": ["a", "b"]}, | ||||
|                 "name": "values", | ||||
|                 "ordered": False, | ||||
|                 "type": "any", | ||||
|             }, | ||||
|         ] | ||||
|  | ||||
|         expected = OrderedDict( | ||||
|             [ | ||||
|                 ("schema", {"fields": fields, "primaryKey": ["idx"]}), | ||||
|                 ( | ||||
|                     "data", | ||||
|                     [ | ||||
|                         OrderedDict([("idx", 0), ("values", "a")]), | ||||
|                         OrderedDict([("idx", 1), ("values", "b")]), | ||||
|                         OrderedDict([("idx", 2), ("values", "a")]), | ||||
|                     ], | ||||
|                 ), | ||||
|             ] | ||||
|         ) | ||||
|  | ||||
|         assert result == expected | ||||
|  | ||||
|     @pytest.mark.parametrize( | ||||
|         "idx,nm,prop", | ||||
|         [ | ||||
|             (pd.Index([1]), "index", "name"), | ||||
|             (pd.Index([1], name="myname"), "myname", "name"), | ||||
|             ( | ||||
|                 pd.MultiIndex.from_product([("a", "b"), ("c", "d")]), | ||||
|                 ["level_0", "level_1"], | ||||
|                 "names", | ||||
|             ), | ||||
|             ( | ||||
|                 pd.MultiIndex.from_product( | ||||
|                     [("a", "b"), ("c", "d")], names=["n1", "n2"] | ||||
|                 ), | ||||
|                 ["n1", "n2"], | ||||
|                 "names", | ||||
|             ), | ||||
|             ( | ||||
|                 pd.MultiIndex.from_product( | ||||
|                     [("a", "b"), ("c", "d")], names=["n1", None] | ||||
|                 ), | ||||
|                 ["n1", "level_1"], | ||||
|                 "names", | ||||
|             ), | ||||
|         ], | ||||
|     ) | ||||
|     def test_set_names_unset(self, idx, nm, prop): | ||||
|         data = pd.Series(1, idx) | ||||
|         result = set_default_names(data) | ||||
|         assert getattr(result.index, prop) == nm | ||||
|  | ||||
|     @pytest.mark.parametrize( | ||||
|         "idx", | ||||
|         [ | ||||
|             pd.Index([], name="index"), | ||||
|             pd.MultiIndex.from_arrays([["foo"], ["bar"]], names=("level_0", "level_1")), | ||||
|             pd.MultiIndex.from_arrays([["foo"], ["bar"]], names=("foo", "level_1")), | ||||
|         ], | ||||
|     ) | ||||
|     def test_warns_non_roundtrippable_names(self, idx): | ||||
|         # GH 19130 | ||||
|         df = DataFrame(index=idx) | ||||
|         df.index.name = "index" | ||||
|         with tm.assert_produces_warning(): | ||||
|             set_default_names(df) | ||||
|  | ||||
|     def test_timestamp_in_columns(self): | ||||
|         df = DataFrame( | ||||
|             [[1, 2]], columns=[pd.Timestamp("2016"), pd.Timedelta(10, unit="s")] | ||||
|         ) | ||||
|         result = df.to_json(orient="table") | ||||
|         js = json.loads(result) | ||||
|         assert js["schema"]["fields"][1]["name"] == "2016-01-01T00:00:00.000" | ||||
|         assert js["schema"]["fields"][2]["name"] == "P0DT0H0M10S" | ||||
|  | ||||
|     @pytest.mark.parametrize( | ||||
|         "case", | ||||
|         [ | ||||
|             pd.Series([1], index=pd.Index([1], name="a"), name="a"), | ||||
|             DataFrame({"A": [1]}, index=pd.Index([1], name="A")), | ||||
|             DataFrame( | ||||
|                 {"A": [1]}, | ||||
|                 index=pd.MultiIndex.from_arrays([["a"], [1]], names=["A", "a"]), | ||||
|             ), | ||||
|         ], | ||||
|     ) | ||||
|     def test_overlapping_names(self, case): | ||||
|         with pytest.raises(ValueError, match="Overlapping"): | ||||
|             case.to_json(orient="table") | ||||
|  | ||||
|     def test_mi_falsey_name(self): | ||||
|         # GH 16203 | ||||
|         df = DataFrame( | ||||
|             np.random.default_rng(2).standard_normal((4, 4)), | ||||
|             index=pd.MultiIndex.from_product([("A", "B"), ("a", "b")]), | ||||
|         ) | ||||
|         result = [x["name"] for x in build_table_schema(df)["fields"]] | ||||
|         assert result == ["level_0", "level_1", 0, 1, 2, 3] | ||||
|  | ||||
|  | ||||
| class TestTableOrientReader: | ||||
|     @pytest.mark.parametrize( | ||||
|         "index_nm", | ||||
|         [None, "idx", pytest.param("index", marks=pytest.mark.xfail), "level_0"], | ||||
|     ) | ||||
|     @pytest.mark.parametrize( | ||||
|         "vals", | ||||
|         [ | ||||
|             {"ints": [1, 2, 3, 4]}, | ||||
|             {"objects": ["a", "b", "c", "d"]}, | ||||
|             {"objects": ["1", "2", "3", "4"]}, | ||||
|             {"date_ranges": pd.date_range("2016-01-01", freq="d", periods=4)}, | ||||
|             {"categoricals": pd.Series(pd.Categorical(["a", "b", "c", "c"]))}, | ||||
|             { | ||||
|                 "ordered_cats": pd.Series( | ||||
|                     pd.Categorical(["a", "b", "c", "c"], ordered=True) | ||||
|                 ) | ||||
|             }, | ||||
|             {"floats": [1.0, 2.0, 3.0, 4.0]}, | ||||
|             {"floats": [1.1, 2.2, 3.3, 4.4]}, | ||||
|             {"bools": [True, False, False, True]}, | ||||
|             { | ||||
|                 "timezones": pd.date_range( | ||||
|                     "2016-01-01", freq="d", periods=4, tz="US/Central" | ||||
|                 )  # added in # GH 35973 | ||||
|             }, | ||||
|         ], | ||||
|     ) | ||||
|     def test_read_json_table_orient(self, index_nm, vals, recwarn): | ||||
|         df = DataFrame(vals, index=pd.Index(range(4), name=index_nm)) | ||||
|         out = df.to_json(orient="table") | ||||
|         result = pd.read_json(out, orient="table") | ||||
|         tm.assert_frame_equal(df, result) | ||||
|  | ||||
|     @pytest.mark.parametrize("index_nm", [None, "idx", "index"]) | ||||
|     @pytest.mark.parametrize( | ||||
|         "vals", | ||||
|         [{"timedeltas": pd.timedelta_range("1h", periods=4, freq="min")}], | ||||
|     ) | ||||
|     def test_read_json_table_orient_raises(self, index_nm, vals, recwarn): | ||||
|         df = DataFrame(vals, index=pd.Index(range(4), name=index_nm)) | ||||
|         out = df.to_json(orient="table") | ||||
|         with pytest.raises(NotImplementedError, match="can not yet read "): | ||||
|             pd.read_json(out, orient="table") | ||||
|  | ||||
|     @pytest.mark.parametrize( | ||||
|         "index_nm", | ||||
|         [None, "idx", pytest.param("index", marks=pytest.mark.xfail), "level_0"], | ||||
|     ) | ||||
|     @pytest.mark.parametrize( | ||||
|         "vals", | ||||
|         [ | ||||
|             {"ints": [1, 2, 3, 4]}, | ||||
|             {"objects": ["a", "b", "c", "d"]}, | ||||
|             {"objects": ["1", "2", "3", "4"]}, | ||||
|             {"date_ranges": pd.date_range("2016-01-01", freq="d", periods=4)}, | ||||
|             {"categoricals": pd.Series(pd.Categorical(["a", "b", "c", "c"]))}, | ||||
|             { | ||||
|                 "ordered_cats": pd.Series( | ||||
|                     pd.Categorical(["a", "b", "c", "c"], ordered=True) | ||||
|                 ) | ||||
|             }, | ||||
|             {"floats": [1.0, 2.0, 3.0, 4.0]}, | ||||
|             {"floats": [1.1, 2.2, 3.3, 4.4]}, | ||||
|             {"bools": [True, False, False, True]}, | ||||
|             { | ||||
|                 "timezones": pd.date_range( | ||||
|                     "2016-01-01", freq="d", periods=4, tz="US/Central" | ||||
|                 )  # added in # GH 35973 | ||||
|             }, | ||||
|         ], | ||||
|     ) | ||||
|     def test_read_json_table_period_orient(self, index_nm, vals, recwarn): | ||||
|         df = DataFrame( | ||||
|             vals, | ||||
|             index=pd.Index( | ||||
|                 (pd.Period(f"2022Q{q}") for q in range(1, 5)), name=index_nm | ||||
|             ), | ||||
|         ) | ||||
|         out = df.to_json(orient="table") | ||||
|         result = pd.read_json(out, orient="table") | ||||
|         tm.assert_frame_equal(df, result) | ||||
|  | ||||
|     @pytest.mark.parametrize( | ||||
|         "idx", | ||||
|         [ | ||||
|             pd.Index(range(4)), | ||||
|             pd.date_range( | ||||
|                 "2020-08-30", | ||||
|                 freq="d", | ||||
|                 periods=4, | ||||
|             )._with_freq(None), | ||||
|             pd.date_range( | ||||
|                 "2020-08-30", freq="d", periods=4, tz="US/Central" | ||||
|             )._with_freq(None), | ||||
|             pd.MultiIndex.from_product( | ||||
|                 [ | ||||
|                     pd.date_range("2020-08-30", freq="d", periods=2, tz="US/Central"), | ||||
|                     ["x", "y"], | ||||
|                 ], | ||||
|             ), | ||||
|         ], | ||||
|     ) | ||||
|     @pytest.mark.parametrize( | ||||
|         "vals", | ||||
|         [ | ||||
|             {"floats": [1.1, 2.2, 3.3, 4.4]}, | ||||
|             {"dates": pd.date_range("2020-08-30", freq="d", periods=4)}, | ||||
|             { | ||||
|                 "timezones": pd.date_range( | ||||
|                     "2020-08-30", freq="d", periods=4, tz="Europe/London" | ||||
|                 ) | ||||
|             }, | ||||
|         ], | ||||
|     ) | ||||
|     def test_read_json_table_timezones_orient(self, idx, vals, recwarn): | ||||
|         # GH 35973 | ||||
|         df = DataFrame(vals, index=idx) | ||||
|         out = df.to_json(orient="table") | ||||
|         result = pd.read_json(out, orient="table") | ||||
|         tm.assert_frame_equal(df, result) | ||||
|  | ||||
|     def test_comprehensive(self): | ||||
|         df = DataFrame( | ||||
|             { | ||||
|                 "A": [1, 2, 3, 4], | ||||
|                 "B": ["a", "b", "c", "c"], | ||||
|                 "C": pd.date_range("2016-01-01", freq="d", periods=4), | ||||
|                 # 'D': pd.timedelta_range('1h', periods=4, freq='min'), | ||||
|                 "E": pd.Series(pd.Categorical(["a", "b", "c", "c"])), | ||||
|                 "F": pd.Series(pd.Categorical(["a", "b", "c", "c"], ordered=True)), | ||||
|                 "G": [1.1, 2.2, 3.3, 4.4], | ||||
|                 "H": pd.date_range("2016-01-01", freq="d", periods=4, tz="US/Central"), | ||||
|                 "I": [True, False, False, True], | ||||
|             }, | ||||
|             index=pd.Index(range(4), name="idx"), | ||||
|         ) | ||||
|  | ||||
|         out = StringIO(df.to_json(orient="table")) | ||||
|         result = pd.read_json(out, orient="table") | ||||
|         tm.assert_frame_equal(df, result) | ||||
|  | ||||
|     @pytest.mark.parametrize( | ||||
|         "index_names", | ||||
|         [[None, None], ["foo", "bar"], ["foo", None], [None, "foo"], ["index", "foo"]], | ||||
|     ) | ||||
|     def test_multiindex(self, index_names): | ||||
|         # GH 18912 | ||||
|         df = DataFrame( | ||||
|             [["Arr", "alpha", [1, 2, 3, 4]], ["Bee", "Beta", [10, 20, 30, 40]]], | ||||
|             index=[["A", "B"], ["Null", "Eins"]], | ||||
|             columns=["Aussprache", "Griechisch", "Args"], | ||||
|         ) | ||||
|         df.index.names = index_names | ||||
|         out = StringIO(df.to_json(orient="table")) | ||||
|         result = pd.read_json(out, orient="table") | ||||
|         tm.assert_frame_equal(df, result) | ||||
|  | ||||
|     def test_empty_frame_roundtrip(self): | ||||
|         # GH 21287 | ||||
|         df = DataFrame(columns=["a", "b", "c"]) | ||||
|         expected = df.copy() | ||||
|         out = StringIO(df.to_json(orient="table")) | ||||
|         result = pd.read_json(out, orient="table") | ||||
|         tm.assert_frame_equal(expected, result) | ||||
|  | ||||
|     def test_read_json_orient_table_old_schema_version(self): | ||||
|         df_json = """ | ||||
|         { | ||||
|             "schema":{ | ||||
|                 "fields":[ | ||||
|                     {"name":"index","type":"integer"}, | ||||
|                     {"name":"a","type":"string"} | ||||
|                 ], | ||||
|                 "primaryKey":["index"], | ||||
|                 "pandas_version":"0.20.0" | ||||
|             }, | ||||
|             "data":[ | ||||
|                 {"index":0,"a":1}, | ||||
|                 {"index":1,"a":2.0}, | ||||
|                 {"index":2,"a":"s"} | ||||
|             ] | ||||
|         } | ||||
|         """ | ||||
|         expected = DataFrame({"a": [1, 2.0, "s"]}) | ||||
|         result = pd.read_json(StringIO(df_json), orient="table") | ||||
|         tm.assert_frame_equal(expected, result) | ||||
|  | ||||
|     @pytest.mark.parametrize("freq", ["M", "2M", "Q", "2Q", "Y", "2Y"]) | ||||
|     def test_read_json_table_orient_period_depr_freq(self, freq, recwarn): | ||||
|         # GH#9586 | ||||
|         df = DataFrame( | ||||
|             {"ints": [1, 2]}, | ||||
|             index=pd.PeriodIndex(["2020-01", "2021-06"], freq=freq), | ||||
|         ) | ||||
|         out = df.to_json(orient="table") | ||||
|         result = pd.read_json(out, orient="table") | ||||
|         tm.assert_frame_equal(df, result) | ||||
| @ -0,0 +1,317 @@ | ||||
| """Tests for ExtensionDtype Table Schema integration.""" | ||||
|  | ||||
| from collections import OrderedDict | ||||
| import datetime as dt | ||||
| import decimal | ||||
| from io import StringIO | ||||
| import json | ||||
|  | ||||
| import pytest | ||||
|  | ||||
| from pandas import ( | ||||
|     NA, | ||||
|     DataFrame, | ||||
|     Index, | ||||
|     array, | ||||
|     read_json, | ||||
| ) | ||||
| import pandas._testing as tm | ||||
| from pandas.core.arrays.integer import Int64Dtype | ||||
| from pandas.core.arrays.string_ import StringDtype | ||||
| from pandas.core.series import Series | ||||
| from pandas.tests.extension.date import ( | ||||
|     DateArray, | ||||
|     DateDtype, | ||||
| ) | ||||
| from pandas.tests.extension.decimal.array import ( | ||||
|     DecimalArray, | ||||
|     DecimalDtype, | ||||
| ) | ||||
|  | ||||
| from pandas.io.json._table_schema import ( | ||||
|     as_json_table_type, | ||||
|     build_table_schema, | ||||
| ) | ||||
|  | ||||
|  | ||||
| class TestBuildSchema: | ||||
|     def test_build_table_schema(self): | ||||
|         df = DataFrame( | ||||
|             { | ||||
|                 "A": DateArray([dt.date(2021, 10, 10)]), | ||||
|                 "B": DecimalArray([decimal.Decimal(10)]), | ||||
|                 "C": array(["pandas"], dtype="string"), | ||||
|                 "D": array([10], dtype="Int64"), | ||||
|             } | ||||
|         ) | ||||
|         result = build_table_schema(df, version=False) | ||||
|         expected = { | ||||
|             "fields": [ | ||||
|                 {"name": "index", "type": "integer"}, | ||||
|                 {"name": "A", "type": "any", "extDtype": "DateDtype"}, | ||||
|                 {"name": "B", "type": "number", "extDtype": "decimal"}, | ||||
|                 {"name": "C", "type": "string", "extDtype": "string"}, | ||||
|                 {"name": "D", "type": "integer", "extDtype": "Int64"}, | ||||
|             ], | ||||
|             "primaryKey": ["index"], | ||||
|         } | ||||
|         assert result == expected | ||||
|         result = build_table_schema(df) | ||||
|         assert "pandas_version" in result | ||||
|  | ||||
|  | ||||
| class TestTableSchemaType: | ||||
|     @pytest.mark.parametrize( | ||||
|         "date_data", | ||||
|         [ | ||||
|             DateArray([dt.date(2021, 10, 10)]), | ||||
|             DateArray(dt.date(2021, 10, 10)), | ||||
|             Series(DateArray(dt.date(2021, 10, 10))), | ||||
|         ], | ||||
|     ) | ||||
|     def test_as_json_table_type_ext_date_array_dtype(self, date_data): | ||||
|         assert as_json_table_type(date_data.dtype) == "any" | ||||
|  | ||||
|     def test_as_json_table_type_ext_date_dtype(self): | ||||
|         assert as_json_table_type(DateDtype()) == "any" | ||||
|  | ||||
|     @pytest.mark.parametrize( | ||||
|         "decimal_data", | ||||
|         [ | ||||
|             DecimalArray([decimal.Decimal(10)]), | ||||
|             Series(DecimalArray([decimal.Decimal(10)])), | ||||
|         ], | ||||
|     ) | ||||
|     def test_as_json_table_type_ext_decimal_array_dtype(self, decimal_data): | ||||
|         assert as_json_table_type(decimal_data.dtype) == "number" | ||||
|  | ||||
|     def test_as_json_table_type_ext_decimal_dtype(self): | ||||
|         assert as_json_table_type(DecimalDtype()) == "number" | ||||
|  | ||||
|     @pytest.mark.parametrize( | ||||
|         "string_data", | ||||
|         [ | ||||
|             array(["pandas"], dtype="string"), | ||||
|             Series(array(["pandas"], dtype="string")), | ||||
|         ], | ||||
|     ) | ||||
|     def test_as_json_table_type_ext_string_array_dtype(self, string_data): | ||||
|         assert as_json_table_type(string_data.dtype) == "string" | ||||
|  | ||||
|     def test_as_json_table_type_ext_string_dtype(self): | ||||
|         assert as_json_table_type(StringDtype()) == "string" | ||||
|  | ||||
|     @pytest.mark.parametrize( | ||||
|         "integer_data", | ||||
|         [ | ||||
|             array([10], dtype="Int64"), | ||||
|             Series(array([10], dtype="Int64")), | ||||
|         ], | ||||
|     ) | ||||
|     def test_as_json_table_type_ext_integer_array_dtype(self, integer_data): | ||||
|         assert as_json_table_type(integer_data.dtype) == "integer" | ||||
|  | ||||
|     def test_as_json_table_type_ext_integer_dtype(self): | ||||
|         assert as_json_table_type(Int64Dtype()) == "integer" | ||||
|  | ||||
|  | ||||
| class TestTableOrient: | ||||
|     @pytest.fixture | ||||
|     def da(self): | ||||
|         return DateArray([dt.date(2021, 10, 10)]) | ||||
|  | ||||
|     @pytest.fixture | ||||
|     def dc(self): | ||||
|         return DecimalArray([decimal.Decimal(10)]) | ||||
|  | ||||
|     @pytest.fixture | ||||
|     def sa(self): | ||||
|         return array(["pandas"], dtype="string") | ||||
|  | ||||
|     @pytest.fixture | ||||
|     def ia(self): | ||||
|         return array([10], dtype="Int64") | ||||
|  | ||||
|     @pytest.fixture | ||||
|     def df(self, da, dc, sa, ia): | ||||
|         return DataFrame( | ||||
|             { | ||||
|                 "A": da, | ||||
|                 "B": dc, | ||||
|                 "C": sa, | ||||
|                 "D": ia, | ||||
|             } | ||||
|         ) | ||||
|  | ||||
|     def test_build_date_series(self, da): | ||||
|         s = Series(da, name="a") | ||||
|         s.index.name = "id" | ||||
|         result = s.to_json(orient="table", date_format="iso") | ||||
|         result = json.loads(result, object_pairs_hook=OrderedDict) | ||||
|  | ||||
|         assert "pandas_version" in result["schema"] | ||||
|         result["schema"].pop("pandas_version") | ||||
|  | ||||
|         fields = [ | ||||
|             {"name": "id", "type": "integer"}, | ||||
|             {"name": "a", "type": "any", "extDtype": "DateDtype"}, | ||||
|         ] | ||||
|  | ||||
|         schema = {"fields": fields, "primaryKey": ["id"]} | ||||
|  | ||||
|         expected = OrderedDict( | ||||
|             [ | ||||
|                 ("schema", schema), | ||||
|                 ("data", [OrderedDict([("id", 0), ("a", "2021-10-10T00:00:00.000")])]), | ||||
|             ] | ||||
|         ) | ||||
|  | ||||
|         assert result == expected | ||||
|  | ||||
|     def test_build_decimal_series(self, dc): | ||||
|         s = Series(dc, name="a") | ||||
|         s.index.name = "id" | ||||
|         result = s.to_json(orient="table", date_format="iso") | ||||
|         result = json.loads(result, object_pairs_hook=OrderedDict) | ||||
|  | ||||
|         assert "pandas_version" in result["schema"] | ||||
|         result["schema"].pop("pandas_version") | ||||
|  | ||||
|         fields = [ | ||||
|             {"name": "id", "type": "integer"}, | ||||
|             {"name": "a", "type": "number", "extDtype": "decimal"}, | ||||
|         ] | ||||
|  | ||||
|         schema = {"fields": fields, "primaryKey": ["id"]} | ||||
|  | ||||
|         expected = OrderedDict( | ||||
|             [ | ||||
|                 ("schema", schema), | ||||
|                 ("data", [OrderedDict([("id", 0), ("a", 10.0)])]), | ||||
|             ] | ||||
|         ) | ||||
|  | ||||
|         assert result == expected | ||||
|  | ||||
|     def test_build_string_series(self, sa): | ||||
|         s = Series(sa, name="a") | ||||
|         s.index.name = "id" | ||||
|         result = s.to_json(orient="table", date_format="iso") | ||||
|         result = json.loads(result, object_pairs_hook=OrderedDict) | ||||
|  | ||||
|         assert "pandas_version" in result["schema"] | ||||
|         result["schema"].pop("pandas_version") | ||||
|  | ||||
|         fields = [ | ||||
|             {"name": "id", "type": "integer"}, | ||||
|             {"name": "a", "type": "string", "extDtype": "string"}, | ||||
|         ] | ||||
|  | ||||
|         schema = {"fields": fields, "primaryKey": ["id"]} | ||||
|  | ||||
|         expected = OrderedDict( | ||||
|             [ | ||||
|                 ("schema", schema), | ||||
|                 ("data", [OrderedDict([("id", 0), ("a", "pandas")])]), | ||||
|             ] | ||||
|         ) | ||||
|  | ||||
|         assert result == expected | ||||
|  | ||||
|     def test_build_int64_series(self, ia): | ||||
|         s = Series(ia, name="a") | ||||
|         s.index.name = "id" | ||||
|         result = s.to_json(orient="table", date_format="iso") | ||||
|         result = json.loads(result, object_pairs_hook=OrderedDict) | ||||
|  | ||||
|         assert "pandas_version" in result["schema"] | ||||
|         result["schema"].pop("pandas_version") | ||||
|  | ||||
|         fields = [ | ||||
|             {"name": "id", "type": "integer"}, | ||||
|             {"name": "a", "type": "integer", "extDtype": "Int64"}, | ||||
|         ] | ||||
|  | ||||
|         schema = {"fields": fields, "primaryKey": ["id"]} | ||||
|  | ||||
|         expected = OrderedDict( | ||||
|             [ | ||||
|                 ("schema", schema), | ||||
|                 ("data", [OrderedDict([("id", 0), ("a", 10)])]), | ||||
|             ] | ||||
|         ) | ||||
|  | ||||
|         assert result == expected | ||||
|  | ||||
|     def test_to_json(self, df): | ||||
|         df = df.copy() | ||||
|         df.index.name = "idx" | ||||
|         result = df.to_json(orient="table", date_format="iso") | ||||
|         result = json.loads(result, object_pairs_hook=OrderedDict) | ||||
|  | ||||
|         assert "pandas_version" in result["schema"] | ||||
|         result["schema"].pop("pandas_version") | ||||
|  | ||||
|         fields = [ | ||||
|             OrderedDict({"name": "idx", "type": "integer"}), | ||||
|             OrderedDict({"name": "A", "type": "any", "extDtype": "DateDtype"}), | ||||
|             OrderedDict({"name": "B", "type": "number", "extDtype": "decimal"}), | ||||
|             OrderedDict({"name": "C", "type": "string", "extDtype": "string"}), | ||||
|             OrderedDict({"name": "D", "type": "integer", "extDtype": "Int64"}), | ||||
|         ] | ||||
|  | ||||
|         schema = OrderedDict({"fields": fields, "primaryKey": ["idx"]}) | ||||
|         data = [ | ||||
|             OrderedDict( | ||||
|                 [ | ||||
|                     ("idx", 0), | ||||
|                     ("A", "2021-10-10T00:00:00.000"), | ||||
|                     ("B", 10.0), | ||||
|                     ("C", "pandas"), | ||||
|                     ("D", 10), | ||||
|                 ] | ||||
|             ) | ||||
|         ] | ||||
|         expected = OrderedDict([("schema", schema), ("data", data)]) | ||||
|  | ||||
|         assert result == expected | ||||
|  | ||||
|     def test_json_ext_dtype_reading_roundtrip(self): | ||||
|         # GH#40255 | ||||
|         df = DataFrame( | ||||
|             { | ||||
|                 "a": Series([2, NA], dtype="Int64"), | ||||
|                 "b": Series([1.5, NA], dtype="Float64"), | ||||
|                 "c": Series([True, NA], dtype="boolean"), | ||||
|             }, | ||||
|             index=Index([1, NA], dtype="Int64"), | ||||
|         ) | ||||
|         expected = df.copy() | ||||
|         data_json = df.to_json(orient="table", indent=4) | ||||
|         result = read_json(StringIO(data_json), orient="table") | ||||
|         tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     def test_json_ext_dtype_reading(self): | ||||
|         # GH#40255 | ||||
|         data_json = """{ | ||||
|             "schema":{ | ||||
|                 "fields":[ | ||||
|                     { | ||||
|                         "name":"a", | ||||
|                         "type":"integer", | ||||
|                         "extDtype":"Int64" | ||||
|                     } | ||||
|                 ], | ||||
|             }, | ||||
|             "data":[ | ||||
|                 { | ||||
|                     "a":2 | ||||
|                 }, | ||||
|                 { | ||||
|                     "a":null | ||||
|                 } | ||||
|             ] | ||||
|         }""" | ||||
|         result = read_json(StringIO(data_json), orient="table") | ||||
|         expected = DataFrame({"a": Series([2, NA], dtype="Int64")}) | ||||
|         tm.assert_frame_equal(result, expected) | ||||
| @ -0,0 +1,907 @@ | ||||
| import json | ||||
|  | ||||
| import numpy as np | ||||
| import pytest | ||||
|  | ||||
| from pandas import ( | ||||
|     DataFrame, | ||||
|     Index, | ||||
|     Series, | ||||
|     json_normalize, | ||||
| ) | ||||
| import pandas._testing as tm | ||||
|  | ||||
| from pandas.io.json._normalize import nested_to_record | ||||
|  | ||||
|  | ||||
| @pytest.fixture | ||||
| def deep_nested(): | ||||
|     # deeply nested data | ||||
|     return [ | ||||
|         { | ||||
|             "country": "USA", | ||||
|             "states": [ | ||||
|                 { | ||||
|                     "name": "California", | ||||
|                     "cities": [ | ||||
|                         {"name": "San Francisco", "pop": 12345}, | ||||
|                         {"name": "Los Angeles", "pop": 12346}, | ||||
|                     ], | ||||
|                 }, | ||||
|                 { | ||||
|                     "name": "Ohio", | ||||
|                     "cities": [ | ||||
|                         {"name": "Columbus", "pop": 1234}, | ||||
|                         {"name": "Cleveland", "pop": 1236}, | ||||
|                     ], | ||||
|                 }, | ||||
|             ], | ||||
|         }, | ||||
|         { | ||||
|             "country": "Germany", | ||||
|             "states": [ | ||||
|                 {"name": "Bayern", "cities": [{"name": "Munich", "pop": 12347}]}, | ||||
|                 { | ||||
|                     "name": "Nordrhein-Westfalen", | ||||
|                     "cities": [ | ||||
|                         {"name": "Duesseldorf", "pop": 1238}, | ||||
|                         {"name": "Koeln", "pop": 1239}, | ||||
|                     ], | ||||
|                 }, | ||||
|             ], | ||||
|         }, | ||||
|     ] | ||||
|  | ||||
|  | ||||
| @pytest.fixture | ||||
| def state_data(): | ||||
|     return [ | ||||
|         { | ||||
|             "counties": [ | ||||
|                 {"name": "Dade", "population": 12345}, | ||||
|                 {"name": "Broward", "population": 40000}, | ||||
|                 {"name": "Palm Beach", "population": 60000}, | ||||
|             ], | ||||
|             "info": {"governor": "Rick Scott"}, | ||||
|             "shortname": "FL", | ||||
|             "state": "Florida", | ||||
|         }, | ||||
|         { | ||||
|             "counties": [ | ||||
|                 {"name": "Summit", "population": 1234}, | ||||
|                 {"name": "Cuyahoga", "population": 1337}, | ||||
|             ], | ||||
|             "info": {"governor": "John Kasich"}, | ||||
|             "shortname": "OH", | ||||
|             "state": "Ohio", | ||||
|         }, | ||||
|     ] | ||||
|  | ||||
|  | ||||
| @pytest.fixture | ||||
| def author_missing_data(): | ||||
|     return [ | ||||
|         {"info": None}, | ||||
|         { | ||||
|             "info": {"created_at": "11/08/1993", "last_updated": "26/05/2012"}, | ||||
|             "author_name": {"first": "Jane", "last_name": "Doe"}, | ||||
|         }, | ||||
|     ] | ||||
|  | ||||
|  | ||||
| @pytest.fixture | ||||
| def missing_metadata(): | ||||
|     return [ | ||||
|         { | ||||
|             "name": "Alice", | ||||
|             "addresses": [ | ||||
|                 { | ||||
|                     "number": 9562, | ||||
|                     "street": "Morris St.", | ||||
|                     "city": "Massillon", | ||||
|                     "state": "OH", | ||||
|                     "zip": 44646, | ||||
|                 } | ||||
|             ], | ||||
|             "previous_residences": {"cities": [{"city_name": "Foo York City"}]}, | ||||
|         }, | ||||
|         { | ||||
|             "addresses": [ | ||||
|                 { | ||||
|                     "number": 8449, | ||||
|                     "street": "Spring St.", | ||||
|                     "city": "Elizabethton", | ||||
|                     "state": "TN", | ||||
|                     "zip": 37643, | ||||
|                 } | ||||
|             ], | ||||
|             "previous_residences": {"cities": [{"city_name": "Barmingham"}]}, | ||||
|         }, | ||||
|     ] | ||||
|  | ||||
|  | ||||
| @pytest.fixture | ||||
| def max_level_test_input_data(): | ||||
|     """ | ||||
|     input data to test json_normalize with max_level param | ||||
|     """ | ||||
|     return [ | ||||
|         { | ||||
|             "CreatedBy": {"Name": "User001"}, | ||||
|             "Lookup": { | ||||
|                 "TextField": "Some text", | ||||
|                 "UserField": {"Id": "ID001", "Name": "Name001"}, | ||||
|             }, | ||||
|             "Image": {"a": "b"}, | ||||
|         } | ||||
|     ] | ||||
|  | ||||
|  | ||||
| class TestJSONNormalize: | ||||
|     def test_simple_records(self): | ||||
|         recs = [ | ||||
|             {"a": 1, "b": 2, "c": 3}, | ||||
|             {"a": 4, "b": 5, "c": 6}, | ||||
|             {"a": 7, "b": 8, "c": 9}, | ||||
|             {"a": 10, "b": 11, "c": 12}, | ||||
|         ] | ||||
|  | ||||
|         result = json_normalize(recs) | ||||
|         expected = DataFrame(recs) | ||||
|  | ||||
|         tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     def test_simple_normalize(self, state_data): | ||||
|         result = json_normalize(state_data[0], "counties") | ||||
|         expected = DataFrame(state_data[0]["counties"]) | ||||
|         tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|         result = json_normalize(state_data, "counties") | ||||
|  | ||||
|         expected = [] | ||||
|         for rec in state_data: | ||||
|             expected.extend(rec["counties"]) | ||||
|         expected = DataFrame(expected) | ||||
|  | ||||
|         tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|         result = json_normalize(state_data, "counties", meta="state") | ||||
|         expected["state"] = np.array(["Florida", "Ohio"]).repeat([3, 2]) | ||||
|  | ||||
|         tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     def test_fields_list_type_normalize(self): | ||||
|         parse_metadata_fields_list_type = [ | ||||
|             {"values": [1, 2, 3], "metadata": {"listdata": [1, 2]}} | ||||
|         ] | ||||
|         result = json_normalize( | ||||
|             parse_metadata_fields_list_type, | ||||
|             record_path=["values"], | ||||
|             meta=[["metadata", "listdata"]], | ||||
|         ) | ||||
|         expected = DataFrame( | ||||
|             {0: [1, 2, 3], "metadata.listdata": [[1, 2], [1, 2], [1, 2]]} | ||||
|         ) | ||||
|         tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     def test_empty_array(self): | ||||
|         result = json_normalize([]) | ||||
|         expected = DataFrame() | ||||
|         tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     @pytest.mark.parametrize( | ||||
|         "data, record_path, exception_type", | ||||
|         [ | ||||
|             ([{"a": 0}, {"a": 1}], None, None), | ||||
|             ({"a": [{"a": 0}, {"a": 1}]}, "a", None), | ||||
|             ('{"a": [{"a": 0}, {"a": 1}]}', None, NotImplementedError), | ||||
|             (None, None, NotImplementedError), | ||||
|         ], | ||||
|     ) | ||||
|     def test_accepted_input(self, data, record_path, exception_type): | ||||
|         if exception_type is not None: | ||||
|             with pytest.raises(exception_type, match=""): | ||||
|                 json_normalize(data, record_path=record_path) | ||||
|         else: | ||||
|             result = json_normalize(data, record_path=record_path) | ||||
|             expected = DataFrame([0, 1], columns=["a"]) | ||||
|             tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     def test_simple_normalize_with_separator(self, deep_nested): | ||||
|         # GH 14883 | ||||
|         result = json_normalize({"A": {"A": 1, "B": 2}}) | ||||
|         expected = DataFrame([[1, 2]], columns=["A.A", "A.B"]) | ||||
|         tm.assert_frame_equal(result.reindex_like(expected), expected) | ||||
|  | ||||
|         result = json_normalize({"A": {"A": 1, "B": 2}}, sep="_") | ||||
|         expected = DataFrame([[1, 2]], columns=["A_A", "A_B"]) | ||||
|         tm.assert_frame_equal(result.reindex_like(expected), expected) | ||||
|  | ||||
|         result = json_normalize({"A": {"A": 1, "B": 2}}, sep="\u03c3") | ||||
|         expected = DataFrame([[1, 2]], columns=["A\u03c3A", "A\u03c3B"]) | ||||
|         tm.assert_frame_equal(result.reindex_like(expected), expected) | ||||
|  | ||||
|         result = json_normalize( | ||||
|             deep_nested, | ||||
|             ["states", "cities"], | ||||
|             meta=["country", ["states", "name"]], | ||||
|             sep="_", | ||||
|         ) | ||||
|         expected = Index(["name", "pop", "country", "states_name"]).sort_values() | ||||
|         assert result.columns.sort_values().equals(expected) | ||||
|  | ||||
|     def test_normalize_with_multichar_separator(self): | ||||
|         # GH #43831 | ||||
|         data = {"a": [1, 2], "b": {"b_1": 2, "b_2": (3, 4)}} | ||||
|         result = json_normalize(data, sep="__") | ||||
|         expected = DataFrame([[[1, 2], 2, (3, 4)]], columns=["a", "b__b_1", "b__b_2"]) | ||||
|         tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     def test_value_array_record_prefix(self): | ||||
|         # GH 21536 | ||||
|         result = json_normalize({"A": [1, 2]}, "A", record_prefix="Prefix.") | ||||
|         expected = DataFrame([[1], [2]], columns=["Prefix.0"]) | ||||
|         tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     def test_nested_object_record_path(self): | ||||
|         # GH 22706 | ||||
|         data = { | ||||
|             "state": "Florida", | ||||
|             "info": { | ||||
|                 "governor": "Rick Scott", | ||||
|                 "counties": [ | ||||
|                     {"name": "Dade", "population": 12345}, | ||||
|                     {"name": "Broward", "population": 40000}, | ||||
|                     {"name": "Palm Beach", "population": 60000}, | ||||
|                 ], | ||||
|             }, | ||||
|         } | ||||
|         result = json_normalize(data, record_path=["info", "counties"]) | ||||
|         expected = DataFrame( | ||||
|             [["Dade", 12345], ["Broward", 40000], ["Palm Beach", 60000]], | ||||
|             columns=["name", "population"], | ||||
|         ) | ||||
|         tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     def test_more_deeply_nested(self, deep_nested): | ||||
|         result = json_normalize( | ||||
|             deep_nested, ["states", "cities"], meta=["country", ["states", "name"]] | ||||
|         ) | ||||
|         ex_data = { | ||||
|             "country": ["USA"] * 4 + ["Germany"] * 3, | ||||
|             "states.name": [ | ||||
|                 "California", | ||||
|                 "California", | ||||
|                 "Ohio", | ||||
|                 "Ohio", | ||||
|                 "Bayern", | ||||
|                 "Nordrhein-Westfalen", | ||||
|                 "Nordrhein-Westfalen", | ||||
|             ], | ||||
|             "name": [ | ||||
|                 "San Francisco", | ||||
|                 "Los Angeles", | ||||
|                 "Columbus", | ||||
|                 "Cleveland", | ||||
|                 "Munich", | ||||
|                 "Duesseldorf", | ||||
|                 "Koeln", | ||||
|             ], | ||||
|             "pop": [12345, 12346, 1234, 1236, 12347, 1238, 1239], | ||||
|         } | ||||
|  | ||||
|         expected = DataFrame(ex_data, columns=result.columns) | ||||
|         tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     def test_shallow_nested(self): | ||||
|         data = [ | ||||
|             { | ||||
|                 "state": "Florida", | ||||
|                 "shortname": "FL", | ||||
|                 "info": {"governor": "Rick Scott"}, | ||||
|                 "counties": [ | ||||
|                     {"name": "Dade", "population": 12345}, | ||||
|                     {"name": "Broward", "population": 40000}, | ||||
|                     {"name": "Palm Beach", "population": 60000}, | ||||
|                 ], | ||||
|             }, | ||||
|             { | ||||
|                 "state": "Ohio", | ||||
|                 "shortname": "OH", | ||||
|                 "info": {"governor": "John Kasich"}, | ||||
|                 "counties": [ | ||||
|                     {"name": "Summit", "population": 1234}, | ||||
|                     {"name": "Cuyahoga", "population": 1337}, | ||||
|                 ], | ||||
|             }, | ||||
|         ] | ||||
|  | ||||
|         result = json_normalize( | ||||
|             data, "counties", ["state", "shortname", ["info", "governor"]] | ||||
|         ) | ||||
|         ex_data = { | ||||
|             "name": ["Dade", "Broward", "Palm Beach", "Summit", "Cuyahoga"], | ||||
|             "state": ["Florida"] * 3 + ["Ohio"] * 2, | ||||
|             "shortname": ["FL", "FL", "FL", "OH", "OH"], | ||||
|             "info.governor": ["Rick Scott"] * 3 + ["John Kasich"] * 2, | ||||
|             "population": [12345, 40000, 60000, 1234, 1337], | ||||
|         } | ||||
|         expected = DataFrame(ex_data, columns=result.columns) | ||||
|         tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     def test_nested_meta_path_with_nested_record_path(self, state_data): | ||||
|         # GH 27220 | ||||
|         result = json_normalize( | ||||
|             data=state_data, | ||||
|             record_path=["counties"], | ||||
|             meta=["state", "shortname", ["info", "governor"]], | ||||
|             errors="ignore", | ||||
|         ) | ||||
|  | ||||
|         ex_data = { | ||||
|             "name": ["Dade", "Broward", "Palm Beach", "Summit", "Cuyahoga"], | ||||
|             "population": [12345, 40000, 60000, 1234, 1337], | ||||
|             "state": ["Florida"] * 3 + ["Ohio"] * 2, | ||||
|             "shortname": ["FL"] * 3 + ["OH"] * 2, | ||||
|             "info.governor": ["Rick Scott"] * 3 + ["John Kasich"] * 2, | ||||
|         } | ||||
|  | ||||
|         expected = DataFrame(ex_data) | ||||
|         tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     def test_meta_name_conflict(self): | ||||
|         data = [ | ||||
|             { | ||||
|                 "foo": "hello", | ||||
|                 "bar": "there", | ||||
|                 "data": [ | ||||
|                     {"foo": "something", "bar": "else"}, | ||||
|                     {"foo": "something2", "bar": "else2"}, | ||||
|                 ], | ||||
|             } | ||||
|         ] | ||||
|  | ||||
|         msg = r"Conflicting metadata name (foo|bar), need distinguishing prefix" | ||||
|         with pytest.raises(ValueError, match=msg): | ||||
|             json_normalize(data, "data", meta=["foo", "bar"]) | ||||
|  | ||||
|         result = json_normalize(data, "data", meta=["foo", "bar"], meta_prefix="meta") | ||||
|  | ||||
|         for val in ["metafoo", "metabar", "foo", "bar"]: | ||||
|             assert val in result | ||||
|  | ||||
|     def test_meta_parameter_not_modified(self): | ||||
|         # GH 18610 | ||||
|         data = [ | ||||
|             { | ||||
|                 "foo": "hello", | ||||
|                 "bar": "there", | ||||
|                 "data": [ | ||||
|                     {"foo": "something", "bar": "else"}, | ||||
|                     {"foo": "something2", "bar": "else2"}, | ||||
|                 ], | ||||
|             } | ||||
|         ] | ||||
|  | ||||
|         COLUMNS = ["foo", "bar"] | ||||
|         result = json_normalize(data, "data", meta=COLUMNS, meta_prefix="meta") | ||||
|  | ||||
|         assert COLUMNS == ["foo", "bar"] | ||||
|         for val in ["metafoo", "metabar", "foo", "bar"]: | ||||
|             assert val in result | ||||
|  | ||||
|     def test_record_prefix(self, state_data): | ||||
|         result = json_normalize(state_data[0], "counties") | ||||
|         expected = DataFrame(state_data[0]["counties"]) | ||||
|         tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|         result = json_normalize( | ||||
|             state_data, "counties", meta="state", record_prefix="county_" | ||||
|         ) | ||||
|  | ||||
|         expected = [] | ||||
|         for rec in state_data: | ||||
|             expected.extend(rec["counties"]) | ||||
|         expected = DataFrame(expected) | ||||
|         expected = expected.rename(columns=lambda x: "county_" + x) | ||||
|         expected["state"] = np.array(["Florida", "Ohio"]).repeat([3, 2]) | ||||
|  | ||||
|         tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     def test_non_ascii_key(self): | ||||
|         testjson = ( | ||||
|             b'[{"\xc3\x9cnic\xc3\xb8de":0,"sub":{"A":1, "B":2}},' | ||||
|             b'{"\xc3\x9cnic\xc3\xb8de":1,"sub":{"A":3, "B":4}}]' | ||||
|         ).decode("utf8") | ||||
|  | ||||
|         testdata = { | ||||
|             b"\xc3\x9cnic\xc3\xb8de".decode("utf8"): [0, 1], | ||||
|             "sub.A": [1, 3], | ||||
|             "sub.B": [2, 4], | ||||
|         } | ||||
|         expected = DataFrame(testdata) | ||||
|  | ||||
|         result = json_normalize(json.loads(testjson)) | ||||
|         tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     def test_missing_field(self, author_missing_data): | ||||
|         # GH20030: | ||||
|         result = json_normalize(author_missing_data) | ||||
|         ex_data = [ | ||||
|             { | ||||
|                 "info": np.nan, | ||||
|                 "info.created_at": np.nan, | ||||
|                 "info.last_updated": np.nan, | ||||
|                 "author_name.first": np.nan, | ||||
|                 "author_name.last_name": np.nan, | ||||
|             }, | ||||
|             { | ||||
|                 "info": None, | ||||
|                 "info.created_at": "11/08/1993", | ||||
|                 "info.last_updated": "26/05/2012", | ||||
|                 "author_name.first": "Jane", | ||||
|                 "author_name.last_name": "Doe", | ||||
|             }, | ||||
|         ] | ||||
|         expected = DataFrame(ex_data) | ||||
|         tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     @pytest.mark.parametrize( | ||||
|         "max_level,expected", | ||||
|         [ | ||||
|             ( | ||||
|                 0, | ||||
|                 [ | ||||
|                     { | ||||
|                         "TextField": "Some text", | ||||
|                         "UserField": {"Id": "ID001", "Name": "Name001"}, | ||||
|                         "CreatedBy": {"Name": "User001"}, | ||||
|                         "Image": {"a": "b"}, | ||||
|                     }, | ||||
|                     { | ||||
|                         "TextField": "Some text", | ||||
|                         "UserField": {"Id": "ID001", "Name": "Name001"}, | ||||
|                         "CreatedBy": {"Name": "User001"}, | ||||
|                         "Image": {"a": "b"}, | ||||
|                     }, | ||||
|                 ], | ||||
|             ), | ||||
|             ( | ||||
|                 1, | ||||
|                 [ | ||||
|                     { | ||||
|                         "TextField": "Some text", | ||||
|                         "UserField.Id": "ID001", | ||||
|                         "UserField.Name": "Name001", | ||||
|                         "CreatedBy": {"Name": "User001"}, | ||||
|                         "Image": {"a": "b"}, | ||||
|                     }, | ||||
|                     { | ||||
|                         "TextField": "Some text", | ||||
|                         "UserField.Id": "ID001", | ||||
|                         "UserField.Name": "Name001", | ||||
|                         "CreatedBy": {"Name": "User001"}, | ||||
|                         "Image": {"a": "b"}, | ||||
|                     }, | ||||
|                 ], | ||||
|             ), | ||||
|         ], | ||||
|     ) | ||||
|     def test_max_level_with_records_path(self, max_level, expected): | ||||
|         # GH23843: Enhanced JSON normalize | ||||
|         test_input = [ | ||||
|             { | ||||
|                 "CreatedBy": {"Name": "User001"}, | ||||
|                 "Lookup": [ | ||||
|                     { | ||||
|                         "TextField": "Some text", | ||||
|                         "UserField": {"Id": "ID001", "Name": "Name001"}, | ||||
|                     }, | ||||
|                     { | ||||
|                         "TextField": "Some text", | ||||
|                         "UserField": {"Id": "ID001", "Name": "Name001"}, | ||||
|                     }, | ||||
|                 ], | ||||
|                 "Image": {"a": "b"}, | ||||
|                 "tags": [ | ||||
|                     {"foo": "something", "bar": "else"}, | ||||
|                     {"foo": "something2", "bar": "else2"}, | ||||
|                 ], | ||||
|             } | ||||
|         ] | ||||
|  | ||||
|         result = json_normalize( | ||||
|             test_input, | ||||
|             record_path=["Lookup"], | ||||
|             meta=[["CreatedBy"], ["Image"]], | ||||
|             max_level=max_level, | ||||
|         ) | ||||
|         expected_df = DataFrame(data=expected, columns=result.columns.values) | ||||
|         tm.assert_equal(expected_df, result) | ||||
|  | ||||
|     def test_nested_flattening_consistent(self): | ||||
|         # see gh-21537 | ||||
|         df1 = json_normalize([{"A": {"B": 1}}]) | ||||
|         df2 = json_normalize({"dummy": [{"A": {"B": 1}}]}, "dummy") | ||||
|  | ||||
|         # They should be the same. | ||||
|         tm.assert_frame_equal(df1, df2) | ||||
|  | ||||
|     def test_nonetype_record_path(self, nulls_fixture): | ||||
|         # see gh-30148 | ||||
|         # should not raise TypeError | ||||
|         result = json_normalize( | ||||
|             [ | ||||
|                 {"state": "Texas", "info": nulls_fixture}, | ||||
|                 {"state": "Florida", "info": [{"i": 2}]}, | ||||
|             ], | ||||
|             record_path=["info"], | ||||
|         ) | ||||
|         expected = DataFrame({"i": 2}, index=[0]) | ||||
|         tm.assert_equal(result, expected) | ||||
|  | ||||
|     @pytest.mark.parametrize("value", ["false", "true", "{}", "1", '"text"']) | ||||
|     def test_non_list_record_path_errors(self, value): | ||||
|         # see gh-30148, GH 26284 | ||||
|         parsed_value = json.loads(value) | ||||
|         test_input = {"state": "Texas", "info": parsed_value} | ||||
|         test_path = "info" | ||||
|         msg = ( | ||||
|             f"{test_input} has non list value {parsed_value} for path {test_path}. " | ||||
|             "Must be list or null." | ||||
|         ) | ||||
|         with pytest.raises(TypeError, match=msg): | ||||
|             json_normalize([test_input], record_path=[test_path]) | ||||
|  | ||||
|     def test_meta_non_iterable(self): | ||||
|         # GH 31507 | ||||
|         data = """[{"id": 99, "data": [{"one": 1, "two": 2}]}]""" | ||||
|  | ||||
|         result = json_normalize(json.loads(data), record_path=["data"], meta=["id"]) | ||||
|         expected = DataFrame( | ||||
|             {"one": [1], "two": [2], "id": np.array([99], dtype=object)} | ||||
|         ) | ||||
|         tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     def test_generator(self, state_data): | ||||
|         # GH35923 Fix pd.json_normalize to not skip the first element of a | ||||
|         # generator input | ||||
|         def generator_data(): | ||||
|             yield from state_data[0]["counties"] | ||||
|  | ||||
|         result = json_normalize(generator_data()) | ||||
|         expected = DataFrame(state_data[0]["counties"]) | ||||
|  | ||||
|         tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     def test_top_column_with_leading_underscore(self): | ||||
|         # 49861 | ||||
|         data = {"_id": {"a1": 10, "l2": {"l3": 0}}, "gg": 4} | ||||
|         result = json_normalize(data, sep="_") | ||||
|         expected = DataFrame([[4, 10, 0]], columns=["gg", "_id_a1", "_id_l2_l3"]) | ||||
|  | ||||
|         tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| class TestNestedToRecord: | ||||
|     def test_flat_stays_flat(self): | ||||
|         recs = [{"flat1": 1, "flat2": 2}, {"flat3": 3, "flat2": 4}] | ||||
|         result = nested_to_record(recs) | ||||
|         expected = recs | ||||
|         assert result == expected | ||||
|  | ||||
|     def test_one_level_deep_flattens(self): | ||||
|         data = {"flat1": 1, "dict1": {"c": 1, "d": 2}} | ||||
|  | ||||
|         result = nested_to_record(data) | ||||
|         expected = {"dict1.c": 1, "dict1.d": 2, "flat1": 1} | ||||
|  | ||||
|         assert result == expected | ||||
|  | ||||
|     def test_nested_flattens(self): | ||||
|         data = { | ||||
|             "flat1": 1, | ||||
|             "dict1": {"c": 1, "d": 2}, | ||||
|             "nested": {"e": {"c": 1, "d": 2}, "d": 2}, | ||||
|         } | ||||
|  | ||||
|         result = nested_to_record(data) | ||||
|         expected = { | ||||
|             "dict1.c": 1, | ||||
|             "dict1.d": 2, | ||||
|             "flat1": 1, | ||||
|             "nested.d": 2, | ||||
|             "nested.e.c": 1, | ||||
|             "nested.e.d": 2, | ||||
|         } | ||||
|  | ||||
|         assert result == expected | ||||
|  | ||||
|     def test_json_normalize_errors(self, missing_metadata): | ||||
|         # GH14583: | ||||
|         # If meta keys are not always present a new option to set | ||||
|         # errors='ignore' has been implemented | ||||
|  | ||||
|         msg = ( | ||||
|             "Key 'name' not found. To replace missing values of " | ||||
|             "'name' with np.nan, pass in errors='ignore'" | ||||
|         ) | ||||
|         with pytest.raises(KeyError, match=msg): | ||||
|             json_normalize( | ||||
|                 data=missing_metadata, | ||||
|                 record_path="addresses", | ||||
|                 meta="name", | ||||
|                 errors="raise", | ||||
|             ) | ||||
|  | ||||
|     def test_missing_meta(self, missing_metadata): | ||||
|         # GH25468 | ||||
|         # If metadata is nullable with errors set to ignore, the null values | ||||
|         # should be numpy.nan values | ||||
|         result = json_normalize( | ||||
|             data=missing_metadata, record_path="addresses", meta="name", errors="ignore" | ||||
|         ) | ||||
|         ex_data = [ | ||||
|             [9562, "Morris St.", "Massillon", "OH", 44646, "Alice"], | ||||
|             [8449, "Spring St.", "Elizabethton", "TN", 37643, np.nan], | ||||
|         ] | ||||
|         columns = ["number", "street", "city", "state", "zip", "name"] | ||||
|         expected = DataFrame(ex_data, columns=columns) | ||||
|         tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     def test_missing_nested_meta(self): | ||||
|         # GH44312 | ||||
|         # If errors="ignore" and nested metadata is null, we should return nan | ||||
|         data = {"meta": "foo", "nested_meta": None, "value": [{"rec": 1}, {"rec": 2}]} | ||||
|         result = json_normalize( | ||||
|             data, | ||||
|             record_path="value", | ||||
|             meta=["meta", ["nested_meta", "leaf"]], | ||||
|             errors="ignore", | ||||
|         ) | ||||
|         ex_data = [[1, "foo", np.nan], [2, "foo", np.nan]] | ||||
|         columns = ["rec", "meta", "nested_meta.leaf"] | ||||
|         expected = DataFrame(ex_data, columns=columns).astype( | ||||
|             {"nested_meta.leaf": object} | ||||
|         ) | ||||
|         tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|         # If errors="raise" and nested metadata is null, we should raise with the | ||||
|         # key of the first missing level | ||||
|         with pytest.raises(KeyError, match="'leaf' not found"): | ||||
|             json_normalize( | ||||
|                 data, | ||||
|                 record_path="value", | ||||
|                 meta=["meta", ["nested_meta", "leaf"]], | ||||
|                 errors="raise", | ||||
|             ) | ||||
|  | ||||
|     def test_missing_meta_multilevel_record_path_errors_raise(self, missing_metadata): | ||||
|         # GH41876 | ||||
|         # Ensure errors='raise' works as intended even when a record_path of length | ||||
|         # greater than one is passed in | ||||
|         msg = ( | ||||
|             "Key 'name' not found. To replace missing values of " | ||||
|             "'name' with np.nan, pass in errors='ignore'" | ||||
|         ) | ||||
|         with pytest.raises(KeyError, match=msg): | ||||
|             json_normalize( | ||||
|                 data=missing_metadata, | ||||
|                 record_path=["previous_residences", "cities"], | ||||
|                 meta="name", | ||||
|                 errors="raise", | ||||
|             ) | ||||
|  | ||||
|     def test_missing_meta_multilevel_record_path_errors_ignore(self, missing_metadata): | ||||
|         # GH41876 | ||||
|         # Ensure errors='ignore' works as intended even when a record_path of length | ||||
|         # greater than one is passed in | ||||
|         result = json_normalize( | ||||
|             data=missing_metadata, | ||||
|             record_path=["previous_residences", "cities"], | ||||
|             meta="name", | ||||
|             errors="ignore", | ||||
|         ) | ||||
|         ex_data = [ | ||||
|             ["Foo York City", "Alice"], | ||||
|             ["Barmingham", np.nan], | ||||
|         ] | ||||
|         columns = ["city_name", "name"] | ||||
|         expected = DataFrame(ex_data, columns=columns) | ||||
|         tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     def test_donot_drop_nonevalues(self): | ||||
|         # GH21356 | ||||
|         data = [ | ||||
|             {"info": None, "author_name": {"first": "Smith", "last_name": "Appleseed"}}, | ||||
|             { | ||||
|                 "info": {"created_at": "11/08/1993", "last_updated": "26/05/2012"}, | ||||
|                 "author_name": {"first": "Jane", "last_name": "Doe"}, | ||||
|             }, | ||||
|         ] | ||||
|         result = nested_to_record(data) | ||||
|         expected = [ | ||||
|             { | ||||
|                 "info": None, | ||||
|                 "author_name.first": "Smith", | ||||
|                 "author_name.last_name": "Appleseed", | ||||
|             }, | ||||
|             { | ||||
|                 "author_name.first": "Jane", | ||||
|                 "author_name.last_name": "Doe", | ||||
|                 "info.created_at": "11/08/1993", | ||||
|                 "info.last_updated": "26/05/2012", | ||||
|             }, | ||||
|         ] | ||||
|  | ||||
|         assert result == expected | ||||
|  | ||||
|     def test_nonetype_top_level_bottom_level(self): | ||||
|         # GH21158: If inner level json has a key with a null value | ||||
|         # make sure it does not do a new_d.pop twice and except | ||||
|         data = { | ||||
|             "id": None, | ||||
|             "location": { | ||||
|                 "country": { | ||||
|                     "state": { | ||||
|                         "id": None, | ||||
|                         "town.info": { | ||||
|                             "id": None, | ||||
|                             "region": None, | ||||
|                             "x": 49.151580810546875, | ||||
|                             "y": -33.148521423339844, | ||||
|                             "z": 27.572303771972656, | ||||
|                         }, | ||||
|                     } | ||||
|                 } | ||||
|             }, | ||||
|         } | ||||
|         result = nested_to_record(data) | ||||
|         expected = { | ||||
|             "id": None, | ||||
|             "location.country.state.id": None, | ||||
|             "location.country.state.town.info.id": None, | ||||
|             "location.country.state.town.info.region": None, | ||||
|             "location.country.state.town.info.x": 49.151580810546875, | ||||
|             "location.country.state.town.info.y": -33.148521423339844, | ||||
|             "location.country.state.town.info.z": 27.572303771972656, | ||||
|         } | ||||
|         assert result == expected | ||||
|  | ||||
|     def test_nonetype_multiple_levels(self): | ||||
|         # GH21158: If inner level json has a key with a null value | ||||
|         # make sure it does not do a new_d.pop twice and except | ||||
|         data = { | ||||
|             "id": None, | ||||
|             "location": { | ||||
|                 "id": None, | ||||
|                 "country": { | ||||
|                     "id": None, | ||||
|                     "state": { | ||||
|                         "id": None, | ||||
|                         "town.info": { | ||||
|                             "region": None, | ||||
|                             "x": 49.151580810546875, | ||||
|                             "y": -33.148521423339844, | ||||
|                             "z": 27.572303771972656, | ||||
|                         }, | ||||
|                     }, | ||||
|                 }, | ||||
|             }, | ||||
|         } | ||||
|         result = nested_to_record(data) | ||||
|         expected = { | ||||
|             "id": None, | ||||
|             "location.id": None, | ||||
|             "location.country.id": None, | ||||
|             "location.country.state.id": None, | ||||
|             "location.country.state.town.info.region": None, | ||||
|             "location.country.state.town.info.x": 49.151580810546875, | ||||
|             "location.country.state.town.info.y": -33.148521423339844, | ||||
|             "location.country.state.town.info.z": 27.572303771972656, | ||||
|         } | ||||
|         assert result == expected | ||||
|  | ||||
|     @pytest.mark.parametrize( | ||||
|         "max_level, expected", | ||||
|         [ | ||||
|             ( | ||||
|                 None, | ||||
|                 [ | ||||
|                     { | ||||
|                         "CreatedBy.Name": "User001", | ||||
|                         "Lookup.TextField": "Some text", | ||||
|                         "Lookup.UserField.Id": "ID001", | ||||
|                         "Lookup.UserField.Name": "Name001", | ||||
|                         "Image.a": "b", | ||||
|                     } | ||||
|                 ], | ||||
|             ), | ||||
|             ( | ||||
|                 0, | ||||
|                 [ | ||||
|                     { | ||||
|                         "CreatedBy": {"Name": "User001"}, | ||||
|                         "Lookup": { | ||||
|                             "TextField": "Some text", | ||||
|                             "UserField": {"Id": "ID001", "Name": "Name001"}, | ||||
|                         }, | ||||
|                         "Image": {"a": "b"}, | ||||
|                     } | ||||
|                 ], | ||||
|             ), | ||||
|             ( | ||||
|                 1, | ||||
|                 [ | ||||
|                     { | ||||
|                         "CreatedBy.Name": "User001", | ||||
|                         "Lookup.TextField": "Some text", | ||||
|                         "Lookup.UserField": {"Id": "ID001", "Name": "Name001"}, | ||||
|                         "Image.a": "b", | ||||
|                     } | ||||
|                 ], | ||||
|             ), | ||||
|         ], | ||||
|     ) | ||||
|     def test_with_max_level(self, max_level, expected, max_level_test_input_data): | ||||
|         # GH23843: Enhanced JSON normalize | ||||
|         output = nested_to_record(max_level_test_input_data, max_level=max_level) | ||||
|         assert output == expected | ||||
|  | ||||
|     def test_with_large_max_level(self): | ||||
|         # GH23843: Enhanced JSON normalize | ||||
|         max_level = 100 | ||||
|         input_data = [ | ||||
|             { | ||||
|                 "CreatedBy": { | ||||
|                     "user": { | ||||
|                         "name": {"firstname": "Leo", "LastName": "Thomson"}, | ||||
|                         "family_tree": { | ||||
|                             "father": { | ||||
|                                 "name": "Father001", | ||||
|                                 "father": { | ||||
|                                     "Name": "Father002", | ||||
|                                     "father": { | ||||
|                                         "name": "Father003", | ||||
|                                         "father": {"Name": "Father004"}, | ||||
|                                     }, | ||||
|                                 }, | ||||
|                             } | ||||
|                         }, | ||||
|                     } | ||||
|                 } | ||||
|             } | ||||
|         ] | ||||
|         expected = [ | ||||
|             { | ||||
|                 "CreatedBy.user.name.firstname": "Leo", | ||||
|                 "CreatedBy.user.name.LastName": "Thomson", | ||||
|                 "CreatedBy.user.family_tree.father.name": "Father001", | ||||
|                 "CreatedBy.user.family_tree.father.father.Name": "Father002", | ||||
|                 "CreatedBy.user.family_tree.father.father.father.name": "Father003", | ||||
|                 "CreatedBy.user.family_tree.father.father.father.father.Name": "Father004",  # noqa: E501 | ||||
|             } | ||||
|         ] | ||||
|         output = nested_to_record(input_data, max_level=max_level) | ||||
|         assert output == expected | ||||
|  | ||||
|     def test_series_non_zero_index(self): | ||||
|         # GH 19020 | ||||
|         data = { | ||||
|             0: {"id": 1, "name": "Foo", "elements": {"a": 1}}, | ||||
|             1: {"id": 2, "name": "Bar", "elements": {"b": 2}}, | ||||
|             2: {"id": 3, "name": "Baz", "elements": {"c": 3}}, | ||||
|         } | ||||
|         s = Series(data) | ||||
|         s.index = [1, 2, 3] | ||||
|         result = json_normalize(s) | ||||
|         expected = DataFrame( | ||||
|             { | ||||
|                 "id": [1, 2, 3], | ||||
|                 "name": ["Foo", "Bar", "Baz"], | ||||
|                 "elements.a": [1.0, np.nan, np.nan], | ||||
|                 "elements.b": [np.nan, 2.0, np.nan], | ||||
|                 "elements.c": [np.nan, np.nan, 3.0], | ||||
|             } | ||||
|         ) | ||||
|         tm.assert_frame_equal(result, expected) | ||||
							
								
								
									
										2188
									
								
								lib/python3.11/site-packages/pandas/tests/io/json/test_pandas.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										2188
									
								
								lib/python3.11/site-packages/pandas/tests/io/json/test_pandas.py
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							| @ -0,0 +1,543 @@ | ||||
| from collections.abc import Iterator | ||||
| from io import StringIO | ||||
| from pathlib import Path | ||||
|  | ||||
| import numpy as np | ||||
| import pytest | ||||
|  | ||||
| import pandas as pd | ||||
| from pandas import ( | ||||
|     DataFrame, | ||||
|     read_json, | ||||
| ) | ||||
| import pandas._testing as tm | ||||
|  | ||||
| from pandas.io.json._json import JsonReader | ||||
|  | ||||
| pytestmark = pytest.mark.filterwarnings( | ||||
|     "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" | ||||
| ) | ||||
|  | ||||
|  | ||||
| @pytest.fixture | ||||
| def lines_json_df(): | ||||
|     df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) | ||||
|     return df.to_json(lines=True, orient="records") | ||||
|  | ||||
|  | ||||
| @pytest.fixture(params=["ujson", "pyarrow"]) | ||||
| def engine(request): | ||||
|     if request.param == "pyarrow": | ||||
|         pytest.importorskip("pyarrow.json") | ||||
|     return request.param | ||||
|  | ||||
|  | ||||
| def test_read_jsonl(): | ||||
|     # GH9180 | ||||
|     result = read_json(StringIO('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n'), lines=True) | ||||
|     expected = DataFrame([[1, 2], [1, 2]], columns=["a", "b"]) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_read_jsonl_engine_pyarrow(datapath, engine): | ||||
|     result = read_json( | ||||
|         datapath("io", "json", "data", "line_delimited.json"), | ||||
|         lines=True, | ||||
|         engine=engine, | ||||
|     ) | ||||
|     expected = DataFrame({"a": [1, 3, 5], "b": [2, 4, 6]}) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_read_datetime(request, engine): | ||||
|     # GH33787 | ||||
|     if engine == "pyarrow": | ||||
|         # GH 48893 | ||||
|         reason = "Pyarrow only supports a file path as an input and line delimited json" | ||||
|         request.applymarker(pytest.mark.xfail(reason=reason, raises=ValueError)) | ||||
|  | ||||
|     df = DataFrame( | ||||
|         [([1, 2], ["2020-03-05", "2020-04-08T09:58:49+00:00"], "hector")], | ||||
|         columns=["accounts", "date", "name"], | ||||
|     ) | ||||
|     json_line = df.to_json(lines=True, orient="records") | ||||
|  | ||||
|     if engine == "pyarrow": | ||||
|         result = read_json(StringIO(json_line), engine=engine) | ||||
|     else: | ||||
|         result = read_json(StringIO(json_line), engine=engine) | ||||
|     expected = DataFrame( | ||||
|         [[1, "2020-03-05", "hector"], [2, "2020-04-08T09:58:49+00:00", "hector"]], | ||||
|         columns=["accounts", "date", "name"], | ||||
|     ) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_read_jsonl_unicode_chars(): | ||||
|     # GH15132: non-ascii unicode characters | ||||
|     # \u201d == RIGHT DOUBLE QUOTATION MARK | ||||
|  | ||||
|     # simulate file handle | ||||
|     json = '{"a": "foo”", "b": "bar"}\n{"a": "foo", "b": "bar"}\n' | ||||
|     json = StringIO(json) | ||||
|     result = read_json(json, lines=True) | ||||
|     expected = DataFrame([["foo\u201d", "bar"], ["foo", "bar"]], columns=["a", "b"]) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|     # simulate string | ||||
|     json = '{"a": "foo”", "b": "bar"}\n{"a": "foo", "b": "bar"}\n' | ||||
|     result = read_json(StringIO(json), lines=True) | ||||
|     expected = DataFrame([["foo\u201d", "bar"], ["foo", "bar"]], columns=["a", "b"]) | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_to_jsonl(): | ||||
|     # GH9180 | ||||
|     df = DataFrame([[1, 2], [1, 2]], columns=["a", "b"]) | ||||
|     result = df.to_json(orient="records", lines=True) | ||||
|     expected = '{"a":1,"b":2}\n{"a":1,"b":2}\n' | ||||
|     assert result == expected | ||||
|  | ||||
|     df = DataFrame([["foo}", "bar"], ['foo"', "bar"]], columns=["a", "b"]) | ||||
|     result = df.to_json(orient="records", lines=True) | ||||
|     expected = '{"a":"foo}","b":"bar"}\n{"a":"foo\\"","b":"bar"}\n' | ||||
|     assert result == expected | ||||
|     tm.assert_frame_equal(read_json(StringIO(result), lines=True), df) | ||||
|  | ||||
|     # GH15096: escaped characters in columns and data | ||||
|     df = DataFrame([["foo\\", "bar"], ['foo"', "bar"]], columns=["a\\", "b"]) | ||||
|     result = df.to_json(orient="records", lines=True) | ||||
|     expected = '{"a\\\\":"foo\\\\","b":"bar"}\n{"a\\\\":"foo\\"","b":"bar"}\n' | ||||
|     assert result == expected | ||||
|     tm.assert_frame_equal(read_json(StringIO(result), lines=True), df) | ||||
|  | ||||
|  | ||||
| def test_to_jsonl_count_new_lines(): | ||||
|     # GH36888 | ||||
|     df = DataFrame([[1, 2], [1, 2]], columns=["a", "b"]) | ||||
|     actual_new_lines_count = df.to_json(orient="records", lines=True).count("\n") | ||||
|     expected_new_lines_count = 2 | ||||
|     assert actual_new_lines_count == expected_new_lines_count | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("chunksize", [1, 1.0]) | ||||
| def test_readjson_chunks(request, lines_json_df, chunksize, engine): | ||||
|     # Basic test that read_json(chunks=True) gives the same result as | ||||
|     # read_json(chunks=False) | ||||
|     # GH17048: memory usage when lines=True | ||||
|  | ||||
|     if engine == "pyarrow": | ||||
|         # GH 48893 | ||||
|         reason = ( | ||||
|             "Pyarrow only supports a file path as an input and line delimited json" | ||||
|             "and doesn't support chunksize parameter." | ||||
|         ) | ||||
|         request.applymarker(pytest.mark.xfail(reason=reason, raises=ValueError)) | ||||
|  | ||||
|     unchunked = read_json(StringIO(lines_json_df), lines=True) | ||||
|     with read_json( | ||||
|         StringIO(lines_json_df), lines=True, chunksize=chunksize, engine=engine | ||||
|     ) as reader: | ||||
|         chunked = pd.concat(reader) | ||||
|  | ||||
|     tm.assert_frame_equal(chunked, unchunked) | ||||
|  | ||||
|  | ||||
| def test_readjson_chunksize_requires_lines(lines_json_df, engine): | ||||
|     msg = "chunksize can only be passed if lines=True" | ||||
|     with pytest.raises(ValueError, match=msg): | ||||
|         with read_json( | ||||
|             StringIO(lines_json_df), lines=False, chunksize=2, engine=engine | ||||
|         ) as _: | ||||
|             pass | ||||
|  | ||||
|  | ||||
| def test_readjson_chunks_series(request, engine): | ||||
|     if engine == "pyarrow": | ||||
|         # GH 48893 | ||||
|         reason = ( | ||||
|             "Pyarrow only supports a file path as an input and line delimited json" | ||||
|             "and doesn't support chunksize parameter." | ||||
|         ) | ||||
|         request.applymarker(pytest.mark.xfail(reason=reason)) | ||||
|  | ||||
|     # Test reading line-format JSON to Series with chunksize param | ||||
|     s = pd.Series({"A": 1, "B": 2}) | ||||
|  | ||||
|     strio = StringIO(s.to_json(lines=True, orient="records")) | ||||
|     unchunked = read_json(strio, lines=True, typ="Series", engine=engine) | ||||
|  | ||||
|     strio = StringIO(s.to_json(lines=True, orient="records")) | ||||
|     with read_json( | ||||
|         strio, lines=True, typ="Series", chunksize=1, engine=engine | ||||
|     ) as reader: | ||||
|         chunked = pd.concat(reader) | ||||
|  | ||||
|     tm.assert_series_equal(chunked, unchunked) | ||||
|  | ||||
|  | ||||
| def test_readjson_each_chunk(request, lines_json_df, engine): | ||||
|     if engine == "pyarrow": | ||||
|         # GH 48893 | ||||
|         reason = ( | ||||
|             "Pyarrow only supports a file path as an input and line delimited json" | ||||
|             "and doesn't support chunksize parameter." | ||||
|         ) | ||||
|         request.applymarker(pytest.mark.xfail(reason=reason, raises=ValueError)) | ||||
|  | ||||
|     # Other tests check that the final result of read_json(chunksize=True) | ||||
|     # is correct. This checks the intermediate chunks. | ||||
|     with read_json( | ||||
|         StringIO(lines_json_df), lines=True, chunksize=2, engine=engine | ||||
|     ) as reader: | ||||
|         chunks = list(reader) | ||||
|     assert chunks[0].shape == (2, 2) | ||||
|     assert chunks[1].shape == (1, 2) | ||||
|  | ||||
|  | ||||
| def test_readjson_chunks_from_file(request, engine): | ||||
|     if engine == "pyarrow": | ||||
|         # GH 48893 | ||||
|         reason = ( | ||||
|             "Pyarrow only supports a file path as an input and line delimited json" | ||||
|             "and doesn't support chunksize parameter." | ||||
|         ) | ||||
|         request.applymarker(pytest.mark.xfail(reason=reason, raises=ValueError)) | ||||
|  | ||||
|     with tm.ensure_clean("test.json") as path: | ||||
|         df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) | ||||
|         df.to_json(path, lines=True, orient="records") | ||||
|         with read_json(path, lines=True, chunksize=1, engine=engine) as reader: | ||||
|             chunked = pd.concat(reader) | ||||
|         unchunked = read_json(path, lines=True, engine=engine) | ||||
|         tm.assert_frame_equal(unchunked, chunked) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("chunksize", [None, 1]) | ||||
| def test_readjson_chunks_closes(chunksize): | ||||
|     with tm.ensure_clean("test.json") as path: | ||||
|         df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) | ||||
|         df.to_json(path, lines=True, orient="records") | ||||
|         reader = JsonReader( | ||||
|             path, | ||||
|             orient=None, | ||||
|             typ="frame", | ||||
|             dtype=True, | ||||
|             convert_axes=True, | ||||
|             convert_dates=True, | ||||
|             keep_default_dates=True, | ||||
|             precise_float=False, | ||||
|             date_unit=None, | ||||
|             encoding=None, | ||||
|             lines=True, | ||||
|             chunksize=chunksize, | ||||
|             compression=None, | ||||
|             nrows=None, | ||||
|         ) | ||||
|         with reader: | ||||
|             reader.read() | ||||
|         assert ( | ||||
|             reader.handles.handle.closed | ||||
|         ), f"didn't close stream with chunksize = {chunksize}" | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("chunksize", [0, -1, 2.2, "foo"]) | ||||
| def test_readjson_invalid_chunksize(lines_json_df, chunksize, engine): | ||||
|     msg = r"'chunksize' must be an integer >=1" | ||||
|  | ||||
|     with pytest.raises(ValueError, match=msg): | ||||
|         with read_json( | ||||
|             StringIO(lines_json_df), lines=True, chunksize=chunksize, engine=engine | ||||
|         ) as _: | ||||
|             pass | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("chunksize", [None, 1, 2]) | ||||
| def test_readjson_chunks_multiple_empty_lines(chunksize): | ||||
|     j = """ | ||||
|  | ||||
|     {"A":1,"B":4} | ||||
|  | ||||
|  | ||||
|  | ||||
|     {"A":2,"B":5} | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
|     {"A":3,"B":6} | ||||
|     """ | ||||
|     orig = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) | ||||
|     test = read_json(StringIO(j), lines=True, chunksize=chunksize) | ||||
|     if chunksize is not None: | ||||
|         with test: | ||||
|             test = pd.concat(test) | ||||
|     tm.assert_frame_equal(orig, test, obj=f"chunksize: {chunksize}") | ||||
|  | ||||
|  | ||||
| def test_readjson_unicode(request, monkeypatch, engine): | ||||
|     if engine == "pyarrow": | ||||
|         # GH 48893 | ||||
|         reason = ( | ||||
|             "Pyarrow only supports a file path as an input and line delimited json" | ||||
|             "and doesn't support chunksize parameter." | ||||
|         ) | ||||
|         request.applymarker(pytest.mark.xfail(reason=reason, raises=ValueError)) | ||||
|  | ||||
|     with tm.ensure_clean("test.json") as path: | ||||
|         monkeypatch.setattr("locale.getpreferredencoding", lambda do_setlocale: "cp949") | ||||
|         with open(path, "w", encoding="utf-8") as f: | ||||
|             f.write('{"£©µÀÆÖÞßéöÿ":["АБВГДабвгд가"]}') | ||||
|  | ||||
|         result = read_json(path, engine=engine) | ||||
|         expected = DataFrame({"£©µÀÆÖÞßéöÿ": ["АБВГДабвгд가"]}) | ||||
|         tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("nrows", [1, 2]) | ||||
| def test_readjson_nrows(nrows, engine): | ||||
|     # GH 33916 | ||||
|     # Test reading line-format JSON to Series with nrows param | ||||
|     jsonl = """{"a": 1, "b": 2} | ||||
|         {"a": 3, "b": 4} | ||||
|         {"a": 5, "b": 6} | ||||
|         {"a": 7, "b": 8}""" | ||||
|     result = read_json(StringIO(jsonl), lines=True, nrows=nrows) | ||||
|     expected = DataFrame({"a": [1, 3, 5, 7], "b": [2, 4, 6, 8]}).iloc[:nrows] | ||||
|     tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("nrows,chunksize", [(2, 2), (4, 2)]) | ||||
| def test_readjson_nrows_chunks(request, nrows, chunksize, engine): | ||||
|     # GH 33916 | ||||
|     # Test reading line-format JSON to Series with nrows and chunksize param | ||||
|     if engine == "pyarrow": | ||||
|         # GH 48893 | ||||
|         reason = ( | ||||
|             "Pyarrow only supports a file path as an input and line delimited json" | ||||
|             "and doesn't support chunksize parameter." | ||||
|         ) | ||||
|         request.applymarker(pytest.mark.xfail(reason=reason, raises=ValueError)) | ||||
|  | ||||
|     jsonl = """{"a": 1, "b": 2} | ||||
|         {"a": 3, "b": 4} | ||||
|         {"a": 5, "b": 6} | ||||
|         {"a": 7, "b": 8}""" | ||||
|  | ||||
|     if engine != "pyarrow": | ||||
|         with read_json( | ||||
|             StringIO(jsonl), lines=True, nrows=nrows, chunksize=chunksize, engine=engine | ||||
|         ) as reader: | ||||
|             chunked = pd.concat(reader) | ||||
|     else: | ||||
|         with read_json( | ||||
|             jsonl, lines=True, nrows=nrows, chunksize=chunksize, engine=engine | ||||
|         ) as reader: | ||||
|             chunked = pd.concat(reader) | ||||
|     expected = DataFrame({"a": [1, 3, 5, 7], "b": [2, 4, 6, 8]}).iloc[:nrows] | ||||
|     tm.assert_frame_equal(chunked, expected) | ||||
|  | ||||
|  | ||||
| def test_readjson_nrows_requires_lines(engine): | ||||
|     # GH 33916 | ||||
|     # Test ValueError raised if nrows is set without setting lines in read_json | ||||
|     jsonl = """{"a": 1, "b": 2} | ||||
|         {"a": 3, "b": 4} | ||||
|         {"a": 5, "b": 6} | ||||
|         {"a": 7, "b": 8}""" | ||||
|     msg = "nrows can only be passed if lines=True" | ||||
|     with pytest.raises(ValueError, match=msg): | ||||
|         read_json(jsonl, lines=False, nrows=2, engine=engine) | ||||
|  | ||||
|  | ||||
| def test_readjson_lines_chunks_fileurl(request, datapath, engine): | ||||
|     # GH 27135 | ||||
|     # Test reading line-format JSON from file url | ||||
|     if engine == "pyarrow": | ||||
|         # GH 48893 | ||||
|         reason = ( | ||||
|             "Pyarrow only supports a file path as an input and line delimited json" | ||||
|             "and doesn't support chunksize parameter." | ||||
|         ) | ||||
|         request.applymarker(pytest.mark.xfail(reason=reason, raises=ValueError)) | ||||
|  | ||||
|     df_list_expected = [ | ||||
|         DataFrame([[1, 2]], columns=["a", "b"], index=[0]), | ||||
|         DataFrame([[3, 4]], columns=["a", "b"], index=[1]), | ||||
|         DataFrame([[5, 6]], columns=["a", "b"], index=[2]), | ||||
|     ] | ||||
|     os_path = datapath("io", "json", "data", "line_delimited.json") | ||||
|     file_url = Path(os_path).as_uri() | ||||
|     with read_json(file_url, lines=True, chunksize=1, engine=engine) as url_reader: | ||||
|         for index, chuck in enumerate(url_reader): | ||||
|             tm.assert_frame_equal(chuck, df_list_expected[index]) | ||||
|  | ||||
|  | ||||
| def test_chunksize_is_incremental(): | ||||
|     # See https://github.com/pandas-dev/pandas/issues/34548 | ||||
|     jsonl = ( | ||||
|         """{"a": 1, "b": 2} | ||||
|         {"a": 3, "b": 4} | ||||
|         {"a": 5, "b": 6} | ||||
|         {"a": 7, "b": 8}\n""" | ||||
|         * 1000 | ||||
|     ) | ||||
|  | ||||
|     class MyReader: | ||||
|         def __init__(self, contents) -> None: | ||||
|             self.read_count = 0 | ||||
|             self.stringio = StringIO(contents) | ||||
|  | ||||
|         def read(self, *args): | ||||
|             self.read_count += 1 | ||||
|             return self.stringio.read(*args) | ||||
|  | ||||
|         def __iter__(self) -> Iterator: | ||||
|             self.read_count += 1 | ||||
|             return iter(self.stringio) | ||||
|  | ||||
|     reader = MyReader(jsonl) | ||||
|     assert len(list(read_json(reader, lines=True, chunksize=100))) > 1 | ||||
|     assert reader.read_count > 10 | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("orient_", ["split", "index", "table"]) | ||||
| def test_to_json_append_orient(orient_): | ||||
|     # GH 35849 | ||||
|     # Test ValueError when orient is not 'records' | ||||
|     df = DataFrame({"col1": [1, 2], "col2": ["a", "b"]}) | ||||
|     msg = ( | ||||
|         r"mode='a' \(append\) is only supported when " | ||||
|         "lines is True and orient is 'records'" | ||||
|     ) | ||||
|     with pytest.raises(ValueError, match=msg): | ||||
|         df.to_json(mode="a", orient=orient_) | ||||
|  | ||||
|  | ||||
| def test_to_json_append_lines(): | ||||
|     # GH 35849 | ||||
|     # Test ValueError when lines is not True | ||||
|     df = DataFrame({"col1": [1, 2], "col2": ["a", "b"]}) | ||||
|     msg = ( | ||||
|         r"mode='a' \(append\) is only supported when " | ||||
|         "lines is True and orient is 'records'" | ||||
|     ) | ||||
|     with pytest.raises(ValueError, match=msg): | ||||
|         df.to_json(mode="a", lines=False, orient="records") | ||||
|  | ||||
|  | ||||
| @pytest.mark.parametrize("mode_", ["r", "x"]) | ||||
| def test_to_json_append_mode(mode_): | ||||
|     # GH 35849 | ||||
|     # Test ValueError when mode is not supported option | ||||
|     df = DataFrame({"col1": [1, 2], "col2": ["a", "b"]}) | ||||
|     msg = ( | ||||
|         f"mode={mode_} is not a valid option." | ||||
|         "Only 'w' and 'a' are currently supported." | ||||
|     ) | ||||
|     with pytest.raises(ValueError, match=msg): | ||||
|         df.to_json(mode=mode_, lines=False, orient="records") | ||||
|  | ||||
|  | ||||
| def test_to_json_append_output_consistent_columns(): | ||||
|     # GH 35849 | ||||
|     # Testing that resulting output reads in as expected. | ||||
|     # Testing same columns, new rows | ||||
|     df1 = DataFrame({"col1": [1, 2], "col2": ["a", "b"]}) | ||||
|     df2 = DataFrame({"col1": [3, 4], "col2": ["c", "d"]}) | ||||
|  | ||||
|     expected = DataFrame({"col1": [1, 2, 3, 4], "col2": ["a", "b", "c", "d"]}) | ||||
|     with tm.ensure_clean("test.json") as path: | ||||
|         # Save dataframes to the same file | ||||
|         df1.to_json(path, lines=True, orient="records") | ||||
|         df2.to_json(path, mode="a", lines=True, orient="records") | ||||
|  | ||||
|         # Read path file | ||||
|         result = read_json(path, lines=True) | ||||
|         tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_to_json_append_output_inconsistent_columns(): | ||||
|     # GH 35849 | ||||
|     # Testing that resulting output reads in as expected. | ||||
|     # Testing one new column, one old column, new rows | ||||
|     df1 = DataFrame({"col1": [1, 2], "col2": ["a", "b"]}) | ||||
|     df3 = DataFrame({"col2": ["e", "f"], "col3": ["!", "#"]}) | ||||
|  | ||||
|     expected = DataFrame( | ||||
|         { | ||||
|             "col1": [1, 2, None, None], | ||||
|             "col2": ["a", "b", "e", "f"], | ||||
|             "col3": [np.nan, np.nan, "!", "#"], | ||||
|         } | ||||
|     ) | ||||
|     with tm.ensure_clean("test.json") as path: | ||||
|         # Save dataframes to the same file | ||||
|         df1.to_json(path, mode="a", lines=True, orient="records") | ||||
|         df3.to_json(path, mode="a", lines=True, orient="records") | ||||
|  | ||||
|         # Read path file | ||||
|         result = read_json(path, lines=True) | ||||
|         tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_to_json_append_output_different_columns(): | ||||
|     # GH 35849 | ||||
|     # Testing that resulting output reads in as expected. | ||||
|     # Testing same, differing and new columns | ||||
|     df1 = DataFrame({"col1": [1, 2], "col2": ["a", "b"]}) | ||||
|     df2 = DataFrame({"col1": [3, 4], "col2": ["c", "d"]}) | ||||
|     df3 = DataFrame({"col2": ["e", "f"], "col3": ["!", "#"]}) | ||||
|     df4 = DataFrame({"col4": [True, False]}) | ||||
|  | ||||
|     expected = DataFrame( | ||||
|         { | ||||
|             "col1": [1, 2, 3, 4, None, None, None, None], | ||||
|             "col2": ["a", "b", "c", "d", "e", "f", np.nan, np.nan], | ||||
|             "col3": [np.nan, np.nan, np.nan, np.nan, "!", "#", np.nan, np.nan], | ||||
|             "col4": [None, None, None, None, None, None, True, False], | ||||
|         } | ||||
|     ).astype({"col4": "float"}) | ||||
|     with tm.ensure_clean("test.json") as path: | ||||
|         # Save dataframes to the same file | ||||
|         df1.to_json(path, mode="a", lines=True, orient="records") | ||||
|         df2.to_json(path, mode="a", lines=True, orient="records") | ||||
|         df3.to_json(path, mode="a", lines=True, orient="records") | ||||
|         df4.to_json(path, mode="a", lines=True, orient="records") | ||||
|  | ||||
|         # Read path file | ||||
|         result = read_json(path, lines=True) | ||||
|         tm.assert_frame_equal(result, expected) | ||||
|  | ||||
|  | ||||
| def test_to_json_append_output_different_columns_reordered(): | ||||
|     # GH 35849 | ||||
|     # Testing that resulting output reads in as expected. | ||||
|     # Testing specific result column order. | ||||
|     df1 = DataFrame({"col1": [1, 2], "col2": ["a", "b"]}) | ||||
|     df2 = DataFrame({"col1": [3, 4], "col2": ["c", "d"]}) | ||||
|     df3 = DataFrame({"col2": ["e", "f"], "col3": ["!", "#"]}) | ||||
|     df4 = DataFrame({"col4": [True, False]}) | ||||
|  | ||||
|     # df4, df3, df2, df1 (in that order) | ||||
|     expected = DataFrame( | ||||
|         { | ||||
|             "col4": [True, False, None, None, None, None, None, None], | ||||
|             "col2": [np.nan, np.nan, "e", "f", "c", "d", "a", "b"], | ||||
|             "col3": [np.nan, np.nan, "!", "#", np.nan, np.nan, np.nan, np.nan], | ||||
|             "col1": [None, None, None, None, 3, 4, 1, 2], | ||||
|         } | ||||
|     ).astype({"col4": "float"}) | ||||
|     with tm.ensure_clean("test.json") as path: | ||||
|         # Save dataframes to the same file | ||||
|         df4.to_json(path, mode="a", lines=True, orient="records") | ||||
|         df3.to_json(path, mode="a", lines=True, orient="records") | ||||
|         df2.to_json(path, mode="a", lines=True, orient="records") | ||||
|         df1.to_json(path, mode="a", lines=True, orient="records") | ||||
|  | ||||
|         # Read path file | ||||
|         result = read_json(path, lines=True) | ||||
|         tm.assert_frame_equal(result, expected) | ||||
							
								
								
									
										1087
									
								
								lib/python3.11/site-packages/pandas/tests/io/json/test_ujson.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1087
									
								
								lib/python3.11/site-packages/pandas/tests/io/json/test_ujson.py
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
		Reference in New Issue
	
	Block a user