Skip to content

Fix JSON orient='table' issues with numeric column names #25488

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 12 commits into from
19 changes: 12 additions & 7 deletions pandas/io/json/json.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,24 +162,24 @@ def _write(self, obj, orient, double_precision, ensure_ascii,


class JSONTableWriter(FrameWriter):
_default_orient = 'records'
_default_orient = 'values'

def __init__(self, obj, orient, date_format, double_precision,
ensure_ascii, date_unit, index, default_handler=None):
"""
Adds a `schema` attribute with the Table Schema, resets
the index (can't do in caller, because the schema inference needs
to know what the index is, forces orient to records, and forces
to know what the index is, forces orient to values, and forces
date_format to 'iso'.
"""
super(JSONTableWriter, self).__init__(
obj, orient, date_format, double_precision, ensure_ascii,
date_unit, index, default_handler=default_handler)

if date_format != 'iso':
msg = ("Trying to write with `orient='table'` and "
"`date_format='{fmt}'`. Table Schema requires dates "
"to be formatted with `date_format='iso'`"
msg = ("Trying to write with orient='table' and "
"date_format='{fmt}'. Table Schema requires dates "
"to be formatted with date_format='iso'"
.format(fmt=date_format))
raise ValueError(msg)

Expand Down Expand Up @@ -211,7 +211,7 @@ def __init__(self, obj, orient, date_format, double_precision,
else:
self.obj = obj.reset_index(drop=False)
self.date_format = 'iso'
self.orient = 'records'
self.orient = 'values'
self.index = index

def _write(self, obj, orient, double_precision, ensure_ascii,
Expand All @@ -221,7 +221,12 @@ def _write(self, obj, orient, double_precision, ensure_ascii,
ensure_ascii, date_unit,
iso_dates,
default_handler)
serialized = '{{"schema": {schema}, "data": {data}}}'.format(
# add column names
column_names = dumps(obj.columns)
if len(data) > 2:
column_names = column_names + ','
data = data[0] + column_names + data[1:]
serialized = '{{"schema":{schema},"data":{data}}}'.format(
schema=dumps(self.schema), data=data)
return serialized

Expand Down
23 changes: 16 additions & 7 deletions pandas/io/json/table_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -255,7 +255,7 @@ def build_table_schema(data, index=True, primary_key=None, version=True):
schema['primaryKey'] = primary_key

if version:
schema['pandas_version'] = '0.20.0'
schema['pandas_version'] = '0.25.0'
return schema


Expand Down Expand Up @@ -296,21 +296,28 @@ def parse_table_schema(json, precise_float):
pandas.read_json
"""
table = loads(json, precise_float=precise_float)
col_order = [field['name'] for field in table['schema']['fields']]
df = DataFrame(table['data'], columns=col_order)[col_order]
version = table['schema']['pandas_version']
if version == '0.20.0':
# Each table row is represented by a dict
col_order = [field['name'] for field in table['schema']['fields']]
df = DataFrame(table['data'], columns=col_order)[col_order]
elif version == '0.25.0':
# Each table row is represented by a list
col_order = table['data'][0]
df = DataFrame(table['data'][1:], columns=col_order)[col_order]

dtypes = {field['name']: convert_json_field_to_pandas_type(field)
for field in table['schema']['fields']}

# Cannot directly use as_type with timezone data on object; raise for now
if any(str(x).startswith('datetime64[ns, ') for x in dtypes.values()):
raise NotImplementedError('table="orient" can not yet read timezone '
'data')
raise NotImplementedError("orient='table' can not yet read timezone "
"data")

# No ISO constructor for Timedelta as of yet, so need to raise
if 'timedelta64' in dtypes.values():
raise NotImplementedError('table="orient" can not yet read '
'ISO-formatted Timedelta data')
raise NotImplementedError("orient='table' can not yet read "
"ISO-formatted Timedelta data")

df = df.astype(dtypes)

Expand All @@ -322,5 +329,7 @@ def parse_table_schema(json, precise_float):
else:
df.index.names = [None if x.startswith('level_') else x for x in
df.index.names]
# Reset columns dtype
df.columns = df.columns.values.tolist()

return df
65 changes: 20 additions & 45 deletions pandas/tests/io/json/test_json_table_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,8 +208,8 @@ def test_build_series(self):

expected = OrderedDict([
('schema', schema),
('data', [OrderedDict([('id', 0), ('a', 1)]),
OrderedDict([('id', 1), ('a', 2)])])])
('data', [['id', 'a'], [0, 1], [1, 2]])
])
assert result == expected

def test_to_json(self):
Expand Down Expand Up @@ -243,32 +243,15 @@ def test_to_json(self):
'fields': fields,
'primaryKey': ['idx'],
}
data = [
OrderedDict([('idx', 0), ('A', 1), ('B', 'a'),
('C', '2016-01-01T00:00:00.000Z'),
('D', 'P0DT1H0M0S'),
('E', 'a'), ('F', 'a'), ('G', 1.),
('H', '2016-01-01T06:00:00.000Z')
]),
OrderedDict([('idx', 1), ('A', 2), ('B', 'b'),
('C', '2016-01-02T00:00:00.000Z'),
('D', 'P0DT1H1M0S'),
('E', 'b'), ('F', 'b'), ('G', 2.),
('H', '2016-01-02T06:00:00.000Z')
]),
OrderedDict([('idx', 2), ('A', 3), ('B', 'c'),
('C', '2016-01-03T00:00:00.000Z'),
('D', 'P0DT1H2M0S'),
('E', 'c'), ('F', 'c'), ('G', 3.),
('H', '2016-01-03T06:00:00.000Z')
]),
OrderedDict([('idx', 3), ('A', 4), ('B', 'c'),
('C', '2016-01-04T00:00:00.000Z'),
('D', 'P0DT1H3M0S'),
('E', 'c'), ('F', 'c'), ('G', 4.),
('H', '2016-01-04T06:00:00.000Z')
]),
]
data = [['idx', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H'],
[0, 1, 'a', '2016-01-01T00:00:00.000Z', 'P0DT1H0M0S', 'a', 'a',
1., '2016-01-01T06:00:00.000Z'],
[1, 2, 'b', '2016-01-02T00:00:00.000Z', 'P0DT1H1M0S', 'b', 'b',
2., '2016-01-02T06:00:00.000Z'],
[2, 3, 'c', '2016-01-03T00:00:00.000Z', 'P0DT1H2M0S', 'c', 'c',
3., '2016-01-03T06:00:00.000Z'],
[3, 4, 'c', '2016-01-04T00:00:00.000Z', 'P0DT1H3M0S', 'c', 'c',
4., '2016-01-04T06:00:00.000Z']]
expected = OrderedDict([('schema', schema), ('data', data)])
assert result == expected

Expand All @@ -277,16 +260,14 @@ def test_to_json_float_index(self):
result = data.to_json(orient='table', date_format='iso')
result = json.loads(result, object_pairs_hook=OrderedDict)
result['schema'].pop('pandas_version')

expected = (
OrderedDict([('schema', {
expected = (OrderedDict([
('schema', {
'fields': [{'name': 'index', 'type': 'number'},
{'name': 'values', 'type': 'integer'}],
'primaryKey': ['index']
}),
('data', [OrderedDict([('index', 1.0), ('values', 1)]),
OrderedDict([('index', 2.0), ('values', 1)])])])
)
('data', [['index', 'values'], [1.0, 1], [2.0, 1]])
]))
assert result == expected

def test_to_json_period_index(self):
Expand All @@ -300,10 +281,9 @@ def test_to_json_period_index(self):
{'name': 'values', 'type': 'integer'}]

schema = {'fields': fields, 'primaryKey': ['index']}
data = [OrderedDict([('index', '2015-11-01T00:00:00.000Z'),
('values', 1)]),
OrderedDict([('index', '2016-02-01T00:00:00.000Z'),
('values', 1)])]
data = [['index', 'values'],
['2015-11-01T00:00:00.000Z', 1],
['2016-02-01T00:00:00.000Z', 1]]
expected = OrderedDict([('schema', schema), ('data', data)])
assert result == expected

Expand All @@ -320,10 +300,7 @@ def test_to_json_categorical_index(self):
'ordered': False},
{'name': 'values', 'type': 'integer'}],
'primaryKey': ['index']}),
('data', [
OrderedDict([('index', 'a'),
('values', 1)]),
OrderedDict([('index', 'b'), ('values', 1)])])])
('data', [['index', 'values'], ['a', 1], ['b', 1]])])
)
assert result == expected

Expand Down Expand Up @@ -428,9 +405,7 @@ def test_categorical(self):
expected = OrderedDict([
('schema', {'fields': fields,
'primaryKey': ['idx']}),
('data', [OrderedDict([('idx', 0), ('values', 'a')]),
OrderedDict([('idx', 1), ('values', 'b')]),
OrderedDict([('idx', 2), ('values', 'a')])])])
('data', [['idx', 'values'], [0, 'a'], [1, 'b'], [2, 'a']])])
assert result == expected

@pytest.mark.parametrize('idx,nm,prop', [
Expand Down
58 changes: 48 additions & 10 deletions pandas/tests/io/json/test_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -1197,9 +1197,10 @@ def test_data_frame_size_after_to_json(self):

@pytest.mark.parametrize('index', [None, [1, 2], [1., 2.], ['a', 'b'],
['1', '2'], ['1.', '2.']])
@pytest.mark.parametrize('columns', [['a', 'b'], ['1', '2'], ['1.', '2.']])
@pytest.mark.parametrize('columns', [None, [1, 2], [1., 2.], ['a', 'b'],
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So I don't know that we want to do this. Is it valid JSON in the table spec to have column names that are non-string?

Understood you have gotten this to round trip but if it violates the Table spec for JSON then I'd rather raise as commented previously

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@WillAyd as I already commented in #19129, after having tried to start a discussion before doing the PR:

I was suggesting not just "raising a more descriptive ValueError" (sic), but changing the implementation of the JSON serialization for orient='table'.

Could you please tell me where the JSON table spec claims that column names MUST be strings?

I am going to make a longer comment to further justify my PR.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you please tell me where the JSON table spec claims that column names MUST be strings?

Not specific to the table spec as much as just JSON itself. See the description of an object here:

https://json.org

Copy link
Contributor Author

@albertvillanova albertvillanova Mar 1, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@WillAyd you are talking about different things:

  • JSON spec imposes that in a JSON object (composed of key-value pairs), its keys must be strings
  • but we are talking about column names, not JSON object keys

And my question is: where the JSON table spec claims that COLUMN NAMES (not JSON object keys) must be strings?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@pwalsh can you comment on this? Is the name field in a Field Descriptor expected to be a string?

['1', '2'], ['1.', '2.']])
def test_from_json_to_json_table_index_and_columns(self, index, columns):
# GH25433 GH25435
# GH19129 GH25433 GH25435
expected = DataFrame([[1, 2], [3, 4]], index=index, columns=columns)
dfjson = expected.to_json(orient='table')
result = pd.read_json(dfjson, orient='table')
Expand Down Expand Up @@ -1229,6 +1230,50 @@ def test_read_json_table_convert_axes_raises(self):
with pytest.raises(ValueError, match=msg):
pd.read_json(dfjson, orient='table', convert_axes=True)

@pytest.mark.parametrize('index, dfjson', [
(None,
'{"schema":{"fields":[{"name":"index","type":"integer"},{"name":"a",'
'"type":"integer"},{"name":"b","type":"number"},{"name":"c",'
'"type":"string"}],"primaryKey":["index"],"pandas_version":"0.20.0"},'
'"data":[{"index":0,"a":1,"b":3.0,"c":"5"},{"index":1,"a":2,"b":4.0,'
'"c":"6"}]}'),
([1, 2],
'{"schema":{"fields":[{"name":"index","type":"integer"},{"name":"a",'
'"type":"integer"},{"name":"b","type":"number"},{"name":"c",'
'"type":"string"}],"primaryKey":["index"],"pandas_version":"0.20.0"},'
'"data":[{"index":1,"a":1,"b":3.0,"c":"5"},{"index":2,"a":2,"b":4.0,'
'"c":"6"}]}'),
([1., 2.],
'{"schema":{"fields":[{"name":"index","type":"number"},{"name":"a",'
'"type":"integer"},{"name":"b","type":"number"},{"name":"c",'
'"type":"string"}],"primaryKey":["index"],"pandas_version":"0.20.0"},'
'"data":[{"index":1.0,"a":1,"b":3.0,"c":"5"},{"index":2.0,"a":2,'
'"b":4.0,"c":"6"}]}'),
(['a', 'b'],
'{"schema":{"fields":[{"name":"index","type":"string"},{"name":"a",'
'"type":"integer"},{"name":"b","type":"number"},{"name":"c",'
'"type":"string"}],"primaryKey":["index"],"pandas_version":"0.20.0"},'
'"data":[{"index":"a","a":1,"b":3.0,"c":"5"},{"index":"b","a":2,'
'"b":4.0,"c":"6"}]}'),
(['1', '2'],
'{"schema":{"fields":[{"name":"index","type":"string"},{"name":"a",'
'"type":"integer"},{"name":"b","type":"number"},{"name":"c",'
'"type":"string"}],"primaryKey":["index"],"pandas_version":"0.20.0"},'
'"data":[{"index":"1","a":1,"b":3.0,"c":"5"},{"index":"2","a":2,'
'"b":4.0,"c":"6"}]}'),
(['1.', '2.'],
'{"schema":{"fields":[{"name":"index","type":"string"},{"name":"a",'
'"type":"integer"},{"name":"b","type":"number"},{"name":"c",'
'"type":"string"}],"primaryKey":["index"],"pandas_version":"0.20.0"},'
'"data":[{"index":"1.","a":1,"b":3.0,"c":"5"},{"index":"2.","a":2,'
'"b":4.0,"c":"6"}]}')
])
def test_read_json_table_version_0_20_0(self, index, dfjson):
expected = pd.DataFrame([[1, 3., '5'], [2, 4., '6']],
index=index, columns=['a', 'b', 'c'])
result = pd.read_json(dfjson, orient='table')
assert_frame_equal(result, expected)

@pytest.mark.parametrize('data, expected', [
(DataFrame([[1, 2], [4, 5]], columns=['a', 'b']),
{'columns': ['a', 'b'], 'data': [[1, 2], [4, 5]]}),
Expand Down Expand Up @@ -1265,16 +1310,9 @@ def test_index_false_to_json_split(self, data, expected):
def test_index_false_to_json_table(self, data):
# GH 17394
# Testing index=False in to_json with orient='table'

result = data.to_json(orient='table', index=False)
result = json.loads(result)

expected = {
'schema': pd.io.json.build_table_schema(data, index=False),
'data': DataFrame(data).to_dict(orient='records')
}

assert result == expected
assert 'primaryKey' not in result['schema']

@pytest.mark.parametrize('orient', [
'records', 'index', 'columns', 'values'
Expand Down