Skip to content

Fix JSON orient='table' issues with numeric column names #25488

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 12 commits into from
7 changes: 6 additions & 1 deletion pandas/io/json/json.py
Original file line number Diff line number Diff line change
Expand Up @@ -211,7 +211,7 @@ def __init__(self, obj, orient, date_format, double_precision,
else:
self.obj = obj.reset_index(drop=False)
self.date_format = 'iso'
self.orient = 'records'
self.orient = 'values'
self.index = index

def _write(self, obj, orient, double_precision, ensure_ascii,
Expand All @@ -221,6 +221,11 @@ def _write(self, obj, orient, double_precision, ensure_ascii,
ensure_ascii, date_unit,
iso_dates,
default_handler)
# add column names
column_names = dumps(obj.columns)
if len(data) > 2:
column_names = column_names + ','
data = data[0] + column_names + data[1:]
serialized = '{{"schema": {schema}, "data": {data}}}'.format(
schema=dumps(self.schema), data=data)
return serialized
Expand Down
5 changes: 3 additions & 2 deletions pandas/io/json/table_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -296,8 +296,8 @@ def parse_table_schema(json, precise_float):
pandas.read_json
"""
table = loads(json, precise_float=precise_float)
col_order = [field['name'] for field in table['schema']['fields']]
df = DataFrame(table['data'], columns=col_order)[col_order]
col_order = table['data'][0]
df = DataFrame(table['data'][1:], columns=col_order)[col_order]

dtypes = {field['name']: convert_json_field_to_pandas_type(field)
for field in table['schema']['fields']}
Expand All @@ -322,5 +322,6 @@ def parse_table_schema(json, precise_float):
else:
df.index.names = [None if x.startswith('level_') else x for x in
df.index.names]
df.columns = df.columns.values.tolist()

return df
65 changes: 20 additions & 45 deletions pandas/tests/io/json/test_json_table_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,8 +208,8 @@ def test_build_series(self):

expected = OrderedDict([
('schema', schema),
('data', [OrderedDict([('id', 0), ('a', 1)]),
OrderedDict([('id', 1), ('a', 2)])])])
('data', [['id', 'a'], [0, 1], [1, 2]])
])
assert result == expected

def test_to_json(self):
Expand Down Expand Up @@ -243,32 +243,15 @@ def test_to_json(self):
'fields': fields,
'primaryKey': ['idx'],
}
data = [
OrderedDict([('idx', 0), ('A', 1), ('B', 'a'),
('C', '2016-01-01T00:00:00.000Z'),
('D', 'P0DT1H0M0S'),
('E', 'a'), ('F', 'a'), ('G', 1.),
('H', '2016-01-01T06:00:00.000Z')
]),
OrderedDict([('idx', 1), ('A', 2), ('B', 'b'),
('C', '2016-01-02T00:00:00.000Z'),
('D', 'P0DT1H1M0S'),
('E', 'b'), ('F', 'b'), ('G', 2.),
('H', '2016-01-02T06:00:00.000Z')
]),
OrderedDict([('idx', 2), ('A', 3), ('B', 'c'),
('C', '2016-01-03T00:00:00.000Z'),
('D', 'P0DT1H2M0S'),
('E', 'c'), ('F', 'c'), ('G', 3.),
('H', '2016-01-03T06:00:00.000Z')
]),
OrderedDict([('idx', 3), ('A', 4), ('B', 'c'),
('C', '2016-01-04T00:00:00.000Z'),
('D', 'P0DT1H3M0S'),
('E', 'c'), ('F', 'c'), ('G', 4.),
('H', '2016-01-04T06:00:00.000Z')
]),
]
data = [['idx', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H'],
[0, 1, 'a', '2016-01-01T00:00:00.000Z', 'P0DT1H0M0S', 'a', 'a',
1., '2016-01-01T06:00:00.000Z'],
[1, 2, 'b', '2016-01-02T00:00:00.000Z', 'P0DT1H1M0S', 'b', 'b',
2., '2016-01-02T06:00:00.000Z'],
[2, 3, 'c', '2016-01-03T00:00:00.000Z', 'P0DT1H2M0S', 'c', 'c',
3., '2016-01-03T06:00:00.000Z'],
[3, 4, 'c', '2016-01-04T00:00:00.000Z', 'P0DT1H3M0S', 'c', 'c',
4., '2016-01-04T06:00:00.000Z']]
expected = OrderedDict([('schema', schema), ('data', data)])
assert result == expected

Expand All @@ -277,16 +260,14 @@ def test_to_json_float_index(self):
result = data.to_json(orient='table', date_format='iso')
result = json.loads(result, object_pairs_hook=OrderedDict)
result['schema'].pop('pandas_version')

expected = (
OrderedDict([('schema', {
expected = (OrderedDict([
('schema', {
'fields': [{'name': 'index', 'type': 'number'},
{'name': 'values', 'type': 'integer'}],
'primaryKey': ['index']
}),
('data', [OrderedDict([('index', 1.0), ('values', 1)]),
OrderedDict([('index', 2.0), ('values', 1)])])])
)
('data', [['index', 'values'], [1.0, 1], [2.0, 1]])
]))
assert result == expected

def test_to_json_period_index(self):
Expand All @@ -300,10 +281,9 @@ def test_to_json_period_index(self):
{'name': 'values', 'type': 'integer'}]

schema = {'fields': fields, 'primaryKey': ['index']}
data = [OrderedDict([('index', '2015-11-01T00:00:00.000Z'),
('values', 1)]),
OrderedDict([('index', '2016-02-01T00:00:00.000Z'),
('values', 1)])]
data = [['index', 'values'],
['2015-11-01T00:00:00.000Z', 1],
['2016-02-01T00:00:00.000Z', 1]]
expected = OrderedDict([('schema', schema), ('data', data)])
assert result == expected

Expand All @@ -320,10 +300,7 @@ def test_to_json_categorical_index(self):
'ordered': False},
{'name': 'values', 'type': 'integer'}],
'primaryKey': ['index']}),
('data', [
OrderedDict([('index', 'a'),
('values', 1)]),
OrderedDict([('index', 'b'), ('values', 1)])])])
('data', [['index', 'values'], ['a', 1], ['b', 1]])])
)
assert result == expected

Expand Down Expand Up @@ -428,9 +405,7 @@ def test_categorical(self):
expected = OrderedDict([
('schema', {'fields': fields,
'primaryKey': ['idx']}),
('data', [OrderedDict([('idx', 0), ('values', 'a')]),
OrderedDict([('idx', 1), ('values', 'b')]),
OrderedDict([('idx', 2), ('values', 'a')])])])
('data', [['idx', 'values'], [0, 'a'], [1, 'b'], [2, 'a']])])
assert result == expected

@pytest.mark.parametrize('idx,nm,prop', [
Expand Down
14 changes: 4 additions & 10 deletions pandas/tests/io/json/test_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -1204,9 +1204,10 @@ def test_data_frame_size_after_to_json(self):

@pytest.mark.parametrize('index', [None, [1, 2], [1., 2.], ['a', 'b'],
['1', '2'], ['1.', '2.']])
@pytest.mark.parametrize('columns', [['a', 'b'], ['1', '2'], ['1.', '2.']])
@pytest.mark.parametrize('columns', [None, [1, 2], [1., 2.], ['a', 'b'],
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So I don't know that we want to do this. Is it valid JSON in the table spec to have column names that are non-string?

Understood you have gotten this to round trip but if it violates the Table spec for JSON then I'd rather raise as commented previously

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@WillAyd as I already commented in #19129, after having tried to start a discussion before doing the PR:

I was suggesting not just "raising a more descriptive ValueError" (sic), but changing the implementation of the JSON serialization for orient='table'.

Could you please tell me where the JSON table spec claims that column names MUST be strings?

I am going to make a longer comment to further justify my PR.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you please tell me where the JSON table spec claims that column names MUST be strings?

Not specific to the table spec as much as just JSON itself. See the description of an object here:

https://json.org

Copy link
Contributor Author

@albertvillanova albertvillanova Mar 1, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@WillAyd you are talking about different things:

  • JSON spec imposes that in a JSON object (composed of key-value pairs), its keys must be strings
  • but we are talking about column names, not JSON object keys

And my question is: where the JSON table spec claims that COLUMN NAMES (not JSON object keys) must be strings?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@pwalsh can you comment on this? Is the name field in a Field Descriptor expected to be a string?

['1', '2'], ['1.', '2.']])
def test_from_json_to_json_table_index_and_columns(self, index, columns):
# GH25433 GH25435
# GH19129 GH25433 GH25435
expected = DataFrame([[1, 2], [3, 4]], index=index, columns=columns)
dfjson = expected.to_json(orient='table')
result = pd.read_json(dfjson, orient='table')
Expand Down Expand Up @@ -1272,16 +1273,9 @@ def test_index_false_to_json_split(self, data, expected):
def test_index_false_to_json_table(self, data):
# GH 17394
# Testing index=False in to_json with orient='table'

result = data.to_json(orient='table', index=False)
result = json.loads(result)

expected = {
'schema': pd.io.json.build_table_schema(data, index=False),
'data': DataFrame(data).to_dict(orient='records')
}

assert result == expected
assert 'primaryKey' not in result['schema']

@pytest.mark.parametrize('orient', [
'records', 'index', 'columns', 'values'
Expand Down