Skip to content

Commit 6b9e93a

Browse files
authored
ENH: Enabling parsing ulonglong from json (#44770)
1 parent 325e4b6 commit 6b9e93a

File tree

6 files changed

+44
-37
lines changed

6 files changed

+44
-37
lines changed

doc/source/whatsnew/v1.4.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -229,6 +229,7 @@ Other enhancements
229229
- :meth:`Series.info` has been added, for compatibility with :meth:`DataFrame.info` (:issue:`5167`)
230230
- Implemented :meth:`IntervalArray.min`, :meth:`IntervalArray.max`, as a result of which ``min`` and ``max`` now work for :class:`IntervalIndex`, :class:`Series` and :class:`DataFrame` with ``IntervalDtype`` (:issue:`44746`)
231231
- :meth:`UInt64Index.map` now retains ``dtype`` where possible (:issue:`44609`)
232+
- :meth:`read_json` can now parse unsigned long long integers (:issue:`26068`)
232233
-
233234

234235

pandas/_libs/src/ujson/lib/ultrajson.h

+1
Original file line numberDiff line numberDiff line change
@@ -297,6 +297,7 @@ typedef struct __JSONObjectDecoder {
297297
JSOBJ (*endArray)(void *prv, JSOBJ obj);
298298
JSOBJ (*newInt)(void *prv, JSINT32 value);
299299
JSOBJ (*newLong)(void *prv, JSINT64 value);
300+
JSOBJ (*newUnsignedLong)(void *prv, JSUINT64 value);
300301
JSOBJ (*newDouble)(void *prv, double value);
301302
void (*releaseObject)(void *prv, JSOBJ obj, void *decoder);
302303
JSPFN_MALLOC malloc;

pandas/_libs/src/ujson/lib/ultrajsondec.c

+15-15
Original file line numberDiff line numberDiff line change
@@ -116,8 +116,8 @@ JSOBJ FASTCALL_MSVC decodePreciseFloat(struct DecoderState *ds) {
116116

117117
JSOBJ FASTCALL_MSVC decode_numeric(struct DecoderState *ds) {
118118
int intNeg = 1;
119-
int mantSize = 0;
120119
JSUINT64 intValue;
120+
JSUINT64 prevIntValue;
121121
int chr;
122122
int decimalCount = 0;
123123
double frcValue = 0.0;
@@ -134,10 +134,10 @@ JSOBJ FASTCALL_MSVC decode_numeric(struct DecoderState *ds) {
134134
} else if (*(offset) == '-') {
135135
offset++;
136136
intNeg = -1;
137+
overflowLimit = LLONG_MIN;
137138
if (*(offset) == 'I') {
138139
goto DECODE_INF;
139140
}
140-
overflowLimit = LLONG_MIN;
141141
}
142142

143143
// Scan integer part
@@ -157,19 +157,18 @@ JSOBJ FASTCALL_MSVC decode_numeric(struct DecoderState *ds) {
157157
case '7':
158158
case '8':
159159
case '9': {
160-
// FIXME: Check for arithmetic overflow here
161-
// PERF: Don't do 64-bit arithmetic here unless we know we have
162-
// to
163-
intValue = intValue * 10ULL + (JSLONG)(chr - 48);
164-
165-
if (intValue > overflowLimit) {
166-
return SetError(ds, -1, overflowLimit == LLONG_MAX
167-
? "Value is too big"
168-
: "Value is too small");
160+
// PERF: Don't do 64-bit arithmetic here unless we have to
161+
prevIntValue = intValue;
162+
intValue = intValue * 10ULL + (JSLONG) (chr - 48);
163+
164+
if (intNeg == 1 && prevIntValue > intValue) {
165+
return SetError(ds, -1, "Value is too big!");
166+
} else if (intNeg == -1 && intValue > overflowLimit) {
167+
return SetError(ds, -1, overflowLimit == LLONG_MAX ?
168+
"Value is too big!" : "Value is too small");
169169
}
170170

171171
offset++;
172-
mantSize++;
173172
break;
174173
}
175174
case '.': {
@@ -196,11 +195,12 @@ JSOBJ FASTCALL_MSVC decode_numeric(struct DecoderState *ds) {
196195
ds->lastType = JT_INT;
197196
ds->start = offset;
198197

199-
if ((intValue >> 31)) {
198+
if (intNeg == 1 && (intValue & 0x8000000000000000ULL) != 0)
199+
return ds->dec->newUnsignedLong(ds->prv, intValue);
200+
else if ((intValue >> 31))
200201
return ds->dec->newLong(ds->prv, (JSINT64)(intValue * (JSINT64)intNeg));
201-
} else {
202+
else
202203
return ds->dec->newInt(ds->prv, (JSINT32)(intValue * intNeg));
203-
}
204204

205205
DECODE_FRACTION:
206206

pandas/_libs/src/ujson/python/JSONtoObj.c

+6-1
Original file line numberDiff line numberDiff line change
@@ -479,6 +479,10 @@ JSOBJ Object_newLong(void *prv, JSINT64 value) {
479479
return PyLong_FromLongLong(value);
480480
}
481481

482+
JSOBJ Object_newUnsignedLong(void *prv, JSUINT64 value) {
483+
return PyLong_FromUnsignedLongLong(value);
484+
}
485+
482486
JSOBJ Object_newDouble(void *prv, double value) {
483487
return PyFloat_FromDouble(value);
484488
}
@@ -508,7 +512,8 @@ PyObject *JSONToObj(PyObject *self, PyObject *args, PyObject *kwargs) {
508512
Object_newTrue, Object_newFalse, Object_newNull,
509513
Object_newPosInf, Object_newNegInf, Object_newObject,
510514
Object_endObject, Object_newArray, Object_endArray,
511-
Object_newInteger, Object_newLong, Object_newDouble,
515+
Object_newInteger, Object_newLong, Object_newUnsignedLong,
516+
Object_newDouble,
512517
Object_releaseObject, PyObject_Malloc, PyObject_Free,
513518
PyObject_Realloc};
514519

pandas/tests/io/json/test_pandas.py

+2-5
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@
2222
DatetimeIndex,
2323
Series,
2424
Timestamp,
25-
compat,
2625
read_json,
2726
)
2827
import pandas._testing as tm
@@ -1275,11 +1274,9 @@ def test_to_json_large_numbers(self, bigNum):
12751274
expected = '{"0":{"articleId":' + str(bigNum) + "}}"
12761275
assert json == expected
12771276

1278-
@pytest.mark.parametrize("bigNum", [sys.maxsize + 1, -(sys.maxsize + 2)])
1279-
@pytest.mark.skipif(not compat.IS64, reason="GH-35279")
1277+
@pytest.mark.parametrize("bigNum", [-(2 ** 63) - 1, 2 ** 64])
12801278
def test_read_json_large_numbers(self, bigNum):
1281-
# GH20599
1282-
1279+
# GH20599, 26068
12831280
json = StringIO('{"articleId":' + str(bigNum) + "}")
12841281
msg = r"Value is too small|Value is too big"
12851282
with pytest.raises(ValueError, match=msg):

pandas/tests/io/json/test_ujson.py

+19-16
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
import locale
66
import math
77
import re
8-
import sys
98
import time
109

1110
import dateutil
@@ -599,24 +598,23 @@ def test_encode_list_long_conversion(self):
599598
np.array(long_input), ujson.decode(output, numpy=True, dtype=np.int64)
600599
)
601600

602-
def test_encode_long_conversion(self):
603-
long_input = 9223372036854775807
601+
@pytest.mark.parametrize("long_input", [9223372036854775807, 18446744073709551615])
602+
def test_encode_long_conversion(self, long_input):
604603
output = ujson.encode(long_input)
605604

606605
assert long_input == json.loads(output)
607606
assert output == json.dumps(long_input)
608607
assert long_input == ujson.decode(output)
609608

610-
@pytest.mark.parametrize("bigNum", [sys.maxsize + 1, -(sys.maxsize + 2)])
611-
@pytest.mark.xfail(not IS64, reason="GH-35288")
609+
@pytest.mark.parametrize("bigNum", [2 ** 64, -(2 ** 63) - 1])
612610
def test_dumps_ints_larger_than_maxsize(self, bigNum):
613-
# GH34395
614-
bigNum = sys.maxsize + 1
615611
encoding = ujson.encode(bigNum)
616612
assert str(bigNum) == encoding
617613

618-
# GH20599
619-
with pytest.raises(ValueError, match="Value is too big"):
614+
with pytest.raises(
615+
ValueError,
616+
match="Value is too big|Value is too small",
617+
):
620618
assert ujson.loads(encoding) == bigNum
621619

622620
@pytest.mark.parametrize(
@@ -1162,11 +1160,12 @@ def test_decode_array(self, arr):
11621160
def test_decode_extreme_numbers(self, extreme_num):
11631161
assert extreme_num == ujson.decode(str(extreme_num))
11641162

1165-
@pytest.mark.parametrize(
1166-
"too_extreme_num", ["9223372036854775808", "-90223372036854775809"]
1167-
)
1163+
@pytest.mark.parametrize("too_extreme_num", [f"{2**64}", f"{-2**63-1}"])
11681164
def test_decode_too_extreme_numbers(self, too_extreme_num):
1169-
with pytest.raises(ValueError, match="Value is too big|Value is too small"):
1165+
with pytest.raises(
1166+
ValueError,
1167+
match="Value is too big|Value is too small",
1168+
):
11701169
ujson.decode(too_extreme_num)
11711170

11721171
def test_decode_with_trailing_whitespaces(self):
@@ -1176,9 +1175,13 @@ def test_decode_with_trailing_non_whitespaces(self):
11761175
with pytest.raises(ValueError, match="Trailing data"):
11771176
ujson.decode("{}\n\t a")
11781177

1179-
def test_decode_array_with_big_int(self):
1180-
with pytest.raises(ValueError, match="Value is too big"):
1181-
ujson.loads("[18446098363113800555]")
1178+
@pytest.mark.parametrize("value", [f"{2**64}", f"{-2**63-1}"])
1179+
def test_decode_array_with_big_int(self, value):
1180+
with pytest.raises(
1181+
ValueError,
1182+
match="Value is too big|Value is too small",
1183+
):
1184+
ujson.loads(value)
11821185

11831186
@pytest.mark.parametrize(
11841187
"float_number",

0 commit comments

Comments
 (0)