diff --git a/ext/json/ext/json.h b/ext/json/ext/json.h index cf9420d4..a29d37be 100644 --- a/ext/json/ext/json.h +++ b/ext/json/ext/json.h @@ -11,6 +11,9 @@ #if defined(RUBY_DEBUG) && RUBY_DEBUG # define JSON_ASSERT RUBY_ASSERT +# ifndef JSON_DEBUG +# define JSON_DEBUG 1 +# endif #else # ifdef JSON_DEBUG # include @@ -20,8 +23,18 @@ # endif #endif +#ifdef JSON_DEBUG +# define JSON_UNREACHABLE_RETURN(val) rb_bug("Unreachable") +#else +# define JSON_UNREACHABLE_RETURN UNREACHABLE_RETURN +#endif + /* shims */ +#ifndef UNDEF_P +#define UNDEF_P(val) (val == Qundef) +#endif + #if SIZEOF_UINT64_T == SIZEOF_LONG_LONG # define INT64T2NUM(x) LL2NUM(x) # define UINT64T2NUM(x) ULL2NUM(x) diff --git a/ext/json/ext/parser/parser.c b/ext/json/ext/parser/parser.c index c0631728..44b581e6 100644 --- a/ext/json/ext/parser/parser.c +++ b/ext/json/ext/parser/parser.c @@ -206,7 +206,7 @@ static rvalue_stack *rvalue_stack_spill(rvalue_stack *old_stack, VALUE *handle, static rvalue_stack *rvalue_stack_grow(rvalue_stack *stack, VALUE *handle, rvalue_stack **stack_ref) { - long required = stack->capa * 2; + long required = stack->capa ? stack->capa * 2 : RVALUE_STACK_INITIAL_CAPA; if (stack->type == RVALUE_STACK_STACK_ALLOCATED) { stack = rvalue_stack_spill(stack, handle, stack_ref); @@ -222,8 +222,10 @@ static VALUE rvalue_stack_push(rvalue_stack *stack, VALUE value, VALUE *handle, if (RB_UNLIKELY(stack->head >= stack->capa)) { stack = rvalue_stack_grow(stack, handle, stack_ref); } + stack->ptr[stack->head] = value; stack->head++; + return value; } @@ -372,7 +374,7 @@ typedef struct json_frame_struct { enum json_frame_type type; enum json_frame_phase phase; long value_stack_head; // rvalue_stack->head when this container opened - const char *start_cursor; // object frames only (the '{'); NULL otherwise + size_t start_offset; // object frames only (the '{'); NULL otherwise } json_frame; typedef struct json_frame_stack_struct { @@ -420,7 +422,7 @@ static json_frame_stack *json_frame_stack_spill(json_frame_stack *old_stack, VAL static json_frame_stack *json_frame_stack_grow(json_frame_stack *stack, VALUE *handle, json_frame_stack **stack_ref) { - long required = stack->capa * 2; + long required = stack->capa ? stack->capa * 2 : JSON_FRAME_STACK_INITIAL_CAPA; if (stack->type == RVALUE_STACK_STACK_ALLOCATED) { stack = json_frame_stack_spill(stack, handle, stack_ref); @@ -488,7 +490,7 @@ static const rb_data_type_t JSON_Parser_frame_stack_type = { .dfree = json_frame_stack_free, .dsize = json_frame_stack_memsize, }, - .flags = RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_EMBEDDABLE, + .flags = RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED | RUBY_TYPED_EMBEDDABLE, }; static json_frame_stack *json_frame_stack_spill(json_frame_stack *old_stack, VALUE *handle, json_frame_stack **stack_ref) @@ -640,26 +642,29 @@ static VALUE build_parse_error_message(const char *format, JSON_ParserState *sta return message; } -static VALUE parse_error_new(VALUE message, long line, long column) +static VALUE parse_error_new(VALUE message, long line, long column, bool eos) { VALUE exc = rb_exc_new_str(rb_path2class("JSON::ParserError"), message); rb_ivar_set(exc, rb_intern("@line"), LONG2NUM(line)); rb_ivar_set(exc, rb_intern("@column"), LONG2NUM(column)); + if (eos) { + rb_ivar_set(exc, rb_intern("@eos"), Qtrue); + } return exc; } -NORETURN(static) void raise_parse_error(const char *format, JSON_ParserState *state) +NORETURN(static) void raise_parse_error(const char *format, JSON_ParserState *state, bool eos) { long line, column; cursor_position(state, &line, &column); VALUE message = build_parse_error_message(format, state, line, column); - rb_exc_raise(parse_error_new(message, line, column)); + rb_exc_raise(parse_error_new(message, line, column, eos)); } -NORETURN(static) void raise_parse_error_at(const char *format, JSON_ParserState *state, const char *at) +NORETURN(static) void raise_parse_error_at(const char *format, JSON_ParserState *state, const char *at, bool eos) { state->cursor = at; - raise_parse_error(format, state); + raise_parse_error(format, state, eos); } /* unicode */ @@ -684,7 +689,7 @@ static const signed char digit_values[256] = { static uint32_t unescape_unicode(JSON_ParserState *state, const char *sp, const char *spe) { if (RB_UNLIKELY(sp > spe - 4)) { - raise_parse_error_at("incomplete unicode character escape sequence at %s", state, sp - 2); + raise_parse_error_at("incomplete unicode character escape sequence at %s", state, sp - 2, true); } const unsigned char *p = (const unsigned char *)sp; @@ -695,7 +700,7 @@ static uint32_t unescape_unicode(JSON_ParserState *state, const char *sp, const const signed char b3 = digit_values[p[3]]; if (RB_UNLIKELY((signed char)(b0 | b1 | b2 | b3) < 0)) { - raise_parse_error_at("incomplete unicode character escape sequence at %s", state, sp - 2); + raise_parse_error_at("incomplete unicode character escape sequence at %s", state, sp - 2, false); } return ((uint32_t)b0 << 12) | ((uint32_t)b1 << 8) | ((uint32_t)b2 << 4) | (uint32_t)b3; @@ -729,7 +734,7 @@ json_eat_comments(JSON_ParserState *state) while (true) { const char *next_match = memchr(state->cursor, '*', state->end - state->cursor); if (!next_match) { - raise_parse_error_at("unterminated comment, expected closing '*/'", state, start); + raise_parse_error_at("unterminated comment, expected closing '*/'", state, start, true); } state->cursor = next_match + 1; @@ -741,7 +746,7 @@ json_eat_comments(JSON_ParserState *state) break; } default: - raise_parse_error_at("unexpected token %s", state, start); + raise_parse_error_at("unexpected token %s", state, start, eos(state)); break; } } @@ -932,13 +937,13 @@ NOINLINE(static) VALUE json_string_unescape(JSON_ParserState *state, JSON_Parser uint32_t sur = unescape_unicode(state, pe + 2, stringEnd); if (RB_UNLIKELY((sur & 0xFC00) != 0xDC00)) { - raise_parse_error_at("invalid surrogate pair at %s", state, p); + raise_parse_error_at("invalid surrogate pair at %s", state, p, false); } ch = (((ch & 0x3F) << 10) | ((((ch >> 6) & 0xF) + 1) << 16) | (sur & 0x3FF)); pe += 5; } else { - raise_parse_error_at("incomplete surrogate pair at %s", state, p); + raise_parse_error_at("incomplete surrogate pair at %s", state, p, false); break; } } @@ -948,20 +953,22 @@ NOINLINE(static) VALUE json_string_unescape(JSON_ParserState *state, JSON_Parser p = ++pe; break; } + case 0: + return Qundef; default: if ((unsigned char)*pe < 0x20) { if (!config->allow_control_characters) { if (*pe == '\n') { - raise_parse_error_at("Invalid unescaped newline character (\\n) in string: %s", state, pe - 1); + raise_parse_error_at("Invalid unescaped newline character (\\n) in string: %s", state, pe - 1, false); } - raise_parse_error_at("invalid ASCII control character in string: %s", state, pe - 1); + raise_parse_error_at("invalid ASCII control character in string: %s", state, pe - 1, false); } } if (config->allow_invalid_escape) { APPEND_CHAR(*pe); } else { - raise_parse_error_at("invalid escape character in string: %s", state, pe - 1); + raise_parse_error_at("invalid escape character in string: %s", state, pe - 1, false); } break; } @@ -1116,7 +1123,7 @@ NORETURN(static) void raise_duplicate_key_error(JSON_ParserState *state, VALUE d long line, column; cursor_position(state, &line, &column); rb_str_concat(message, build_parse_error_message("", state, line, column)) ; - rb_exc_raise(parse_error_new(message, line, column)); + rb_exc_raise(parse_error_new(message, line, column, false)); } NOINLINE(static) void json_on_duplicate_key(JSON_ParserState *state, JSON_ParserConfig *config, size_t count, const VALUE *pairs) @@ -1252,7 +1259,7 @@ static VALUE json_parse_escaped_string(JSON_ParserState *state, JSON_ParserConfi } default: if (!config->allow_control_characters) { - raise_parse_error("invalid ASCII control character in string: %s", state); + raise_parse_error("invalid ASCII control character in string: %s", state, false); } break; } @@ -1260,8 +1267,7 @@ static VALUE json_parse_escaped_string(JSON_ParserState *state, JSON_ParserConfi state->cursor++; } while (string_scan(state)); - raise_parse_error("unexpected end of input, expected closing \"", state); - return Qfalse; + return Qundef; } ALWAYS_INLINE(static) VALUE json_parse_string(JSON_ParserState *state, JSON_ParserConfig *config, bool is_name) @@ -1270,7 +1276,7 @@ ALWAYS_INLINE(static) VALUE json_parse_string(JSON_ParserState *state, JSON_Pars const char *start = state->cursor; if (RB_UNLIKELY(!string_scan(state))) { - raise_parse_error("unexpected end of input, expected closing \"", state); + return Qundef; } VALUE string; @@ -1369,7 +1375,7 @@ static inline VALUE json_parse_number(JSON_ParserState *state, JSON_ParserConfig int mantissa_digits = json_parse_digits(state, &mantissa); if (RB_UNLIKELY((first_digit == '0' && mantissa_digits > 1) || (negative && mantissa_digits == 0))) { - raise_parse_error_at("invalid number: %s", state, start); + return Qundef; } // Parse fractional part @@ -1382,7 +1388,7 @@ static inline VALUE json_parse_number(JSON_ParserState *state, JSON_ParserConfig mantissa_digits += fractional_digits; if (RB_UNLIKELY(!fractional_digits)) { - raise_parse_error_at("invalid number: %s", state, start); + return Qundef; } } @@ -1402,7 +1408,7 @@ static inline VALUE json_parse_number(JSON_ParserState *state, JSON_ParserConfig int exponent_digits = json_parse_digits(state, &abs_exponent); if (RB_UNLIKELY(!exponent_digits)) { - raise_parse_error_at("invalid number: %s", state, start); + return Qundef; } if (RB_UNLIKELY(exponent_digits >= 20 || abs_exponent > (uint64_t)INT64_MAX)) { @@ -1447,16 +1453,16 @@ static inline long json_frame_entry_count(const json_frame *frame, const rvalue_ // moves on to expecting a ',' or its closing bracket. The caller passes the // frame it already has in hand -- the one that was expecting the value -- which // after a container close is the freshly re-exposed parent. -static inline void json_value_completed(json_frame *frame) +static inline enum json_frame_phase json_value_completed(json_frame *frame) { JSON_ASSERT((int)JSON_PHASE_DONE == (int)JSON_FRAME_ROOT); JSON_ASSERT((int)JSON_PHASE_ARRAY_COMMA == (int)JSON_FRAME_ARRAY); JSON_ASSERT((int)JSON_PHASE_OBJECT_COMMA == (int)JSON_FRAME_OBJECT); - frame->phase = (enum json_frame_phase) frame->type; + return frame->phase = (enum json_frame_phase) frame->type; } -ALWAYS_INLINE(static) bool json_match_keyword(JSON_ParserState *state, const char *keyword, size_t offset) +ALWAYS_INLINE(static) void json_match_keyword(JSON_ParserState *state, bool resumable, const char *keyword, size_t offset) { // It is assumed that since `keyword` is always a literal, the compiler is able to constantize this // `strlen` and several other computations in that routine, such as eliminating the `if (resumable)` branch. @@ -1467,9 +1473,11 @@ ALWAYS_INLINE(static) bool json_match_keyword(JSON_ParserState *state, const cha // That's why we sometime compare starting from the first byte and sometimes from the second. if (rest(state) >= len && (memcmp(state->cursor + offset, keyword + offset, len - offset) == 0)) { state->cursor += len; - return true; + return; } - return false; + + bool eos = rest(state) < len && memcmp(state->cursor, keyword, rest(state)) == 0; + raise_parse_error("unexpected token %s", state, eos); } // Parse an arbitrary JSON value iteratively. This is a state machine driven @@ -1477,7 +1485,7 @@ ALWAYS_INLINE(static) bool json_match_keyword(JSON_ParserState *state, const cha // resume purely from the frame stack. A JSON_FRAME_ROOT frame sits at the // bottom of the stack, so the stack is never empty mid-parse and the document // itself is just another frame whose value, once parsed, leaves its phase DONE. -static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config) +ALWAYS_INLINE(static) bool json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config, bool resumable) { json_frame *frame = json_frame_stack_peek(state->frames); @@ -1489,12 +1497,12 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config) case JSON_PHASE_OBJECT_KEY: goto JSON_PHASE_OBJECT_KEY; case JSON_PHASE_OBJECT_COLON: goto JSON_PHASE_OBJECT_COLON; } - UNREACHABLE_RETURN(Qundef); + JSON_UNREACHABLE_RETURN(Qundef); JSON_PHASE_DONE: { // The root document value is parsed; it is the lone survivor on // the rvalue stack. - return *rvalue_stack_peek(state->value_stack, 1); + return true; } JSON_PHASE_VALUE: { @@ -1503,68 +1511,110 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config) VALUE value; switch (peek(state)) { case 'n': - if (json_match_keyword(state, "null", 0)) { - value = Qnil; - break; - } - raise_parse_error("unexpected token %s", state); + json_match_keyword(state, resumable, "null", 0); + value = Qnil; + break; case 't': - if (json_match_keyword(state, "true", 0)) { - value = Qtrue; - break; - } - raise_parse_error("unexpected token %s", state); + json_match_keyword(state, resumable, "true", 0); + value = Qtrue; + break; case 'f': - if (json_match_keyword(state, "false", 1)) { - value = Qfalse; - break; - } - raise_parse_error("unexpected token %s", state); + json_match_keyword(state, resumable, "false", 1); + value = Qfalse; + break; case 'N': - // Note: memcmp with a small power of two compile to an integer comparison - if (config->allow_nan && json_match_keyword(state, "NaN", 1)) { - value = CNaN; - break; + if (!config->allow_nan) { + raise_parse_error("unexpected token %s", state, false); } - raise_parse_error("unexpected token %s", state); + + json_match_keyword(state, resumable, "NaN", 1); + value = CNaN; + break; case 'I': - if (config->allow_nan && json_match_keyword(state, "Infinity", 0)) { - value = CInfinity; - break; + if (!config->allow_nan) { + raise_parse_error("unexpected token %s", state, false); } - raise_parse_error("unexpected token %s", state); + + json_match_keyword(state, resumable, "Infinity", 0); + value = CInfinity; + break; case '-': { + const char *start = state->cursor; state->cursor++; - if (config->allow_nan && json_match_keyword(state, "Infinity", 0)) { - value = CMinusInfinity; - } else { - value = json_parse_negative_number(state, config); + + value = json_parse_negative_number(state, config); + + if (RB_UNLIKELY(UNDEF_P(value))) { + if (config->allow_nan && peek(state) == 'I') { + state->cursor = start; + json_match_keyword(state, resumable, "-Infinity", 1); + value = CMinusInfinity; + break; + } + } + + // Top level numbers are ambiguous when parsing streams, we can't + // know if we parsed all the digits if we hit EOS. + if (resumable && eos(state)) { + state->cursor = start; + return false; + } + + if (RB_UNLIKELY(UNDEF_P(value))) { + raise_parse_error_at("invalid number: %s", state, start, false); } break; } - case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': + case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': { + const char *start = state->cursor; value = json_parse_positive_number(state, config); + + // Top level numbers are ambiguous when parsing streams, we can't + // know if we parsed all the digits if we hit EOS. + if (resumable && eos(state)) { + state->cursor = start; + raise_parse_error("streamed number", state, true); + } + + if (RB_UNLIKELY(UNDEF_P(value))) { + raise_parse_error_at("invalid number: %s", state, start, false); + } break; + } - case '"': + case '"': { // %r{\A"[^"\\\t\n\x00]*(?:\\[bfnrtu\\/"][^"\\]*)*"} + const char *start = state->cursor; value = json_parse_string(state, config, false); + + if (RB_UNLIKELY(UNDEF_P(value))) { + bool is_eos = eos(state); + if (resumable) { + state->cursor = start; + } + raise_parse_error("unexpected end of input, expected closing \"", state, is_eos); + } break; + } case '[': { - state->cursor++; + const char *start = state->cursor++; json_eat_whitespace(state); - if (peek(state) == ']') { + const char next = peek(state); + if (next == ']') { state->cursor++; value = json_decode_array(state, config, 0); break; + } else if (resumable && next == 0) { + state->cursor = start; + return false; } state->current_nesting++; @@ -1581,6 +1631,7 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config) }); goto JSON_PHASE_VALUE; } + case '{': { const char *object_start_cursor = state->cursor; @@ -1591,6 +1642,9 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config) state->cursor++; value = json_decode_object(state, config, 0); break; + } else if (resumable && eos(state)) { + state->cursor = object_start_cursor; + return false; } state->current_nesting++; @@ -1603,16 +1657,16 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config) .type = JSON_FRAME_OBJECT, .phase = JSON_PHASE_OBJECT_KEY, .value_stack_head = state->value_stack->head, - .start_cursor = object_start_cursor, + .start_offset = object_start_cursor - state->start, }); goto JSON_PHASE_OBJECT_KEY; } case 0: - raise_parse_error("unexpected end of input", state); + raise_parse_error("unexpected end of input", state, eos(state)); default: - raise_parse_error("unexpected character: %s", state); + raise_parse_error("unexpected character: %s", state, false); } json_push_value(state, config, value); @@ -1623,10 +1677,10 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config) case JSON_PHASE_ARRAY_COMMA: goto JSON_PHASE_ARRAY_COMMA; case JSON_PHASE_OBJECT_COMMA: goto JSON_PHASE_OBJECT_COMMA; case JSON_PHASE_VALUE: goto JSON_PHASE_VALUE; - case JSON_PHASE_OBJECT_KEY: UNREACHABLE_RETURN(Qundef); + case JSON_PHASE_OBJECT_KEY: JSON_UNREACHABLE_RETURN(false); case JSON_PHASE_OBJECT_COLON: goto JSON_PHASE_OBJECT_COLON; } - UNREACHABLE_RETURN(Qundef); + JSON_UNREACHABLE_RETURN(false); } JSON_PHASE_OBJECT_KEY: { @@ -1634,21 +1688,34 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config) json_eat_whitespace(state); + const char *start = state->cursor; + if (RB_LIKELY(peek(state) == '"')) { - json_push_value(state, config, json_parse_string(state, config, true)); + VALUE string = json_parse_string(state, config, true); + if (UNDEF_P(string)) { + if (resumable) { + state->cursor = start; + return false; + } else { + raise_parse_error("unexpected end of input, expected closing \"", state, false); + } + } + json_push_value(state, config, string); frame->phase = JSON_PHASE_OBJECT_COLON; goto JSON_PHASE_OBJECT_COLON; + } else if (resumable && eos(state)) { + return false; } else { // The message differs for the first key vs. a key after a // ',': the first is the only one reached with nothing pushed // for this object yet. if (json_frame_entry_count(frame, state->value_stack) == 0) { - raise_parse_error("expected object key, got %s", state); + raise_parse_error("expected object key, got %s", state, false); } else { - raise_parse_error("expected object key, got: %s", state); + raise_parse_error("expected object key, got: %s", state, false); } } - UNREACHABLE_RETURN(Qundef); + JSON_UNREACHABLE_RETURN(false); } JSON_PHASE_OBJECT_COLON: { @@ -1660,16 +1727,18 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config) state->cursor++; frame->phase = JSON_PHASE_VALUE; goto JSON_PHASE_VALUE; + } else if (resumable && eos(state)) { + return false; } else { // First colon (only the first pair's key is pushed, nothing // else) vs. a later one. if (json_frame_entry_count(frame, state->value_stack) == 1) { - raise_parse_error("expected ':' after object key", state); + raise_parse_error("expected ':' after object key", state, false); } else { - raise_parse_error("expected ':' after object key, got: %s", state); + raise_parse_error("expected ':' after object key, got: %s", state, false); } } - UNREACHABLE_RETURN(Qundef); + JSON_UNREACHABLE_RETURN(false); } JSON_PHASE_ARRAY_COMMA: { @@ -1695,6 +1764,7 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config) long count = json_frame_entry_count(frame, state->value_stack); state->current_nesting--; state->in_array--; + json_frame_stack_pop(state->frames); json_push_value(state, config, json_decode_array(state, config, count)); frame = json_frame_stack_peek(state->frames); @@ -1705,13 +1775,15 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config) case JSON_PHASE_ARRAY_COMMA: goto JSON_PHASE_ARRAY_COMMA; case JSON_PHASE_OBJECT_COMMA: goto JSON_PHASE_OBJECT_COMMA; case JSON_PHASE_VALUE: goto JSON_PHASE_VALUE; - case JSON_PHASE_OBJECT_KEY: UNREACHABLE_RETURN(Qundef); + case JSON_PHASE_OBJECT_KEY: JSON_UNREACHABLE_RETURN(false); case JSON_PHASE_OBJECT_COLON: goto JSON_PHASE_OBJECT_COLON; } + } else if (resumable && next_char == 0) { + return false; } else { - raise_parse_error("expected ',' or ']' after array value", state); + raise_parse_error("expected ',' or ']' after array value", state, false); } - UNREACHABLE_RETURN(Qundef); + JSON_UNREACHABLE_RETURN(false); } JSON_PHASE_OBJECT_COMMA: { @@ -1722,9 +1794,9 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config) if (RB_LIKELY(next_char == ',')) { state->cursor++; + json_eat_whitespace(state); if (config->allow_trailing_comma) { - json_eat_whitespace(state); if (peek(state) == '}') { // Trailing comma: stay in COMMA to close on the next iteration. goto JSON_PHASE_OBJECT_COMMA; @@ -1740,7 +1812,7 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config) // Temporary rewind cursor in case an error is raised const char *final_cursor = state->cursor; - state->cursor = frame->start_cursor; + state->cursor = state->start + frame->start_offset; VALUE object = json_decode_object(state, config, count); state->cursor = final_cursor; @@ -1754,23 +1826,25 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config) case JSON_PHASE_ARRAY_COMMA: goto JSON_PHASE_ARRAY_COMMA; case JSON_PHASE_OBJECT_COMMA: goto JSON_PHASE_OBJECT_COMMA; case JSON_PHASE_VALUE: goto JSON_PHASE_VALUE; - case JSON_PHASE_OBJECT_KEY: UNREACHABLE_RETURN(Qundef); + case JSON_PHASE_OBJECT_KEY: JSON_UNREACHABLE_RETURN(false); case JSON_PHASE_OBJECT_COLON: goto JSON_PHASE_OBJECT_COLON; } + } else if (resumable && next_char == 0) { + return false; } else { - raise_parse_error("expected ',' or '}' after object value, got: %s", state); + raise_parse_error("expected ',' or '}' after object value, got: %s", state, false); } - UNREACHABLE_RETURN(Qundef); + JSON_UNREACHABLE_RETURN(false); } - UNREACHABLE_RETURN(Qundef); + JSON_UNREACHABLE_RETURN(Qundef); } static void json_ensure_eof(JSON_ParserState *state) { json_eat_whitespace(state); if (!eos(state)) { - raise_parse_error("unexpected token at end of stream %s", state); + raise_parse_error("unexpected token at end of stream %s", state, false); } } @@ -1968,7 +2042,11 @@ static VALUE cParser_parse(JSON_ParserConfig *config, VALUE src) }; JSON_ParserState *state = &_state; - VALUE result = json_parse_any(state, config); + json_parse_any(state, config, false); + + // The root document value is parsed; it is the lone survivor on + // the rvalue stack. + VALUE result = *rvalue_stack_peek(state->value_stack, 1); // This may be skipped in case of exception, but // it won't cause a leak. @@ -2044,6 +2122,231 @@ static VALUE cJSON_parser_s_allocate(VALUE klass) return TypedData_Make_Struct(klass, JSON_ParserConfig, &JSON_ParserConfig_type, config); } +typedef struct JSON_ResumableParserStruct { + JSON_ParserConfig config; + JSON_ParserState state; // TODO: Should this actually be on the heap? Maybe it should be on the stack and reset on every parse. + rvalue_stack value_stack; + json_frame_stack frames; + VALUE buffer; +} JSON_ResumableParser; + +static void JSON_ResumableParser_mark(void *ptr) +{ + JSON_ResumableParser *parser = (JSON_ResumableParser *)ptr; + JSON_ParserConfig_mark(&parser->config); + rvalue_stack_mark(&parser->value_stack); + rb_gc_mark_movable(parser->buffer); +} + +static void JSON_ResumableParser_free(void *ptr) +{ + JSON_ResumableParser *parser = (JSON_ResumableParser *)ptr; + rvalue_stack_free_buffer(&parser->value_stack); + json_frame_stack_free_buffer(&parser->frames); +} + +static size_t JSON_ResumableParser_memsize(const void *ptr) +{ + const JSON_ResumableParser *parser = (const JSON_ResumableParser *)ptr; + size_t memsize = JSON_ParserConfig_memsize(&parser->config); + memsize += rvalue_stack_memsize(&parser->value_stack); + memsize += json_frame_stack_memsize(&parser->frames); +#ifndef HAVE_RUBY_TYPED_EMBEDDABLE + memsize += ( + sizeof(JSON_ResumableParser) + - sizeof(JSON_ParserState) + - sizeof(JSON_ParserConfig) + - sizeof(rvalue_stack) + - sizeof(json_frame_stack) + ); +#endif + return memsize; +} + +static void JSON_ResumableParser_compact(void *ptr) +{ + JSON_ResumableParser *parser = (JSON_ResumableParser *)ptr; + JSON_ParserConfig_compact(&parser->config); + rvalue_stack_compact(&parser->value_stack); + parser->buffer = rb_gc_location(parser->buffer); +} + +static const rb_data_type_t JSON_ResumableParser_type = { + .wrap_struct_name = "JSON::Ext::ResumableParser", + .function = { + JSON_ResumableParser_mark, + JSON_ResumableParser_free, + JSON_ResumableParser_memsize, + JSON_ResumableParser_compact, + }, + // TODO: Figure how to set RUBY_TYPED_WB_PROTECTED + // Need to the the rvalue_stack, marked by another object. + .flags = RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_EMBEDDABLE, +}; + +static VALUE cResumableParser_allocate(VALUE klass) +{ + JSON_ResumableParser *parser; + VALUE obj = TypedData_Make_Struct(klass, JSON_ResumableParser, &JSON_ResumableParser_type, parser); + + parser->state.frames = &parser->frames; + json_frame_stack_push(&parser->state, (json_frame){ + .type = JSON_FRAME_ROOT, + .phase = JSON_PHASE_VALUE, + }); + return obj; +} + +static inline JSON_ResumableParser *cResumableParser_get(VALUE self) +{ + JSON_ResumableParser *parser; + TypedData_Get_Struct(self, JSON_ResumableParser, &JSON_ResumableParser_type, parser); + return parser; +} + +// TODO: doc +// TODO: optional arg +static VALUE cResumableParser_initialize(VALUE self, VALUE opts) +{ + rb_check_frozen(self); + JSON_ResumableParser *parser = cResumableParser_get(self); + + parser_config_init(&parser->config, opts, self); + + return self; +} + +// TODO: doc +static VALUE cResumableParser_feed(VALUE self, VALUE str) +{ + rb_check_frozen(self); + str = convert_encoding(str); + if (!RSTRING_LEN(str)) { + return self; + } + + JSON_ResumableParser *parser = cResumableParser_get(self); + + size_t offset = parser->state.cursor - parser->state.start; + const size_t remaining = parser->state.end - parser->state.cursor; + + if (!remaining) { + parser->buffer = RB_OBJ_FROZEN_RAW(str) ? str : rb_obj_hide(rb_str_new_shared(str)); + offset = 0; + } else { + JSON_ASSERT(parser->buffer); + + const size_t size = parser->state.end - parser->state.start; + const size_t consumed = size - remaining; + char *old_ptr = RSTRING_PTR(parser->buffer); + + if (RB_OBJ_FROZEN_RAW(parser->buffer)) { + VALUE new_buffer = rb_obj_hide(rb_str_buf_new(remaining + RSTRING_LEN(str))); + MEMCPY(RSTRING_PTR(new_buffer), old_ptr + consumed, char, remaining); + offset = 0; + parser->buffer = new_buffer; + } else if (consumed > (size / 2) && size >= 512) { + MEMMOVE(old_ptr, old_ptr + consumed, char, remaining); + rb_str_set_len(parser->buffer, remaining); + offset = 0; + } + rb_str_append(parser->buffer, str); + } + + long len; + const char *start; + RSTRING_GETMEM(parser->buffer, start, len); + parser->state.start = start; + parser->state.end = start + len; + parser->state.cursor = parser->state.start + offset; + + return self; +} + +struct json_parse_any_args { + JSON_ParserState *state; + JSON_ParserConfig *config; +}; + +static VALUE json_parse_any_resumable_safe(VALUE _args) +{ + struct json_parse_any_args *args = (struct json_parse_any_args *)_args; + return (VALUE)json_parse_any(args->state, args->config, true); +} + +// TODO: doc +static VALUE cResumableParser_parse(VALUE self) +{ + JSON_ResumableParser *parser = cResumableParser_get(self); + if (!parser->buffer) { + return Qfalse; + } + VALUE Vsource = parser->buffer; // Prevent compaction + + // TODO: prevent reentrancy + + // self may have moved, so we need to update all pointers + // We might be better off keeping JSON_ParserState on the stack + // and only persist what we need. + parser->state.value_stack_handle = &self; + parser->state.frame_stack_handle = &self; + parser->state.value_stack = &parser->value_stack; + parser->state.frames = &parser->frames; + + json_frame *frame = json_frame_stack_peek(&parser->frames); + + if (frame->phase == JSON_PHASE_DONE) { + frame->phase = JSON_PHASE_VALUE; + rvalue_stack_pop(parser->state.value_stack, 1); + } + + struct json_parse_any_args args = { + .state = &parser->state, + .config = &parser->config, + }; + int status; + bool complete = rb_protect(json_parse_any_resumable_safe, (VALUE)&args, &status); + if (status) { + complete = false; + if (RTEST(rb_ivar_get(rb_errinfo(), rb_intern("@eos")))) { + complete = false; // is an EOS error + } else { + rb_jump_tag(status); // reraise + } + } + RB_GC_GUARD(Vsource); + return complete ? Qtrue : Qfalse; +} + +// TODO: doc +static VALUE cResumableParser_value(VALUE self) +{ + JSON_ResumableParser *parser = cResumableParser_get(self); + json_frame *frame = json_frame_stack_peek(&parser->frames); + + if (frame->phase == JSON_PHASE_DONE) { + return *rvalue_stack_peek(parser->state.value_stack, 1); + } else { + rb_raise(rb_eArgError, "no ready value"); // TODO: Figure out the best exception and message + } +} + +// TODO: doc +static VALUE cResumableParser_rest(VALUE self) +{ + JSON_ResumableParser *parser = cResumableParser_get(self); + + if (!parser->buffer) { + return rb_utf8_str_new("", 0); + } + + size_t offset = parser->state.cursor - parser->state.start; + const char *ptr; + long len; + RSTRING_GETMEM(parser->buffer, ptr, len); + return rb_utf8_str_new(ptr + offset, len - offset); +} + void Init_parser(void) { #ifdef HAVE_RB_EXT_RACTOR_SAFE @@ -2058,12 +2361,20 @@ void Init_parser(void) eNestingError = rb_path2class("JSON::NestingError"); rb_gc_register_mark_object(eNestingError); rb_define_alloc_func(cParserConfig, cJSON_parser_s_allocate); - rb_define_method(cParserConfig, "initialize", cParserConfig_initialize, 1); + rb_define_private_method(cParserConfig, "initialize", cParserConfig_initialize, 1); rb_define_method(cParserConfig, "parse", cParserConfig_parse, 1); VALUE cParser = rb_define_class_under(mExt, "Parser", rb_cObject); rb_define_singleton_method(cParser, "parse", cParser_m_parse, 2); + VALUE cResumableParser = rb_define_class_under(mExt, "ResumableParser", rb_cObject); + rb_define_alloc_func(cResumableParser, cResumableParser_allocate); + rb_define_private_method(cResumableParser, "initialize", cResumableParser_initialize, 1); + rb_define_method(cResumableParser, "<<", cResumableParser_feed, 1); + rb_define_method(cResumableParser, "parse", cResumableParser_parse, 0); + rb_define_method(cResumableParser, "value", cResumableParser_value, 0); + rb_define_method(cResumableParser, "rest", cResumableParser_rest, 0); + CNaN = rb_const_get(mJSON, rb_intern("NaN")); rb_gc_register_mark_object(CNaN); diff --git a/test/json/resumable_parser_test.rb b/test/json/resumable_parser_test.rb new file mode 100644 index 00000000..52c63025 --- /dev/null +++ b/test/json/resumable_parser_test.rb @@ -0,0 +1,95 @@ +# frozen_string_literal: true +require_relative 'test_helper' + +class JSONResumageParserTest < Test::Unit::TestCase + include JSON + + def setup + omit "JRuby not supported" if RUBY_ENGINE == "jruby" + @parser = new_parser + end + + def test_parse_document_direct + @parser << '[true]' + assert_equal true, @parser.parse + assert_equal [true], @parser.value + end + + def test_parse_multiple_documents_direct + @parser << '[true]{}[1, 2, 3]' + + assert_equal true, @parser.parse + assert_equal [true], @parser.value + + assert_equal true, @parser.parse + assert_equal({}, @parser.value) + + assert_equal true, @parser.parse + assert_equal [1, 2, 3], @parser.value + end + + def test_parse_byte_by_byte_array + assert_resumed_parsing('[]') + assert_resumed_parsing('[ ]') + assert_resumed_parsing('[true]') + assert_resumed_parsing('[12]') + assert_resumed_parsing('[ 12 ]') + assert_resumed_parsing('[ 12.3 ]') + assert_resumed_parsing('[ 12.3e12 ]') + assert_resumed_parsing('[ 1e12 ]') + assert_resumed_parsing('[-12]') + assert_resumed_parsing('[ -12 ]') + assert_resumed_parsing('[ -12.3 ]') + assert_resumed_parsing('[ -12.3e12 ]') + assert_resumed_parsing('[ -1e12 ]') + end + + def test_parse_byte_by_byte_object + assert_resumed_parsing('{}') + assert_resumed_parsing('{ }') + assert_resumed_parsing('{"test" : true}') + assert_resumed_parsing('{ "test":12, "value" : { "key": 42} }') + end + + def test_parse_byte_by_byte_string + assert_resumed_parsing(JSON.generate('test')) + assert_resumed_parsing(JSON.generate('te\\st')) + assert_resumed_parsing('"te\\u2028st"') + assert_resumed_parsing(JSON.generate("te\u2028st")) + assert_resumed_parsing(JSON.generate("te \u2028 st")) + end + + def test_parse_byte_by_byte_numbers + # TODO: should we actually types other than Array and Object and perhaps String at the top level? + # Top values other than arrays and objects are ambiguous. + # e.g. if you stream numbers without a separator: 123 followed by 456 is ambiguous with 123456. + assert_resumed_parsing('123 ') + end + + def test_rest + @parser << '[1, 2, 3, "unterminated string' + refute @parser.parse + assert_equal '"unterminated string', @parser.rest + end + + private + + def assert_resumed_parsing(json, parser = @parser) + expected = JSON.parse(json) + + last_parsed_byte_index = 0 + json.each_byte do |byte| + parser << byte.chr + last_parsed_byte_index += 1 + break if parser.parse + end + actual = parser.value + assert_equal expected, actual + remaining_bytes = (json.bytesize - last_parsed_byte_index) + assert_equal 0, remaining_bytes, "unconsumed bytes: #{actual.inspect}, remaining: #{json.byteslice(-1, remaining_bytes).inspect}" + end + + def new_parser(options = {}) + JSON::Ext::ResumableParser.new(options) + end +end