From ecf70acaf7c543152fee18e4b7c3a50f53d0e3ff Mon Sep 17 00:00:00 2001 From: Jean Boussier Date: Fri, 5 Jun 2026 09:36:20 +0200 Subject: [PATCH 1/7] Experiment: ResumableParser Fix: https://github.com/ruby/json/issues/983 Numerous known issues TODO: - `object_start_cursor` recorded in frame becomes invalid if the buffer string is reallocated or spilled. - The buffer need to be shrunk sometimes. - Lot more testing needed. - Unclear what to do with top level numbers (and perhaps true/false/null) - API is all but final - I'd like to be able to "pop" the value, so we don't uselessly keep a reference on it. - Then methods need to be documented. - It would worth trying to make `json_parse_any` exception free. - Right now EOF errors have been eliminated in favor of returning `false`. - We could try to do the same with syntax errors. - But then we need to `rb_protect` when calling back into Ruby or other unsafe APIs, so perhaps it's best to just accept it. --- ext/json/ext/json.h | 13 ++ ext/json/ext/parser/parser.c | 362 +++++++++++++++++++++++++---- test/json/resumable_parser_test.rb | 91 ++++++++ 3 files changed, 418 insertions(+), 48 deletions(-) create mode 100644 test/json/resumable_parser_test.rb diff --git a/ext/json/ext/json.h b/ext/json/ext/json.h index cf9420d4..a29d37be 100644 --- a/ext/json/ext/json.h +++ b/ext/json/ext/json.h @@ -11,6 +11,9 @@ #if defined(RUBY_DEBUG) && RUBY_DEBUG # define JSON_ASSERT RUBY_ASSERT +# ifndef JSON_DEBUG +# define JSON_DEBUG 1 +# endif #else # ifdef JSON_DEBUG # include @@ -20,8 +23,18 @@ # endif #endif +#ifdef JSON_DEBUG +# define JSON_UNREACHABLE_RETURN(val) rb_bug("Unreachable") +#else +# define JSON_UNREACHABLE_RETURN UNREACHABLE_RETURN +#endif + /* shims */ +#ifndef UNDEF_P +#define UNDEF_P(val) (val == Qundef) +#endif + #if SIZEOF_UINT64_T == SIZEOF_LONG_LONG # define INT64T2NUM(x) LL2NUM(x) # define UINT64T2NUM(x) ULL2NUM(x) diff --git a/ext/json/ext/parser/parser.c b/ext/json/ext/parser/parser.c index c0631728..e59107d8 100644 --- a/ext/json/ext/parser/parser.c +++ b/ext/json/ext/parser/parser.c @@ -206,7 +206,7 @@ static rvalue_stack *rvalue_stack_spill(rvalue_stack *old_stack, VALUE *handle, static rvalue_stack *rvalue_stack_grow(rvalue_stack *stack, VALUE *handle, rvalue_stack **stack_ref) { - long required = stack->capa * 2; + long required = stack->capa ? stack->capa * 2 : RVALUE_STACK_INITIAL_CAPA; if (stack->type == RVALUE_STACK_STACK_ALLOCATED) { stack = rvalue_stack_spill(stack, handle, stack_ref); @@ -222,8 +222,14 @@ static VALUE rvalue_stack_push(rvalue_stack *stack, VALUE value, VALUE *handle, if (RB_UNLIKELY(stack->head >= stack->capa)) { stack = rvalue_stack_grow(stack, handle, stack_ref); } + stack->ptr[stack->head] = value; stack->head++; + + if (*handle) { + RB_OBJ_WRITTEN(*handle, Qundef, value); + } + return value; } @@ -420,7 +426,7 @@ static json_frame_stack *json_frame_stack_spill(json_frame_stack *old_stack, VAL static json_frame_stack *json_frame_stack_grow(json_frame_stack *stack, VALUE *handle, json_frame_stack **stack_ref) { - long required = stack->capa * 2; + long required = stack->capa ? stack->capa * 2 : JSON_FRAME_STACK_INITIAL_CAPA; if (stack->type == RVALUE_STACK_STACK_ALLOCATED) { stack = json_frame_stack_spill(stack, handle, stack_ref); @@ -488,7 +494,7 @@ static const rb_data_type_t JSON_Parser_frame_stack_type = { .dfree = json_frame_stack_free, .dsize = json_frame_stack_memsize, }, - .flags = RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_EMBEDDABLE, + .flags = RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED | RUBY_TYPED_EMBEDDABLE, }; static json_frame_stack *json_frame_stack_spill(json_frame_stack *old_stack, VALUE *handle, json_frame_stack **stack_ref) @@ -948,6 +954,8 @@ NOINLINE(static) VALUE json_string_unescape(JSON_ParserState *state, JSON_Parser p = ++pe; break; } + case 0: + return Qundef; default: if ((unsigned char)*pe < 0x20) { if (!config->allow_control_characters) { @@ -1260,8 +1268,7 @@ static VALUE json_parse_escaped_string(JSON_ParserState *state, JSON_ParserConfi state->cursor++; } while (string_scan(state)); - raise_parse_error("unexpected end of input, expected closing \"", state); - return Qfalse; + return Qundef; } ALWAYS_INLINE(static) VALUE json_parse_string(JSON_ParserState *state, JSON_ParserConfig *config, bool is_name) @@ -1270,7 +1277,7 @@ ALWAYS_INLINE(static) VALUE json_parse_string(JSON_ParserState *state, JSON_Pars const char *start = state->cursor; if (RB_UNLIKELY(!string_scan(state))) { - raise_parse_error("unexpected end of input, expected closing \"", state); + return Qundef; } VALUE string; @@ -1369,7 +1376,7 @@ static inline VALUE json_parse_number(JSON_ParserState *state, JSON_ParserConfig int mantissa_digits = json_parse_digits(state, &mantissa); if (RB_UNLIKELY((first_digit == '0' && mantissa_digits > 1) || (negative && mantissa_digits == 0))) { - raise_parse_error_at("invalid number: %s", state, start); + return Qundef; } // Parse fractional part @@ -1382,7 +1389,7 @@ static inline VALUE json_parse_number(JSON_ParserState *state, JSON_ParserConfig mantissa_digits += fractional_digits; if (RB_UNLIKELY(!fractional_digits)) { - raise_parse_error_at("invalid number: %s", state, start); + return Qundef; } } @@ -1402,7 +1409,7 @@ static inline VALUE json_parse_number(JSON_ParserState *state, JSON_ParserConfig int exponent_digits = json_parse_digits(state, &abs_exponent); if (RB_UNLIKELY(!exponent_digits)) { - raise_parse_error_at("invalid number: %s", state, start); + return Qundef; } if (RB_UNLIKELY(exponent_digits >= 20 || abs_exponent > (uint64_t)INT64_MAX)) { @@ -1447,16 +1454,16 @@ static inline long json_frame_entry_count(const json_frame *frame, const rvalue_ // moves on to expecting a ',' or its closing bracket. The caller passes the // frame it already has in hand -- the one that was expecting the value -- which // after a container close is the freshly re-exposed parent. -static inline void json_value_completed(json_frame *frame) +static inline enum json_frame_phase json_value_completed(json_frame *frame) { JSON_ASSERT((int)JSON_PHASE_DONE == (int)JSON_FRAME_ROOT); JSON_ASSERT((int)JSON_PHASE_ARRAY_COMMA == (int)JSON_FRAME_ARRAY); JSON_ASSERT((int)JSON_PHASE_OBJECT_COMMA == (int)JSON_FRAME_OBJECT); - frame->phase = (enum json_frame_phase) frame->type; + return frame->phase = (enum json_frame_phase) frame->type; } -ALWAYS_INLINE(static) bool json_match_keyword(JSON_ParserState *state, const char *keyword, size_t offset) +ALWAYS_INLINE(static) bool json_match_keyword(JSON_ParserState *state, bool resumable, const char *keyword, size_t offset) { // It is assumed that since `keyword` is always a literal, the compiler is able to constantize this // `strlen` and several other computations in that routine, such as eliminating the `if (resumable)` branch. @@ -1468,8 +1475,11 @@ ALWAYS_INLINE(static) bool json_match_keyword(JSON_ParserState *state, const cha if (rest(state) >= len && (memcmp(state->cursor + offset, keyword + offset, len - offset) == 0)) { state->cursor += len; return true; + } else if (resumable && rest(state) < len && memcmp(state->cursor, keyword, rest(state)) == 0) { + return false; } - return false; + + raise_parse_error("unexpected token %s", state); } // Parse an arbitrary JSON value iteratively. This is a state machine driven @@ -1477,7 +1487,7 @@ ALWAYS_INLINE(static) bool json_match_keyword(JSON_ParserState *state, const cha // resume purely from the frame stack. A JSON_FRAME_ROOT frame sits at the // bottom of the stack, so the stack is never empty mid-parse and the document // itself is just another frame whose value, once parsed, leaves its phase DONE. -static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config) +ALWAYS_INLINE(static) bool json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config, bool resumable) { json_frame *frame = json_frame_stack_peek(state->frames); @@ -1489,12 +1499,12 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config) case JSON_PHASE_OBJECT_KEY: goto JSON_PHASE_OBJECT_KEY; case JSON_PHASE_OBJECT_COLON: goto JSON_PHASE_OBJECT_COLON; } - UNREACHABLE_RETURN(Qundef); + JSON_UNREACHABLE_RETURN(Qundef); JSON_PHASE_DONE: { // The root document value is parsed; it is the lone survivor on // the rvalue stack. - return *rvalue_stack_peek(state->value_stack, 1); + return true; } JSON_PHASE_VALUE: { @@ -1503,68 +1513,119 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config) VALUE value; switch (peek(state)) { case 'n': - if (json_match_keyword(state, "null", 0)) { + if (json_match_keyword(state, resumable, "null", 0)) { value = Qnil; break; } - raise_parse_error("unexpected token %s", state); + return false; case 't': - if (json_match_keyword(state, "true", 0)) { + if (json_match_keyword(state, resumable, "true", 0)) { value = Qtrue; break; } - raise_parse_error("unexpected token %s", state); + return false; case 'f': - if (json_match_keyword(state, "false", 1)) { + if (json_match_keyword(state, resumable, "false", 1)) { value = Qfalse; break; } - raise_parse_error("unexpected token %s", state); + return false; case 'N': - // Note: memcmp with a small power of two compile to an integer comparison - if (config->allow_nan && json_match_keyword(state, "NaN", 1)) { + if (!config->allow_nan) { + raise_parse_error("unexpected token %s", state); + } + + if (json_match_keyword(state, resumable, "NaN", 1)) { value = CNaN; break; } - raise_parse_error("unexpected token %s", state); + return false; case 'I': - if (config->allow_nan && json_match_keyword(state, "Infinity", 0)) { + if (!config->allow_nan) { + raise_parse_error("unexpected token %s", state); + } + + if (json_match_keyword(state, resumable, "Infinity", 0)) { value = CInfinity; break; } - raise_parse_error("unexpected token %s", state); + return false; case '-': { + const char *start = state->cursor; state->cursor++; - if (config->allow_nan && json_match_keyword(state, "Infinity", 0)) { - value = CMinusInfinity; - } else { - value = json_parse_negative_number(state, config); + + value = json_parse_negative_number(state, config); + + if (RB_UNLIKELY(UNDEF_P(value))) { + if (config->allow_nan && peek(state) == 'I') { + if (json_match_keyword(state, resumable, "Infinity", 0)) { + value = CMinusInfinity; + break; + } + state->cursor = start; + return false; + } + } + + if (resumable && peek(state) == 0) { + state->cursor = start; + return false; + } + + if (RB_UNLIKELY(UNDEF_P(value))) { + raise_parse_error_at("invalid number: %s", state, start); } break; } - case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': + case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': { + const char *start = state->cursor; value = json_parse_positive_number(state, config); + + if (resumable && peek(state) == 0) { + state->cursor = start; + return false; + } + + if (RB_UNLIKELY(UNDEF_P(value))) { + raise_parse_error_at("invalid number: %s", state, start); + } break; + } - case '"': + case '"': { // %r{\A"[^"\\\t\n\x00]*(?:\\[bfnrtu\\/"][^"\\]*)*"} + const char *start = state->cursor; value = json_parse_string(state, config, false); + + if (RB_UNLIKELY(UNDEF_P(value))) { + if (resumable) { + state->cursor = start; + return false; + } else { + raise_parse_error("unexpected end of input, expected closing \"", state); + } + } break; + } case '[': { - state->cursor++; + const char *start = state->cursor++; json_eat_whitespace(state); - if (peek(state) == ']') { + const char next = peek(state); + if (next == ']') { state->cursor++; value = json_decode_array(state, config, 0); break; + } else if (resumable && next == 0) { + state->cursor = start; + return false; } state->current_nesting++; @@ -1581,6 +1642,7 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config) }); goto JSON_PHASE_VALUE; } + case '{': { const char *object_start_cursor = state->cursor; @@ -1591,6 +1653,9 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config) state->cursor++; value = json_decode_object(state, config, 0); break; + } else if (resumable && peek(state) == 0) { + state->cursor = object_start_cursor; + return false; } state->current_nesting++; @@ -1609,6 +1674,9 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config) } case 0: + if (resumable) { + return false; + } raise_parse_error("unexpected end of input", state); default: @@ -1623,10 +1691,10 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config) case JSON_PHASE_ARRAY_COMMA: goto JSON_PHASE_ARRAY_COMMA; case JSON_PHASE_OBJECT_COMMA: goto JSON_PHASE_OBJECT_COMMA; case JSON_PHASE_VALUE: goto JSON_PHASE_VALUE; - case JSON_PHASE_OBJECT_KEY: UNREACHABLE_RETURN(Qundef); + case JSON_PHASE_OBJECT_KEY: JSON_UNREACHABLE_RETURN(false); case JSON_PHASE_OBJECT_COLON: goto JSON_PHASE_OBJECT_COLON; } - UNREACHABLE_RETURN(Qundef); + JSON_UNREACHABLE_RETURN(false); } JSON_PHASE_OBJECT_KEY: { @@ -1634,10 +1702,23 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config) json_eat_whitespace(state); + const char *start = state->cursor; + if (RB_LIKELY(peek(state) == '"')) { - json_push_value(state, config, json_parse_string(state, config, true)); + VALUE string = json_parse_string(state, config, true); + if (UNDEF_P(string)) { + if (resumable) { + state->cursor = start; + return false; + } else { + raise_parse_error("unexpected end of input, expected closing \"", state); + } + } + json_push_value(state, config, string); frame->phase = JSON_PHASE_OBJECT_COLON; goto JSON_PHASE_OBJECT_COLON; + } else if (resumable && peek(state) == 0) { + return false; } else { // The message differs for the first key vs. a key after a // ',': the first is the only one reached with nothing pushed @@ -1648,7 +1729,7 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config) raise_parse_error("expected object key, got: %s", state); } } - UNREACHABLE_RETURN(Qundef); + JSON_UNREACHABLE_RETURN(false); } JSON_PHASE_OBJECT_COLON: { @@ -1660,6 +1741,8 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config) state->cursor++; frame->phase = JSON_PHASE_VALUE; goto JSON_PHASE_VALUE; + } else if (resumable && peek(state) == 0) { + return false; } else { // First colon (only the first pair's key is pushed, nothing // else) vs. a later one. @@ -1669,7 +1752,7 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config) raise_parse_error("expected ':' after object key, got: %s", state); } } - UNREACHABLE_RETURN(Qundef); + JSON_UNREACHABLE_RETURN(false); } JSON_PHASE_ARRAY_COMMA: { @@ -1695,6 +1778,7 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config) long count = json_frame_entry_count(frame, state->value_stack); state->current_nesting--; state->in_array--; + json_frame_stack_pop(state->frames); json_push_value(state, config, json_decode_array(state, config, count)); frame = json_frame_stack_peek(state->frames); @@ -1705,13 +1789,15 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config) case JSON_PHASE_ARRAY_COMMA: goto JSON_PHASE_ARRAY_COMMA; case JSON_PHASE_OBJECT_COMMA: goto JSON_PHASE_OBJECT_COMMA; case JSON_PHASE_VALUE: goto JSON_PHASE_VALUE; - case JSON_PHASE_OBJECT_KEY: UNREACHABLE_RETURN(Qundef); + case JSON_PHASE_OBJECT_KEY: JSON_UNREACHABLE_RETURN(false); case JSON_PHASE_OBJECT_COLON: goto JSON_PHASE_OBJECT_COLON; } + } else if (resumable && next_char == 0) { + return false; } else { raise_parse_error("expected ',' or ']' after array value", state); } - UNREACHABLE_RETURN(Qundef); + JSON_UNREACHABLE_RETURN(false); } JSON_PHASE_OBJECT_COMMA: { @@ -1722,9 +1808,9 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config) if (RB_LIKELY(next_char == ',')) { state->cursor++; + json_eat_whitespace(state); if (config->allow_trailing_comma) { - json_eat_whitespace(state); if (peek(state) == '}') { // Trailing comma: stay in COMMA to close on the next iteration. goto JSON_PHASE_OBJECT_COMMA; @@ -1754,16 +1840,18 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config) case JSON_PHASE_ARRAY_COMMA: goto JSON_PHASE_ARRAY_COMMA; case JSON_PHASE_OBJECT_COMMA: goto JSON_PHASE_OBJECT_COMMA; case JSON_PHASE_VALUE: goto JSON_PHASE_VALUE; - case JSON_PHASE_OBJECT_KEY: UNREACHABLE_RETURN(Qundef); + case JSON_PHASE_OBJECT_KEY: JSON_UNREACHABLE_RETURN(false); case JSON_PHASE_OBJECT_COLON: goto JSON_PHASE_OBJECT_COLON; } + } else if (resumable && next_char == 0) { + return false; } else { raise_parse_error("expected ',' or '}' after object value, got: %s", state); } - UNREACHABLE_RETURN(Qundef); + JSON_UNREACHABLE_RETURN(false); } - UNREACHABLE_RETURN(Qundef); + JSON_UNREACHABLE_RETURN(Qundef); } static void json_ensure_eof(JSON_ParserState *state) @@ -1968,7 +2056,11 @@ static VALUE cParser_parse(JSON_ParserConfig *config, VALUE src) }; JSON_ParserState *state = &_state; - VALUE result = json_parse_any(state, config); + json_parse_any(state, config, false); + + // The root document value is parsed; it is the lone survivor on + // the rvalue stack. + VALUE result = *rvalue_stack_peek(state->value_stack, 1); // This may be skipped in case of exception, but // it won't cause a leak. @@ -2044,6 +2136,173 @@ static VALUE cJSON_parser_s_allocate(VALUE klass) return TypedData_Make_Struct(klass, JSON_ParserConfig, &JSON_ParserConfig_type, config); } +typedef struct JSON_ResumableParserStruct { + JSON_ParserConfig config; + JSON_ParserState state; // TODO: Should this actually be on the heap? Maybe it should be on the stack and reset on every parse. + rvalue_stack value_stack; + json_frame_stack frames; + VALUE buffer; +} JSON_ResumableParser; + +static void JSON_ResumableParser_mark(void *ptr) +{ + JSON_ResumableParser *parser = (JSON_ResumableParser *)ptr; + JSON_ParserConfig_mark(&parser->config); + rvalue_stack_mark(&parser->value_stack); + rb_gc_mark_movable(parser->buffer); +} + +static void JSON_ResumableParser_free(void *ptr) +{ + JSON_ResumableParser *parser = (JSON_ResumableParser *)ptr; + rvalue_stack_free_buffer(&parser->value_stack); + json_frame_stack_free_buffer(&parser->frames); +} + +static size_t JSON_ResumableParser_memsize(const void *ptr) +{ + const JSON_ResumableParser *parser = (const JSON_ResumableParser *)ptr; + size_t memsize = JSON_ParserConfig_memsize(&parser->config); + memsize += rvalue_stack_memsize(&parser->value_stack); + memsize += json_frame_stack_memsize(&parser->frames); +#ifndef HAVE_RUBY_TYPED_EMBEDDABLE + memsize += ( + sizeof(JSON_ResumableParser) + - sizeof(JSON_ParserState) + - sizeof(JSON_ParserConfig) + - sizeof(rvalue_stack) + - sizeof(json_frame_stack) + ); +#endif + return memsize; +} + +static void JSON_ResumableParser_compact(void *ptr) +{ + JSON_ResumableParser *parser = (JSON_ResumableParser *)ptr; + JSON_ParserConfig_compact(&parser->config); + rvalue_stack_compact(&parser->value_stack); + parser->buffer = rb_gc_location(parser->buffer); +} + +static const rb_data_type_t JSON_ResumableParser_type = { + .wrap_struct_name = "JSON::Ext::ResumableParser", + .function = { + JSON_ResumableParser_mark, + JSON_ResumableParser_free, + JSON_ResumableParser_memsize, + JSON_ResumableParser_compact, + }, + // TODO: Figure how to set RUBY_TYPED_WB_PROTECTED + // Need to the the rvalue_stack, marked by another object. + .flags = RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_EMBEDDABLE, +}; + +static VALUE cResumableParser_allocate(VALUE klass) +{ + JSON_ResumableParser *parser; + VALUE obj = TypedData_Make_Struct(klass, JSON_ResumableParser, &JSON_ResumableParser_type, parser); + + parser->state.frames = &parser->frames; + json_frame_stack_push(&parser->state, (json_frame){ + .type = JSON_FRAME_ROOT, + .phase = JSON_PHASE_VALUE, + }); + return obj; +} + +static inline JSON_ResumableParser *cResumableParser_get(VALUE self) +{ + JSON_ResumableParser *parser; + TypedData_Get_Struct(self, JSON_ResumableParser, &JSON_ResumableParser_type, parser); + return parser; +} + +// TODO: doc +// TODO: optional arg +static VALUE cResumableParser_initialize(VALUE self, VALUE opts) +{ + rb_check_frozen(self); + JSON_ResumableParser *parser = cResumableParser_get(self); + + parser_config_init(&parser->config, opts, self); + + return self; +} + +// TODO: doc +static VALUE cResumableParser_feed(VALUE self, VALUE str) +{ + rb_check_frozen(self); + str = convert_encoding(str); + + JSON_ResumableParser *parser = cResumableParser_get(self); + + // TODO: Figure out some efficient buffering. Shrink buffer, etc. + // This is just a quick stub to see what changes the parser need to be resumable + + // TODO: We also store some cursors in the frame stack (JSON_FRAME_OBJECT) + const size_t offset = parser->state.cursor - parser->state.start; + if (parser->buffer) { + rb_str_append(parser->buffer, str); + } else { + RB_OBJ_WRITE(self, &parser->buffer, rb_str_new_shared(str)); + } + + long len; + const char *start; + RSTRING_GETMEM(parser->buffer, start, len); + parser->state.start = start; + parser->state.end = start + len; + parser->state.cursor = parser->state.start + offset; + + return self; +} + +// TODO: doc +static VALUE cResumableParser_parse(VALUE self) +{ + JSON_ResumableParser *parser = cResumableParser_get(self); + if (!parser->buffer) { + return Qfalse; + } + VALUE Vsource = parser->buffer; // Prevent compaction + + // TODO: prevent reentrancy + + // self may have moved, so we need to update all pointers + // We might be better off keeping JSON_ParserState on the stack + // and only persist what we need. + parser->state.value_stack_handle = &self; + parser->state.frame_stack_handle = &self; + parser->state.value_stack = &parser->value_stack; + parser->state.frames = &parser->frames; + + json_frame *frame = json_frame_stack_peek(&parser->frames); + + if (frame->phase == JSON_PHASE_DONE) { + frame->phase = JSON_PHASE_VALUE; + rvalue_stack_pop(parser->state.value_stack, 1); + } + + bool complete = json_parse_any(&parser->state, &parser->config, true); + RB_GC_GUARD(Vsource); + return complete ? Qtrue : Qfalse; +} + +// TODO: doc +static VALUE cResumableParser_value(VALUE self) +{ + JSON_ResumableParser *parser = cResumableParser_get(self); + json_frame *frame = json_frame_stack_peek(&parser->frames); + + if (frame->phase == JSON_PHASE_DONE) { + return *rvalue_stack_peek(parser->state.value_stack, 1); + } else { + rb_raise(rb_eArgError, "no ready value"); // TODO: Figure out the best exception and message + } +} + void Init_parser(void) { #ifdef HAVE_RB_EXT_RACTOR_SAFE @@ -2058,12 +2317,19 @@ void Init_parser(void) eNestingError = rb_path2class("JSON::NestingError"); rb_gc_register_mark_object(eNestingError); rb_define_alloc_func(cParserConfig, cJSON_parser_s_allocate); - rb_define_method(cParserConfig, "initialize", cParserConfig_initialize, 1); + rb_define_private_method(cParserConfig, "initialize", cParserConfig_initialize, 1); rb_define_method(cParserConfig, "parse", cParserConfig_parse, 1); VALUE cParser = rb_define_class_under(mExt, "Parser", rb_cObject); rb_define_singleton_method(cParser, "parse", cParser_m_parse, 2); + VALUE cResumableParser = rb_define_class_under(mExt, "ResumableParser", rb_cObject); + rb_define_alloc_func(cResumableParser, cResumableParser_allocate); + rb_define_private_method(cResumableParser, "initialize", cResumableParser_initialize, 1); + rb_define_method(cResumableParser, "<<", cResumableParser_feed, 1); + rb_define_method(cResumableParser, "parse", cResumableParser_parse, 0); + rb_define_method(cResumableParser, "value", cResumableParser_value, 0); + CNaN = rb_const_get(mJSON, rb_intern("NaN")); rb_gc_register_mark_object(CNaN); diff --git a/test/json/resumable_parser_test.rb b/test/json/resumable_parser_test.rb new file mode 100644 index 00000000..44cd5c7a --- /dev/null +++ b/test/json/resumable_parser_test.rb @@ -0,0 +1,91 @@ +# frozen_string_literal: true +require_relative 'test_helper' + +class JSONResumageParserTest < Test::Unit::TestCase + include JSON + + def setup + omit "JRuby not supported" if RUBY_ENGINE == "jruby" + @parser = new_parser + end + + def test_parse_document_direct + @parser << '[true]' + assert_equal true, @parser.parse + assert_equal [true], @parser.value + end + + def test_parse_multiple_documents_direct + @parser << '[true]{}[1, 2, 3]' + + assert_equal true, @parser.parse + assert_equal [true], @parser.value + + assert_equal true, @parser.parse + assert_equal({}, @parser.value) + + assert_equal true, @parser.parse + assert_equal [1, 2, 3], @parser.value + end + + def test_parse_byte_by_byte_array + assert_resumed_parsing('[]') + assert_resumed_parsing('[ ]') + assert_resumed_parsing('[true]') + assert_resumed_parsing('[12]') + assert_resumed_parsing('[ 12 ]') + assert_resumed_parsing('[ 12.3 ]') + assert_resumed_parsing('[ 12.3e12 ]') + assert_resumed_parsing('[ 1e12 ]') + assert_resumed_parsing('[-12]') + assert_resumed_parsing('[ -12 ]') + assert_resumed_parsing('[ -12.3 ]') + assert_resumed_parsing('[ -12.3e12 ]') + assert_resumed_parsing('[ -1e12 ]') + end + + def test_parse_byte_by_byte_object + assert_resumed_parsing('{}') + assert_resumed_parsing('{ }') + assert_resumed_parsing('{"test" : true}') + assert_resumed_parsing('{ "test":12, "value" : { "key": 42} }') + end + + def test_parse_byte_by_byte_string + assert_resumed_parsing(JSON.generate('test')) + assert_resumed_parsing(JSON.generate('te\\st')) + assert_resumed_parsing('"te\\u2028st"') + assert_resumed_parsing(JSON.generate("te\u2028st")) + assert_resumed_parsing(JSON.generate("te \u2028 st")) + end + + def test_parse_byte_by_byte_numbers + # TODO: should we actually types other than Array and Object and perhaps String at the top level? + # Top values other than arrays and objects are ambiguous. + # e.g. if you stream numbers without a separator: 123 followed by 456 is ambiguous with 123456. + assert_resumed_parsing('123 ') + # assert_resumed_parsing('123tru') + # assert_resumed_parsing('e') + end + + private + + def assert_resumed_parsing(json, parser = @parser) + expected = JSON.parse(json) + + last_parsed_byte_index = 0 + json.each_byte do |byte| + parser << byte.chr + last_parsed_byte_index += 1 + break if parser.parse + end + actual = parser.value + assert_equal expected, actual + remaining_bytes = (json.bytesize - last_parsed_byte_index) + assert_equal 0, remaining_bytes, "unconsumed bytes: #{actual.inspect}, remaining: #{json.byteslice(-1, remaining_bytes).inspect}" + end + + def new_parser(options = {}) + JSON::Ext::ResumableParser.new(options) + end +end From 964129a01c5c5eff0d2e016936ed05262310df65 Mon Sep 17 00:00:00 2001 From: Jean Boussier Date: Sat, 6 Jun 2026 10:36:14 +0200 Subject: [PATCH 2/7] Remove write barriers from rvalue_stack and cResumableParser --- ext/json/ext/parser/parser.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/ext/json/ext/parser/parser.c b/ext/json/ext/parser/parser.c index e59107d8..26a6fc42 100644 --- a/ext/json/ext/parser/parser.c +++ b/ext/json/ext/parser/parser.c @@ -226,10 +226,6 @@ static VALUE rvalue_stack_push(rvalue_stack *stack, VALUE value, VALUE *handle, stack->ptr[stack->head] = value; stack->head++; - if (*handle) { - RB_OBJ_WRITTEN(*handle, Qundef, value); - } - return value; } From 22e1b695d874efb45e1aea3090f2e72c09f1d96b Mon Sep 17 00:00:00 2001 From: Jean Boussier Date: Sat, 6 Jun 2026 13:50:35 +0200 Subject: [PATCH 3/7] record start_offset instead of start_cursor --- ext/json/ext/parser/parser.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/ext/json/ext/parser/parser.c b/ext/json/ext/parser/parser.c index 26a6fc42..bef4d4b4 100644 --- a/ext/json/ext/parser/parser.c +++ b/ext/json/ext/parser/parser.c @@ -374,7 +374,7 @@ typedef struct json_frame_struct { enum json_frame_type type; enum json_frame_phase phase; long value_stack_head; // rvalue_stack->head when this container opened - const char *start_cursor; // object frames only (the '{'); NULL otherwise + size_t start_offset; // object frames only (the '{'); NULL otherwise } json_frame; typedef struct json_frame_stack_struct { @@ -1664,7 +1664,7 @@ ALWAYS_INLINE(static) bool json_parse_any(JSON_ParserState *state, JSON_ParserCo .type = JSON_FRAME_OBJECT, .phase = JSON_PHASE_OBJECT_KEY, .value_stack_head = state->value_stack->head, - .start_cursor = object_start_cursor, + .start_offset = object_start_cursor - state->start, }); goto JSON_PHASE_OBJECT_KEY; } @@ -1822,7 +1822,7 @@ ALWAYS_INLINE(static) bool json_parse_any(JSON_ParserState *state, JSON_ParserCo // Temporary rewind cursor in case an error is raised const char *final_cursor = state->cursor; - state->cursor = frame->start_cursor; + state->cursor = state->start + frame->start_offset; VALUE object = json_decode_object(state, config, count); state->cursor = final_cursor; @@ -2237,7 +2237,6 @@ static VALUE cResumableParser_feed(VALUE self, VALUE str) // TODO: Figure out some efficient buffering. Shrink buffer, etc. // This is just a quick stub to see what changes the parser need to be resumable - // TODO: We also store some cursors in the frame stack (JSON_FRAME_OBJECT) const size_t offset = parser->state.cursor - parser->state.start; if (parser->buffer) { rb_str_append(parser->buffer, str); From df4451f3fbe056b88b4f168c78b1955652683b39 Mon Sep 17 00:00:00 2001 From: Jean Boussier Date: Sat, 6 Jun 2026 14:25:13 +0200 Subject: [PATCH 4/7] Improve resumable parser buffering --- ext/json/ext/parser/parser.c | 31 +++++++++++++++++++++++++------ 1 file changed, 25 insertions(+), 6 deletions(-) diff --git a/ext/json/ext/parser/parser.c b/ext/json/ext/parser/parser.c index bef4d4b4..f8dd3cb8 100644 --- a/ext/json/ext/parser/parser.c +++ b/ext/json/ext/parser/parser.c @@ -2231,17 +2231,36 @@ static VALUE cResumableParser_feed(VALUE self, VALUE str) { rb_check_frozen(self); str = convert_encoding(str); + if (!RSTRING_LEN(str)) { + return self; + } JSON_ResumableParser *parser = cResumableParser_get(self); - // TODO: Figure out some efficient buffering. Shrink buffer, etc. - // This is just a quick stub to see what changes the parser need to be resumable + size_t offset = parser->state.cursor - parser->state.start; + const size_t remaining = parser->state.end - parser->state.cursor; - const size_t offset = parser->state.cursor - parser->state.start; - if (parser->buffer) { - rb_str_append(parser->buffer, str); + if (!remaining) { + parser->buffer = RB_OBJ_FROZEN_RAW(str) ? str : rb_obj_hide(rb_str_new_shared(str)); + offset = 0; } else { - RB_OBJ_WRITE(self, &parser->buffer, rb_str_new_shared(str)); + JSON_ASSERT(parser->buffer); + + const size_t size = parser->state.end - parser->state.start; + const size_t consumed = size - remaining; + char *old_ptr = RSTRING_PTR(parser->buffer); + + if (RB_OBJ_FROZEN_RAW(parser->buffer)) { + VALUE new_buffer = rb_obj_hide(rb_str_buf_new(remaining + RSTRING_LEN(str))); + MEMCPY(RSTRING_PTR(new_buffer), old_ptr + consumed, char, remaining); + offset = 0; + parser->buffer = new_buffer; + } else if (consumed > (size / 2) && size >= 512) { + MEMMOVE(old_ptr, old_ptr + consumed, char, remaining); + rb_str_set_len(parser->buffer, remaining); + offset = 0; + } + rb_str_append(parser->buffer, str); } long len; From dfd783fda4f8341cb039b85fc7768cb809d2bbe4 Mon Sep 17 00:00:00 2001 From: Jean Boussier Date: Sat, 6 Jun 2026 19:16:30 +0200 Subject: [PATCH 5/7] eos instead of peek == 0 --- ext/json/ext/parser/parser.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/ext/json/ext/parser/parser.c b/ext/json/ext/parser/parser.c index f8dd3cb8..c83b16bb 100644 --- a/ext/json/ext/parser/parser.c +++ b/ext/json/ext/parser/parser.c @@ -1568,7 +1568,7 @@ ALWAYS_INLINE(static) bool json_parse_any(JSON_ParserState *state, JSON_ParserCo } } - if (resumable && peek(state) == 0) { + if (resumable && eos(state)) { state->cursor = start; return false; } @@ -1583,7 +1583,7 @@ ALWAYS_INLINE(static) bool json_parse_any(JSON_ParserState *state, JSON_ParserCo const char *start = state->cursor; value = json_parse_positive_number(state, config); - if (resumable && peek(state) == 0) { + if (resumable && eos(state)) { state->cursor = start; return false; } @@ -1649,7 +1649,7 @@ ALWAYS_INLINE(static) bool json_parse_any(JSON_ParserState *state, JSON_ParserCo state->cursor++; value = json_decode_object(state, config, 0); break; - } else if (resumable && peek(state) == 0) { + } else if (resumable && eos(state)) { state->cursor = object_start_cursor; return false; } @@ -1713,7 +1713,7 @@ ALWAYS_INLINE(static) bool json_parse_any(JSON_ParserState *state, JSON_ParserCo json_push_value(state, config, string); frame->phase = JSON_PHASE_OBJECT_COLON; goto JSON_PHASE_OBJECT_COLON; - } else if (resumable && peek(state) == 0) { + } else if (resumable && eos(state)) { return false; } else { // The message differs for the first key vs. a key after a @@ -1737,7 +1737,7 @@ ALWAYS_INLINE(static) bool json_parse_any(JSON_ParserState *state, JSON_ParserCo state->cursor++; frame->phase = JSON_PHASE_VALUE; goto JSON_PHASE_VALUE; - } else if (resumable && peek(state) == 0) { + } else if (resumable && eos(state)) { return false; } else { // First colon (only the first pair's key is pushed, nothing From 2ce68e12e283ec232ae446888b692b6a421872ee Mon Sep 17 00:00:00 2001 From: Jean Boussier Date: Sat, 6 Jun 2026 19:59:51 +0200 Subject: [PATCH 6/7] Mark parser error with `@eos` In some case allow to remove some branches from the fast path. Since we may call back into Ruby, we'll never trully be free of exceptions anyway, so we might as well pass some info through there. --- ext/json/ext/parser/parser.c | 160 +++++++++++++++++++---------------- 1 file changed, 87 insertions(+), 73 deletions(-) diff --git a/ext/json/ext/parser/parser.c b/ext/json/ext/parser/parser.c index c83b16bb..d2c7a388 100644 --- a/ext/json/ext/parser/parser.c +++ b/ext/json/ext/parser/parser.c @@ -642,26 +642,29 @@ static VALUE build_parse_error_message(const char *format, JSON_ParserState *sta return message; } -static VALUE parse_error_new(VALUE message, long line, long column) +static VALUE parse_error_new(VALUE message, long line, long column, bool eos) { VALUE exc = rb_exc_new_str(rb_path2class("JSON::ParserError"), message); rb_ivar_set(exc, rb_intern("@line"), LONG2NUM(line)); rb_ivar_set(exc, rb_intern("@column"), LONG2NUM(column)); + if (eos) { + rb_ivar_set(exc, rb_intern("@eos"), Qtrue); + } return exc; } -NORETURN(static) void raise_parse_error(const char *format, JSON_ParserState *state) +NORETURN(static) void raise_parse_error(const char *format, JSON_ParserState *state, bool eos) { long line, column; cursor_position(state, &line, &column); VALUE message = build_parse_error_message(format, state, line, column); - rb_exc_raise(parse_error_new(message, line, column)); + rb_exc_raise(parse_error_new(message, line, column, eos)); } -NORETURN(static) void raise_parse_error_at(const char *format, JSON_ParserState *state, const char *at) +NORETURN(static) void raise_parse_error_at(const char *format, JSON_ParserState *state, const char *at, bool eos) { state->cursor = at; - raise_parse_error(format, state); + raise_parse_error(format, state, eos); } /* unicode */ @@ -686,7 +689,7 @@ static const signed char digit_values[256] = { static uint32_t unescape_unicode(JSON_ParserState *state, const char *sp, const char *spe) { if (RB_UNLIKELY(sp > spe - 4)) { - raise_parse_error_at("incomplete unicode character escape sequence at %s", state, sp - 2); + raise_parse_error_at("incomplete unicode character escape sequence at %s", state, sp - 2, true); } const unsigned char *p = (const unsigned char *)sp; @@ -697,7 +700,7 @@ static uint32_t unescape_unicode(JSON_ParserState *state, const char *sp, const const signed char b3 = digit_values[p[3]]; if (RB_UNLIKELY((signed char)(b0 | b1 | b2 | b3) < 0)) { - raise_parse_error_at("incomplete unicode character escape sequence at %s", state, sp - 2); + raise_parse_error_at("incomplete unicode character escape sequence at %s", state, sp - 2, false); } return ((uint32_t)b0 << 12) | ((uint32_t)b1 << 8) | ((uint32_t)b2 << 4) | (uint32_t)b3; @@ -731,7 +734,7 @@ json_eat_comments(JSON_ParserState *state) while (true) { const char *next_match = memchr(state->cursor, '*', state->end - state->cursor); if (!next_match) { - raise_parse_error_at("unterminated comment, expected closing '*/'", state, start); + raise_parse_error_at("unterminated comment, expected closing '*/'", state, start, true); } state->cursor = next_match + 1; @@ -743,7 +746,7 @@ json_eat_comments(JSON_ParserState *state) break; } default: - raise_parse_error_at("unexpected token %s", state, start); + raise_parse_error_at("unexpected token %s", state, start, eos(state)); break; } } @@ -934,13 +937,13 @@ NOINLINE(static) VALUE json_string_unescape(JSON_ParserState *state, JSON_Parser uint32_t sur = unescape_unicode(state, pe + 2, stringEnd); if (RB_UNLIKELY((sur & 0xFC00) != 0xDC00)) { - raise_parse_error_at("invalid surrogate pair at %s", state, p); + raise_parse_error_at("invalid surrogate pair at %s", state, p, false); } ch = (((ch & 0x3F) << 10) | ((((ch >> 6) & 0xF) + 1) << 16) | (sur & 0x3FF)); pe += 5; } else { - raise_parse_error_at("incomplete surrogate pair at %s", state, p); + raise_parse_error_at("incomplete surrogate pair at %s", state, p, false); break; } } @@ -956,16 +959,16 @@ NOINLINE(static) VALUE json_string_unescape(JSON_ParserState *state, JSON_Parser if ((unsigned char)*pe < 0x20) { if (!config->allow_control_characters) { if (*pe == '\n') { - raise_parse_error_at("Invalid unescaped newline character (\\n) in string: %s", state, pe - 1); + raise_parse_error_at("Invalid unescaped newline character (\\n) in string: %s", state, pe - 1, false); } - raise_parse_error_at("invalid ASCII control character in string: %s", state, pe - 1); + raise_parse_error_at("invalid ASCII control character in string: %s", state, pe - 1, false); } } if (config->allow_invalid_escape) { APPEND_CHAR(*pe); } else { - raise_parse_error_at("invalid escape character in string: %s", state, pe - 1); + raise_parse_error_at("invalid escape character in string: %s", state, pe - 1, false); } break; } @@ -1120,7 +1123,7 @@ NORETURN(static) void raise_duplicate_key_error(JSON_ParserState *state, VALUE d long line, column; cursor_position(state, &line, &column); rb_str_concat(message, build_parse_error_message("", state, line, column)) ; - rb_exc_raise(parse_error_new(message, line, column)); + rb_exc_raise(parse_error_new(message, line, column, false)); } NOINLINE(static) void json_on_duplicate_key(JSON_ParserState *state, JSON_ParserConfig *config, size_t count, const VALUE *pairs) @@ -1256,7 +1259,7 @@ static VALUE json_parse_escaped_string(JSON_ParserState *state, JSON_ParserConfi } default: if (!config->allow_control_characters) { - raise_parse_error("invalid ASCII control character in string: %s", state); + raise_parse_error("invalid ASCII control character in string: %s", state, false); } break; } @@ -1459,7 +1462,7 @@ static inline enum json_frame_phase json_value_completed(json_frame *frame) return frame->phase = (enum json_frame_phase) frame->type; } -ALWAYS_INLINE(static) bool json_match_keyword(JSON_ParserState *state, bool resumable, const char *keyword, size_t offset) +ALWAYS_INLINE(static) void json_match_keyword(JSON_ParserState *state, bool resumable, const char *keyword, size_t offset) { // It is assumed that since `keyword` is always a literal, the compiler is able to constantize this // `strlen` and several other computations in that routine, such as eliminating the `if (resumable)` branch. @@ -1470,12 +1473,11 @@ ALWAYS_INLINE(static) bool json_match_keyword(JSON_ParserState *state, bool resu // That's why we sometime compare starting from the first byte and sometimes from the second. if (rest(state) >= len && (memcmp(state->cursor + offset, keyword + offset, len - offset) == 0)) { state->cursor += len; - return true; - } else if (resumable && rest(state) < len && memcmp(state->cursor, keyword, rest(state)) == 0) { - return false; + return; } - raise_parse_error("unexpected token %s", state); + bool eos = rest(state) < len && memcmp(state->cursor, keyword, rest(state)) == 0; + raise_parse_error("unexpected token %s", state, eos); } // Parse an arbitrary JSON value iteratively. This is a state machine driven @@ -1509,47 +1511,37 @@ ALWAYS_INLINE(static) bool json_parse_any(JSON_ParserState *state, JSON_ParserCo VALUE value; switch (peek(state)) { case 'n': - if (json_match_keyword(state, resumable, "null", 0)) { - value = Qnil; - break; - } - return false; + json_match_keyword(state, resumable, "null", 0); + value = Qnil; + break; case 't': - if (json_match_keyword(state, resumable, "true", 0)) { - value = Qtrue; - break; - } - return false; + json_match_keyword(state, resumable, "true", 0); + value = Qtrue; + break; case 'f': - if (json_match_keyword(state, resumable, "false", 1)) { - value = Qfalse; - break; - } - return false; + json_match_keyword(state, resumable, "false", 1); + value = Qfalse; + break; case 'N': if (!config->allow_nan) { - raise_parse_error("unexpected token %s", state); + raise_parse_error("unexpected token %s", state, false); } - if (json_match_keyword(state, resumable, "NaN", 1)) { - value = CNaN; - break; - } - return false; + json_match_keyword(state, resumable, "NaN", 1); + value = CNaN; + break; case 'I': if (!config->allow_nan) { - raise_parse_error("unexpected token %s", state); + raise_parse_error("unexpected token %s", state, false); } - if (json_match_keyword(state, resumable, "Infinity", 0)) { - value = CInfinity; - break; - } - return false; + json_match_keyword(state, resumable, "Infinity", 0); + value = CInfinity; + break; case '-': { const char *start = state->cursor; @@ -1559,22 +1551,22 @@ ALWAYS_INLINE(static) bool json_parse_any(JSON_ParserState *state, JSON_ParserCo if (RB_UNLIKELY(UNDEF_P(value))) { if (config->allow_nan && peek(state) == 'I') { - if (json_match_keyword(state, resumable, "Infinity", 0)) { - value = CMinusInfinity; - break; - } state->cursor = start; - return false; + json_match_keyword(state, resumable, "-Infinity", 1); + value = CMinusInfinity; + break; } } + // Top level numbers are ambiguous when parsing streams, we can't + // know if we parsed all the digits if we hit EOS. if (resumable && eos(state)) { state->cursor = start; return false; } if (RB_UNLIKELY(UNDEF_P(value))) { - raise_parse_error_at("invalid number: %s", state, start); + raise_parse_error_at("invalid number: %s", state, start, false); } break; } @@ -1583,13 +1575,15 @@ ALWAYS_INLINE(static) bool json_parse_any(JSON_ParserState *state, JSON_ParserCo const char *start = state->cursor; value = json_parse_positive_number(state, config); + // Top level numbers are ambiguous when parsing streams, we can't + // know if we parsed all the digits if we hit EOS. if (resumable && eos(state)) { state->cursor = start; - return false; + raise_parse_error("streamed number", state, true); } if (RB_UNLIKELY(UNDEF_P(value))) { - raise_parse_error_at("invalid number: %s", state, start); + raise_parse_error_at("invalid number: %s", state, start, false); } break; } @@ -1600,12 +1594,11 @@ ALWAYS_INLINE(static) bool json_parse_any(JSON_ParserState *state, JSON_ParserCo value = json_parse_string(state, config, false); if (RB_UNLIKELY(UNDEF_P(value))) { + bool is_eos = eos(state); if (resumable) { state->cursor = start; - return false; - } else { - raise_parse_error("unexpected end of input, expected closing \"", state); } + raise_parse_error("unexpected end of input, expected closing \"", state, is_eos); } break; } @@ -1670,13 +1663,10 @@ ALWAYS_INLINE(static) bool json_parse_any(JSON_ParserState *state, JSON_ParserCo } case 0: - if (resumable) { - return false; - } - raise_parse_error("unexpected end of input", state); + raise_parse_error("unexpected end of input", state, eos(state)); default: - raise_parse_error("unexpected character: %s", state); + raise_parse_error("unexpected character: %s", state, false); } json_push_value(state, config, value); @@ -1707,7 +1697,7 @@ ALWAYS_INLINE(static) bool json_parse_any(JSON_ParserState *state, JSON_ParserCo state->cursor = start; return false; } else { - raise_parse_error("unexpected end of input, expected closing \"", state); + raise_parse_error("unexpected end of input, expected closing \"", state, false); } } json_push_value(state, config, string); @@ -1720,9 +1710,9 @@ ALWAYS_INLINE(static) bool json_parse_any(JSON_ParserState *state, JSON_ParserCo // ',': the first is the only one reached with nothing pushed // for this object yet. if (json_frame_entry_count(frame, state->value_stack) == 0) { - raise_parse_error("expected object key, got %s", state); + raise_parse_error("expected object key, got %s", state, false); } else { - raise_parse_error("expected object key, got: %s", state); + raise_parse_error("expected object key, got: %s", state, false); } } JSON_UNREACHABLE_RETURN(false); @@ -1743,9 +1733,9 @@ ALWAYS_INLINE(static) bool json_parse_any(JSON_ParserState *state, JSON_ParserCo // First colon (only the first pair's key is pushed, nothing // else) vs. a later one. if (json_frame_entry_count(frame, state->value_stack) == 1) { - raise_parse_error("expected ':' after object key", state); + raise_parse_error("expected ':' after object key", state, false); } else { - raise_parse_error("expected ':' after object key, got: %s", state); + raise_parse_error("expected ':' after object key, got: %s", state, false); } } JSON_UNREACHABLE_RETURN(false); @@ -1791,7 +1781,7 @@ ALWAYS_INLINE(static) bool json_parse_any(JSON_ParserState *state, JSON_ParserCo } else if (resumable && next_char == 0) { return false; } else { - raise_parse_error("expected ',' or ']' after array value", state); + raise_parse_error("expected ',' or ']' after array value", state, false); } JSON_UNREACHABLE_RETURN(false); } @@ -1842,7 +1832,7 @@ ALWAYS_INLINE(static) bool json_parse_any(JSON_ParserState *state, JSON_ParserCo } else if (resumable && next_char == 0) { return false; } else { - raise_parse_error("expected ',' or '}' after object value, got: %s", state); + raise_parse_error("expected ',' or '}' after object value, got: %s", state, false); } JSON_UNREACHABLE_RETURN(false); } @@ -1854,7 +1844,7 @@ static void json_ensure_eof(JSON_ParserState *state) { json_eat_whitespace(state); if (!eos(state)) { - raise_parse_error("unexpected token at end of stream %s", state); + raise_parse_error("unexpected token at end of stream %s", state, false); } } @@ -2273,6 +2263,17 @@ static VALUE cResumableParser_feed(VALUE self, VALUE str) return self; } +struct json_parse_any_args { + JSON_ParserState *state; + JSON_ParserConfig *config; +}; + +static VALUE json_parse_any_resumable_safe(VALUE _args) +{ + struct json_parse_any_args *args = (struct json_parse_any_args *)_args; + return (VALUE)json_parse_any(args->state, args->config, true); +} + // TODO: doc static VALUE cResumableParser_parse(VALUE self) { @@ -2299,7 +2300,20 @@ static VALUE cResumableParser_parse(VALUE self) rvalue_stack_pop(parser->state.value_stack, 1); } - bool complete = json_parse_any(&parser->state, &parser->config, true); + struct json_parse_any_args args = { + .state = &parser->state, + .config = &parser->config, + }; + int status; + bool complete = rb_protect(json_parse_any_resumable_safe, (VALUE)&args, &status); + if (status) { + complete = false; + if (RTEST(rb_ivar_get(rb_errinfo(), rb_intern("@eos")))) { + complete = false; // is an EOS error + } else { + rb_jump_tag(status); // reraise + } + } RB_GC_GUARD(Vsource); return complete ? Qtrue : Qfalse; } From 07bae133621b056c83590538ebc3e74b010d64c1 Mon Sep 17 00:00:00 2001 From: Jean Boussier Date: Sat, 6 Jun 2026 20:07:10 +0200 Subject: [PATCH 7/7] Implement ResumableParser#rest --- ext/json/ext/parser/parser.c | 17 +++++++++++++++++ test/json/resumable_parser_test.rb | 8 ++++++-- 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/ext/json/ext/parser/parser.c b/ext/json/ext/parser/parser.c index d2c7a388..44b581e6 100644 --- a/ext/json/ext/parser/parser.c +++ b/ext/json/ext/parser/parser.c @@ -2331,6 +2331,22 @@ static VALUE cResumableParser_value(VALUE self) } } +// TODO: doc +static VALUE cResumableParser_rest(VALUE self) +{ + JSON_ResumableParser *parser = cResumableParser_get(self); + + if (!parser->buffer) { + return rb_utf8_str_new("", 0); + } + + size_t offset = parser->state.cursor - parser->state.start; + const char *ptr; + long len; + RSTRING_GETMEM(parser->buffer, ptr, len); + return rb_utf8_str_new(ptr + offset, len - offset); +} + void Init_parser(void) { #ifdef HAVE_RB_EXT_RACTOR_SAFE @@ -2357,6 +2373,7 @@ void Init_parser(void) rb_define_method(cResumableParser, "<<", cResumableParser_feed, 1); rb_define_method(cResumableParser, "parse", cResumableParser_parse, 0); rb_define_method(cResumableParser, "value", cResumableParser_value, 0); + rb_define_method(cResumableParser, "rest", cResumableParser_rest, 0); CNaN = rb_const_get(mJSON, rb_intern("NaN")); rb_gc_register_mark_object(CNaN); diff --git a/test/json/resumable_parser_test.rb b/test/json/resumable_parser_test.rb index 44cd5c7a..52c63025 100644 --- a/test/json/resumable_parser_test.rb +++ b/test/json/resumable_parser_test.rb @@ -64,8 +64,12 @@ def test_parse_byte_by_byte_numbers # Top values other than arrays and objects are ambiguous. # e.g. if you stream numbers without a separator: 123 followed by 456 is ambiguous with 123456. assert_resumed_parsing('123 ') - # assert_resumed_parsing('123tru') - # assert_resumed_parsing('e') + end + + def test_rest + @parser << '[1, 2, 3, "unterminated string' + refute @parser.parse + assert_equal '"unterminated string', @parser.rest end private