From 87fc2c516bff207f880c71526926842fd8dcc77e Mon Sep 17 00:00:00 2001 From: Josh Haberman Date: Wed, 7 Jan 2015 18:02:09 -0800 Subject: Changes from Google-internal development. * JSON parser expanded to handle split buffers. * bugfix to the protobuf decoder. --- upb/json/parser.c | 1086 +++++++++++++++++++++++++-------------- upb/json/parser.h | 21 +- upb/json/parser.rl | 788 +++++++++++++++++++--------- upb/pb/compile_decoder.c | 17 +- upb/pb/compile_decoder_x64.dasc | 3 + upb/pb/compile_decoder_x64.h | 7 +- upb/pb/decoder.c | 6 +- upb/pb/decoder.int.h | 4 +- upb/upb.c | 6 +- 9 files changed, 1302 insertions(+), 636 deletions(-) (limited to 'upb') diff --git a/upb/json/parser.c b/upb/json/parser.c index fe2b586..cfe1def 100644 --- a/upb/json/parser.c +++ b/upb/json/parser.c @@ -35,6 +35,9 @@ #define PARSER_CHECK_RETURN(x) if (!(x)) return false +// Used to signal that a capture has been suspended. +static char suspend_capture; + static upb_selector_t getsel_for_handlertype(upb_json_parser *p, upb_handlertype_t type) { upb_selector_t sel; @@ -48,41 +51,6 @@ static upb_selector_t parser_getsel(upb_json_parser *p) { p, upb_handlers_getprimitivehandlertype(p->top->f)); } -static void start_member(upb_json_parser *p) { - assert(!p->top->f); - assert(!p->accumulated); - p->accumulated_len = 0; -} - -static bool end_member(upb_json_parser *p) { - // TODO(haberman): support keys that span buffers or have escape sequences. - assert(!p->top->f); - assert(p->accumulated); - const upb_fielddef *f = - upb_msgdef_ntof(p->top->m, p->accumulated, p->accumulated_len); - - if (!f) { - // TODO(haberman): Ignore unknown fields if requested/configured to do so. - upb_status_seterrf(p->status, "No such field: %.*s\n", - (int)p->accumulated_len, p->accumulated); - return false; - } - - p->top->f = f; - p->accumulated = NULL; - - return true; -} - -static void start_object(upb_json_parser *p) { - upb_sink_startmsg(&p->top->sink); -} - -static void end_object(upb_json_parser *p) { - upb_status status; - upb_sink_endmsg(&p->top->sink, &status); -} - static bool check_stack(upb_json_parser *p) { if ((p->top + 1) == p->limit) { upb_status_seterrmsg(p->status, "Nesting too deep"); @@ -92,83 +60,28 @@ static bool check_stack(upb_json_parser *p) { return true; } -static bool start_subobject(upb_json_parser *p) { - assert(p->top->f); - - if (!upb_fielddef_issubmsg(p->top->f)) { - upb_status_seterrf(p->status, - "Object specified for non-message/group field: %s", - upb_fielddef_name(p->top->f)); - return false; - } - - if (!check_stack(p)) return false; - - upb_jsonparser_frame *inner = p->top + 1; - - upb_selector_t sel = getsel_for_handlertype(p, UPB_HANDLER_STARTSUBMSG); - upb_sink_startsubmsg(&p->top->sink, sel, &inner->sink); - inner->m = upb_fielddef_msgsubdef(p->top->f); - inner->f = NULL; - p->top = inner; +// There are GCC/Clang built-ins for overflow checking which we could start +// using if there was any performance benefit to it. +static bool checked_add(size_t a, size_t b, size_t *c) { + if (SIZE_MAX - a < b) return false; + *c = a + b; return true; } -static void end_subobject(upb_json_parser *p) { - p->top--; - upb_selector_t sel = getsel_for_handlertype(p, UPB_HANDLER_ENDSUBMSG); - upb_sink_endsubmsg(&p->top->sink, sel); -} - -static bool start_array(upb_json_parser *p) { - assert(p->top->f); - - if (!upb_fielddef_isseq(p->top->f)) { - upb_status_seterrf(p->status, - "Array specified for non-repeated field: %s", - upb_fielddef_name(p->top->f)); - return false; +static size_t saturating_multiply(size_t a, size_t b) { + // size_t is unsigned, so this is defined behavior even on overflow. + size_t ret = a * b; + if (b != 0 && ret / b != a) { + ret = SIZE_MAX; } - - if (!check_stack(p)) return false; - - upb_jsonparser_frame *inner = p->top + 1; - upb_selector_t sel = getsel_for_handlertype(p, UPB_HANDLER_STARTSEQ); - upb_sink_startseq(&p->top->sink, sel, &inner->sink); - inner->m = p->top->m; - inner->f = p->top->f; - p->top = inner; - - return true; + return ret; } -static void end_array(upb_json_parser *p) { - assert(p->top > p->stack); - p->top--; - upb_selector_t sel = getsel_for_handlertype(p, UPB_HANDLER_ENDSEQ); - upb_sink_endseq(&p->top->sink, sel); -} +/* Base64 decoding ************************************************************/ -static void clear_member(upb_json_parser *p) { p->top->f = NULL; } - -static bool parser_putbool(upb_json_parser *p, bool val) { - if (upb_fielddef_type(p->top->f) != UPB_TYPE_BOOL) { - upb_status_seterrf(p->status, - "Boolean value specified for non-bool field: %s", - upb_fielddef_name(p->top->f)); - return false; - } - - bool ok = upb_sink_putbool(&p->top->sink, parser_getsel(p), val); - UPB_ASSERT_VAR(ok, ok); - return true; -} - -static void start_text(upb_json_parser *p, const char *ptr) { - p->text_begin = ptr; -} +// TODO(haberman): make this streaming. static const signed char b64table[] = { -1, -1, -1, -1, -1, -1, -1, -1, @@ -288,89 +201,323 @@ badpadding: return false; } -static bool end_text(upb_json_parser *p, const char *ptr, bool is_num) { - assert(!p->accumulated); // TODO: handle this case. - p->accumulated = p->text_begin; - p->accumulated_len = ptr - p->text_begin; - if (p->top->f && upb_fielddef_isstring(p->top->f)) { - // This is a string field (as opposed to a member name). - upb_selector_t sel = getsel_for_handlertype(p, UPB_HANDLER_STRING); - if (upb_fielddef_type(p->top->f) == UPB_TYPE_BYTES) { - PARSER_CHECK_RETURN(base64_push(p, sel, p->accumulated, - p->accumulated_len)); - } else { - upb_sink_putstring(&p->top->sink, sel, p->accumulated, p->accumulated_len, NULL); +/* Accumulate buffer **********************************************************/ + +// Functionality for accumulating a buffer. +// +// Some parts of the parser need an entire value as a contiguous string. For +// example, to look up a member name in a hash table, or to turn a string into +// a number, the relevant library routines need the input string to be in +// contiguous memory, even if the value spanned two or more buffers in the +// input. These routines handle that. +// +// In the common case we can just point to the input buffer to get this +// contiguous string and avoid any actual copy. So we optimistically begin +// this way. But there are a few cases where we must instead copy into a +// separate buffer: +// +// 1. The string was not contiguous in the input (it spanned buffers). +// +// 2. The string included escape sequences that need to be interpreted to get +// the true value in a contiguous buffer. + +static void assert_accumulate_empty(upb_json_parser *p) { + UPB_UNUSED(p); + assert(p->accumulated == NULL); + assert(p->accumulated_len == 0); +} + +static void accumulate_clear(upb_json_parser *p) { + p->accumulated = NULL; + p->accumulated_len = 0; +} + +// Used internally by accumulate_append(). +static bool accumulate_realloc(upb_json_parser *p, size_t need) { + size_t new_size = UPB_MAX(p->accumulate_buf_size, 128); + while (new_size < need) { + new_size = saturating_multiply(new_size, 2); + } + + void *mem = realloc(p->accumulate_buf, new_size); + if (!mem) { + upb_status_seterrmsg(p->status, "Out of memory allocating buffer."); + return false; + } + + p->accumulate_buf = mem; + p->accumulate_buf_size = new_size; + return true; +} + +// Logically appends the given data to the append buffer. +// If "can_alias" is true, we will try to avoid actually copying, but the buffer +// must be valid until the next accumulate_append() call (if any). +static bool accumulate_append(upb_json_parser *p, const char *buf, size_t len, + bool can_alias) { + if (!p->accumulated && can_alias) { + p->accumulated = buf; + p->accumulated_len = len; + return true; + } + + if (p->accumulate_buf_size - p->accumulated_len < len) { + size_t need; + if (!checked_add(p->accumulated_len, len, &need)) { + upb_status_seterrmsg(p->status, "Integer overflow."); + return false; } - p->accumulated = NULL; - } else if (p->top->f && - upb_fielddef_type(p->top->f) == UPB_TYPE_ENUM && - !is_num) { - - // Enum case: resolve enum symbolic name to integer value. - const upb_enumdef *enumdef = - (const upb_enumdef*)upb_fielddef_subdef(p->top->f); - - int32_t int_val = 0; - if (upb_enumdef_ntoi(enumdef, p->accumulated, p->accumulated_len, - &int_val)) { - upb_selector_t sel = parser_getsel(p); - upb_sink_putint32(&p->top->sink, sel, int_val); - } else { - upb_status_seterrmsg(p->status, "Enum value name unknown"); + + if (!accumulate_realloc(p, need)) { return false; } - p->accumulated = NULL; } + if (p->accumulated != p->accumulate_buf) { + memcpy(p->accumulate_buf, p->accumulated, p->accumulated_len); + p->accumulated = p->accumulate_buf; + } + + memcpy(p->accumulate_buf + p->accumulated_len, buf, len); + p->accumulated_len += len; return true; } -static bool start_stringval(upb_json_parser *p) { - assert(p->top->f); +// Returns a pointer to the data accumulated since the last accumulate_clear() +// call, and writes the length to *len. This with point either to the input +// buffer or a temporary accumulate buffer. +static const char *accumulate_getptr(upb_json_parser *p, size_t *len) { + assert(p->accumulated); + *len = p->accumulated_len; + return p->accumulated; +} - if (upb_fielddef_isstring(p->top->f)) { - if (!check_stack(p)) return false; - // Start a new parser frame: parser frames correspond one-to-one with - // handler frames, and string events occur in a sub-frame. - upb_jsonparser_frame *inner = p->top + 1; - upb_selector_t sel = getsel_for_handlertype(p, UPB_HANDLER_STARTSTR); - upb_sink_startstr(&p->top->sink, sel, 0, &inner->sink); - inner->m = p->top->m; - inner->f = p->top->f; - p->top = inner; +/* Mult-part text data ********************************************************/ + +// When we have text data in the input, it can often come in multiple segments. +// For example, there may be some raw string data followed by an escape +// sequence. The two segments are processed with different logic. Also buffer +// seams in the input can cause multiple segments. +// +// As we see segments, there are two main cases for how we want to process them: +// +// 1. we want to push the captured input directly to string handlers. +// +// 2. we need to accumulate all the parts into a contiguous buffer for further +// processing (field name lookup, string->number conversion, etc). + +// This is the set of states for p->multipart_state. +enum { + // We are not currently processing multipart data. + MULTIPART_INACTIVE = 0, + + // We are processing multipart data by accumulating it into a contiguous + // buffer. + MULTIPART_ACCUMULATE = 1, + + // We are processing multipart data by pushing each part directly to the + // current string handlers. + MULTIPART_PUSHEAGERLY = 2 +}; - return true; - } else if (upb_fielddef_type(p->top->f) == UPB_TYPE_ENUM) { - // Do nothing -- symbolic enum names in quotes remain in the - // current parser frame. +// Start a multi-part text value where we accumulate the data for processing at +// the end. +static void multipart_startaccum(upb_json_parser *p) { + assert_accumulate_empty(p); + assert(p->multipart_state == MULTIPART_INACTIVE); + p->multipart_state = MULTIPART_ACCUMULATE; +} + +// Start a multi-part text value where we immediately push text data to a string +// value with the given selector. +static void multipart_start(upb_json_parser *p, upb_selector_t sel) { + assert_accumulate_empty(p); + assert(p->multipart_state == MULTIPART_INACTIVE); + p->multipart_state = MULTIPART_PUSHEAGERLY; + p->string_selector = sel; +} + +static bool multipart_text(upb_json_parser *p, const char *buf, size_t len, + bool can_alias) { + switch (p->multipart_state) { + case MULTIPART_INACTIVE: + upb_status_seterrmsg( + p->status, "Internal error: unexpected state MULTIPART_INACTIVE"); + return false; + + case MULTIPART_ACCUMULATE: + if (!accumulate_append(p, buf, len, can_alias)) { + return false; + } + break; + + case MULTIPART_PUSHEAGERLY: { + const upb_bufhandle *handle = can_alias ? p->handle : NULL; + upb_sink_putstring(&p->top->sink, p->string_selector, buf, len, handle); + break; + } + } + + return true; +} + +// Note: this invalidates the accumulate buffer! Call only after reading its +// contents. +static void multipart_end(upb_json_parser *p) { + assert(p->multipart_state != MULTIPART_INACTIVE); + p->multipart_state = MULTIPART_INACTIVE; + accumulate_clear(p); +} + + +/* Input capture **************************************************************/ + +// Functionality for capturing a region of the input as text. Gracefully +// handles the case where a buffer seam occurs in the middle of the captured +// region. + +static void capture_begin(upb_json_parser *p, const char *ptr) { + assert(p->multipart_state != MULTIPART_INACTIVE); + assert(p->capture == NULL); + p->capture = ptr; +} + +static bool capture_end(upb_json_parser *p, const char *ptr) { + assert(p->capture); + if (multipart_text(p, p->capture, ptr - p->capture, true)) { + p->capture = NULL; return true; } else { - upb_status_seterrf(p->status, - "String specified for non-string/non-enum field: %s", - upb_fielddef_name(p->top->f)); return false; } +} +// This is called at the end of each input buffer (ie. when we have hit a +// buffer seam). If we are in the middle of capturing the input, this +// processes the unprocessed capture region. +static void capture_suspend(upb_json_parser *p, const char **ptr) { + if (!p->capture) return; + + if (multipart_text(p, p->capture, *ptr - p->capture, false)) { + // We use this as a signal that we were in the middle of capturing, and + // that capturing should resume at the beginning of the next buffer. + // + // We can't use *ptr here, because we have no guarantee that this pointer + // will be valid when we resume (if the underlying memory is freed, then + // using the pointer at all, even to compare to NULL, is likely undefined + // behavior). + p->capture = &suspend_capture; + } else { + // Need to back up the pointer to the beginning of the capture, since + // we were not able to actually preserve it. + *ptr = p->capture; + } } -static void end_stringval(upb_json_parser *p) { - if (upb_fielddef_isstring(p->top->f)) { - upb_selector_t sel = getsel_for_handlertype(p, UPB_HANDLER_ENDSTR); - upb_sink_endstr(&p->top->sink, sel); - p->top--; +static void capture_resume(upb_json_parser *p, const char *ptr) { + if (p->capture) { + assert(p->capture == &suspend_capture); + p->capture = ptr; } } + +/* Callbacks from the parser **************************************************/ + +// These are the functions called directly from the parser itself. +// We define these in the same order as their declarations in the parser. + +static char escape_char(char in) { + switch (in) { + case 'r': return '\r'; + case 't': return '\t'; + case 'n': return '\n'; + case 'f': return '\f'; + case 'b': return '\b'; + case '/': return '/'; + case '"': return '"'; + case '\\': return '\\'; + default: + assert(0); + return 'x'; + } +} + +static bool escape(upb_json_parser *p, const char *ptr) { + char ch = escape_char(*ptr); + return multipart_text(p, &ch, 1, false); +} + +static void start_hex(upb_json_parser *p) { + p->digit = 0; +} + +static void hexdigit(upb_json_parser *p, const char *ptr) { + char ch = *ptr; + + p->digit <<= 4; + + if (ch >= '0' && ch <= '9') { + p->digit += (ch - '0'); + } else if (ch >= 'a' && ch <= 'f') { + p->digit += ((ch - 'a') + 10); + } else { + assert(ch >= 'A' && ch <= 'F'); + p->digit += ((ch - 'A') + 10); + } +} + +static bool end_hex(upb_json_parser *p) { + uint32_t codepoint = p->digit; + + // emit the codepoint as UTF-8. + char utf8[3]; // support \u0000 -- \uFFFF -- need only three bytes. + int length = 0; + if (codepoint <= 0x7F) { + utf8[0] = codepoint; + length = 1; + } else if (codepoint <= 0x07FF) { + utf8[1] = (codepoint & 0x3F) | 0x80; + codepoint >>= 6; + utf8[0] = (codepoint & 0x1F) | 0xC0; + length = 2; + } else /* codepoint <= 0xFFFF */ { + utf8[2] = (codepoint & 0x3F) | 0x80; + codepoint >>= 6; + utf8[1] = (codepoint & 0x3F) | 0x80; + codepoint >>= 6; + utf8[0] = (codepoint & 0x0F) | 0xE0; + length = 3; + } + // TODO(haberman): Handle high surrogates: if codepoint is a high surrogate + // we have to wait for the next escape to get the full code point). + + return multipart_text(p, utf8, length, false); +} + +static void start_text(upb_json_parser *p, const char *ptr) { + capture_begin(p, ptr); +} + +static bool end_text(upb_json_parser *p, const char *ptr) { + return capture_end(p, ptr); +} + static void start_number(upb_json_parser *p, const char *ptr) { - start_text(p, ptr); - assert(p->accumulated == NULL); + multipart_startaccum(p); + capture_begin(p, ptr); } -static void end_number(upb_json_parser *p, const char *ptr) { - end_text(p, ptr, true); - const char *myend = p->accumulated + p->accumulated_len; +static bool end_number(upb_json_parser *p, const char *ptr) { + if (!capture_end(p, ptr)) { + return false; + } + + size_t len; + const char *buf = accumulate_getptr(p, &len); + const char *myend = buf + len; char *end; switch (upb_fielddef_type(p->top->f)) { @@ -378,7 +525,7 @@ static void end_number(upb_json_parser *p, const char *ptr) { case UPB_TYPE_INT32: { long val = strtol(p->accumulated, &end, 0); if (val > INT32_MAX || val < INT32_MIN || errno == ERANGE || end != myend) - assert(false); + goto err; else upb_sink_putint32(&p->top->sink, parser_getsel(p), val); break; @@ -386,7 +533,7 @@ static void end_number(upb_json_parser *p, const char *ptr) { case UPB_TYPE_INT64: { long long val = strtoll(p->accumulated, &end, 0); if (val > INT64_MAX || val < INT64_MIN || errno == ERANGE || end != myend) - assert(false); + goto err; else upb_sink_putint64(&p->top->sink, parser_getsel(p), val); break; @@ -394,7 +541,7 @@ static void end_number(upb_json_parser *p, const char *ptr) { case UPB_TYPE_UINT32: { unsigned long val = strtoul(p->accumulated, &end, 0); if (val > UINT32_MAX || errno == ERANGE || end != myend) - assert(false); + goto err; else upb_sink_putuint32(&p->top->sink, parser_getsel(p), val); break; @@ -402,7 +549,7 @@ static void end_number(upb_json_parser *p, const char *ptr) { case UPB_TYPE_UINT64: { unsigned long long val = strtoull(p->accumulated, &end, 0); if (val > UINT64_MAX || errno == ERANGE || end != myend) - assert(false); + goto err; else upb_sink_putuint64(&p->top->sink, parser_getsel(p), val); break; @@ -410,7 +557,7 @@ static void end_number(upb_json_parser *p, const char *ptr) { case UPB_TYPE_DOUBLE: { double val = strtod(p->accumulated, &end); if (errno == ERANGE || end != myend) - assert(false); + goto err; else upb_sink_putdouble(&p->top->sink, parser_getsel(p), val); break; @@ -418,7 +565,7 @@ static void end_number(upb_json_parser *p, const char *ptr) { case UPB_TYPE_FLOAT: { float val = strtof(p->accumulated, &end); if (errno == ERANGE || end != myend) - assert(false); + goto err; else upb_sink_putfloat(&p->top->sink, parser_getsel(p), val); break; @@ -427,230 +574,380 @@ static void end_number(upb_json_parser *p, const char *ptr) { assert(false); } - p->accumulated = NULL; + multipart_end(p); + return true; + +err: + upb_status_seterrf(p->status, "error parsing number: %.*s", buf, len); + multipart_end(p); + return false; } -static char escape_char(char in) { - switch (in) { - case 'r': return '\r'; - case 't': return '\t'; - case 'n': return '\n'; - case 'f': return '\f'; - case 'b': return '\b'; - case '/': return '/'; - case '"': return '"'; - case '\\': return '\\'; +static bool parser_putbool(upb_json_parser *p, bool val) { + if (upb_fielddef_type(p->top->f) != UPB_TYPE_BOOL) { + upb_status_seterrf(p->status, + "Boolean value specified for non-bool field: %s", + upb_fielddef_name(p->top->f)); + return false; + } + + bool ok = upb_sink_putbool(&p->top->sink, parser_getsel(p), val); + UPB_ASSERT_VAR(ok, ok); + return true; +} + +static bool start_stringval(upb_json_parser *p) { + assert(p->top->f); + + if (upb_fielddef_isstring(p->top->f)) { + if (!check_stack(p)) return false; + + // Start a new parser frame: parser frames correspond one-to-one with + // handler frames, and string events occur in a sub-frame. + upb_jsonparser_frame *inner = p->top + 1; + upb_selector_t sel = getsel_for_handlertype(p, UPB_HANDLER_STARTSTR); + upb_sink_startstr(&p->top->sink, sel, 0, &inner->sink); + inner->m = p->top->m; + inner->f = p->top->f; + p->top = inner; + + if (upb_fielddef_type(p->top->f) == UPB_TYPE_STRING) { + // For STRING fields we push data directly to the handlers as it is + // parsed. We don't do this yet for BYTES fields, because our base64 + // decoder is not streaming. + // + // TODO(haberman): make base64 decoding streaming also. + multipart_start(p, getsel_for_handlertype(p, UPB_HANDLER_STRING)); + return true; + } else { + multipart_startaccum(p); + return true; + } + } else if (upb_fielddef_type(p->top->f) == UPB_TYPE_ENUM) { + // No need to push a frame -- symbolic enum names in quotes remain in the + // current parser frame. + // + // Enum string values must accumulate so we can look up the value in a table + // once it is complete. + multipart_startaccum(p); + return true; + } else { + upb_status_seterrf(p->status, + "String specified for non-string/non-enum field: %s", + upb_fielddef_name(p->top->f)); + return false; + } +} + +static bool end_stringval(upb_json_parser *p) { + bool ok = true; + + switch (upb_fielddef_type(p->top->f)) { + case UPB_TYPE_BYTES: + if (!base64_push(p, getsel_for_handlertype(p, UPB_HANDLER_STRING), + p->accumulated, p->accumulated_len)) { + return false; + } + // Fall through. + + case UPB_TYPE_STRING: { + upb_selector_t sel = getsel_for_handlertype(p, UPB_HANDLER_ENDSTR); + upb_sink_endstr(&p->top->sink, sel); + p->top--; + break; + } + + case UPB_TYPE_ENUM: { + // Resolve enum symbolic name to integer value. + const upb_enumdef *enumdef = + (const upb_enumdef*)upb_fielddef_subdef(p->top->f); + + size_t len; + const char *buf = accumulate_getptr(p, &len); + + int32_t int_val = 0; + ok = upb_enumdef_ntoi(enumdef, buf, len, &int_val); + + if (ok) { + upb_selector_t sel = parser_getsel(p); + upb_sink_putint32(&p->top->sink, sel, int_val); + } else { + upb_status_seterrf(p->status, "Enum value unknown: '%.*s'", len, buf); + } + + break; + } + default: - assert(0); - return 'x'; + assert(false); + upb_status_seterrmsg(p->status, "Internal error in JSON decoder"); + ok = false; + break; } + + multipart_end(p); + return ok; } -static void escape(upb_json_parser *p, const char *ptr) { - char ch = escape_char(*ptr); - upb_selector_t sel = getsel_for_handlertype(p, UPB_HANDLER_STRING); - upb_sink_putstring(&p->top->sink, sel, &ch, 1, NULL); +static void start_member(upb_json_parser *p) { + assert(!p->top->f); + multipart_startaccum(p); } -static uint8_t hexdigit(char ch) { - if (ch >= '0' && ch <= '9') { - return ch - '0'; - } else if (ch >= 'a' && ch <= 'f') { - return ch - 'a' + 10; - } else { - assert(ch >= 'A' && ch <= 'F'); - return ch - 'A' + 10; +static bool end_member(upb_json_parser *p) { + assert(!p->top->f); + size_t len; + const char *buf = accumulate_getptr(p, &len); + + const upb_fielddef *f = upb_msgdef_ntof(p->top->m, buf, len); + + if (!f) { + // TODO(haberman): Ignore unknown fields if requested/configured to do so. + upb_status_seterrf(p->status, "No such field: %.*s\n", (int)len, buf); + return false; + } + + p->top->f = f; + multipart_end(p); + + return true; +} + +static void clear_member(upb_json_parser *p) { p->top->f = NULL; } + +static bool start_subobject(upb_json_parser *p) { + assert(p->top->f); + + if (!upb_fielddef_issubmsg(p->top->f)) { + upb_status_seterrf(p->status, + "Object specified for non-message/group field: %s", + upb_fielddef_name(p->top->f)); + return false; } + + if (!check_stack(p)) return false; + + upb_jsonparser_frame *inner = p->top + 1; + + upb_selector_t sel = getsel_for_handlertype(p, UPB_HANDLER_STARTSUBMSG); + upb_sink_startsubmsg(&p->top->sink, sel, &inner->sink); + inner->m = upb_fielddef_msgsubdef(p->top->f); + inner->f = NULL; + p->top = inner; + + return true; } -static void start_hex(upb_json_parser *p, const char *ptr) { - start_text(p, ptr); +static void end_subobject(upb_json_parser *p) { + p->top--; + upb_selector_t sel = getsel_for_handlertype(p, UPB_HANDLER_ENDSUBMSG); + upb_sink_endsubmsg(&p->top->sink, sel); } -static void hex(upb_json_parser *p, const char *end) { - const char *start = p->text_begin; - UPB_ASSERT_VAR(end, end - start == 4); - uint16_t codepoint = - (hexdigit(start[0]) << 12) | - (hexdigit(start[1]) << 8) | - (hexdigit(start[2]) << 4) | - hexdigit(start[3]); - // emit the codepoint as UTF-8. - char utf8[3]; // support \u0000 -- \uFFFF -- need only three bytes. - int length = 0; - if (codepoint <= 0x7F) { - utf8[0] = codepoint; - length = 1; - } else if (codepoint <= 0x07FF) { - utf8[1] = (codepoint & 0x3F) | 0x80; - codepoint >>= 6; - utf8[0] = (codepoint & 0x1F) | 0xC0; - length = 2; - } else /* codepoint <= 0xFFFF */ { - utf8[2] = (codepoint & 0x3F) | 0x80; - codepoint >>= 6; - utf8[1] = (codepoint & 0x3F) | 0x80; - codepoint >>= 6; - utf8[0] = (codepoint & 0x0F) | 0xE0; - length = 3; +static bool start_array(upb_json_parser *p) { + assert(p->top->f); + + if (!upb_fielddef_isseq(p->top->f)) { + upb_status_seterrf(p->status, + "Array specified for non-repeated field: %s", + upb_fielddef_name(p->top->f)); + return false; } - // TODO(haberman): Handle high surrogates: if codepoint is a high surrogate - // we have to wait for the next escape to get the full code point). - upb_selector_t sel = getsel_for_handlertype(p, UPB_HANDLER_STRING); - upb_sink_putstring(&p->top->sink, sel, utf8, length, NULL); + if (!check_stack(p)) return false; + + upb_jsonparser_frame *inner = p->top + 1; + upb_selector_t sel = getsel_for_handlertype(p, UPB_HANDLER_STARTSEQ); + upb_sink_startseq(&p->top->sink, sel, &inner->sink); + inner->m = p->top->m; + inner->f = p->top->f; + p->top = inner; + + return true; } +static void end_array(upb_json_parser *p) { + assert(p->top > p->stack); + + p->top--; + upb_selector_t sel = getsel_for_handlertype(p, UPB_HANDLER_ENDSEQ); + upb_sink_endseq(&p->top->sink, sel); +} + +static void start_object(upb_json_parser *p) { + upb_sink_startmsg(&p->top->sink); +} + +static void end_object(upb_json_parser *p) { + upb_status status; + upb_sink_endmsg(&p->top->sink, &status); +} + + #define CHECK_RETURN_TOP(x) if (!(x)) goto error + +/* The actual parser **********************************************************/ + // What follows is the Ragel parser itself. The language is specified in Ragel // and the actions call our C functions above. +// +// Ragel has an extensive set of functionality, and we use only a small part of +// it. There are many action types but we only use a few: +// +// ">" -- transition into a machine +// "%" -- transition out of a machine +// "@" -- transition into a final state of a machine. +// +// "@" transitions are tricky because a machine can transition into a final +// state repeatedly. But in some cases we know this can't happen, for example +// a string which is delimited by a final '"' can only transition into its +// final state once, when the closing '"' is seen. + -#line 596 "upb/json/parser.rl" +#line 901 "upb/json/parser.rl" -#line 514 "upb/json/parser.c" +#line 813 "upb/json/parser.c" static const char _json_actions[] = { 0, 1, 0, 1, 2, 1, 3, 1, - 4, 1, 5, 1, 6, 1, 7, 1, - 9, 1, 11, 1, 12, 1, 13, 1, - 14, 1, 15, 1, 16, 1, 24, 1, - 26, 2, 3, 7, 2, 5, 2, 2, - 5, 7, 2, 10, 8, 2, 12, 14, - 2, 13, 14, 2, 17, 1, 2, 18, - 26, 2, 19, 8, 2, 20, 26, 2, - 21, 26, 2, 22, 26, 2, 23, 26, - 2, 25, 26, 3, 13, 10, 8 + 5, 1, 6, 1, 7, 1, 8, 1, + 10, 1, 12, 1, 13, 1, 14, 1, + 15, 1, 16, 1, 17, 1, 21, 1, + 25, 1, 27, 2, 3, 8, 2, 4, + 5, 2, 6, 2, 2, 6, 8, 2, + 11, 9, 2, 13, 15, 2, 14, 15, + 2, 18, 1, 2, 19, 27, 2, 20, + 9, 2, 22, 27, 2, 23, 27, 2, + 24, 27, 2, 26, 27, 3, 14, 11, + 9 }; static const unsigned char _json_key_offsets[] = { - 0, 0, 4, 9, 14, 18, 22, 27, - 32, 37, 41, 45, 48, 51, 53, 57, - 61, 63, 65, 70, 72, 74, 83, 89, - 95, 101, 107, 109, 118, 118, 118, 123, - 128, 133, 133, 134, 135, 136, 137, 137, - 138, 139, 140, 140, 141, 142, 143, 143, - 148, 153, 157, 161, 166, 171, 176, 180, - 180, 183, 183, 183 + 0, 0, 4, 9, 14, 15, 19, 24, + 29, 34, 38, 42, 45, 48, 50, 54, + 58, 60, 62, 67, 69, 71, 80, 86, + 92, 98, 104, 106, 115, 116, 116, 116, + 121, 126, 131, 132, 133, 134, 135, 135, + 136, 137, 138, 138, 139, 140, 141, 141, + 146, 151, 152, 156, 161, 166, 171, 175, + 175, 178, 178, 178 }; static const char _json_trans_keys[] = { 32, 123, 9, 13, 32, 34, 125, 9, - 13, 32, 34, 125, 9, 13, 32, 58, - 9, 13, 32, 58, 9, 13, 32, 93, - 125, 9, 13, 32, 44, 125, 9, 13, - 32, 44, 125, 9, 13, 32, 34, 9, - 13, 45, 48, 49, 57, 48, 49, 57, - 46, 69, 101, 48, 57, 69, 101, 48, - 57, 43, 45, 48, 57, 48, 57, 48, - 57, 46, 69, 101, 48, 57, 34, 92, - 34, 92, 34, 47, 92, 98, 102, 110, - 114, 116, 117, 48, 57, 65, 70, 97, - 102, 48, 57, 65, 70, 97, 102, 48, - 57, 65, 70, 97, 102, 48, 57, 65, - 70, 97, 102, 34, 92, 34, 45, 91, - 102, 110, 116, 123, 48, 57, 32, 93, - 125, 9, 13, 32, 44, 93, 9, 13, - 32, 93, 125, 9, 13, 97, 108, 115, - 101, 117, 108, 108, 114, 117, 101, 32, - 34, 125, 9, 13, 32, 34, 125, 9, - 13, 32, 58, 9, 13, 32, 58, 9, - 13, 32, 93, 125, 9, 13, 32, 44, - 125, 9, 13, 32, 44, 125, 9, 13, - 32, 34, 9, 13, 32, 9, 13, 0 + 13, 32, 34, 125, 9, 13, 34, 32, + 58, 9, 13, 32, 93, 125, 9, 13, + 32, 44, 125, 9, 13, 32, 44, 125, + 9, 13, 32, 34, 9, 13, 45, 48, + 49, 57, 48, 49, 57, 46, 69, 101, + 48, 57, 69, 101, 48, 57, 43, 45, + 48, 57, 48, 57, 48, 57, 46, 69, + 101, 48, 57, 34, 92, 34, 92, 34, + 47, 92, 98, 102, 110, 114, 116, 117, + 48, 57, 65, 70, 97, 102, 48, 57, + 65, 70, 97, 102, 48, 57, 65, 70, + 97, 102, 48, 57, 65, 70, 97, 102, + 34, 92, 34, 45, 91, 102, 110, 116, + 123, 48, 57, 34, 32, 93, 125, 9, + 13, 32, 44, 93, 9, 13, 32, 93, + 125, 9, 13, 97, 108, 115, 101, 117, + 108, 108, 114, 117, 101, 32, 34, 125, + 9, 13, 32, 34, 125, 9, 13, 34, + 32, 58, 9, 13, 32, 93, 125, 9, + 13, 32, 44, 125, 9, 13, 32, 44, + 125, 9, 13, 32, 34, 9, 13, 32, + 9, 13, 0 }; static const char _json_single_lengths[] = { - 0, 2, 3, 3, 2, 2, 3, 3, + 0, 2, 3, 3, 1, 2, 3, 3, 3, 2, 2, 1, 3, 0, 2, 2, 0, 0, 3, 2, 2, 9, 0, 0, - 0, 0, 2, 7, 0, 0, 3, 3, - 3, 0, 1, 1, 1, 1, 0, 1, + 0, 0, 2, 7, 1, 0, 0, 3, + 3, 3, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 3, - 3, 2, 2, 3, 3, 3, 2, 0, + 3, 1, 2, 3, 3, 3, 2, 0, 1, 0, 0, 0 }; static const char _json_range_lengths[] = { - 0, 1, 1, 1, 1, 1, 1, 1, + 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 3, 3, - 3, 3, 0, 1, 0, 0, 1, 1, - 1, 0, 0, 0, 0, 0, 0, 0, + 3, 3, 0, 1, 0, 0, 0, 1, + 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, - 1, 1, 1, 1, 1, 1, 1, 0, + 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0 }; static const short _json_index_offsets[] = { - 0, 0, 4, 9, 14, 18, 22, 27, - 32, 37, 41, 45, 48, 52, 54, 58, - 62, 64, 66, 71, 74, 77, 87, 91, - 95, 99, 103, 106, 115, 116, 117, 122, - 127, 132, 133, 135, 137, 139, 141, 142, - 144, 146, 148, 149, 151, 153, 155, 156, - 161, 166, 170, 174, 179, 184, 189, 193, - 194, 197, 198, 199 + 0, 0, 4, 9, 14, 16, 20, 25, + 30, 35, 39, 43, 46, 50, 52, 56, + 60, 62, 64, 69, 72, 75, 85, 89, + 93, 97, 101, 104, 113, 115, 116, 117, + 122, 127, 132, 134, 136, 138, 140, 141, + 143, 145, 147, 148, 150, 152, 154, 155, + 160, 165, 167, 171, 176, 181, 186, 190, + 191, 194, 195, 196 }; static const char _json_indicies[] = { 0, 2, 0, 1, 3, 4, 5, 3, - 1, 6, 7, 8, 6, 1, 9, 10, - 9, 1, 11, 12, 11, 1, 12, 1, - 1, 12, 13, 14, 15, 16, 14, 1, - 17, 18, 8, 17, 1, 18, 7, 18, - 1, 19, 20, 21, 1, 20, 21, 1, - 23, 24, 24, 22, 25, 1, 24, 24, - 25, 22, 26, 26, 27, 1, 27, 1, - 27, 22, 23, 24, 24, 21, 22, 29, - 30, 28, 32, 33, 31, 34, 34, 34, - 34, 34, 34, 34, 34, 35, 1, 36, - 36, 36, 1, 37, 37, 37, 1, 38, - 38, 38, 1, 39, 39, 39, 1, 41, - 42, 40, 43, 44, 45, 46, 47, 48, - 49, 44, 1, 50, 51, 53, 54, 1, + 1, 6, 7, 8, 6, 1, 9, 1, + 10, 11, 10, 1, 11, 1, 1, 11, + 12, 13, 14, 15, 13, 1, 16, 17, + 8, 16, 1, 17, 7, 17, 1, 18, + 19, 20, 1, 19, 20, 1, 22, 23, + 23, 21, 24, 1, 23, 23, 24, 21, + 25, 25, 26, 1, 26, 1, 26, 21, + 22, 23, 23, 20, 21, 28, 29, 27, + 31, 32, 30, 33, 33, 33, 33, 33, + 33, 33, 33, 34, 1, 35, 35, 35, + 1, 36, 36, 36, 1, 37, 37, 37, + 1, 38, 38, 38, 1, 40, 41, 39, + 42, 43, 44, 45, 46, 47, 48, 43, + 1, 49, 1, 50, 51, 53, 54, 1, 53, 52, 55, 56, 54, 55, 1, 56, - 1, 1, 56, 52, 57, 58, 1, 59, - 1, 60, 1, 61, 1, 62, 63, 1, - 64, 1, 65, 1, 66, 67, 1, 68, - 1, 69, 1, 70, 71, 72, 73, 71, - 1, 74, 75, 76, 74, 1, 77, 78, - 77, 1, 79, 80, 79, 1, 80, 1, - 1, 80, 81, 82, 83, 84, 82, 1, - 85, 86, 76, 85, 1, 86, 75, 86, - 1, 87, 88, 88, 1, 1, 1, 1, - 0 + 1, 1, 56, 52, 57, 1, 58, 1, + 59, 1, 60, 1, 61, 62, 1, 63, + 1, 64, 1, 65, 66, 1, 67, 1, + 68, 1, 69, 70, 71, 72, 70, 1, + 73, 74, 75, 73, 1, 76, 1, 77, + 78, 77, 1, 78, 1, 1, 78, 79, + 80, 81, 82, 80, 1, 83, 84, 75, + 83, 1, 84, 74, 84, 1, 85, 86, + 86, 1, 1, 1, 1, 0 }; static const char _json_trans_targs[] = { 1, 0, 2, 3, 4, 56, 3, 4, - 56, 5, 6, 5, 6, 7, 8, 9, - 56, 8, 9, 11, 12, 18, 57, 13, - 15, 14, 16, 17, 20, 58, 21, 20, - 58, 21, 19, 22, 23, 24, 25, 26, - 20, 58, 21, 28, 29, 30, 34, 39, - 43, 47, 59, 59, 31, 30, 33, 31, - 32, 59, 35, 36, 37, 38, 59, 40, - 41, 42, 59, 44, 45, 46, 59, 48, - 49, 55, 48, 49, 55, 50, 51, 50, - 51, 52, 53, 54, 55, 53, 54, 59, - 56 + 56, 5, 5, 6, 7, 8, 9, 56, + 8, 9, 11, 12, 18, 57, 13, 15, + 14, 16, 17, 20, 58, 21, 20, 58, + 21, 19, 22, 23, 24, 25, 26, 20, + 58, 21, 28, 30, 31, 34, 39, 43, + 47, 29, 59, 59, 32, 31, 29, 32, + 33, 35, 36, 37, 38, 59, 40, 41, + 42, 59, 44, 45, 46, 59, 48, 49, + 55, 48, 49, 55, 50, 50, 51, 52, + 53, 54, 55, 53, 54, 59, 56 }; static const char _json_trans_actions[] = { - 0, 0, 0, 21, 75, 48, 0, 42, - 23, 17, 17, 0, 0, 15, 19, 19, - 45, 0, 0, 0, 0, 0, 1, 0, - 0, 0, 0, 0, 3, 13, 0, 0, - 33, 5, 11, 0, 7, 0, 0, 0, - 36, 39, 9, 57, 51, 25, 0, 0, - 0, 29, 60, 54, 15, 0, 27, 0, - 0, 31, 0, 0, 0, 0, 66, 0, - 0, 0, 69, 0, 0, 0, 63, 21, - 75, 48, 0, 42, 23, 17, 17, 0, - 0, 15, 19, 19, 45, 0, 0, 72, - 0 + 0, 0, 0, 21, 77, 53, 0, 47, + 23, 17, 0, 0, 15, 19, 19, 50, + 0, 0, 0, 0, 0, 1, 0, 0, + 0, 0, 0, 3, 13, 0, 0, 35, + 5, 11, 0, 38, 7, 7, 7, 41, + 44, 9, 62, 56, 25, 0, 0, 0, + 31, 29, 33, 59, 15, 0, 27, 0, + 0, 0, 0, 0, 0, 68, 0, 0, + 0, 71, 0, 0, 0, 65, 21, 77, + 53, 0, 47, 23, 17, 0, 0, 15, + 19, 19, 50, 0, 0, 74, 0 }; static const int json_start = 1; @@ -663,13 +960,14 @@ static const int json_en_value_machine = 27; static const int json_en_main = 1; -#line 599 "upb/json/parser.rl" +#line 904 "upb/json/parser.rl" size_t parse(void *closure, const void *hd, const char *buf, size_t size, const upb_bufhandle *handle) { UPB_UNUSED(hd); UPB_UNUSED(handle); upb_json_parser *parser = closure; + parser->handle = handle; // Variables used by Ragel's generated code. int cs = parser->current_state; @@ -679,8 +977,10 @@ size_t parse(void *closure, const void *hd, const char *buf, size_t size, const char *p = buf; const char *pe = buf + size; + capture_resume(parser, buf); + -#line 684 "upb/json/parser.c" +#line 984 "upb/json/parser.c" { int _klen; unsigned int _trans; @@ -755,114 +1055,118 @@ _match: switch ( *_acts++ ) { case 0: -#line 517 "upb/json/parser.rl" +#line 816 "upb/json/parser.rl" { p--; {cs = stack[--top]; goto _again;} } break; case 1: -#line 518 "upb/json/parser.rl" +#line 817 "upb/json/parser.rl" { p--; {stack[top++] = cs; cs = 10; goto _again;} } break; case 2: -#line 522 "upb/json/parser.rl" +#line 821 "upb/json/parser.rl" { start_text(parser, p); } break; case 3: -#line 523 "upb/json/parser.rl" - { CHECK_RETURN_TOP(end_text(parser, p, false)); } +#line 822 "upb/json/parser.rl" + { CHECK_RETURN_TOP(end_text(parser, p)); } break; case 4: -#line 529 "upb/json/parser.rl" - { start_hex(parser, p); } +#line 828 "upb/json/parser.rl" + { start_hex(parser); } break; case 5: -#line 530 "upb/json/parser.rl" - { hex(parser, p); } +#line 829 "upb/json/parser.rl" + { hexdigit(parser, p); } break; case 6: -#line 536 "upb/json/parser.rl" - { escape(parser, p); } +#line 830 "upb/json/parser.rl" + { CHECK_RETURN_TOP(end_hex(parser)); } break; case 7: -#line 539 "upb/json/parser.rl" - { {cs = stack[--top]; goto _again;} } +#line 836 "upb/json/parser.rl" + { CHECK_RETURN_TOP(escape(parser, p)); } break; case 8: -#line 540 "upb/json/parser.rl" - { {stack[top++] = cs; cs = 19; goto _again;} } +#line 842 "upb/json/parser.rl" + { p--; {cs = stack[--top]; goto _again;} } break; case 9: -#line 542 "upb/json/parser.rl" - { p--; {stack[top++] = cs; cs = 27; goto _again;} } +#line 845 "upb/json/parser.rl" + { {stack[top++] = cs; cs = 19; goto _again;} } break; case 10: -#line 547 "upb/json/parser.rl" - { start_member(parser); } +#line 847 "upb/json/parser.rl" + { p--; {stack[top++] = cs; cs = 27; goto _again;} } break; case 11: -#line 548 "upb/json/parser.rl" - { CHECK_RETURN_TOP(end_member(parser)); } +#line 852 "upb/json/parser.rl" + { start_member(parser); } break; case 12: -#line 551 "upb/json/parser.rl" - { clear_member(parser); } +#line 853 "upb/json/parser.rl" + { CHECK_RETURN_TOP(end_member(parser)); } break; case 13: -#line 557 "upb/json/parser.rl" - { start_object(parser); } +#line 856 "upb/json/parser.rl" + { clear_member(parser); } break; case 14: -#line 560 "upb/json/parser.rl" - { end_object(parser); } +#line 862 "upb/json/parser.rl" + { start_object(parser); } break; case 15: -#line 566 "upb/json/parser.rl" - { CHECK_RETURN_TOP(start_array(parser)); } +#line 865 "upb/json/parser.rl" + { end_object(parser); } break; case 16: -#line 570 "upb/json/parser.rl" - { end_array(parser); } +#line 871 "upb/json/parser.rl" + { CHECK_RETURN_TOP(start_array(parser)); } break; case 17: -#line 575 "upb/json/parser.rl" - { start_number(parser, p); } +#line 875 "upb/json/parser.rl" + { end_array(parser); } break; case 18: -#line 576 "upb/json/parser.rl" - { end_number(parser, p); } +#line 880 "upb/json/parser.rl" + { start_number(parser, p); } break; case 19: -#line 578 "upb/json/parser.rl" - { CHECK_RETURN_TOP(start_stringval(parser)); } +#line 881 "upb/json/parser.rl" + { CHECK_RETURN_TOP(end_number(parser, p)); } break; case 20: -#line 579 "upb/json/parser.rl" - { end_stringval(parser); } +#line 883 "upb/json/parser.rl" + { CHECK_RETURN_TOP(start_stringval(parser)); } break; case 21: -#line 581 "upb/json/parser.rl" - { CHECK_RETURN_TOP(parser_putbool(parser, true)); } +#line 884 "upb/json/parser.rl" + { CHECK_RETURN_TOP(end_stringval(parser)); } break; case 22: -#line 583 "upb/json/parser.rl" - { CHECK_RETURN_TOP(parser_putbool(parser, false)); } +#line 886 "upb/json/parser.rl" + { CHECK_RETURN_TOP(parser_putbool(parser, true)); } break; case 23: -#line 585 "upb/json/parser.rl" - { /* null value */ } +#line 888 "upb/json/parser.rl" + { CHECK_RETURN_TOP(parser_putbool(parser, false)); } break; case 24: -#line 587 "upb/json/parser.rl" - { CHECK_RETURN_TOP(start_subobject(parser)); } +#line 890 "upb/json/parser.rl" + { /* null value */ } break; case 25: -#line 588 "upb/json/parser.rl" - { end_subobject(parser); } +#line 892 "upb/json/parser.rl" + { CHECK_RETURN_TOP(start_subobject(parser)); } break; case 26: -#line 593 "upb/json/parser.rl" +#line 893 "upb/json/parser.rl" + { end_subobject(parser); } + break; + case 27: +#line 898 "upb/json/parser.rl" { p--; {cs = stack[--top]; goto _again;} } break; -#line 866 "upb/json/parser.c" +#line 1170 "upb/json/parser.c" } } @@ -875,10 +1179,12 @@ _again: _out: {} } -#line 615 "upb/json/parser.rl" +#line 923 "upb/json/parser.rl" if (p != pe) { upb_status_seterrf(parser->status, "Parse error at %s\n", p); + } else { + capture_suspend(parser, &p); } error: @@ -895,8 +1201,13 @@ bool end(void *closure, const void *hd) { return true; } + +/* Public API *****************************************************************/ + void upb_json_parser_init(upb_json_parser *p, upb_status *status) { p->limit = p->stack + UPB_JSON_MAX_DEPTH; + p->accumulate_buf = NULL; + p->accumulate_buf_size = 0; upb_byteshandler_init(&p->input_handler_); upb_byteshandler_setstring(&p->input_handler_, parse, NULL); upb_byteshandler_setendstr(&p->input_handler_, end, NULL); @@ -906,6 +1217,7 @@ void upb_json_parser_init(upb_json_parser *p, upb_status *status) { void upb_json_parser_uninit(upb_json_parser *p) { upb_byteshandler_uninit(&p->input_handler_); + free(p->accumulate_buf); } void upb_json_parser_reset(upb_json_parser *p) { @@ -916,18 +1228,18 @@ void upb_json_parser_reset(upb_json_parser *p) { int top; // Emit Ragel initialization of the parser. -#line 920 "upb/json/parser.c" +#line 1232 "upb/json/parser.c" { cs = json_start; top = 0; } -#line 655 "upb/json/parser.rl" +#line 971 "upb/json/parser.rl" p->current_state = cs; p->parser_top = top; - p->text_begin = NULL; - p->accumulated = NULL; - p->accumulated_len = 0; + accumulate_clear(p); + p->multipart_state = MULTIPART_INACTIVE; + p->capture = NULL; } void upb_json_parser_resetoutput(upb_json_parser *p, upb_sink *sink) { diff --git a/upb/json/parser.h b/upb/json/parser.h index a5833e7..51578f2 100644 --- a/upb/json/parser.h +++ b/upb/json/parser.h @@ -69,15 +69,24 @@ UPB_DEFINE_STRUCT0(upb_json_parser, int parser_stack[UPB_JSON_MAX_DEPTH]; int parser_top; - // A pointer to the beginning of whatever text we are currently parsing. - const char *text_begin; + // The handle for the current buffer. + const upb_bufhandle *handle; - // We have to accumulate text for member names, integers, unicode escapes, and - // base64 partial results. + // Accumulate buffer. See details in parser.rl. const char *accumulated; size_t accumulated_len; - // TODO: add members and code for allocating a buffer when necessary (when the - // member spans input buffers or contains escapes). + char *accumulate_buf; + size_t accumulate_buf_size; + + // Multi-part text data. See details in parser.rl. + int multipart_state; + upb_selector_t string_selector; + + // Input capture. See details in parser.rl. + const char *capture; + + // Intermediate result of parsing a unicode escape sequence. + uint32_t digit; )); UPB_BEGIN_EXTERN_C diff --git a/upb/json/parser.rl b/upb/json/parser.rl index abc76c8..b72bc10 100644 --- a/upb/json/parser.rl +++ b/upb/json/parser.rl @@ -33,6 +33,9 @@ #define PARSER_CHECK_RETURN(x) if (!(x)) return false +// Used to signal that a capture has been suspended. +static char suspend_capture; + static upb_selector_t getsel_for_handlertype(upb_json_parser *p, upb_handlertype_t type) { upb_selector_t sel; @@ -46,41 +49,6 @@ static upb_selector_t parser_getsel(upb_json_parser *p) { p, upb_handlers_getprimitivehandlertype(p->top->f)); } -static void start_member(upb_json_parser *p) { - assert(!p->top->f); - assert(!p->accumulated); - p->accumulated_len = 0; -} - -static bool end_member(upb_json_parser *p) { - // TODO(haberman): support keys that span buffers or have escape sequences. - assert(!p->top->f); - assert(p->accumulated); - const upb_fielddef *f = - upb_msgdef_ntof(p->top->m, p->accumulated, p->accumulated_len); - - if (!f) { - // TODO(haberman): Ignore unknown fields if requested/configured to do so. - upb_status_seterrf(p->status, "No such field: %.*s\n", - (int)p->accumulated_len, p->accumulated); - return false; - } - - p->top->f = f; - p->accumulated = NULL; - - return true; -} - -static void start_object(upb_json_parser *p) { - upb_sink_startmsg(&p->top->sink); -} - -static void end_object(upb_json_parser *p) { - upb_status status; - upb_sink_endmsg(&p->top->sink, &status); -} - static bool check_stack(upb_json_parser *p) { if ((p->top + 1) == p->limit) { upb_status_seterrmsg(p->status, "Nesting too deep"); @@ -90,83 +58,28 @@ static bool check_stack(upb_json_parser *p) { return true; } -static bool start_subobject(upb_json_parser *p) { - assert(p->top->f); - - if (!upb_fielddef_issubmsg(p->top->f)) { - upb_status_seterrf(p->status, - "Object specified for non-message/group field: %s", - upb_fielddef_name(p->top->f)); - return false; - } - - if (!check_stack(p)) return false; - - upb_jsonparser_frame *inner = p->top + 1; - - upb_selector_t sel = getsel_for_handlertype(p, UPB_HANDLER_STARTSUBMSG); - upb_sink_startsubmsg(&p->top->sink, sel, &inner->sink); - inner->m = upb_fielddef_msgsubdef(p->top->f); - inner->f = NULL; - p->top = inner; +// There are GCC/Clang built-ins for overflow checking which we could start +// using if there was any performance benefit to it. +static bool checked_add(size_t a, size_t b, size_t *c) { + if (SIZE_MAX - a < b) return false; + *c = a + b; return true; } -static void end_subobject(upb_json_parser *p) { - p->top--; - upb_selector_t sel = getsel_for_handlertype(p, UPB_HANDLER_ENDSUBMSG); - upb_sink_endsubmsg(&p->top->sink, sel); -} - -static bool start_array(upb_json_parser *p) { - assert(p->top->f); - - if (!upb_fielddef_isseq(p->top->f)) { - upb_status_seterrf(p->status, - "Array specified for non-repeated field: %s", - upb_fielddef_name(p->top->f)); - return false; +static size_t saturating_multiply(size_t a, size_t b) { + // size_t is unsigned, so this is defined behavior even on overflow. + size_t ret = a * b; + if (b != 0 && ret / b != a) { + ret = SIZE_MAX; } - - if (!check_stack(p)) return false; - - upb_jsonparser_frame *inner = p->top + 1; - upb_selector_t sel = getsel_for_handlertype(p, UPB_HANDLER_STARTSEQ); - upb_sink_startseq(&p->top->sink, sel, &inner->sink); - inner->m = p->top->m; - inner->f = p->top->f; - p->top = inner; - - return true; + return ret; } -static void end_array(upb_json_parser *p) { - assert(p->top > p->stack); - p->top--; - upb_selector_t sel = getsel_for_handlertype(p, UPB_HANDLER_ENDSEQ); - upb_sink_endseq(&p->top->sink, sel); -} +/* Base64 decoding ************************************************************/ -static void clear_member(upb_json_parser *p) { p->top->f = NULL; } - -static bool parser_putbool(upb_json_parser *p, bool val) { - if (upb_fielddef_type(p->top->f) != UPB_TYPE_BOOL) { - upb_status_seterrf(p->status, - "Boolean value specified for non-bool field: %s", - upb_fielddef_name(p->top->f)); - return false; - } - - bool ok = upb_sink_putbool(&p->top->sink, parser_getsel(p), val); - UPB_ASSERT_VAR(ok, ok); - return true; -} - -static void start_text(upb_json_parser *p, const char *ptr) { - p->text_begin = ptr; -} +// TODO(haberman): make this streaming. static const signed char b64table[] = { -1, -1, -1, -1, -1, -1, -1, -1, @@ -286,89 +199,323 @@ badpadding: return false; } -static bool end_text(upb_json_parser *p, const char *ptr, bool is_num) { - assert(!p->accumulated); // TODO: handle this case. - p->accumulated = p->text_begin; - p->accumulated_len = ptr - p->text_begin; - if (p->top->f && upb_fielddef_isstring(p->top->f)) { - // This is a string field (as opposed to a member name). - upb_selector_t sel = getsel_for_handlertype(p, UPB_HANDLER_STRING); - if (upb_fielddef_type(p->top->f) == UPB_TYPE_BYTES) { - PARSER_CHECK_RETURN(base64_push(p, sel, p->accumulated, - p->accumulated_len)); - } else { - upb_sink_putstring(&p->top->sink, sel, p->accumulated, p->accumulated_len, NULL); +/* Accumulate buffer **********************************************************/ + +// Functionality for accumulating a buffer. +// +// Some parts of the parser need an entire value as a contiguous string. For +// example, to look up a member name in a hash table, or to turn a string into +// a number, the relevant library routines need the input string to be in +// contiguous memory, even if the value spanned two or more buffers in the +// input. These routines handle that. +// +// In the common case we can just point to the input buffer to get this +// contiguous string and avoid any actual copy. So we optimistically begin +// this way. But there are a few cases where we must instead copy into a +// separate buffer: +// +// 1. The string was not contiguous in the input (it spanned buffers). +// +// 2. The string included escape sequences that need to be interpreted to get +// the true value in a contiguous buffer. + +static void assert_accumulate_empty(upb_json_parser *p) { + UPB_UNUSED(p); + assert(p->accumulated == NULL); + assert(p->accumulated_len == 0); +} + +static void accumulate_clear(upb_json_parser *p) { + p->accumulated = NULL; + p->accumulated_len = 0; +} + +// Used internally by accumulate_append(). +static bool accumulate_realloc(upb_json_parser *p, size_t need) { + size_t new_size = UPB_MAX(p->accumulate_buf_size, 128); + while (new_size < need) { + new_size = saturating_multiply(new_size, 2); + } + + void *mem = realloc(p->accumulate_buf, new_size); + if (!mem) { + upb_status_seterrmsg(p->status, "Out of memory allocating buffer."); + return false; + } + + p->accumulate_buf = mem; + p->accumulate_buf_size = new_size; + return true; +} + +// Logically appends the given data to the append buffer. +// If "can_alias" is true, we will try to avoid actually copying, but the buffer +// must be valid until the next accumulate_append() call (if any). +static bool accumulate_append(upb_json_parser *p, const char *buf, size_t len, + bool can_alias) { + if (!p->accumulated && can_alias) { + p->accumulated = buf; + p->accumulated_len = len; + return true; + } + + if (p->accumulate_buf_size - p->accumulated_len < len) { + size_t need; + if (!checked_add(p->accumulated_len, len, &need)) { + upb_status_seterrmsg(p->status, "Integer overflow."); + return false; } - p->accumulated = NULL; - } else if (p->top->f && - upb_fielddef_type(p->top->f) == UPB_TYPE_ENUM && - !is_num) { - - // Enum case: resolve enum symbolic name to integer value. - const upb_enumdef *enumdef = - (const upb_enumdef*)upb_fielddef_subdef(p->top->f); - - int32_t int_val = 0; - if (upb_enumdef_ntoi(enumdef, p->accumulated, p->accumulated_len, - &int_val)) { - upb_selector_t sel = parser_getsel(p); - upb_sink_putint32(&p->top->sink, sel, int_val); - } else { - upb_status_seterrmsg(p->status, "Enum value name unknown"); + + if (!accumulate_realloc(p, need)) { return false; } - p->accumulated = NULL; } + if (p->accumulated != p->accumulate_buf) { + memcpy(p->accumulate_buf, p->accumulated, p->accumulated_len); + p->accumulated = p->accumulate_buf; + } + + memcpy(p->accumulate_buf + p->accumulated_len, buf, len); + p->accumulated_len += len; return true; } -static bool start_stringval(upb_json_parser *p) { - assert(p->top->f); +// Returns a pointer to the data accumulated since the last accumulate_clear() +// call, and writes the length to *len. This with point either to the input +// buffer or a temporary accumulate buffer. +static const char *accumulate_getptr(upb_json_parser *p, size_t *len) { + assert(p->accumulated); + *len = p->accumulated_len; + return p->accumulated; +} - if (upb_fielddef_isstring(p->top->f)) { - if (!check_stack(p)) return false; - // Start a new parser frame: parser frames correspond one-to-one with - // handler frames, and string events occur in a sub-frame. - upb_jsonparser_frame *inner = p->top + 1; - upb_selector_t sel = getsel_for_handlertype(p, UPB_HANDLER_STARTSTR); - upb_sink_startstr(&p->top->sink, sel, 0, &inner->sink); - inner->m = p->top->m; - inner->f = p->top->f; - p->top = inner; +/* Mult-part text data ********************************************************/ + +// When we have text data in the input, it can often come in multiple segments. +// For example, there may be some raw string data followed by an escape +// sequence. The two segments are processed with different logic. Also buffer +// seams in the input can cause multiple segments. +// +// As we see segments, there are two main cases for how we want to process them: +// +// 1. we want to push the captured input directly to string handlers. +// +// 2. we need to accumulate all the parts into a contiguous buffer for further +// processing (field name lookup, string->number conversion, etc). + +// This is the set of states for p->multipart_state. +enum { + // We are not currently processing multipart data. + MULTIPART_INACTIVE = 0, + + // We are processing multipart data by accumulating it into a contiguous + // buffer. + MULTIPART_ACCUMULATE = 1, + + // We are processing multipart data by pushing each part directly to the + // current string handlers. + MULTIPART_PUSHEAGERLY = 2 +}; - return true; - } else if (upb_fielddef_type(p->top->f) == UPB_TYPE_ENUM) { - // Do nothing -- symbolic enum names in quotes remain in the - // current parser frame. +// Start a multi-part text value where we accumulate the data for processing at +// the end. +static void multipart_startaccum(upb_json_parser *p) { + assert_accumulate_empty(p); + assert(p->multipart_state == MULTIPART_INACTIVE); + p->multipart_state = MULTIPART_ACCUMULATE; +} + +// Start a multi-part text value where we immediately push text data to a string +// value with the given selector. +static void multipart_start(upb_json_parser *p, upb_selector_t sel) { + assert_accumulate_empty(p); + assert(p->multipart_state == MULTIPART_INACTIVE); + p->multipart_state = MULTIPART_PUSHEAGERLY; + p->string_selector = sel; +} + +static bool multipart_text(upb_json_parser *p, const char *buf, size_t len, + bool can_alias) { + switch (p->multipart_state) { + case MULTIPART_INACTIVE: + upb_status_seterrmsg( + p->status, "Internal error: unexpected state MULTIPART_INACTIVE"); + return false; + + case MULTIPART_ACCUMULATE: + if (!accumulate_append(p, buf, len, can_alias)) { + return false; + } + break; + + case MULTIPART_PUSHEAGERLY: { + const upb_bufhandle *handle = can_alias ? p->handle : NULL; + upb_sink_putstring(&p->top->sink, p->string_selector, buf, len, handle); + break; + } + } + + return true; +} + +// Note: this invalidates the accumulate buffer! Call only after reading its +// contents. +static void multipart_end(upb_json_parser *p) { + assert(p->multipart_state != MULTIPART_INACTIVE); + p->multipart_state = MULTIPART_INACTIVE; + accumulate_clear(p); +} + + +/* Input capture **************************************************************/ + +// Functionality for capturing a region of the input as text. Gracefully +// handles the case where a buffer seam occurs in the middle of the captured +// region. + +static void capture_begin(upb_json_parser *p, const char *ptr) { + assert(p->multipart_state != MULTIPART_INACTIVE); + assert(p->capture == NULL); + p->capture = ptr; +} + +static bool capture_end(upb_json_parser *p, const char *ptr) { + assert(p->capture); + if (multipart_text(p, p->capture, ptr - p->capture, true)) { + p->capture = NULL; return true; } else { - upb_status_seterrf(p->status, - "String specified for non-string/non-enum field: %s", - upb_fielddef_name(p->top->f)); return false; } +} +// This is called at the end of each input buffer (ie. when we have hit a +// buffer seam). If we are in the middle of capturing the input, this +// processes the unprocessed capture region. +static void capture_suspend(upb_json_parser *p, const char **ptr) { + if (!p->capture) return; + + if (multipart_text(p, p->capture, *ptr - p->capture, false)) { + // We use this as a signal that we were in the middle of capturing, and + // that capturing should resume at the beginning of the next buffer. + // + // We can't use *ptr here, because we have no guarantee that this pointer + // will be valid when we resume (if the underlying memory is freed, then + // using the pointer at all, even to compare to NULL, is likely undefined + // behavior). + p->capture = &suspend_capture; + } else { + // Need to back up the pointer to the beginning of the capture, since + // we were not able to actually preserve it. + *ptr = p->capture; + } } -static void end_stringval(upb_json_parser *p) { - if (upb_fielddef_isstring(p->top->f)) { - upb_selector_t sel = getsel_for_handlertype(p, UPB_HANDLER_ENDSTR); - upb_sink_endstr(&p->top->sink, sel); - p->top--; +static void capture_resume(upb_json_parser *p, const char *ptr) { + if (p->capture) { + assert(p->capture == &suspend_capture); + p->capture = ptr; + } +} + + +/* Callbacks from the parser **************************************************/ + +// These are the functions called directly from the parser itself. +// We define these in the same order as their declarations in the parser. + +static char escape_char(char in) { + switch (in) { + case 'r': return '\r'; + case 't': return '\t'; + case 'n': return '\n'; + case 'f': return '\f'; + case 'b': return '\b'; + case '/': return '/'; + case '"': return '"'; + case '\\': return '\\'; + default: + assert(0); + return 'x'; + } +} + +static bool escape(upb_json_parser *p, const char *ptr) { + char ch = escape_char(*ptr); + return multipart_text(p, &ch, 1, false); +} + +static void start_hex(upb_json_parser *p) { + p->digit = 0; +} + +static void hexdigit(upb_json_parser *p, const char *ptr) { + char ch = *ptr; + + p->digit <<= 4; + + if (ch >= '0' && ch <= '9') { + p->digit += (ch - '0'); + } else if (ch >= 'a' && ch <= 'f') { + p->digit += ((ch - 'a') + 10); + } else { + assert(ch >= 'A' && ch <= 'F'); + p->digit += ((ch - 'A') + 10); } } +static bool end_hex(upb_json_parser *p) { + uint32_t codepoint = p->digit; + + // emit the codepoint as UTF-8. + char utf8[3]; // support \u0000 -- \uFFFF -- need only three bytes. + int length = 0; + if (codepoint <= 0x7F) { + utf8[0] = codepoint; + length = 1; + } else if (codepoint <= 0x07FF) { + utf8[1] = (codepoint & 0x3F) | 0x80; + codepoint >>= 6; + utf8[0] = (codepoint & 0x1F) | 0xC0; + length = 2; + } else /* codepoint <= 0xFFFF */ { + utf8[2] = (codepoint & 0x3F) | 0x80; + codepoint >>= 6; + utf8[1] = (codepoint & 0x3F) | 0x80; + codepoint >>= 6; + utf8[0] = (codepoint & 0x0F) | 0xE0; + length = 3; + } + // TODO(haberman): Handle high surrogates: if codepoint is a high surrogate + // we have to wait for the next escape to get the full code point). + + return multipart_text(p, utf8, length, false); +} + +static void start_text(upb_json_parser *p, const char *ptr) { + capture_begin(p, ptr); +} + +static bool end_text(upb_json_parser *p, const char *ptr) { + return capture_end(p, ptr); +} + static void start_number(upb_json_parser *p, const char *ptr) { - start_text(p, ptr); - assert(p->accumulated == NULL); + multipart_startaccum(p); + capture_begin(p, ptr); } -static void end_number(upb_json_parser *p, const char *ptr) { - end_text(p, ptr, true); - const char *myend = p->accumulated + p->accumulated_len; +static bool end_number(upb_json_parser *p, const char *ptr) { + if (!capture_end(p, ptr)) { + return false; + } + + size_t len; + const char *buf = accumulate_getptr(p, &len); + const char *myend = buf + len; char *end; switch (upb_fielddef_type(p->top->f)) { @@ -376,7 +523,7 @@ static void end_number(upb_json_parser *p, const char *ptr) { case UPB_TYPE_INT32: { long val = strtol(p->accumulated, &end, 0); if (val > INT32_MAX || val < INT32_MIN || errno == ERANGE || end != myend) - assert(false); + goto err; else upb_sink_putint32(&p->top->sink, parser_getsel(p), val); break; @@ -384,7 +531,7 @@ static void end_number(upb_json_parser *p, const char *ptr) { case UPB_TYPE_INT64: { long long val = strtoll(p->accumulated, &end, 0); if (val > INT64_MAX || val < INT64_MIN || errno == ERANGE || end != myend) - assert(false); + goto err; else upb_sink_putint64(&p->top->sink, parser_getsel(p), val); break; @@ -392,7 +539,7 @@ static void end_number(upb_json_parser *p, const char *ptr) { case UPB_TYPE_UINT32: { unsigned long val = strtoul(p->accumulated, &end, 0); if (val > UINT32_MAX || errno == ERANGE || end != myend) - assert(false); + goto err; else upb_sink_putuint32(&p->top->sink, parser_getsel(p), val); break; @@ -400,7 +547,7 @@ static void end_number(upb_json_parser *p, const char *ptr) { case UPB_TYPE_UINT64: { unsigned long long val = strtoull(p->accumulated, &end, 0); if (val > UINT64_MAX || errno == ERANGE || end != myend) - assert(false); + goto err; else upb_sink_putuint64(&p->top->sink, parser_getsel(p), val); break; @@ -408,7 +555,7 @@ static void end_number(upb_json_parser *p, const char *ptr) { case UPB_TYPE_DOUBLE: { double val = strtod(p->accumulated, &end); if (errno == ERANGE || end != myend) - assert(false); + goto err; else upb_sink_putdouble(&p->top->sink, parser_getsel(p), val); break; @@ -416,7 +563,7 @@ static void end_number(upb_json_parser *p, const char *ptr) { case UPB_TYPE_FLOAT: { float val = strtof(p->accumulated, &end); if (errno == ERANGE || end != myend) - assert(false); + goto err; else upb_sink_putfloat(&p->top->sink, parser_getsel(p), val); break; @@ -425,84 +572,236 @@ static void end_number(upb_json_parser *p, const char *ptr) { assert(false); } - p->accumulated = NULL; + multipart_end(p); + return true; + +err: + upb_status_seterrf(p->status, "error parsing number: %.*s", buf, len); + multipart_end(p); + return false; } -static char escape_char(char in) { - switch (in) { - case 'r': return '\r'; - case 't': return '\t'; - case 'n': return '\n'; - case 'f': return '\f'; - case 'b': return '\b'; - case '/': return '/'; - case '"': return '"'; - case '\\': return '\\'; +static bool parser_putbool(upb_json_parser *p, bool val) { + if (upb_fielddef_type(p->top->f) != UPB_TYPE_BOOL) { + upb_status_seterrf(p->status, + "Boolean value specified for non-bool field: %s", + upb_fielddef_name(p->top->f)); + return false; + } + + bool ok = upb_sink_putbool(&p->top->sink, parser_getsel(p), val); + UPB_ASSERT_VAR(ok, ok); + return true; +} + +static bool start_stringval(upb_json_parser *p) { + assert(p->top->f); + + if (upb_fielddef_isstring(p->top->f)) { + if (!check_stack(p)) return false; + + // Start a new parser frame: parser frames correspond one-to-one with + // handler frames, and string events occur in a sub-frame. + upb_jsonparser_frame *inner = p->top + 1; + upb_selector_t sel = getsel_for_handlertype(p, UPB_HANDLER_STARTSTR); + upb_sink_startstr(&p->top->sink, sel, 0, &inner->sink); + inner->m = p->top->m; + inner->f = p->top->f; + p->top = inner; + + if (upb_fielddef_type(p->top->f) == UPB_TYPE_STRING) { + // For STRING fields we push data directly to the handlers as it is + // parsed. We don't do this yet for BYTES fields, because our base64 + // decoder is not streaming. + // + // TODO(haberman): make base64 decoding streaming also. + multipart_start(p, getsel_for_handlertype(p, UPB_HANDLER_STRING)); + return true; + } else { + multipart_startaccum(p); + return true; + } + } else if (upb_fielddef_type(p->top->f) == UPB_TYPE_ENUM) { + // No need to push a frame -- symbolic enum names in quotes remain in the + // current parser frame. + // + // Enum string values must accumulate so we can look up the value in a table + // once it is complete. + multipart_startaccum(p); + return true; + } else { + upb_status_seterrf(p->status, + "String specified for non-string/non-enum field: %s", + upb_fielddef_name(p->top->f)); + return false; + } +} + +static bool end_stringval(upb_json_parser *p) { + bool ok = true; + + switch (upb_fielddef_type(p->top->f)) { + case UPB_TYPE_BYTES: + if (!base64_push(p, getsel_for_handlertype(p, UPB_HANDLER_STRING), + p->accumulated, p->accumulated_len)) { + return false; + } + // Fall through. + + case UPB_TYPE_STRING: { + upb_selector_t sel = getsel_for_handlertype(p, UPB_HANDLER_ENDSTR); + upb_sink_endstr(&p->top->sink, sel); + p->top--; + break; + } + + case UPB_TYPE_ENUM: { + // Resolve enum symbolic name to integer value. + const upb_enumdef *enumdef = + (const upb_enumdef*)upb_fielddef_subdef(p->top->f); + + size_t len; + const char *buf = accumulate_getptr(p, &len); + + int32_t int_val = 0; + ok = upb_enumdef_ntoi(enumdef, buf, len, &int_val); + + if (ok) { + upb_selector_t sel = parser_getsel(p); + upb_sink_putint32(&p->top->sink, sel, int_val); + } else { + upb_status_seterrf(p->status, "Enum value unknown: '%.*s'", len, buf); + } + + break; + } + default: - assert(0); - return 'x'; + assert(false); + upb_status_seterrmsg(p->status, "Internal error in JSON decoder"); + ok = false; + break; } + + multipart_end(p); + return ok; } -static void escape(upb_json_parser *p, const char *ptr) { - char ch = escape_char(*ptr); - upb_selector_t sel = getsel_for_handlertype(p, UPB_HANDLER_STRING); - upb_sink_putstring(&p->top->sink, sel, &ch, 1, NULL); +static void start_member(upb_json_parser *p) { + assert(!p->top->f); + multipart_startaccum(p); } -static uint8_t hexdigit(char ch) { - if (ch >= '0' && ch <= '9') { - return ch - '0'; - } else if (ch >= 'a' && ch <= 'f') { - return ch - 'a' + 10; - } else { - assert(ch >= 'A' && ch <= 'F'); - return ch - 'A' + 10; +static bool end_member(upb_json_parser *p) { + assert(!p->top->f); + size_t len; + const char *buf = accumulate_getptr(p, &len); + + const upb_fielddef *f = upb_msgdef_ntof(p->top->m, buf, len); + + if (!f) { + // TODO(haberman): Ignore unknown fields if requested/configured to do so. + upb_status_seterrf(p->status, "No such field: %.*s\n", (int)len, buf); + return false; } + + p->top->f = f; + multipart_end(p); + + return true; } -static void start_hex(upb_json_parser *p, const char *ptr) { - start_text(p, ptr); +static void clear_member(upb_json_parser *p) { p->top->f = NULL; } + +static bool start_subobject(upb_json_parser *p) { + assert(p->top->f); + + if (!upb_fielddef_issubmsg(p->top->f)) { + upb_status_seterrf(p->status, + "Object specified for non-message/group field: %s", + upb_fielddef_name(p->top->f)); + return false; + } + + if (!check_stack(p)) return false; + + upb_jsonparser_frame *inner = p->top + 1; + + upb_selector_t sel = getsel_for_handlertype(p, UPB_HANDLER_STARTSUBMSG); + upb_sink_startsubmsg(&p->top->sink, sel, &inner->sink); + inner->m = upb_fielddef_msgsubdef(p->top->f); + inner->f = NULL; + p->top = inner; + + return true; } -static void hex(upb_json_parser *p, const char *end) { - const char *start = p->text_begin; - UPB_ASSERT_VAR(end, end - start == 4); - uint16_t codepoint = - (hexdigit(start[0]) << 12) | - (hexdigit(start[1]) << 8) | - (hexdigit(start[2]) << 4) | - hexdigit(start[3]); - // emit the codepoint as UTF-8. - char utf8[3]; // support \u0000 -- \uFFFF -- need only three bytes. - int length = 0; - if (codepoint <= 0x7F) { - utf8[0] = codepoint; - length = 1; - } else if (codepoint <= 0x07FF) { - utf8[1] = (codepoint & 0x3F) | 0x80; - codepoint >>= 6; - utf8[0] = (codepoint & 0x1F) | 0xC0; - length = 2; - } else /* codepoint <= 0xFFFF */ { - utf8[2] = (codepoint & 0x3F) | 0x80; - codepoint >>= 6; - utf8[1] = (codepoint & 0x3F) | 0x80; - codepoint >>= 6; - utf8[0] = (codepoint & 0x0F) | 0xE0; - length = 3; +static void end_subobject(upb_json_parser *p) { + p->top--; + upb_selector_t sel = getsel_for_handlertype(p, UPB_HANDLER_ENDSUBMSG); + upb_sink_endsubmsg(&p->top->sink, sel); +} + +static bool start_array(upb_json_parser *p) { + assert(p->top->f); + + if (!upb_fielddef_isseq(p->top->f)) { + upb_status_seterrf(p->status, + "Array specified for non-repeated field: %s", + upb_fielddef_name(p->top->f)); + return false; } - // TODO(haberman): Handle high surrogates: if codepoint is a high surrogate - // we have to wait for the next escape to get the full code point). - upb_selector_t sel = getsel_for_handlertype(p, UPB_HANDLER_STRING); - upb_sink_putstring(&p->top->sink, sel, utf8, length, NULL); + if (!check_stack(p)) return false; + + upb_jsonparser_frame *inner = p->top + 1; + upb_selector_t sel = getsel_for_handlertype(p, UPB_HANDLER_STARTSEQ); + upb_sink_startseq(&p->top->sink, sel, &inner->sink); + inner->m = p->top->m; + inner->f = p->top->f; + p->top = inner; + + return true; +} + +static void end_array(upb_json_parser *p) { + assert(p->top > p->stack); + + p->top--; + upb_selector_t sel = getsel_for_handlertype(p, UPB_HANDLER_ENDSEQ); + upb_sink_endseq(&p->top->sink, sel); +} + +static void start_object(upb_json_parser *p) { + upb_sink_startmsg(&p->top->sink); +} + +static void end_object(upb_json_parser *p) { + upb_status status; + upb_sink_endmsg(&p->top->sink, &status); } + #define CHECK_RETURN_TOP(x) if (!(x)) goto error + +/* The actual parser **********************************************************/ + // What follows is the Ragel parser itself. The language is specified in Ragel // and the actions call our C functions above. +// +// Ragel has an extensive set of functionality, and we use only a small part of +// it. There are many action types but we only use a few: +// +// ">" -- transition into a machine +// "%" -- transition out of a machine +// "@" -- transition into a final state of a machine. +// +// "@" transitions are tricky because a machine can transition into a final +// state repeatedly. But in some cases we know this can't happen, for example +// a string which is delimited by a final '"' can only transition into its +// final state once, when the closing '"' is seen. + %%{ machine json; @@ -520,24 +819,30 @@ static void hex(upb_json_parser *p, const char *end) { text = /[^\\"]/+ >{ start_text(parser, p); } - %{ CHECK_RETURN_TOP(end_text(parser, p, false)); } + %{ CHECK_RETURN_TOP(end_text(parser, p)); } ; unicode_char = "\\u" /[0-9A-Fa-f]/{4} - >{ start_hex(parser, p); } - %{ hex(parser, p); } + >{ start_hex(parser); } + ${ hexdigit(parser, p); } + %{ CHECK_RETURN_TOP(end_hex(parser)); } ; escape_char = "\\" /[rtbfn"\/\\]/ - >{ escape(parser, p); } + >{ CHECK_RETURN_TOP(escape(parser, p)); } + ; + + string_machine := + (text | unicode_char | escape_char)** + '"' + @{ fhold; fret; } ; - string_machine := (text | unicode_char | escape_char)** '"' @{ fret; } ; - string = '"' @{ fcall string_machine; }; + string = '"' @{ fcall string_machine; } '"'; value2 = ^(space | "]" | "}") >{ fhold; fcall value_machine; } ; @@ -545,7 +850,7 @@ static void hex(upb_json_parser *p, const char *end) { ws string >{ start_member(parser); } - %{ CHECK_RETURN_TOP(end_member(parser)); } + @{ CHECK_RETURN_TOP(end_member(parser)); } ws ":" ws value2 %{ clear_member(parser); } @@ -573,10 +878,10 @@ static void hex(upb_json_parser *p, const char *end) { value = number >{ start_number(parser, p); } - %{ end_number(parser, p); } + %{ CHECK_RETURN_TOP(end_number(parser, p)); } | string >{ CHECK_RETURN_TOP(start_stringval(parser)); } - %{ end_stringval(parser); } + @{ CHECK_RETURN_TOP(end_stringval(parser)); } | "true" %{ CHECK_RETURN_TOP(parser_putbool(parser, true)); } | "false" @@ -602,6 +907,7 @@ size_t parse(void *closure, const void *hd, const char *buf, size_t size, UPB_UNUSED(hd); UPB_UNUSED(handle); upb_json_parser *parser = closure; + parser->handle = handle; // Variables used by Ragel's generated code. int cs = parser->current_state; @@ -611,10 +917,14 @@ size_t parse(void *closure, const void *hd, const char *buf, size_t size, const char *p = buf; const char *pe = buf + size; + capture_resume(parser, buf); + %% write exec; if (p != pe) { upb_status_seterrf(parser->status, "Parse error at %s\n", p); + } else { + capture_suspend(parser, &p); } error: @@ -631,8 +941,13 @@ bool end(void *closure, const void *hd) { return true; } + +/* Public API *****************************************************************/ + void upb_json_parser_init(upb_json_parser *p, upb_status *status) { p->limit = p->stack + UPB_JSON_MAX_DEPTH; + p->accumulate_buf = NULL; + p->accumulate_buf_size = 0; upb_byteshandler_init(&p->input_handler_); upb_byteshandler_setstring(&p->input_handler_, parse, NULL); upb_byteshandler_setendstr(&p->input_handler_, end, NULL); @@ -642,6 +957,7 @@ void upb_json_parser_init(upb_json_parser *p, upb_status *status) { void upb_json_parser_uninit(upb_json_parser *p) { upb_byteshandler_uninit(&p->input_handler_); + free(p->accumulate_buf); } void upb_json_parser_reset(upb_json_parser *p) { @@ -654,9 +970,9 @@ void upb_json_parser_reset(upb_json_parser *p) { %% write init; p->current_state = cs; p->parser_top = top; - p->text_begin = NULL; - p->accumulated = NULL; - p->accumulated_len = 0; + accumulate_clear(p); + p->multipart_state = MULTIPART_INACTIVE; + p->capture = NULL; } void upb_json_parser_resetoutput(upb_json_parser *p, upb_sink *sink) { diff --git a/upb/pb/compile_decoder.c b/upb/pb/compile_decoder.c index 64689f6..11aa4e9 100644 --- a/upb/pb/compile_decoder.c +++ b/upb/pb/compile_decoder.c @@ -302,6 +302,7 @@ static void putop(compiler *c, opcode op, ...) { case OP_SETDELIM: case OP_HALT: case OP_RET: + case OP_DISPATCH: put32(c, op); break; case OP_PARSE_DOUBLE: @@ -382,7 +383,7 @@ const char *upb_pbdecoder_getopname(unsigned int op) { OP(ENDSUBMSG), OP(STARTSTR), OP(STRING), OP(ENDSTR), OP(CALL), OP(RET), OP(PUSHLENDELIM), OP(PUSHTAGDELIM), OP(SETDELIM), OP(CHECKDELIM), OP(BRANCH), OP(TAG1), OP(TAG2), OP(TAGN), OP(SETDISPATCH), OP(POP), - OP(SETBIGGROUPNUM), OP(HALT), + OP(SETBIGGROUPNUM), OP(DISPATCH), OP(HALT), }; return op > OP_HALT ? names[0] : names[op]; #undef OP @@ -414,6 +415,7 @@ static void dumpbc(uint32_t *p, uint32_t *end, FILE *f) { upb_handlers_msgdef(method->dest_handlers_))); break; } + case OP_DISPATCH: case OP_STARTMSG: case OP_ENDMSG: case OP_PUSHLENDELIM: @@ -759,6 +761,7 @@ static void compile_method(compiler *c, upb_pbdecodermethod *method) { putop(c, OP_SETDISPATCH, &method->dispatch); putsel(c, OP_STARTMSG, UPB_STARTMSG_SELECTOR, h); label(c, LABEL_FIELD); + uint32_t* start_pc = c->pc; upb_msg_iter i; for(upb_msg_begin(&i, md); !upb_msg_done(&i); upb_msg_next(&i)) { const upb_fielddef *f = upb_msg_iter_field(&i); @@ -774,8 +777,18 @@ static void compile_method(compiler *c, upb_pbdecodermethod *method) { } } + // If there were no fields, or if no handlers were defined, we need to + // generate a non-empty loop body so that we can at least dispatch for unknown + // fields and check for the end of the message. + if (c->pc == start_pc) { + // Check for end-of-message. + putop(c, OP_CHECKDELIM, LABEL_ENDMSG); + // Unconditionally dispatch. + putop(c, OP_DISPATCH, 0); + } + // For now we just loop back to the last field of the message (or if none, - // the DISPATCH opcode for the message. + // the DISPATCH opcode for the message). putop(c, OP_BRANCH, -LABEL_FIELD); // Insert both a label and a dispatch table entry for this end-of-msg. diff --git a/upb/pb/compile_decoder_x64.dasc b/upb/pb/compile_decoder_x64.dasc index bac1ac2..3181cab 100644 --- a/upb/pb/compile_decoder_x64.dasc +++ b/upb/pb/compile_decoder_x64.dasc @@ -1124,6 +1124,9 @@ static void jitbytecode(jitcompiler *jc) { jittag(jc, tag, arg >> 8, (int8_t)arg, method); break; } + case OP_DISPATCH: + | call =>jmptarget(jc, &method->dispatch) + break; case OP_HALT: assert(false); } diff --git a/upb/pb/compile_decoder_x64.h b/upb/pb/compile_decoder_x64.h index ba3636e..ef4459d 100644 --- a/upb/pb/compile_decoder_x64.h +++ b/upb/pb/compile_decoder_x64.h @@ -1680,6 +1680,11 @@ static void jitbytecode(jitcompiler *jc) { jittag(jc, tag, arg >> 8, (int8_t)arg, method); break; } + case OP_DISPATCH: + //| call =>jmptarget(jc, &method->dispatch) + dasm_put(Dst, 2151, jmptarget(jc, &method->dispatch)); +# 1129 "upb/pb/compile_decoder_x64.dasc" + break; case OP_HALT: assert(false); } @@ -1688,5 +1693,5 @@ static void jitbytecode(jitcompiler *jc) { asmlabel(jc, "eof"); //| nop dasm_put(Dst, 1909); -# 1134 "upb/pb/compile_decoder_x64.dasc" +# 1137 "upb/pb/compile_decoder_x64.dasc" } diff --git a/upb/pb/decoder.c b/upb/pb/decoder.c index ec8c03f..04ca413 100644 --- a/upb/pb/decoder.c +++ b/upb/pb/decoder.c @@ -801,6 +801,9 @@ size_t upb_pbdecoder_decode(void *closure, const void *hd, const char *buf, if (result == DECODE_MISMATCH) goto badtag; if (result >= 0) return result; }) + VMCASE(OP_DISPATCH, { + CHECK_RETURN(dispatch(d)); + }) VMCASE(OP_HALT, { return size; }) @@ -859,7 +862,8 @@ bool upb_pbdecoder_end(void *closure, const void *handler_data) { // Rewind from OP_TAG* to OP_CHECKDELIM. assert(getop(*d->pc) == OP_TAG1 || getop(*d->pc) == OP_TAG2 || - getop(*d->pc) == OP_TAGN); + getop(*d->pc) == OP_TAGN || + getop(*d->pc == OP_DISPATCH)); d->pc = p; } upb_pbdecoder_decode(closure, handler_data, &dummy, 0, NULL); diff --git a/upb/pb/decoder.int.h b/upb/pb/decoder.int.h index 9b35b70..302701e 100644 --- a/upb/pb/decoder.int.h +++ b/upb/pb/decoder.int.h @@ -66,7 +66,9 @@ typedef enum { // | unused (24) | opc | // | upb_inttable* (32 or 64) | - OP_HALT = 36, // No arg. + OP_DISPATCH = 36, // No arg. + + OP_HALT = 37, // No arg. } opcode; #define OP_MAX OP_HALT diff --git a/upb/upb.c b/upb/upb.c index 52f55d4..c58d307 100644 --- a/upb/upb.c +++ b/upb/upb.c @@ -32,8 +32,10 @@ static void nullz(upb_status *status) { } void upb_status_clear(upb_status *status) { - upb_status blank = UPB_STATUS_INIT; - upb_status_copy(status, &blank); + if (!status) return; + status->ok_ = true; + status->code_ = 0; + status->msg[0] = '\0'; } bool upb_ok(const upb_status *status) { return status->ok_; } -- cgit v1.2.3