From 8f8113b4fff748b57b0ff2f1a301e86b4703be84 Mon Sep 17 00:00:00 2001 From: Chris Fallin Date: Tue, 9 Dec 2014 12:27:22 -0800 Subject: JSON test, symbolic enum names in JSON, and a few improvements. - Added a JSON test that round-trips (parses then re-serializes) several test messages, ensuring that the re-serialized form matches the original exactly. - Added support for printing and parsing symbolic enum names (rather than integer values) in JSON. - Updated JSON printer to properly handle string fields that come in multiple pieces. ('bytes' fields still do not support this, and this work is more challenging because it requires making the base64 encoder resumable. Base64 encoding is not separable at an input-byte granularity, unlike string escaping.) - Fixed a < vs. <= bug in UTF-8 encoding generation (oops). --- upb/json/parser.c | 139 ++++++++++++++++++++++++++++++++--------------------- upb/json/parser.rl | 69 ++++++++++++++++++-------- upb/json/printer.c | 122 +++++++++++++++++++++++++++++++++++++++++----- 3 files changed, 240 insertions(+), 90 deletions(-) (limited to 'upb/json') diff --git a/upb/json/parser.c b/upb/json/parser.c index 2687713..78fc6c0 100644 --- a/upb/json/parser.c +++ b/upb/json/parser.c @@ -288,7 +288,7 @@ badpadding: return false; } -static bool end_text(upb_json_parser *p, const char *ptr) { +static bool end_text(upb_json_parser *p, const char *ptr, bool is_num) { assert(!p->accumulated); // TODO: handle this case. p->accumulated = p->text_begin; p->accumulated_len = ptr - p->text_begin; @@ -302,6 +302,24 @@ static bool end_text(upb_json_parser *p, const char *ptr) { upb_sink_putstring(&p->top->sink, sel, p->accumulated, p->accumulated_len, NULL); } p->accumulated = NULL; + } else if (p->top->f && + upb_fielddef_type(p->top->f) == UPB_TYPE_ENUM && + !is_num) { + + // Enum case: resolve enum symbolic name to integer value. + const upb_enumdef *enumdef = + (const upb_enumdef*)upb_fielddef_subdef(p->top->f); + + int32_t int_val = 0; + if (upb_enumdef_ntoi(enumdef, p->accumulated, p->accumulated_len, + &int_val)) { + upb_selector_t sel = getsel(p); + upb_sink_putint32(&p->top->sink, sel, int_val); + } else { + upb_status_seterrmsg(p->status, "Enum value name unknown"); + return false; + } + p->accumulated = NULL; } return true; @@ -310,29 +328,38 @@ static bool end_text(upb_json_parser *p, const char *ptr) { static bool start_stringval(upb_json_parser *p) { assert(p->top->f); - if (!upb_fielddef_isstring(p->top->f)) { + if (upb_fielddef_isstring(p->top->f)) { + if (!check_stack(p)) return false; + + // Start a new parser frame: parser frames correspond one-to-one with + // handler frames, and string events occur in a sub-frame. + upb_jsonparser_frame *inner = p->top + 1; + upb_selector_t sel = getsel_for_handlertype(p, UPB_HANDLER_STARTSTR); + upb_sink_startstr(&p->top->sink, sel, 0, &inner->sink); + inner->m = p->top->m; + inner->f = p->top->f; + p->top = inner; + + return true; + } else if (upb_fielddef_type(p->top->f) == UPB_TYPE_ENUM) { + // Do nothing -- symbolic enum names in quotes remain in the + // current parser frame. + return true; + } else { upb_status_seterrf(p->status, - "String specified for non-string field: %s", + "String specified for non-string/non-enum field: %s", upb_fielddef_name(p->top->f)); return false; } - if (!check_stack(p)) return false; - - upb_jsonparser_frame *inner = p->top + 1; // TODO: check for overflow. - upb_selector_t sel = getsel_for_handlertype(p, UPB_HANDLER_STARTSTR); - upb_sink_startstr(&p->top->sink, sel, 0, &inner->sink); - inner->m = p->top->m; - inner->f = p->top->f; - p->top = inner; - - return true; } static void end_stringval(upb_json_parser *p) { - p->top--; - upb_selector_t sel = getsel_for_handlertype(p, UPB_HANDLER_ENDSTR); - upb_sink_endstr(&p->top->sink, sel); + if (upb_fielddef_isstring(p->top->f)) { + upb_selector_t sel = getsel_for_handlertype(p, UPB_HANDLER_ENDSTR); + upb_sink_endstr(&p->top->sink, sel); + p->top--; + } } static void start_number(upb_json_parser *p, const char *ptr) { @@ -341,7 +368,7 @@ static void start_number(upb_json_parser *p, const char *ptr) { } static void end_number(upb_json_parser *p, const char *ptr) { - end_text(p, ptr); + end_text(p, ptr, true); const char *myend = p->accumulated + p->accumulated_len; char *end; @@ -450,15 +477,15 @@ static void hex(upb_json_parser *p, const char *end) { // emit the codepoint as UTF-8. char utf8[3]; // support \u0000 -- \uFFFF -- need only three bytes. int length = 0; - if (codepoint < 0x7F) { + if (codepoint <= 0x7F) { utf8[0] = codepoint; length = 1; - } else if (codepoint < 0x07FF) { + } else if (codepoint <= 0x07FF) { utf8[1] = (codepoint & 0x3F) | 0x80; codepoint >>= 6; utf8[0] = (codepoint & 0x1F) | 0xC0; length = 2; - } else /* codepoint < 0xFFFF */ { + } else /* codepoint <= 0xFFFF */ { utf8[2] = (codepoint & 0x3F) | 0x80; codepoint >>= 6; utf8[1] = (codepoint & 0x3F) | 0x80; @@ -478,11 +505,11 @@ static void hex(upb_json_parser *p, const char *end) { // What follows is the Ragel parser itself. The language is specified in Ragel // and the actions call our C functions above. -#line 568 "upb/json/parser.rl" +#line 595 "upb/json/parser.rl" -#line 486 "upb/json/parser.c" +#line 513 "upb/json/parser.c" static const char _json_actions[] = { 0, 1, 0, 1, 2, 1, 3, 1, 4, 1, 5, 1, 6, 1, 7, 1, @@ -635,7 +662,7 @@ static const int json_en_value_machine = 27; static const int json_en_main = 1; -#line 571 "upb/json/parser.rl" +#line 598 "upb/json/parser.rl" size_t parse(void *closure, const void *hd, const char *buf, size_t size, const upb_bufhandle *handle) { @@ -652,7 +679,7 @@ size_t parse(void *closure, const void *hd, const char *buf, size_t size, const char *pe = buf + size; -#line 656 "upb/json/parser.c" +#line 683 "upb/json/parser.c" { int _klen; unsigned int _trans; @@ -727,114 +754,114 @@ _match: switch ( *_acts++ ) { case 0: -#line 489 "upb/json/parser.rl" +#line 516 "upb/json/parser.rl" { p--; {cs = stack[--top]; goto _again;} } break; case 1: -#line 490 "upb/json/parser.rl" +#line 517 "upb/json/parser.rl" { p--; {stack[top++] = cs; cs = 10; goto _again;} } break; case 2: -#line 494 "upb/json/parser.rl" +#line 521 "upb/json/parser.rl" { start_text(parser, p); } break; case 3: -#line 495 "upb/json/parser.rl" - { CHECK_RETURN_TOP(end_text(parser, p)); } +#line 522 "upb/json/parser.rl" + { CHECK_RETURN_TOP(end_text(parser, p, false)); } break; case 4: -#line 501 "upb/json/parser.rl" +#line 528 "upb/json/parser.rl" { start_hex(parser, p); } break; case 5: -#line 502 "upb/json/parser.rl" +#line 529 "upb/json/parser.rl" { hex(parser, p); } break; case 6: -#line 508 "upb/json/parser.rl" +#line 535 "upb/json/parser.rl" { escape(parser, p); } break; case 7: -#line 511 "upb/json/parser.rl" +#line 538 "upb/json/parser.rl" { {cs = stack[--top]; goto _again;} } break; case 8: -#line 512 "upb/json/parser.rl" +#line 539 "upb/json/parser.rl" { {stack[top++] = cs; cs = 19; goto _again;} } break; case 9: -#line 514 "upb/json/parser.rl" +#line 541 "upb/json/parser.rl" { p--; {stack[top++] = cs; cs = 27; goto _again;} } break; case 10: -#line 519 "upb/json/parser.rl" +#line 546 "upb/json/parser.rl" { start_member(parser); } break; case 11: -#line 520 "upb/json/parser.rl" +#line 547 "upb/json/parser.rl" { CHECK_RETURN_TOP(end_member(parser)); } break; case 12: -#line 523 "upb/json/parser.rl" +#line 550 "upb/json/parser.rl" { clear_member(parser); } break; case 13: -#line 529 "upb/json/parser.rl" +#line 556 "upb/json/parser.rl" { start_object(parser); } break; case 14: -#line 532 "upb/json/parser.rl" +#line 559 "upb/json/parser.rl" { end_object(parser); } break; case 15: -#line 538 "upb/json/parser.rl" +#line 565 "upb/json/parser.rl" { CHECK_RETURN_TOP(start_array(parser)); } break; case 16: -#line 542 "upb/json/parser.rl" +#line 569 "upb/json/parser.rl" { end_array(parser); } break; case 17: -#line 547 "upb/json/parser.rl" +#line 574 "upb/json/parser.rl" { start_number(parser, p); } break; case 18: -#line 548 "upb/json/parser.rl" +#line 575 "upb/json/parser.rl" { end_number(parser, p); } break; case 19: -#line 550 "upb/json/parser.rl" +#line 577 "upb/json/parser.rl" { CHECK_RETURN_TOP(start_stringval(parser)); } break; case 20: -#line 551 "upb/json/parser.rl" +#line 578 "upb/json/parser.rl" { end_stringval(parser); } break; case 21: -#line 553 "upb/json/parser.rl" +#line 580 "upb/json/parser.rl" { CHECK_RETURN_TOP(putbool(parser, true)); } break; case 22: -#line 555 "upb/json/parser.rl" +#line 582 "upb/json/parser.rl" { CHECK_RETURN_TOP(putbool(parser, false)); } break; case 23: -#line 557 "upb/json/parser.rl" +#line 584 "upb/json/parser.rl" { /* null value */ } break; case 24: -#line 559 "upb/json/parser.rl" +#line 586 "upb/json/parser.rl" { CHECK_RETURN_TOP(start_subobject(parser)); } break; case 25: -#line 560 "upb/json/parser.rl" +#line 587 "upb/json/parser.rl" { end_subobject(parser); } break; case 26: -#line 565 "upb/json/parser.rl" +#line 592 "upb/json/parser.rl" { p--; {cs = stack[--top]; goto _again;} } break; -#line 838 "upb/json/parser.c" +#line 865 "upb/json/parser.c" } } @@ -847,7 +874,7 @@ _again: _out: {} } -#line 587 "upb/json/parser.rl" +#line 614 "upb/json/parser.rl" if (p != pe) { upb_status_seterrf(parser->status, "Parse error at %s\n", p); @@ -888,13 +915,13 @@ void upb_json_parser_reset(upb_json_parser *p) { int top; // Emit Ragel initialization of the parser. -#line 892 "upb/json/parser.c" +#line 919 "upb/json/parser.c" { cs = json_start; top = 0; } -#line 627 "upb/json/parser.rl" +#line 654 "upb/json/parser.rl" p->current_state = cs; p->parser_top = top; p->text_begin = NULL; diff --git a/upb/json/parser.rl b/upb/json/parser.rl index 92a1566..8ceca77 100644 --- a/upb/json/parser.rl +++ b/upb/json/parser.rl @@ -286,7 +286,7 @@ badpadding: return false; } -static bool end_text(upb_json_parser *p, const char *ptr) { +static bool end_text(upb_json_parser *p, const char *ptr, bool is_num) { assert(!p->accumulated); // TODO: handle this case. p->accumulated = p->text_begin; p->accumulated_len = ptr - p->text_begin; @@ -300,6 +300,24 @@ static bool end_text(upb_json_parser *p, const char *ptr) { upb_sink_putstring(&p->top->sink, sel, p->accumulated, p->accumulated_len, NULL); } p->accumulated = NULL; + } else if (p->top->f && + upb_fielddef_type(p->top->f) == UPB_TYPE_ENUM && + !is_num) { + + // Enum case: resolve enum symbolic name to integer value. + const upb_enumdef *enumdef = + (const upb_enumdef*)upb_fielddef_subdef(p->top->f); + + int32_t int_val = 0; + if (upb_enumdef_ntoi(enumdef, p->accumulated, p->accumulated_len, + &int_val)) { + upb_selector_t sel = getsel(p); + upb_sink_putint32(&p->top->sink, sel, int_val); + } else { + upb_status_seterrmsg(p->status, "Enum value name unknown"); + return false; + } + p->accumulated = NULL; } return true; @@ -308,29 +326,38 @@ static bool end_text(upb_json_parser *p, const char *ptr) { static bool start_stringval(upb_json_parser *p) { assert(p->top->f); - if (!upb_fielddef_isstring(p->top->f)) { + if (upb_fielddef_isstring(p->top->f)) { + if (!check_stack(p)) return false; + + // Start a new parser frame: parser frames correspond one-to-one with + // handler frames, and string events occur in a sub-frame. + upb_jsonparser_frame *inner = p->top + 1; + upb_selector_t sel = getsel_for_handlertype(p, UPB_HANDLER_STARTSTR); + upb_sink_startstr(&p->top->sink, sel, 0, &inner->sink); + inner->m = p->top->m; + inner->f = p->top->f; + p->top = inner; + + return true; + } else if (upb_fielddef_type(p->top->f) == UPB_TYPE_ENUM) { + // Do nothing -- symbolic enum names in quotes remain in the + // current parser frame. + return true; + } else { upb_status_seterrf(p->status, - "String specified for non-string field: %s", + "String specified for non-string/non-enum field: %s", upb_fielddef_name(p->top->f)); return false; } - if (!check_stack(p)) return false; - - upb_jsonparser_frame *inner = p->top + 1; // TODO: check for overflow. - upb_selector_t sel = getsel_for_handlertype(p, UPB_HANDLER_STARTSTR); - upb_sink_startstr(&p->top->sink, sel, 0, &inner->sink); - inner->m = p->top->m; - inner->f = p->top->f; - p->top = inner; - - return true; } static void end_stringval(upb_json_parser *p) { - p->top--; - upb_selector_t sel = getsel_for_handlertype(p, UPB_HANDLER_ENDSTR); - upb_sink_endstr(&p->top->sink, sel); + if (upb_fielddef_isstring(p->top->f)) { + upb_selector_t sel = getsel_for_handlertype(p, UPB_HANDLER_ENDSTR); + upb_sink_endstr(&p->top->sink, sel); + p->top--; + } } static void start_number(upb_json_parser *p, const char *ptr) { @@ -339,7 +366,7 @@ static void start_number(upb_json_parser *p, const char *ptr) { } static void end_number(upb_json_parser *p, const char *ptr) { - end_text(p, ptr); + end_text(p, ptr, true); const char *myend = p->accumulated + p->accumulated_len; char *end; @@ -448,15 +475,15 @@ static void hex(upb_json_parser *p, const char *end) { // emit the codepoint as UTF-8. char utf8[3]; // support \u0000 -- \uFFFF -- need only three bytes. int length = 0; - if (codepoint < 0x7F) { + if (codepoint <= 0x7F) { utf8[0] = codepoint; length = 1; - } else if (codepoint < 0x07FF) { + } else if (codepoint <= 0x07FF) { utf8[1] = (codepoint & 0x3F) | 0x80; codepoint >>= 6; utf8[0] = (codepoint & 0x1F) | 0xC0; length = 2; - } else /* codepoint < 0xFFFF */ { + } else /* codepoint <= 0xFFFF */ { utf8[2] = (codepoint & 0x3F) | 0x80; codepoint >>= 6; utf8[1] = (codepoint & 0x3F) | 0x80; @@ -492,7 +519,7 @@ static void hex(upb_json_parser *p, const char *end) { text = /[^\\"]/+ >{ start_text(parser, p); } - %{ CHECK_RETURN_TOP(end_text(parser, p)); } + %{ CHECK_RETURN_TOP(end_text(parser, p, false)); } ; unicode_char = diff --git a/upb/json/printer.c b/upb/json/printer.c index 44e6f83..28f3e4a 100644 --- a/upb/json/printer.c +++ b/upb/json/printer.c @@ -69,10 +69,10 @@ static inline char* json_nice_escape(char c) { } } -// Write a properly quoted and escaped string. +// Write a properly escaped string chunk. The surrounding quotes are *not* +// printed; this is so that the caller has the option of emitting the string +// content in chunks. static void putstring(upb_json_printer *p, const char *buf, unsigned int len) { - print_data(p, "\"", 1); - const char* unescaped_run = NULL; for (unsigned int i = 0; i < len; i++) { char c = buf[i]; @@ -112,8 +112,6 @@ static void putstring(upb_json_printer *p, const char *buf, unsigned int len) { if (unescaped_run) { print_data(p, unescaped_run, &buf[len] - unescaped_run); } - - print_data(p, "\"", 1); } #define CHKLENGTH(x) if (!(x)) return -1; @@ -158,8 +156,9 @@ static bool putkey(void *closure, const void *handler_data) { upb_json_printer *p = closure; const strpc *key = handler_data; print_comma(p); + print_data(p, "\"", 1); putstring(p, key->ptr, key->len); - print_data(p, ":", 1); + print_data(p, "\":", 2); return true; } @@ -200,6 +199,47 @@ TYPE_HANDLERS(uint64_t, fmt_uint64); #undef TYPE_HANDLERS +typedef struct { + void *keyname; + const upb_enumdef *enumdef; +} EnumHandlerData; + +static bool scalar_enum(void *closure, const void *handler_data, + int32_t val) { + const EnumHandlerData *hd = handler_data; + upb_json_printer *p = closure; + CHK(putkey(closure, hd->keyname)); + + const char *symbolic_name = upb_enumdef_iton(hd->enumdef, val); + if (symbolic_name) { + print_data(p, "\"", 1); + putstring(p, symbolic_name, strlen(symbolic_name)); + print_data(p, "\"", 1); + } else { + putint32_t(closure, NULL, val); + } + + return true; +} + +static bool repeated_enum(void *closure, const void *handler_data, + int32_t val) { + const EnumHandlerData *hd = handler_data; + upb_json_printer *p = closure; + print_comma(p); + + const char *symbolic_name = upb_enumdef_iton(hd->enumdef, val); + if (symbolic_name) { + print_data(p, "\"", 1); + putstring(p, symbolic_name, strlen(symbolic_name)); + print_data(p, "\"", 1); + } else { + putint32_t(closure, NULL, val); + } + + return true; +} + static void *scalar_startsubmsg(void *closure, const void *handler_data) { return putkey(closure, handler_data) ? closure : UPB_BREAK; } @@ -310,27 +350,60 @@ static size_t putbytes(void *closure, const void *handler_data, const char *str, } size_t bytes = to - data; + print_data(p, "\"", 1); putstring(p, data, bytes); + print_data(p, "\"", 1); return len; } +static void *scalar_startstr(void *closure, const void *handler_data, + size_t size_hint) { + UPB_UNUSED(handler_data); + UPB_UNUSED(size_hint); + upb_json_printer *p = closure; + CHK(putkey(closure, handler_data)); + print_data(p, "\"", 1); + return p; +} + static size_t scalar_str(void *closure, const void *handler_data, const char *str, size_t len, const upb_bufhandle *handle) { - CHK(putkey(closure, handler_data)); CHK(putstr(closure, handler_data, str, len, handle)); return len; } +static bool scalar_endstr(void *closure, const void *handler_data) { + UPB_UNUSED(handler_data); + upb_json_printer *p = closure; + print_data(p, "\"", 1); + return true; +} + +static void *repeated_startstr(void *closure, const void *handler_data, + size_t size_hint) { + UPB_UNUSED(handler_data); + UPB_UNUSED(size_hint); + upb_json_printer *p = closure; + print_comma(p); + print_data(p, "\"", 1); + return p; +} + static size_t repeated_str(void *closure, const void *handler_data, const char *str, size_t len, const upb_bufhandle *handle) { - upb_json_printer *p = closure; - print_comma(p); CHK(putstr(closure, handler_data, str, len, handle)); return len; } +static bool repeated_endstr(void *closure, const void *handler_data) { + UPB_UNUSED(handler_data); + upb_json_printer *p = closure; + print_data(p, "\"", 1); + return true; +} + static size_t scalar_bytes(void *closure, const void *handler_data, const char *str, size_t len, const upb_bufhandle *handle) { @@ -381,21 +454,44 @@ void sethandlers(const void *closure, upb_handlers *h) { TYPE(UPB_TYPE_FLOAT, float, float); TYPE(UPB_TYPE_DOUBLE, double, double); TYPE(UPB_TYPE_BOOL, bool, bool); - TYPE(UPB_TYPE_ENUM, int32, int32_t); TYPE(UPB_TYPE_INT32, int32, int32_t); TYPE(UPB_TYPE_UINT32, uint32, uint32_t); TYPE(UPB_TYPE_INT64, int64, int64_t); TYPE(UPB_TYPE_UINT64, uint64, uint64_t); + case UPB_TYPE_ENUM: { + // For now, we always emit symbolic names for enums. We may want an + // option later to control this behavior, but we will wait for a real + // need first. + EnumHandlerData *hd = malloc(sizeof(EnumHandlerData)); + hd->enumdef = (const upb_enumdef *)upb_fielddef_subdef(f); + hd->keyname = newstrpc(h, f); + upb_handlers_addcleanup(h, hd, free); + upb_handlerattr enum_attr = UPB_HANDLERATTR_INITIALIZER; + upb_handlerattr_sethandlerdata(&enum_attr, hd); + + if (upb_fielddef_isseq(f)) { + upb_handlers_setint32(h, f, repeated_enum, &enum_attr); + } else { + upb_handlers_setint32(h, f, scalar_enum, &enum_attr); + } + + upb_handlerattr_uninit(&enum_attr); + break; + } case UPB_TYPE_STRING: - // XXX: this doesn't support strings that span buffers yet. if (upb_fielddef_isseq(f)) { + upb_handlers_setstartstr(h, f, repeated_startstr, &empty_attr); upb_handlers_setstring(h, f, repeated_str, &empty_attr); + upb_handlers_setendstr(h, f, repeated_endstr, &empty_attr); } else { - upb_handlers_setstring(h, f, scalar_str, &name_attr); + upb_handlers_setstartstr(h, f, scalar_startstr, &name_attr); + upb_handlers_setstring(h, f, scalar_str, &empty_attr); + upb_handlers_setendstr(h, f, scalar_endstr, &empty_attr); } break; case UPB_TYPE_BYTES: - // XXX: this doesn't support strings that span buffers yet. + // XXX: this doesn't support strings that span buffers yet. The base64 + // encoder will need to be made resumable for this to work properly. if (upb_fielddef_isseq(f)) { upb_handlers_setstring(h, f, repeated_bytes, &empty_attr); } else { -- cgit v1.2.3