From f3052474d4687ce92d51c3c4833b0ebc8acbf50d Mon Sep 17 00:00:00 2001 From: Chris Fallin Date: Fri, 5 Dec 2014 11:16:21 -0800 Subject: Update JSON parser to emit UTF8 to string fields appropriately. --- upb/json/parser.rl | 46 +++++++++++++++++++++++++++++++--------------- 1 file changed, 31 insertions(+), 15 deletions(-) (limited to 'upb/json/parser.rl') diff --git a/upb/json/parser.rl b/upb/json/parser.rl index 75860e5..f5e1634 100644 --- a/upb/json/parser.rl +++ b/upb/json/parser.rl @@ -46,13 +46,13 @@ static upb_selector_t getsel(upb_json_parser *p) { p, upb_handlers_getprimitivehandlertype(p->top->f)); } -static void start_member(upb_json_parser *p) { +static void start_member(upb_json_parser *p, const char *ptr) { assert(!p->top->f); assert(!p->accumulated); p->accumulated_len = 0; } -static bool end_member(upb_json_parser *p) { +static bool end_member(upb_json_parser *p, const char *ptr) { // TODO(haberman): support keys that span buffers or have escape sequences. assert(!p->top->f); assert(p->accumulated); @@ -305,7 +305,7 @@ static bool end_text(upb_json_parser *p, const char *ptr) { return true; } -static bool start_stringval(upb_json_parser *p) { +static bool start_stringval(upb_json_parser *p, const char *ptr) { assert(p->top->f); if (!upb_fielddef_isstring(p->top->f)) { @@ -327,7 +327,7 @@ static bool start_stringval(upb_json_parser *p) { return true; } -static void end_stringval(upb_json_parser *p) { +static void end_stringval(upb_json_parser *p, const char *ptr) { p->top--; upb_selector_t sel = getsel_for_handlertype(p, UPB_HANDLER_ENDSTR); upb_sink_endstr(&p->top->sink, sel); @@ -438,7 +438,6 @@ static void start_hex(upb_json_parser *p, const char *ptr) { } static void hex(upb_json_parser *p, const char *end) { - UPB_UNUSED(end); const char *start = p->text_begin; assert(end - start == 4); uint16_t codepoint = @@ -446,9 +445,30 @@ static void hex(upb_json_parser *p, const char *end) { (hexdigit(start[1]) << 8) | (hexdigit(start[2]) << 4) | hexdigit(start[3]); - // TODO(haberman): convert to UTF-8 and emit (though if it is a high surrogate + // emit the codepoint as UTF-8. + char utf8[3]; // support \u0000 -- \uFFFF -- need only three bytes. + int length = 0; + if (codepoint < 0x7F) { + utf8[0] = codepoint; + length = 1; + } else if (codepoint < 0x07FF) { + utf8[1] = (codepoint & 0x3F) | 0x80; + codepoint >>= 6; + utf8[0] = (codepoint & 0x1F) | 0xC0; + length = 2; + } else /* codepoint < 0xFFFF */ { + utf8[2] = (codepoint & 0x3F) | 0x80; + codepoint >>= 6; + utf8[1] = (codepoint & 0x3F) | 0x80; + codepoint >>= 6; + utf8[0] = (codepoint & 0x0F) | 0xE0; + length = 3; + } + // TODO(haberman): Handle high surrogates: if codepoint is a high surrogate // we have to wait for the next escape to get the full code point). - UPB_UNUSED(codepoint); + + upb_selector_t sel = getsel_for_handlertype(p, UPB_HANDLER_STRING); + upb_sink_putstring(&p->top->sink, sel, utf8, length, NULL); } #define CHECK_RETURN_TOP(x) if (!(x)) goto error @@ -496,8 +516,8 @@ static void hex(upb_json_parser *p, const char *end) { member = ws string - >{ start_member(parser); } - %{ CHECK_RETURN_TOP(end_member(parser)); } + >{ start_member(parser, p); } + %{ CHECK_RETURN_TOP(end_member(parser, p)); } ws ":" ws value2 %{ clear_member(parser); } @@ -527,8 +547,8 @@ static void hex(upb_json_parser *p, const char *end) { >{ start_number(parser, p); } %{ end_number(parser, p); } | string - >{ CHECK_RETURN_TOP(start_stringval(parser)); } - %{ end_stringval(parser); } + >{ CHECK_RETURN_TOP(start_stringval(parser, p)); } + %{ end_stringval(parser, p); } | "true" %{ CHECK_RETURN_TOP(putbool(parser, true)); } | "false" @@ -551,8 +571,6 @@ static void hex(upb_json_parser *p, const char *end) { size_t parse(void *closure, const void *hd, const char *buf, size_t size, const upb_bufhandle *handle) { - UPB_UNUSED(hd); - UPB_UNUSED(handle); upb_json_parser *parser = closure; // Variables used by Ragel's generated code. @@ -578,8 +596,6 @@ error: } bool end(void *closure, const void *hd) { - UPB_UNUSED(closure); - UPB_UNUSED(hd); return true; } -- cgit v1.2.3 From 98adb44547cc6667f803e59af20ef0bd835211e6 Mon Sep 17 00:00:00 2001 From: Chris Fallin Date: Fri, 5 Dec 2014 13:30:54 -0800 Subject: Sync update: haberman@'s internal changes. --- upb/json/parser.c | 32 ++++++++++++++++++-------------- upb/json/parser.rl | 22 +++++++++++++--------- 2 files changed, 31 insertions(+), 23 deletions(-) (limited to 'upb/json/parser.rl') diff --git a/upb/json/parser.c b/upb/json/parser.c index 9a331e4..2687713 100644 --- a/upb/json/parser.c +++ b/upb/json/parser.c @@ -48,13 +48,13 @@ static upb_selector_t getsel(upb_json_parser *p) { p, upb_handlers_getprimitivehandlertype(p->top->f)); } -static void start_member(upb_json_parser *p, const char *ptr) { +static void start_member(upb_json_parser *p) { assert(!p->top->f); assert(!p->accumulated); p->accumulated_len = 0; } -static bool end_member(upb_json_parser *p, const char *ptr) { +static bool end_member(upb_json_parser *p) { // TODO(haberman): support keys that span buffers or have escape sequences. assert(!p->top->f); assert(p->accumulated); @@ -307,7 +307,7 @@ static bool end_text(upb_json_parser *p, const char *ptr) { return true; } -static bool start_stringval(upb_json_parser *p, const char *ptr) { +static bool start_stringval(upb_json_parser *p) { assert(p->top->f); if (!upb_fielddef_isstring(p->top->f)) { @@ -329,7 +329,7 @@ static bool start_stringval(upb_json_parser *p, const char *ptr) { return true; } -static void end_stringval(upb_json_parser *p, const char *ptr) { +static void end_stringval(upb_json_parser *p) { p->top--; upb_selector_t sel = getsel_for_handlertype(p, UPB_HANDLER_ENDSTR); upb_sink_endstr(&p->top->sink, sel); @@ -441,7 +441,7 @@ static void start_hex(upb_json_parser *p, const char *ptr) { static void hex(upb_json_parser *p, const char *end) { const char *start = p->text_begin; - assert(end - start == 4); + UPB_ASSERT_VAR(end, end - start == 4); uint16_t codepoint = (hexdigit(start[0]) << 12) | (hexdigit(start[1]) << 8) | @@ -639,6 +639,8 @@ static const int json_en_main = 1; size_t parse(void *closure, const void *hd, const char *buf, size_t size, const upb_bufhandle *handle) { + UPB_UNUSED(hd); + UPB_UNUSED(handle); upb_json_parser *parser = closure; // Variables used by Ragel's generated code. @@ -650,7 +652,7 @@ size_t parse(void *closure, const void *hd, const char *buf, size_t size, const char *pe = buf + size; -#line 654 "upb/json/parser.c" +#line 656 "upb/json/parser.c" { int _klen; unsigned int _trans; @@ -766,11 +768,11 @@ _match: break; case 10: #line 519 "upb/json/parser.rl" - { start_member(parser, p); } + { start_member(parser); } break; case 11: #line 520 "upb/json/parser.rl" - { CHECK_RETURN_TOP(end_member(parser, p)); } + { CHECK_RETURN_TOP(end_member(parser)); } break; case 12: #line 523 "upb/json/parser.rl" @@ -802,11 +804,11 @@ _match: break; case 19: #line 550 "upb/json/parser.rl" - { CHECK_RETURN_TOP(start_stringval(parser, p)); } + { CHECK_RETURN_TOP(start_stringval(parser)); } break; case 20: #line 551 "upb/json/parser.rl" - { end_stringval(parser, p); } + { end_stringval(parser); } break; case 21: #line 553 "upb/json/parser.rl" @@ -832,7 +834,7 @@ _match: #line 565 "upb/json/parser.rl" { p--; {cs = stack[--top]; goto _again;} } break; -#line 836 "upb/json/parser.c" +#line 838 "upb/json/parser.c" } } @@ -845,7 +847,7 @@ _again: _out: {} } -#line 585 "upb/json/parser.rl" +#line 587 "upb/json/parser.rl" if (p != pe) { upb_status_seterrf(parser->status, "Parse error at %s\n", p); @@ -860,6 +862,8 @@ error: } bool end(void *closure, const void *hd) { + UPB_UNUSED(closure); + UPB_UNUSED(hd); return true; } @@ -884,13 +888,13 @@ void upb_json_parser_reset(upb_json_parser *p) { int top; // Emit Ragel initialization of the parser. -#line 888 "upb/json/parser.c" +#line 892 "upb/json/parser.c" { cs = json_start; top = 0; } -#line 623 "upb/json/parser.rl" +#line 627 "upb/json/parser.rl" p->current_state = cs; p->parser_top = top; p->text_begin = NULL; diff --git a/upb/json/parser.rl b/upb/json/parser.rl index f5e1634..92a1566 100644 --- a/upb/json/parser.rl +++ b/upb/json/parser.rl @@ -46,13 +46,13 @@ static upb_selector_t getsel(upb_json_parser *p) { p, upb_handlers_getprimitivehandlertype(p->top->f)); } -static void start_member(upb_json_parser *p, const char *ptr) { +static void start_member(upb_json_parser *p) { assert(!p->top->f); assert(!p->accumulated); p->accumulated_len = 0; } -static bool end_member(upb_json_parser *p, const char *ptr) { +static bool end_member(upb_json_parser *p) { // TODO(haberman): support keys that span buffers or have escape sequences. assert(!p->top->f); assert(p->accumulated); @@ -305,7 +305,7 @@ static bool end_text(upb_json_parser *p, const char *ptr) { return true; } -static bool start_stringval(upb_json_parser *p, const char *ptr) { +static bool start_stringval(upb_json_parser *p) { assert(p->top->f); if (!upb_fielddef_isstring(p->top->f)) { @@ -327,7 +327,7 @@ static bool start_stringval(upb_json_parser *p, const char *ptr) { return true; } -static void end_stringval(upb_json_parser *p, const char *ptr) { +static void end_stringval(upb_json_parser *p) { p->top--; upb_selector_t sel = getsel_for_handlertype(p, UPB_HANDLER_ENDSTR); upb_sink_endstr(&p->top->sink, sel); @@ -439,7 +439,7 @@ static void start_hex(upb_json_parser *p, const char *ptr) { static void hex(upb_json_parser *p, const char *end) { const char *start = p->text_begin; - assert(end - start == 4); + UPB_ASSERT_VAR(end, end - start == 4); uint16_t codepoint = (hexdigit(start[0]) << 12) | (hexdigit(start[1]) << 8) | @@ -516,8 +516,8 @@ static void hex(upb_json_parser *p, const char *end) { member = ws string - >{ start_member(parser, p); } - %{ CHECK_RETURN_TOP(end_member(parser, p)); } + >{ start_member(parser); } + %{ CHECK_RETURN_TOP(end_member(parser)); } ws ":" ws value2 %{ clear_member(parser); } @@ -547,8 +547,8 @@ static void hex(upb_json_parser *p, const char *end) { >{ start_number(parser, p); } %{ end_number(parser, p); } | string - >{ CHECK_RETURN_TOP(start_stringval(parser, p)); } - %{ end_stringval(parser, p); } + >{ CHECK_RETURN_TOP(start_stringval(parser)); } + %{ end_stringval(parser); } | "true" %{ CHECK_RETURN_TOP(putbool(parser, true)); } | "false" @@ -571,6 +571,8 @@ static void hex(upb_json_parser *p, const char *end) { size_t parse(void *closure, const void *hd, const char *buf, size_t size, const upb_bufhandle *handle) { + UPB_UNUSED(hd); + UPB_UNUSED(handle); upb_json_parser *parser = closure; // Variables used by Ragel's generated code. @@ -596,6 +598,8 @@ error: } bool end(void *closure, const void *hd) { + UPB_UNUSED(closure); + UPB_UNUSED(hd); return true; } -- cgit v1.2.3