From f74534b42ac9ac8b0ff496cb0da83f1201bbf8da Mon Sep 17 00:00:00 2001 From: Joshua Haberman Date: Sun, 8 May 2011 13:05:12 -0700 Subject: Decoder redesign in preparation for packed fields and start/endseq. --- Makefile | 13 +- perf-regression-test.py | 1 + perf-tests.sh | 12 +- src/upb.c | 47 +++-- src/upb.h | 8 +- src/upb_decoder.c | 463 ++++++++++++++++++++++++----------------------- src/upb_decoder.h | 4 + src/upb_decoder_x86.dasc | 64 +++---- src/upb_def.h | 10 +- src/upb_msg.c | 9 +- src/upb_stream.c | 49 +++-- src/upb_stream.h | 46 ++--- src/upb_string.h | 5 + src/upb_textprinter.c | 9 +- src/upb_varint.c | 54 ++++++ src/upb_varint.h | 50 +---- tests/test_decoder.c | 67 +++++-- 17 files changed, 480 insertions(+), 431 deletions(-) create mode 100644 src/upb_varint.c diff --git a/Makefile b/Makefile index 08327f0..98ffef4 100644 --- a/Makefile +++ b/Makefile @@ -77,6 +77,7 @@ CORE= \ src/upb_string.c \ src/upb_def.c \ src/upb_msg.c \ + src/upb_varint.c \ # Common encoders/decoders -- you're almost certain to want these. STREAM= \ @@ -211,9 +212,13 @@ SIMPLE_TESTS= \ tests/test_string \ tests/test_def \ tests/test_varint \ - tests/tests -# tests/test_decoder \ - tests/test_stream \ + tests/tests \ + +INTERACTIVE_TESTS= \ + tests/test_decoder \ + +# tests/test_stream \ + SIMPLE_CXX_TESTS= \ tests/test_table @@ -225,7 +230,7 @@ VARIADIC_TESTS= \ TESTS=$(SIMPLE_TESTS) $(SIMPLE_CXX_TESTS) $(VARIADIC_TESTS) -tests: $(TESTS) +tests: $(TESTS) $(INTERACTIVE_TESTS) $(TESTS): $(LIBUPB) tests/tests: tests/test.proto.pb diff --git a/perf-regression-test.py b/perf-regression-test.py index 09f4569..4e2cf09 100755 --- a/perf-regression-test.py +++ b/perf-regression-test.py @@ -10,6 +10,7 @@ set -v # Generate numbers for baseline. rm -rf perf-tmp git clone . perf-tmp +cp perf-tests.sh perf-tmp (cd perf-tmp && ./perf-tests.sh upb) cp perf-tmp/perf-tests.out perf-tests.baseline diff --git a/perf-tests.sh b/perf-tests.sh index 8630182..118922d 100755 --- a/perf-tests.sh +++ b/perf-tests.sh @@ -16,20 +16,16 @@ run_with_flags () { NAME=$2 make clean - echo "$FLAGS -fprofile-generate" > perf-cppflags - make upb_benchmarks - make benchmark - - make clean_leave_profile - echo "$FLAGS -fprofile-use" > perf-cppflags + echo "$FLAGS" > perf-cppflags make upb_benchmarks make benchmark | sed -e "s/^/$NAME./g" | tee -a perf-tests.out } -if [ x`uname -m` = xx86_64 ]; then +#if [ x`uname -m` = xx86_64 ]; then run_with_flags "-DNDEBUG -m32" "plain32" run_with_flags "-DNDEBUG -fomit-frame-pointer -m32" "omitfp32" -fi +#fi run_with_flags "-DNDEBUG " "plain" run_with_flags "-DNDEBUG -fomit-frame-pointer" "omitfp" +run_with_flags "-DNDEBUG -DUPB_USE_JIT_X64" "jit" diff --git a/src/upb.c b/src/upb.c index b80de29..82c7fc2 100644 --- a/src/upb.c +++ b/src/upb.c @@ -13,31 +13,30 @@ #include "upb_string.h" #define alignof(t) offsetof(struct { char c; t x; }, x) -#define TYPE_INFO(wire_type, ctype, allows_delimited, inmemory_type) \ - {alignof(ctype), sizeof(ctype), wire_type, \ - (1 << wire_type) | (allows_delimited << UPB_WIRE_TYPE_DELIMITED), \ - UPB_TYPE(inmemory_type), #ctype}, +#define TYPE_INFO(wire_type, ctype, inmemory_type) \ + {alignof(ctype), sizeof(ctype), wire_type, UPB_TYPE(inmemory_type), #ctype}, const upb_type_info upb_types[] = { - {0, 0, 0, 0, 0, ""}, // There is no type 0. - TYPE_INFO(UPB_WIRE_TYPE_64BIT, double, 1, DOUBLE) // DOUBLE - TYPE_INFO(UPB_WIRE_TYPE_32BIT, float, 1, FLOAT) // FLOAT - TYPE_INFO(UPB_WIRE_TYPE_VARINT, int64_t, 1, INT64) // INT64 - TYPE_INFO(UPB_WIRE_TYPE_VARINT, uint64_t, 1, UINT64) // UINT64 - TYPE_INFO(UPB_WIRE_TYPE_VARINT, int32_t, 1, INT32) // INT32 - TYPE_INFO(UPB_WIRE_TYPE_64BIT, uint64_t, 1, UINT64) // FIXED64 - TYPE_INFO(UPB_WIRE_TYPE_32BIT, uint32_t, 1, UINT32) // FIXED32 - TYPE_INFO(UPB_WIRE_TYPE_VARINT, bool, 1, BOOL) // BOOL - TYPE_INFO(UPB_WIRE_TYPE_DELIMITED, void*, 1, STRING) // STRING - TYPE_INFO(UPB_WIRE_TYPE_START_GROUP, void*, 0, MESSAGE) // GROUP - TYPE_INFO(UPB_WIRE_TYPE_DELIMITED, void*, 1, MESSAGE) // MESSAGE - TYPE_INFO(UPB_WIRE_TYPE_DELIMITED, void*, 1, STRING) // BYTES - TYPE_INFO(UPB_WIRE_TYPE_VARINT, uint32_t, 1, UINT32) // UINT32 - TYPE_INFO(UPB_WIRE_TYPE_VARINT, uint32_t, 1, INT32) // ENUM - TYPE_INFO(UPB_WIRE_TYPE_32BIT, int32_t, 1, INT32) // SFIXED32 - TYPE_INFO(UPB_WIRE_TYPE_64BIT, int64_t, 1, INT64) // SFIXED64 - TYPE_INFO(UPB_WIRE_TYPE_VARINT, int32_t, 1, INT32) // SINT32 - TYPE_INFO(UPB_WIRE_TYPE_VARINT, int64_t, 1, INT64) // SINT64 + {0, 0, 0, 0, ""}, // There is no type 0. + TYPE_INFO(UPB_WIRE_TYPE_64BIT, double, DOUBLE) // DOUBLE + TYPE_INFO(UPB_WIRE_TYPE_32BIT, float, FLOAT) // FLOAT + TYPE_INFO(UPB_WIRE_TYPE_VARINT, int64_t, INT64) // INT64 + TYPE_INFO(UPB_WIRE_TYPE_VARINT, uint64_t, UINT64) // UINT64 + TYPE_INFO(UPB_WIRE_TYPE_VARINT, int32_t, INT32) // INT32 + TYPE_INFO(UPB_WIRE_TYPE_64BIT, uint64_t, UINT64) // FIXED64 + TYPE_INFO(UPB_WIRE_TYPE_32BIT, uint32_t, UINT32) // FIXED32 + TYPE_INFO(UPB_WIRE_TYPE_VARINT, bool, BOOL) // BOOL + TYPE_INFO(UPB_WIRE_TYPE_DELIMITED, void*, STRING) // STRING + TYPE_INFO(UPB_WIRE_TYPE_START_GROUP, void*, MESSAGE) // GROUP + TYPE_INFO(UPB_WIRE_TYPE_DELIMITED, void*, MESSAGE) // MESSAGE + TYPE_INFO(UPB_WIRE_TYPE_DELIMITED, void*, STRING) // BYTES + TYPE_INFO(UPB_WIRE_TYPE_VARINT, uint32_t, UINT32) // UINT32 + TYPE_INFO(UPB_WIRE_TYPE_VARINT, uint32_t, INT32) // ENUM + TYPE_INFO(UPB_WIRE_TYPE_32BIT, int32_t, INT32) // SFIXED32 + TYPE_INFO(UPB_WIRE_TYPE_64BIT, int64_t, INT64) // SFIXED64 + TYPE_INFO(UPB_WIRE_TYPE_VARINT, int32_t, INT32) // SINT32 + TYPE_INFO(UPB_WIRE_TYPE_VARINT, int64_t, INT64) // SINT64 + TYPE_INFO(UPB_WIRE_TYPE_END_GROUP, void*, INT64) // SINT64 }; #ifdef NDEBUG @@ -64,7 +63,7 @@ void upb_copyerr(upb_status *to, upb_status *from) void upb_clearerr(upb_status *status) { status->code = UPB_OK; - upb_string_recycle(&status->str); + if (status->str) upb_string_recycle(&status->str); } void upb_printerr(upb_status *status) { diff --git a/src/upb.h b/src/upb.h index 38808ac..9253748 100644 --- a/src/upb.h +++ b/src/upb.h @@ -126,7 +126,6 @@ typedef struct { uint8_t align; uint8_t size; upb_wire_type_t native_wire_type; - uint8_t allowed_wire_types; // For packable fields, also allows delimited. uint8_t inmemory_type; // For example, INT32, SINT32, and SFIXED32 -> INT32 char *ctype; } upb_type_info; @@ -168,11 +167,11 @@ typedef int32_t upb_strlen_t; // The type of a upb_value. This is like a upb_fieldtype_t, but adds the // constant UPB_VALUETYPE_ARRAY to represent an array. typedef uint8_t upb_valuetype_t; +#define UPB_TYPE_ENDGROUP 19 // Need to increase if more real types are added! #define UPB_VALUETYPE_ARRAY 32 #define UPB_VALUETYPE_BYTESRC 32 #define UPB_VALUETYPE_RAW 33 #define UPB_VALUETYPE_FIELDDEF 34 -#define UPB_TYPE_ENDGROUP 35 // A single .proto value. The owner must have an out-of-band way of knowing // the type, so that it knows which union member to use. @@ -231,11 +230,6 @@ UPB_VALUE_ACCESSORS(fielddef, fielddef, upb_fielddef*, UPB_VALUETYPE_FIELDDEF); extern upb_value UPB_NO_VALUE; -INLINE void upb_value_setraw(upb_value *val, uint64_t cval) { - SET_TYPE(val->type, UPB_VALUETYPE_RAW); - val->val.uint64 = cval; -} - INLINE upb_atomic_refcount_t *upb_value_getrefcount(upb_value val) { assert(val.type == UPB_TYPE(MESSAGE) || val.type == UPB_TYPE(STRING) || diff --git a/src/upb_decoder.c b/src/upb_decoder.c index d952954..a10c0ba 100644 --- a/src/upb_decoder.c +++ b/src/upb_decoder.c @@ -19,44 +19,50 @@ #include "upb_decoder_x86.h" #endif -/* Decoding/Buffering of individual values ************************************/ +// A group continues until an END_GROUP tag is seen. +#define UPB_GROUPEND UINT32_MAX +// A non-packed repeated field ends when a diff. field is seen (or submsg end). +#define UPB_REPEATEDEND (UINT32_MAX-1) -// Performs zig-zag decoding, which is used by sint32 and sint64. -INLINE int32_t upb_zzdec_32(uint32_t n) { return (n >> 1) ^ -(int32_t)(n & 1); } -INLINE int64_t upb_zzdec_64(uint64_t n) { return (n >> 1) ^ -(int64_t)(n & 1); } +// It's unfortunate that we have to micro-manage the compiler this way, +// especially since this tuning is necessarily specific to one hardware +// configuration. But emperically on a Core i7, performance increases 30-50% +// with these annotations. Every instance where these appear, gcc 4.2.1 made +// the wrong decision and degraded performance in benchmarks. +#define FORCEINLINE static __attribute__((always_inline)) +#define NOINLINE static __attribute__((noinline)) + +static void upb_decoder_exit(upb_decoder *d) { siglongjmp(d->exitjmp, 1); } + +/* Decoding/Buffering of wire types *******************************************/ #define UPB_MAX_VARINT_ENCODED_SIZE 10 -INLINE void upb_decoder_advance(upb_decoder *d, size_t len) { - d->ptr += len; -} +static void upb_decoder_advance(upb_decoder *d, size_t len) { d->ptr += len; } +static size_t upb_decoder_bufleft(upb_decoder *d) { return d->end - d->ptr; } -INLINE size_t upb_decoder_offset(upb_decoder *d) { +size_t upb_decoder_offset(upb_decoder *d) { size_t offset = d->buf_stream_offset; if (d->buf) offset += (d->ptr - d->buf); return offset; } -INLINE size_t upb_decoder_bufleft(upb_decoder *d) { - return d->end - d->ptr; -} - -INLINE void upb_dstate_setmsgend(upb_decoder *d) { - uint32_t end_offset = d->dispatcher.top->end_offset; - d->submsg_end = (end_offset == UINT32_MAX) ? - (void*)UINTPTR_MAX : d->buf + end_offset; +static void upb_decoder_setmsgend(upb_decoder *d) { + uint32_t end = d->dispatcher.top->end_offset; + d->submsg_end = (end == UINT32_MAX) ? (void*)UINTPTR_MAX : d->buf + end; } // Pulls the next buffer from the bytesrc. Should be called only when the // current buffer is completely empty. -static bool upb_pullbuf(upb_decoder *d) { +static void upb_pullbuf(upb_decoder *d, bool need) { assert(upb_decoder_bufleft(d) == 0); int32_t last_buf_len = d->buf ? upb_string_len(d->bufstr) : -1; upb_string_recycle(&d->bufstr); if (!upb_bytesrc_getstr(d->bytesrc, d->bufstr, d->status)) { d->buf = NULL; d->end = NULL; - return false; + if (need) upb_seterr(d->status, UPB_ERROR, "Unexpected EOF."); + upb_decoder_exit(d); } if (last_buf_len != -1) { d->buf_stream_offset += last_buf_len; @@ -70,290 +76,256 @@ static bool upb_pullbuf(upb_decoder *d) { d->jit_end = d->end - 20; upb_string_recycle(&d->tmp); upb_string_substr(d->tmp, d->bufstr, 0, 0); - upb_dstate_setmsgend(d); - return true; + upb_decoder_setmsgend(d); } // Called only from the slow path, this function copies the next "len" bytes -// from the stream to "data", adjusting the dstate appropriately. -static bool upb_getbuf(upb_decoder *d, void *data, size_t bytes_wanted) { +// from the stream to "data", adjusting the decoder state appropriately. +static void upb_getbuf(upb_decoder *d, void *data, size_t bytes, bool need) { while (1) { - size_t to_copy = UPB_MIN(bytes_wanted, upb_decoder_bufleft(d)); + size_t to_copy = UPB_MIN(bytes, upb_decoder_bufleft(d)); memcpy(data, d->ptr, to_copy); upb_decoder_advance(d, to_copy); - bytes_wanted -= to_copy; - if (bytes_wanted == 0) return true; - if (!upb_pullbuf(d)) return false; + bytes -= to_copy; + if (bytes == 0) return; + upb_pullbuf(d, need); } } -// We use this path when we don't have UPB_MAX_VARINT_ENCODED_SIZE contiguous -// bytes available in our current buffer. We don't inline this because we -// accept that it will be slow and we don't want to pay for two copies of it. -static bool upb_decode_varint_slow(upb_decoder *d, upb_value *val) { - char byte = 0x80; - uint64_t val64 = 0; +NOINLINE uint64_t upb_decode_varint_slow(upb_decoder *d, bool need) { + uint8_t byte = 0x80; + uint64_t u64 = 0; int bitpos; - for(bitpos = 0; - bitpos < 70 && (byte & 0x80) && upb_getbuf(d, &byte, 1); - bitpos += 7) - val64 |= ((uint64_t)byte & 0x7F) << bitpos; - - if(bitpos == 70) { - upb_seterr(d->status, UPB_ERROR, - "Varint was unterminated after 10 bytes.\n"); - return false; - } else if (d->status->code == UPB_EOF && bitpos == 0) { - // Regular EOF. - return false; - } else if (d->status->code == UPB_EOF && (byte & 0x80)) { - upb_seterr(d->status, UPB_ERROR, - "Provided data ended in the middle of a varint.\n"); - return false; - } else { - // Success. - upb_value_setraw(val, val64); - return true; + for(bitpos = 0; bitpos < 70 && (byte & 0x80); bitpos += 7) { + upb_getbuf(d, &byte, 1, need); + u64 |= ((uint64_t)byte & 0x7F) << bitpos; } -} -typedef struct { - upb_wire_type_t wire_type; - upb_field_number_t field_number; -} upb_tag; + if(bitpos == 70 && (byte & 0x80)) { + upb_seterr(d->status, UPB_ERROR, "Unterminated varint.\n"); + upb_decoder_exit(d); + } + return u64; +} -INLINE bool upb_decode_tag(upb_decoder *d, uint32_t *tag) { +// For tags and delimited lengths, which must be <=32bit and are usually small. +FORCEINLINE uint32_t upb_decode_varint32(upb_decoder *d, bool need) { const char *p = d->ptr; - upb_value val; - // Nearly all tag varints will be either 1 byte (1-16) or 2 bytes (17-2048). + uint32_t ret; + uint64_t u64; + // Nearly all will be either 1 byte (1-16) or 2 bytes (17-2048). if (upb_decoder_bufleft(d) < 2) goto slow; // unlikely. - *tag = *p & 0x7f; + ret = *p & 0x7f; if ((*(p++) & 0x80) == 0) goto done; // predictable if fields are in order - *tag |= (*p & 0x7f) << 7; + ret |= (*p & 0x7f) << 7; if ((*(p++) & 0x80) == 0) goto done; // likely slow: - // Decode a full varint starting over from ptr. - if (!upb_decode_varint_slow(d, &val)) return false; - *tag = upb_value_getint64(val); - p = d->ptr; // Trick the next line into not overwriting us. + u64 = upb_decode_varint_slow(d, need); + if (u64 > 0xffffffff) { + upb_seterr(d->status, UPB_ERROR, "Unterminated 32-bit varint.\n"); + upb_decoder_exit(d); + } + ret = (uint32_t)u64; + p = d->ptr; // Turn the next line into a nop. done: upb_decoder_advance(d, p - d->ptr); - return true; + return ret; } -INLINE bool upb_decode_varint(upb_decoder *d, upb_value *val) { +FORCEINLINE uint64_t upb_decode_varint(upb_decoder *d) { if (upb_decoder_bufleft(d) >= 16) { // Common (fast) case. upb_decoderet r = upb_vdecode_fast(d->ptr); if (r.p == NULL) { upb_seterr(d->status, UPB_ERROR, "Unterminated varint.\n"); - return false; + upb_decoder_exit(d); } - upb_value_setraw(val, r.val); upb_decoder_advance(d, r.p - d->ptr); - return true; + return r.val; } else { - return upb_decode_varint_slow(d, val); + return upb_decode_varint_slow(d, true); } } -INLINE bool upb_decode_fixed(upb_decoder *d, size_t bytes, upb_value *val) { +FORCEINLINE void upb_decode_fixed(upb_decoder *d, void *val, size_t bytes) { if (upb_decoder_bufleft(d) >= bytes) { // Common (fast) case. memcpy(val, d->ptr, bytes); upb_decoder_advance(d, bytes); } else { - if (!upb_getbuf(d, val, bytes)) return false; + upb_getbuf(d, val, bytes, true); } - return true; } -// "val" initially holds the length of the string, this is replaced by the -// contents of the string. -INLINE bool upb_decode_string(upb_decoder *d, upb_value *val, - upb_string **str) { - upb_string_recycle(str); - uint32_t strlen = upb_value_getint32(*val); +FORCEINLINE uint32_t upb_decode_fixed32(upb_decoder *d) { + uint32_t u32; + upb_decode_fixed(d, &u32, sizeof(uint32_t)); + return u32; +} +FORCEINLINE uint64_t upb_decode_fixed64(upb_decoder *d) { + uint64_t u64; + upb_decode_fixed(d, &u64, sizeof(uint64_t)); + return u64; +} + +INLINE upb_string *upb_decode_string(upb_decoder *d) { + upb_string_recycle(&d->tmp); + uint32_t strlen = upb_decode_varint32(d, true); if (upb_decoder_bufleft(d) >= strlen) { // Common (fast) case. - upb_string_substr(*str, d->bufstr, d->ptr - d->buf, strlen); + upb_string_substr(d->tmp, d->bufstr, d->ptr - d->buf, strlen); upb_decoder_advance(d, strlen); } else { - if (!upb_getbuf(d, upb_string_getrwbuf(*str, strlen), strlen)) - return false; + upb_getbuf(d, upb_string_getrwbuf(d->tmp, strlen), strlen, true); } - upb_value_setstr(val, *str); - return true; + return d->tmp; } - -/* The main decoding loop *****************************************************/ - -extern upb_wire_type_t upb_expected_wire_types[]; -// Returns true if wt is the correct on-the-wire type for ft. -INLINE bool upb_check_type(upb_wire_type_t wt, upb_fieldtype_t ft) { - // This doesn't currently support packed arrays. - return upb_types[ft].native_wire_type == wt; +INLINE void upb_pop(upb_decoder *d) { + //if (d->dispatcher.top->end_offset == UPB_REPEATEDEND) + // upb_dispatch_endseq(&d->dispatcher); + d->f = d->dispatcher.top->f; + upb_dispatch_endsubmsg(&d->dispatcher); + upb_decoder_setmsgend(d); } -static upb_flow_t upb_pop(upb_decoder *d) { - upb_flow_t ret = upb_dispatch_endsubmsg(&d->dispatcher); - upb_dstate_setmsgend(d); - return ret; +INLINE void upb_push(upb_decoder *d, upb_fieldent *f, uint32_t end) { + upb_dispatch_startsubmsg(&d->dispatcher, f, end); + upb_decoder_setmsgend(d); } -static upb_flow_t upb_decoder_skipsubmsg(upb_decoder *d) { - if (d->dispatcher.top->f->type == UPB_TYPE(GROUP)) { - fprintf(stderr, "upb_decoder: Can't skip groups yet.\n"); - abort(); - } - upb_decoder_advance(d, d->dispatcher.top->end_offset - (d->ptr - d->buf)); + +/* Decoding of .proto types ***************************************************/ + +// Technically, we are losing data if we see a 32-bit varint that is not +// properly sign-extended. We could detect this and error about the data loss, +// but proto2 does not do this, so we pass. + +#define T(type, wt, valtype, convfunc) \ + INLINE void upb_decode_ ## type(upb_decoder *d, upb_fieldent *f) { \ + upb_value val; \ + upb_value_set ## valtype(&val, (convfunc)(upb_decode_ ## wt(d))); \ + upb_dispatch_value(&d->dispatcher, f, val); \ + } \ + +static double upb_asdouble(uint64_t n) { return *(double*)&n; } +static float upb_asfloat(uint32_t n) { return *(float*)&n; } +static int32_t upb_zzdec_32(uint32_t n) { return (n >> 1) ^ -(int32_t)(n & 1); } +static int64_t upb_zzdec_64(uint64_t n) { return (n >> 1) ^ -(int64_t)(n & 1); } + +T(INT32, varint, int32, int32_t) +T(INT64, varint, int64, int64_t) +T(UINT32, varint, uint32, uint32_t) +T(UINT64, varint, uint64, uint64_t) +T(FIXED32, fixed32, uint32, uint32_t) +T(FIXED64, fixed64, uint64, uint64_t) +T(SFIXED32, fixed32, int32, int32_t) +T(SFIXED64, fixed64, int64, int64_t) +T(BOOL, varint, bool, bool) +T(ENUM, varint, int32, int32_t) +T(DOUBLE, fixed64, double, upb_asdouble) +T(FLOAT, fixed32, float, upb_asfloat) +T(SINT32, varint, int32, upb_zzdec_32) +T(SINT64, varint, int64, upb_zzdec_64) +T(STRING, string, str, upb_string*) + +static void upb_decode_GROUP(upb_decoder *d, upb_fieldent *f) { + upb_push(d, f, UPB_GROUPEND); +} +static void upb_endgroup(upb_decoder *d, upb_fieldent *f) { + (void)f; upb_pop(d); - return UPB_CONTINUE; } - -static upb_flow_t upb_push(upb_decoder *d, upb_handlers_fieldent *f, - uint32_t end_offset) { - upb_flow_t flow = upb_dispatch_startsubmsg(&d->dispatcher, f, end_offset); - upb_dstate_setmsgend(d); - return flow; +static void upb_decode_MESSAGE(upb_decoder *d, upb_fieldent *f) { + upb_push(d, f, upb_decode_varint32(d, true) + (d->ptr - d->buf)); } -void upb_decoder_decode(upb_decoder *d, upb_status *status) { - d->status = status; -#define CHECK_FLOW(expr) \ - switch (expr) { \ - case UPB_BREAK: goto callback_err; \ - case UPB_SKIPSUBMSG: upb_decoder_skipsubmsg(d); continue; \ - default: break; /* continue normally. */ \ - } -#define CHECK(expr) if (!expr) { assert(!upb_ok(status)); goto err; } - - CHECK(upb_pullbuf(d)); - if (upb_dispatch_startmsg(&d->dispatcher) != UPB_CONTINUE) goto err; - - // Main loop: executed once per tag/field pair. - while(1) { - // Check for end-of-submessage. - while (d->ptr >= d->submsg_end) { - if (d->ptr > d->submsg_end) { - upb_seterr(d->status, UPB_ERROR, "Bad submessage end."); - goto err; - } - CHECK_FLOW(upb_pop(d)); - } +/* The main decoding loop *****************************************************/ + +static void upb_unwind(upb_decoder *d) { + // TODO. + (void)d; +} + +static void upb_delimend(upb_decoder *d) { + if (d->ptr > d->submsg_end) { + upb_seterr(d->status, UPB_ERROR, "Bad submessage end."); + upb_decoder_exit(d); + } + upb_pop(d); +} +static void upb_decoder_enterjit(upb_decoder *d) { + (void)d; +#ifdef UPB_USE_JIT_X64 + if (d->jit_code && d->dispatcher.top == d->dispatcher.stack && d->ptr < d->jit_end) { // Decodes as many fields as possible, updating d->ptr appropriately, // before falling through to the slow(er) path. -#ifdef UPB_USE_JIT_X64 void (*upb_jit_decode)(upb_decoder *d) = (void*)d->jit_code; - if (d->jit_code && d->dispatcher.top == d->dispatcher.stack && d->ptr < d->jit_end) { - //const char *before = d->ptr; - //fprintf(stderr, "Entering JIT, JIT bytes left: %zd\n", d->jit_end - d->ptr); - upb_jit_decode(d); - //fprintf(stderr, "Exiting JIT, parsed %zd bytes\n", d->ptr - before); - //fprintf(stderr, "ptr: %p, effective_end: %p, jit_end: %p, effective_end-ptr=%d\n", - // d->ptr, d->effective_end, d->jit_end, d->effective_end - d->ptr); - } + upb_jit_decode(d); + } #endif +} - // Parse/handle tag. - uint32_t tag; - if (!upb_decode_tag(d, &tag)) { - if (status->code == UPB_EOF && upb_dispatcher_stackempty(&d->dispatcher)) { - // Normal end-of-file. - upb_clearerr(status); - upb_dispatch_endmsg(&d->dispatcher, status); - return; - } else { - if (status->code == UPB_EOF) { - upb_seterr(status, UPB_ERROR, - "Input ended in the middle of a submessage."); - } - goto err; - } +INLINE upb_fieldent *upb_decode_tag(upb_decoder *d) { + while (1) { + uint32_t tag = upb_decode_varint32(d, false); + upb_fieldent *f = upb_dispatcher_lookup(&d->dispatcher, tag); + if (f) { + d->f = f; + return f; } - - // Decode wire data. Hopefully this branch will predict pretty well - // since most types will read a varint here. - upb_value val; - uint8_t wire_type = tag & 0x7; - switch (wire_type) { - case UPB_WIRE_TYPE_START_GROUP: - break; // Nothing to do now, below we will push appropriately. - case UPB_WIRE_TYPE_END_GROUP: - // Strictly speaking we should also check the field number here. - if(d->dispatcher.top->f->type != UPB_TYPE(GROUP)) { - upb_seterr(status, UPB_ERROR, "Unexpected END_GROUP tag."); - goto err; - } - CHECK_FLOW(upb_pop(d)); - continue; // We have no value to dispatch. - case UPB_WIRE_TYPE_VARINT: + switch (tag & 0x7) { + case UPB_WIRE_TYPE_VARINT: upb_decode_varint(d); break; + case UPB_WIRE_TYPE_32BIT: upb_decoder_advance(d, 4); break; + case UPB_WIRE_TYPE_64BIT: upb_decoder_advance(d, 8); break; case UPB_WIRE_TYPE_DELIMITED: - // For the delimited case we are parsing the length. - CHECK(upb_decode_varint(d, &val)); - break; - case UPB_WIRE_TYPE_32BIT: - CHECK(upb_decode_fixed(d, 4, &val)); - break; - case UPB_WIRE_TYPE_64BIT: - CHECK(upb_decode_fixed(d, 8, &val)); + upb_decoder_advance(d, upb_decode_varint32(d, true)); break; } + // TODO: deliver to unknown field callback. + while (d->ptr >= d->submsg_end) upb_delimend(d); + } - // Look up field by tag number. - upb_dispatcher_field *f = upb_dispatcher_lookup(&d->dispatcher, tag); - - if (!f) { - if (wire_type == UPB_WIRE_TYPE_DELIMITED) - CHECK(upb_decode_string(d, &val, &d->tmp)); - // TODO. - CHECK_FLOW(upb_dispatch_unknownval(&d->dispatcher, 0, UPB_NO_VALUE)); - continue; - } + // Have to handle both packed and non-packed sequences of primitives. + //if (d->dispatcher.top->end_offset == UPB_REPEATEDEND && d->f != f) { + // upb_dispatch_endseq(&d->dispatcher); + //} else if (f->is_repeated_primitive) { + // if ((tag & 0x7) == UPB_WIRE_TYPE_DELIMITED) { + // upb_pushseq(d, f, upb_decode_varint32(d, true) + (d->ptr - d->buf)); + // } else if (d->f != f) { + // upb_dispatch_startseq(d, f, UPB_REPEATEDEND); + // } + //} +} - // Perform any further massaging of the data now that we have the field's - // type. Now we can distinguish strings from submessages, and we know - // about zig-zag-encoded types. - // TODO: handle packed encoding. - // TODO: if we were being paranoid, we could check for 32-bit-varint types - // that the top 32 bits all match the highest bit of the low 32 bits. - // If this is not true we are losing data. But the main protobuf library - // doesn't check this, and it would slow us down, so pass for now. - switch (f->type) { - case UPB_TYPE(GROUP): - CHECK_FLOW(upb_push(d, f, UINT32_MAX)); - continue; // We have no value to dispatch. - case UPB_TYPE(MESSAGE): - CHECK_FLOW(upb_push(d, f, upb_value_getuint32(val) + (d->ptr - d->buf))); - continue; // We have no value to dispatch. - case UPB_TYPE(STRING): - case UPB_TYPE(BYTES): - CHECK(upb_decode_string(d, &val, &d->tmp)); - break; - case UPB_TYPE(SINT32): - upb_value_setint32(&val, upb_zzdec_32(upb_value_getint32(val))); - break; - case UPB_TYPE(SINT64): - upb_value_setint64(&val, upb_zzdec_64(upb_value_getint64(val))); - break; - default: -#ifndef NDEBUG - val.type = upb_types[f->type].inmemory_type; -#endif - break; // Other types need no further processing at this point. - } - CHECK_FLOW(upb_dispatch_value(&d->dispatcher, f, val)); +void upb_decoder_onexit(upb_decoder *d) { + if (d->status->code == UPB_EOF && upb_dispatcher_stackempty(&d->dispatcher)) { + // Normal end-of-file. + upb_clearerr(d->status); + upb_dispatch_endmsg(&d->dispatcher, d->status); + } else { + if (d->status->code == UPB_EOF) + upb_seterr(d->status, UPB_ERROR, "Input ended mid-submessage."); } +} -callback_err: - if (upb_ok(status)) { - upb_seterr(status, UPB_ERROR, "Callback returned UPB_BREAK"); +void upb_decoder_decode(upb_decoder *d, upb_status *status) { + if (sigsetjmp(d->exitjmp, 0)) { + upb_decoder_onexit(d); + return; + } + d->status = status; + upb_pullbuf(d, true); + upb_dispatch_startmsg(&d->dispatcher); + while(1) { // Main loop: executed once per tag/field pair. + while (d->ptr >= d->submsg_end) upb_delimend(d); + upb_decoder_enterjit(d); + // if (!d->dispatcher.top->is_packed) + upb_fieldent *f = upb_decode_tag(d); + f->decode(d, f); } -err: - assert(!upb_ok(status)); } void upb_decoder_init(upb_decoder *d, upb_handlers *handlers) { @@ -363,9 +335,38 @@ void upb_decoder_init(upb_decoder *d, upb_handlers *handlers) { if (d->dispatcher.handlers->should_jit) upb_decoder_makejit(d); #endif d->bufstr = NULL; - d->buf = NULL; d->tmp = NULL; upb_string_recycle(&d->tmp); + + // Set function pointers for each field's decode function. + for (int i = 0; i < handlers->msgs_len; i++) { + upb_msgent *m = &handlers->msgs[i]; + for(upb_inttable_iter i = upb_inttable_begin(&m->fieldtab); !upb_inttable_done(i); + i = upb_inttable_next(&m->fieldtab, i)) { + upb_fieldent *f = upb_inttable_iter_value(i); + switch (f->type) { + case UPB_TYPE(INT32): f->decode = &upb_decode_INT32; break; + case UPB_TYPE(INT64): f->decode = &upb_decode_INT64; break; + case UPB_TYPE(UINT32): f->decode = &upb_decode_UINT32; break; + case UPB_TYPE(UINT64): f->decode = &upb_decode_UINT64; break; + case UPB_TYPE(FIXED32): f->decode = &upb_decode_FIXED32; break; + case UPB_TYPE(FIXED64): f->decode = &upb_decode_FIXED64; break; + case UPB_TYPE(SFIXED32): f->decode = &upb_decode_SFIXED32; break; + case UPB_TYPE(SFIXED64): f->decode = &upb_decode_SFIXED64; break; + case UPB_TYPE(BOOL): f->decode = &upb_decode_BOOL; break; + case UPB_TYPE(ENUM): f->decode = &upb_decode_ENUM; break; + case UPB_TYPE(DOUBLE): f->decode = &upb_decode_DOUBLE; break; + case UPB_TYPE(FLOAT): f->decode = &upb_decode_FLOAT; break; + case UPB_TYPE(SINT32): f->decode = &upb_decode_SINT32; break; + case UPB_TYPE(SINT64): f->decode = &upb_decode_SINT64; break; + case UPB_TYPE(STRING): f->decode = &upb_decode_STRING; break; + case UPB_TYPE(BYTES): f->decode = &upb_decode_STRING; break; + case UPB_TYPE(GROUP): f->decode = &upb_decode_GROUP; break; + case UPB_TYPE(MESSAGE): f->decode = &upb_decode_MESSAGE; break; + case UPB_TYPE_ENDGROUP: f->decode = &upb_endgroup; break; + } + } + } } void upb_decoder_reset(upb_decoder *d, upb_bytesrc *bytesrc, void *closure) { diff --git a/src/upb_decoder.h b/src/upb_decoder.h index 1be31c4..954b33c 100644 --- a/src/upb_decoder.h +++ b/src/upb_decoder.h @@ -17,6 +17,7 @@ #ifndef UPB_DECODER_H_ #define UPB_DECODER_H_ +#include #include #include #include "upb_stream.h" @@ -60,6 +61,8 @@ struct _upb_decoder { // MIN(end, submsg_end) const char *effective_end; + upb_fieldent *f; + // Where we will store any errors that occur. upb_status *status; @@ -72,6 +75,7 @@ struct _upb_decoder { char *debug_info; struct dasm_State *dynasm; + sigjmp_buf exitjmp; }; // A upb_decoder decodes the binary protocol buffer format, writing the data it diff --git a/src/upb_decoder_x86.dasc b/src/upb_decoder_x86.dasc index d02f7d1..fd2652a 100644 --- a/src/upb_decoder_x86.dasc +++ b/src/upb_decoder_x86.dasc @@ -135,7 +135,7 @@ void upb_reg_jit_gdb(upb_decoder *d) { |// Checks PTR for end-of-buffer. |.macro check_eob, m | cmp PTR, DECODER->effective_end -|| if (m->endgroup_f) { +|| if (m->is_group) { | jae ->exit_jit || } else { | jae =>m->jit_endofbuf_pclabel @@ -194,7 +194,7 @@ void upb_reg_jit_gdb(upb_decoder *d) { | |.macro setmsgend, m | mov rsi, DECODER->jit_end -|| if (m->endgroup_f) { +|| if (m->is_group) { | mov64 rax, 0xffffffffffffffff | mov qword DECODER->submsg_end, rax | mov DECODER->effective_end, rsi @@ -253,8 +253,8 @@ void upb_reg_jit_gdb(upb_decoder *d) { // PTR should point to the beginning of the tag. static void upb_decoder_jit_field(upb_decoder *d, uint32_t tag, uint32_t next_tag, - upb_handlers_msgent *m, - upb_handlers_fieldent *f, upb_handlers_fieldent *next_f) { + upb_msgent *m, + upb_fieldent *f, upb_fieldent *next_f) { int tag_size = upb_value_size(tag); // PC-label for the dispatch table. @@ -388,7 +388,7 @@ static void upb_decoder_jit_field(upb_decoder *d, uint32_t tag, uint32_t next_ta | mov DECODER->dispatcher.top, rax | mov FRAME, rax - upb_handlers_msgent *sub_m = upb_handlers_getmsgent(d->dispatcher.handlers, f); + upb_msgent *sub_m = upb_handlers_getmsgent(d->dispatcher.handlers, f); if (sub_m->jit_parent_field_done_pclabel != UPB_MULTIPLE) { | jmp =>sub_m->jit_startmsg_pclabel; } else { @@ -433,10 +433,11 @@ static void upb_decoder_jit_field(upb_decoder *d, uint32_t tag, uint32_t next_ta } static int upb_compare_uint32(const void *a, const void *b) { + // TODO: always put ENDGROUP at the end. return *(uint32_t*)a - *(uint32_t*)b; } -static void upb_decoder_jit_msg(upb_decoder *d, upb_handlers_msgent *m) { +static void upb_decoder_jit_msg(upb_decoder *d, upb_msgent *m) { |=>m->jit_startmsg_pclabel: // Call startmsg handler (if any): if (m->startmsg != upb_startmsg_nop) { @@ -466,32 +467,24 @@ static void upb_decoder_jit_msg(upb_decoder *d, upb_handlers_msgent *m) { } qsort(keys, num_keys, sizeof(uint32_t), &upb_compare_uint32); - - upb_handlers_fieldent *last_f = NULL; + upb_fieldent *last_f = NULL; uint32_t last_tag = 0; for(int i = 0; i < num_keys; i++) { uint32_t key = keys[i]; - upb_handlers_fieldent *f = upb_inttable_lookup(&m->fieldtab, key); - uint32_t tag = upb_vencode(key); + upb_fieldent *f = upb_inttable_lookup(&m->fieldtab, key); + uint32_t tag = upb_vencode32(key); if (last_f) upb_decoder_jit_field(d, last_tag, tag, m, last_f, f); last_tag = tag; last_f = f; } + upb_decoder_jit_field(d, last_tag, 0, m, last_f, NULL); free(keys); - if (m->endgroup_f) { - uint32_t tag = m->endgroup_f->number << 3 | UPB_WIRE_TYPE_END_GROUP; - upb_decoder_jit_field(d, last_tag, tag, m, last_f, m->endgroup_f); - upb_decoder_jit_field(d, tag, 0, m, m->endgroup_f, NULL); - } else { - upb_decoder_jit_field(d, last_tag, 0, m, last_f, NULL); - } - // --------- New code section (does not fall through) ------------------------ // End-of-buf / end-of-message. - if (!m->endgroup_f) { + if (!m->is_group) { // This case doesn't exist for groups, because there eob really means // eob, so that case just exits the jit directly. |=>m->jit_endofbuf_pclabel: @@ -560,16 +553,14 @@ static void upb_decoder_jit(upb_decoder *d) { | callp abort } -void upb_decoder_jit_assignfieldlabs(upb_handlers_fieldent *f, +void upb_decoder_jit_assignfieldlabs(upb_fieldent *f, uint32_t *pclabel_count) { f->jit_pclabel = (*pclabel_count)++; f->jit_pclabel_notypecheck = (*pclabel_count)++; f->jit_submsg_done_pclabel = (*pclabel_count)++; } -void upb_decoder_jit_assignmsglabs(upb_handlers *h, - upb_handlers_msgent *m, - uint32_t *pclabel_count) { +void upb_decoder_jit_assignmsglabs(upb_msgent *m, uint32_t *pclabel_count) { m->jit_startmsg_pclabel = (*pclabel_count)++; m->jit_endofbuf_pclabel = (*pclabel_count)++; m->jit_endofmsg_pclabel = (*pclabel_count)++; @@ -581,30 +572,22 @@ void upb_decoder_jit_assignmsglabs(upb_handlers *h, i = upb_inttable_next(&m->fieldtab, i)) { uint32_t key = upb_inttable_iter_key(i); m->max_field_number = UPB_MAX(m->max_field_number, key); - upb_handlers_fieldent *f = upb_inttable_iter_value(i); + upb_fieldent *f = upb_inttable_iter_value(i); upb_decoder_jit_assignfieldlabs(f, pclabel_count); - if (f->type == UPB_TYPE(GROUP)) { - upb_handlers_msgent *sub_m = upb_handlers_getmsgent(h, f); - sub_m->endgroup_f = malloc(sizeof(*sub_m->endgroup_f)); - memcpy(sub_m->endgroup_f, f, sizeof(*f)); - sub_m->endgroup_f->type = UPB_TYPE_ENDGROUP; - upb_decoder_jit_assignfieldlabs(sub_m->endgroup_f, pclabel_count); - } } // XXX: Won't work for large field numbers; will need to use a upb_table. - // +2 to cover group case, in case group number is larger than all tags. - m->tablearray = malloc((m->max_field_number + 2) * sizeof(void*)); + m->tablearray = malloc((m->max_field_number + 1) * sizeof(void*)); } // Second pass: for messages that have only one parent, link them to the field // from which they are called. -void upb_decoder_jit_assignmsglabs2(upb_handlers *h, upb_handlers_msgent *m) { +void upb_decoder_jit_assignmsglabs2(upb_handlers *h, upb_msgent *m) { upb_inttable_iter i; for(i = upb_inttable_begin(&m->fieldtab); !upb_inttable_done(i); i = upb_inttable_next(&m->fieldtab, i)) { - upb_handlers_fieldent *f = upb_inttable_iter_value(i); + upb_fieldent *f = upb_inttable_iter_value(i); if (upb_issubmsgtype(f->type)) { - upb_handlers_msgent *sub_m = upb_handlers_getmsgent(h, f); + upb_msgent *sub_m = upb_handlers_getmsgent(h, f); if (sub_m->jit_parent_field_done_pclabel == UPB_NONE) { sub_m->jit_parent_field_done_pclabel = f->jit_submsg_done_pclabel; } else { @@ -621,7 +604,7 @@ void upb_decoder_makejit(upb_decoder *d) { uint32_t pclabel_count = 1; upb_handlers *h = d->dispatcher.handlers; for (int i = 0; i < h->msgs_len; i++) - upb_decoder_jit_assignmsglabs(h, &h->msgs[i], &pclabel_count); + upb_decoder_jit_assignmsglabs(&h->msgs[i], &pclabel_count); for (int i = 0; i < h->msgs_len; i++) upb_decoder_jit_assignmsglabs2(h, &h->msgs[i]); @@ -648,9 +631,9 @@ void upb_decoder_makejit(upb_decoder *d) { // Create dispatch tables. for (int i = 0; i < h->msgs_len; i++) { - upb_handlers_msgent *m = &h->msgs[i]; + upb_msgent *m = &h->msgs[i]; for (uint32_t j = 0; j <= m->max_field_number; j++) { - upb_handlers_fieldent *f = NULL; + upb_fieldent *f = NULL; for (int k = 0; k < 8; k++) { f = upb_inttable_lookup(&m->fieldtab, (j << 3) | k); if (f) break; @@ -662,9 +645,6 @@ void upb_decoder_makejit(upb_decoder *d) { m->tablearray[j] = d->jit_code + dasm_getpclabel(d, 0); } } - if (m->endgroup_f) { - m->tablearray[m->endgroup_f->number] = d->jit_code + dasm_getpclabel(d, m->endgroup_f->jit_pclabel); - } } dasm_free(d); diff --git a/src/upb_def.h b/src/upb_def.h index e19aeba..ba1c410 100644 --- a/src/upb_def.h +++ b/src/upb_def.h @@ -117,12 +117,14 @@ struct _upb_fielddef { INLINE bool upb_issubmsgtype(upb_fieldtype_t type) { return type == UPB_TYPE(GROUP) || type == UPB_TYPE(MESSAGE); } -INLINE bool upb_issubmsg(upb_fielddef *f) { - return upb_issubmsgtype(f->type); +INLINE bool upb_isstringtype(upb_fieldtype_t type) { + return type == UPB_TYPE(STRING) || type == UPB_TYPE(BYTES); } -INLINE bool upb_isstring(upb_fielddef *f) { - return f->type == UPB_TYPE(STRING) || f->type == UPB_TYPE(BYTES); +INLINE bool upb_isprimitivetype(upb_fieldtype_t type) { + return !upb_issubmsgtype(type) && !upb_isstringtype(type); } +INLINE bool upb_issubmsg(upb_fielddef *f) { return upb_issubmsgtype(f->type); } +INLINE bool upb_isstring(upb_fielddef *f) { return upb_isstringtype(f->type); } INLINE bool upb_isarray(upb_fielddef *f) { return f->label == UPB_LABEL(REPEATED); } diff --git a/src/upb_msg.c b/src/upb_msg.c index 2cc503e..64947c8 100644 --- a/src/upb_msg.c +++ b/src/upb_msg.c @@ -207,7 +207,7 @@ static upb_flow_t upb_msg_dispatch(upb_msg *msg, upb_msgdef *md, upb_dispatcher *d); static upb_flow_t upb_msg_pushval(upb_value val, upb_fielddef *f, - upb_dispatcher *d, upb_handlers_fieldent *hf) { + upb_dispatcher *d, upb_fieldent *hf) { #define CHECK_FLOW(x) do { \ upb_flow_t flow = x; if (flow != UPB_CONTINUE) return flow; \ } while(0) @@ -237,7 +237,7 @@ static upb_flow_t upb_msg_dispatch(upb_msg *msg, upb_msgdef *md, for(i = upb_msg_begin(md); !upb_msg_done(i); i = upb_msg_next(md, i)) { upb_fielddef *f = upb_msg_iter_field(i); if (!upb_msg_has(msg, f)) continue; - upb_handlers_fieldent *hf = upb_dispatcher_lookup(d, f->number); + upb_fieldent *hf = upb_dispatcher_lookup(d, f->number); if (!hf) continue; upb_value val = upb_msg_get(msg, f); if (upb_isarray(f)) { @@ -464,11 +464,12 @@ upb_sflow_t upb_msgsink_startsubmsg_r(void *_m, upb_value _fval) { void upb_msg_regdhandlers(upb_handlers *h) { upb_register_all(h, NULL, NULL, NULL, NULL, NULL, NULL); for (int i = 0; i < h->msgs_len; i++) { - upb_handlers_msgent *m = &h->msgs[i]; + upb_msgent *m = &h->msgs[i]; upb_inttable_iter iter = upb_inttable_begin(&m->fieldtab); for(; !upb_inttable_done(iter); iter = upb_inttable_next(&m->fieldtab, iter)) { - upb_handlers_fieldent *fe = upb_inttable_iter_value(iter); + upb_fieldent *fe = upb_inttable_iter_value(iter); + if (fe->type == UPB_TYPE_ENDGROUP) continue; upb_fielddef *f = upb_value_getfielddef(fe->fval); uint16_t msg_size = 0; uint8_t set_flags_bytes = 0; diff --git a/src/upb_stream.c b/src/upb_stream.c index a408925..fe3a552 100644 --- a/src/upb_stream.c +++ b/src/upb_stream.c @@ -47,13 +47,13 @@ upb_flow_t upb_unknownval_nop(void *closure, upb_field_number_t fieldnum, return UPB_CONTINUE; } -static void upb_msgent_init(upb_handlers_msgent *e) { - upb_inttable_init(&e->fieldtab, 8, sizeof(upb_handlers_fieldent)); +static void upb_msgent_init(upb_msgent *e) { + upb_inttable_init(&e->fieldtab, 8, sizeof(upb_fieldent)); e->startmsg = &upb_startmsg_nop; e->endmsg = &upb_endmsg_nop; e->unknownval = &upb_unknownval_nop; - e->endgroup_f = NULL; e->tablearray = NULL; + e->is_group = false; } void upb_handlers_init(upb_handlers *h, upb_msgdef *md) { @@ -76,20 +76,19 @@ void upb_handlers_uninit(upb_handlers *h) { for (int i = 0; i < h->msgs_len; i++) { upb_inttable_free(&h->msgs[i].fieldtab); free(h->msgs[i].tablearray); - free(h->msgs[i].endgroup_f); } free(h->msgs); upb_msgdef_unref(h->toplevel_msgdef); } -static upb_handlers_fieldent *upb_handlers_getorcreate_without_fval( +static upb_fieldent *upb_handlers_getorcreate_without_fval( upb_handlers *h, upb_field_number_t fieldnum, upb_fieldtype_t type, bool repeated) { uint32_t tag = fieldnum << 3 | upb_types[type].native_wire_type; - upb_handlers_fieldent *f = - upb_inttable_lookup(&h->msgent->fieldtab, tag); + upb_fieldent *f = upb_inttable_lookup(&h->msgent->fieldtab, tag); if (!f) { - upb_handlers_fieldent new_f = {false, type, repeated, fieldnum, -1, UPB_NO_VALUE, - {&upb_value_nop}, &upb_endsubmsg_nop, 0, 0, 0}; + upb_fieldent new_f = {false, type, repeated, + repeated && upb_isprimitivetype(type), fieldnum, -1, UPB_NO_VALUE, + {&upb_value_nop}, &upb_endsubmsg_nop, 0, 0, 0, NULL}; if (upb_issubmsgtype(type)) new_f.cb.startsubmsg = &upb_startsubmsg_nop; upb_inttable_insert(&h->msgent->fieldtab, tag, &new_f); @@ -100,10 +99,10 @@ static upb_handlers_fieldent *upb_handlers_getorcreate_without_fval( return f; } -static upb_handlers_fieldent *upb_handlers_getorcreate( +static upb_fieldent *upb_handlers_getorcreate( upb_handlers *h, upb_field_number_t fieldnum, upb_fieldtype_t type, bool repeated, upb_value fval) { - upb_handlers_fieldent *f = + upb_fieldent *f = upb_handlers_getorcreate_without_fval(h, fieldnum, type, repeated); f->fval = fval; return f; @@ -164,7 +163,7 @@ void upb_register_typed_submsg(upb_handlers *h, upb_field_number_t fieldnum, upb_startsubmsg_handler_t start, upb_endsubmsg_handler_t end, upb_value fval) { - upb_handlers_fieldent *f = upb_handlers_getorcreate(h, fieldnum, type, repeated, fval); + upb_fieldent *f = upb_handlers_getorcreate(h, fieldnum, type, repeated, fval); f->cb.startsubmsg = start ? start : &upb_startsubmsg_nop; f->endsubmsg = end ? end : &upb_endsubmsg_nop; } @@ -172,14 +171,14 @@ void upb_register_typed_submsg(upb_handlers *h, upb_field_number_t fieldnum, void upb_handlers_typed_link(upb_handlers *h, upb_field_number_t fieldnum, upb_fieldtype_t type, bool repeated, int frames) { assert(frames <= (h->top - h->stack)); - upb_handlers_fieldent *f = + upb_fieldent *f = upb_handlers_getorcreate_without_fval(h, fieldnum, type, repeated); f->msgent_index = (h->top - frames)->msgent_index; } void upb_handlers_typed_push(upb_handlers *h, upb_field_number_t fieldnum, upb_fieldtype_t type, bool repeated) { - upb_handlers_fieldent *f = + upb_fieldent *f = upb_handlers_getorcreate_without_fval(h, fieldnum, type, repeated); if (h->top == h->limit) abort(); // TODO: make growable. ++h->top; @@ -201,6 +200,15 @@ void upb_handlers_typed_push(upb_handlers *h, upb_field_number_t fieldnum, assert(f); h->top->msgdef = upb_downcast_msgdef(f->def); } + if (type == UPB_TYPE(GROUP)) { + // Insert a fieldent for ENDGROUP so we can easily dispatch endgroup when + // we see it in the submessage. + // TODO: assert that no other fields in the group are registered with the + // same name or number. + upb_register_typed_submsg(h, fieldnum, UPB_TYPE_ENDGROUP, false, NULL, NULL, + UPB_NO_VALUE); + h->msgent->is_group = true; + } } void upb_handlers_push(upb_handlers *h, upb_fielddef *f, @@ -226,15 +234,15 @@ void upb_handlers_pop(upb_handlers *h, upb_fielddef *f) { /* upb_dispatcher *************************************************************/ -static upb_handlers_fieldent toplevel_f = { - false, UPB_TYPE(GROUP), false, 0, +static upb_fieldent toplevel_f = { + false, UPB_TYPE(GROUP), false, false, 0, 0, // msgent_index #ifdef NDEBUG {{0}}, #else {{0}, UPB_VALUETYPE_RAW}, #endif - {NULL}, NULL, 0, 0, 0}; + {NULL}, NULL, 0, 0, 0, NULL}; void upb_dispatcher_init(upb_dispatcher *d, upb_handlers *h) { d->handlers = h; @@ -255,6 +263,7 @@ void upb_dispatcher_reset(upb_dispatcher *d, void *top_closure, uint32_t top_end d->top = d->stack; d->top->closure = top_closure; d->top->end_offset = top_end_offset; + d->top->is_packed = false; } void upb_dispatcher_uninit(upb_dispatcher *d) { @@ -285,8 +294,7 @@ void upb_dispatch_endmsg(upb_dispatcher *d, upb_status *status) { upb_copyerr(status, &d->status); } -upb_flow_t upb_dispatch_startsubmsg(upb_dispatcher *d, - upb_dispatcher_field *f, +upb_flow_t upb_dispatch_startsubmsg(upb_dispatcher *d, upb_fieldent *f, size_t userval) { ++d->current_depth; if (upb_dispatcher_skipping(d)) return UPB_SKIPSUBMSG; @@ -308,6 +316,7 @@ upb_flow_t upb_dispatch_startsubmsg(upb_dispatcher *d, d->top->f = f; d->top->end_offset = userval; d->top->closure = sflow.closure; + d->top->is_packed = false; d->msgent = upb_handlers_getmsgent(d->handlers, f); d->dispatch_table = &d->msgent->fieldtab; return upb_dispatch_startmsg(d); @@ -319,7 +328,7 @@ upb_flow_t upb_dispatch_endsubmsg(upb_dispatcher *d) { flow = UPB_SKIPSUBMSG; } else { assert(d->top > d->stack); - upb_dispatcher_field *old_f = d->top->f; + upb_fieldent *old_f = d->top->f; d->msgent->endmsg(d->top->closure, &d->status); --d->top; d->msgent = upb_handlers_getmsgent(d->handlers, d->top->f); diff --git a/src/upb_stream.h b/src/upb_stream.h index d024675..e749964 100644 --- a/src/upb_stream.h +++ b/src/upb_stream.h @@ -88,11 +88,12 @@ upb_sflow_t upb_startsubmsg_nop(void *closure, upb_value fval); upb_flow_t upb_endsubmsg_nop(void *closure, upb_value fval); upb_flow_t upb_unknownval_nop(void *closure, upb_field_number_t fieldnum, upb_value val); - -typedef struct { +struct _upb_decoder; +typedef struct _upb_fieldent { bool junk; upb_fieldtype_t type; bool repeated; + bool is_repeated_primitive; uint32_t number; // For upb_issubmsg(f) only, the index into the msgdef array of the submsg. // -1 if unset (indicates that submsg should be skipped). @@ -106,23 +107,26 @@ typedef struct { uint32_t jit_pclabel; uint32_t jit_pclabel_notypecheck; uint32_t jit_submsg_done_pclabel; -} upb_handlers_fieldent; + void (*decode)(struct _upb_decoder *d, struct _upb_fieldent *f); +} upb_fieldent; -typedef struct _upb_handlers_msgent { +typedef struct _upb_msgent { upb_startmsg_handler_t startmsg; upb_endmsg_handler_t endmsg; upb_unknownval_handler_t unknownval; - // Maps field number -> upb_handlers_fieldent. + // Maps field number -> upb_fieldent. upb_inttable fieldtab; uint32_t jit_startmsg_pclabel; uint32_t jit_endofbuf_pclabel; uint32_t jit_endofmsg_pclabel; uint32_t jit_unknownfield_pclabel; - upb_handlers_fieldent *endgroup_f; // NULL if not a group. + bool is_group; int32_t jit_parent_field_done_pclabel; uint32_t max_field_number; + // Currently keyed on field number. Could also try keying it + // on encoded or decoded tag, or on encoded field number. void **tablearray; -} upb_handlers_msgent; +} upb_msgent; typedef struct { upb_msgdef *msgdef; @@ -131,10 +135,10 @@ typedef struct { struct _upb_handlers { // Array of msgdefs, [0]=toplevel. - upb_handlers_msgent *msgs; + upb_msgent *msgs; int msgs_len, msgs_size; upb_msgdef *toplevel_msgdef; // We own a ref. - upb_handlers_msgent *msgent; + upb_msgent *msgent; upb_handlers_frame stack[UPB_MAX_TYPE_DEPTH], *top, *limit; bool should_jit; }; @@ -272,12 +276,11 @@ void upb_handlers_typed_push(upb_handlers *h, upb_field_number_t fieldnum, upb_fieldtype_t type, bool repeated); void upb_handlers_typed_pop(upb_handlers *h); -INLINE upb_handlers_msgent *upb_handlers_getmsgent(upb_handlers *h, - upb_handlers_fieldent *f) { +INLINE upb_msgent *upb_handlers_getmsgent(upb_handlers *h, upb_fieldent *f) { assert(f->msgent_index != -1); return &h->msgs[f->msgent_index]; } -upb_handlers_fieldent *upb_handlers_lookup(upb_inttable *dispatch_table, upb_field_number_t fieldnum); +upb_fieldent *upb_handlers_lookup(upb_inttable *dispatch_table, upb_field_number_t fieldnum); /* upb_dispatcher *************************************************************/ @@ -298,11 +301,12 @@ upb_handlers_fieldent *upb_handlers_lookup(upb_inttable *dispatch_table, upb_fie // consumed, like if this is a submessage of a larger stream. typedef struct { - upb_handlers_fieldent *f; + upb_fieldent *f; void *closure; // Relative to the beginning of this buffer. // For groups and the top-level: UINT32_MAX. uint32_t end_offset; + bool is_packed; // == !upb_issubmsg(f) && end_offset != UPB_REPATEDEND } upb_dispatcher_frame; typedef struct { @@ -311,7 +315,7 @@ typedef struct { upb_handlers *handlers; // Msg and dispatch table for the current level. - upb_handlers_msgent *msgent; + upb_msgent *msgent; upb_inttable *dispatch_table; // The number of startsubmsg calls without a corresponding endsubmsg call. @@ -342,8 +346,6 @@ INLINE bool upb_dispatcher_noframe(upb_dispatcher *d) { } -typedef upb_handlers_fieldent upb_dispatcher_field; - void upb_dispatcher_init(upb_dispatcher *d, upb_handlers *h); void upb_dispatcher_reset(upb_dispatcher *d, void *top_closure, uint32_t top_end_offset); void upb_dispatcher_uninit(upb_dispatcher *d); @@ -352,20 +354,20 @@ upb_flow_t upb_dispatch_startmsg(upb_dispatcher *d); void upb_dispatch_endmsg(upb_dispatcher *d, upb_status *status); // Looks up a field by number for the current message. -INLINE upb_dispatcher_field *upb_dispatcher_lookup(upb_dispatcher *d, - upb_field_number_t n) { - return (upb_dispatcher_field*)upb_inttable_fastlookup( - d->dispatch_table, n, sizeof(upb_dispatcher_field)); +INLINE upb_fieldent *upb_dispatcher_lookup(upb_dispatcher *d, + upb_field_number_t n) { + return (upb_fieldent*)upb_inttable_fastlookup( + d->dispatch_table, n, sizeof(upb_fieldent)); } // Dispatches values or submessages -- the client is responsible for having // previously looked up the field. upb_flow_t upb_dispatch_startsubmsg(upb_dispatcher *d, - upb_dispatcher_field *f, + upb_fieldent *f, size_t userval); upb_flow_t upb_dispatch_endsubmsg(upb_dispatcher *d); -INLINE upb_flow_t upb_dispatch_value(upb_dispatcher *d, upb_dispatcher_field *f, +INLINE upb_flow_t upb_dispatch_value(upb_dispatcher *d, upb_fieldent *f, upb_value val) { if (upb_dispatcher_skipping(d)) return UPB_SKIPSUBMSG; upb_flow_t flow = f->cb.value(d->top->closure, f->fval, val); diff --git a/src/upb_string.h b/src/upb_string.h index 5aa5f3b..e017268 100644 --- a/src/upb_string.h +++ b/src/upb_string.h @@ -182,6 +182,11 @@ INLINE void upb_string_recycle(upb_string **_str) { str->len = 0; _upb_string_release(str); } else { + //if (!str) { + // printf("!str\n"); + //} + //else if (upb_atomic_read(&str->refcount) != 1) { printf("refcount: %d\n", upb_atomic_read(&str->refcount)); } + //else { printf("Some other reason.\n"); } upb_string_unref(str); *_str = upb_string_new(); } diff --git a/src/upb_textprinter.c b/src/upb_textprinter.c index 948d28c..52be51e 100644 --- a/src/upb_textprinter.c +++ b/src/upb_textprinter.c @@ -7,9 +7,10 @@ #include "upb_textprinter.h" +#include +#include #include #include -#include struct _upb_textprinter { upb_bytesink *bytesink; @@ -99,10 +100,12 @@ static upb_flow_t upb_textprinter_value(void *_p, upb_value fval, #define CASE(fmtstr, member) \ CHECK(upb_bytesink_printf(p->bytesink, &p->status, fmtstr, upb_value_get ## member(val))); break; switch(f->type) { + // TODO: figure out what we should really be doing for these + // floating-point formats. case UPB_TYPE(DOUBLE): - CASE("%0.f", double); + CHECK(upb_bytesink_printf(p->bytesink, &p->status, "%.*g", DBL_DIG, upb_value_getdouble(val))); break; case UPB_TYPE(FLOAT): - CASE("%0.f", float) + CHECK(upb_bytesink_printf(p->bytesink, &p->status, "%.*g", FLT_DIG+2, upb_value_getfloat(val))); break; case UPB_TYPE(INT64): case UPB_TYPE(SFIXED64): case UPB_TYPE(SINT64): diff --git a/src/upb_varint.c b/src/upb_varint.c new file mode 100644 index 0000000..25052aa --- /dev/null +++ b/src/upb_varint.c @@ -0,0 +1,54 @@ +/* + * upb - a minimalist implementation of protocol buffers. + * + * Copyright (c) 2011 Google Inc. See LICENSE for details. + * Author: Josh Haberman + */ + +#include "upb_varint.h" + +// Given an encoded varint v, returns an integer with a single bit set that +// indicates the end of the varint. Subtracting one from this value will +// yield a mask that leaves only bits that are part of the varint. Returns +// 0 if the varint is unterminated. +INLINE uint64_t upb_get_vstopbit(uint64_t v) { + uint64_t cbits = v | 0x7f7f7f7f7f7f7f7fULL; + return ~cbits & (cbits+1); +} +INLINE uint64_t upb_get_vmask(uint64_t v) { return upb_get_vstopbit(v) - 1; } + +upb_decoderet upb_vdecode_max8_massimino(upb_decoderet r) { + uint64_t b; + memcpy(&b, r.p, sizeof(b)); + uint64_t stop_bit = upb_get_vstopbit(b); + b = (b & 0x7f7f7f7f7f7f7f7fULL) & (stop_bit - 1); + b += b & 0x007f007f007f007fULL; + b += 3 * (b & 0x0000ffff0000ffffULL); + b += 15 * (b & 0x00000000ffffffffULL); + if (stop_bit == 0) { + // Error: unterminated varint. + upb_decoderet err_r = {(void*)0, 0}; + return err_r; + } + upb_decoderet my_r = {r.p + ((__builtin_ctzll(stop_bit) + 1) / 8), + r.val | (b << 7)}; + return my_r; +} + +upb_decoderet upb_vdecode_max8_wright(upb_decoderet r) { + uint64_t b; + memcpy(&b, r.p, sizeof(b)); + uint64_t stop_bit = upb_get_vstopbit(b); + b &= (stop_bit - 1); + b = ((b & 0x7f007f007f007f00) >> 1) | (b & 0x007f007f007f007f); + b = ((b & 0xffff0000ffff0000) >> 2) | (b & 0x0000ffff0000ffff); + b = ((b & 0xffffffff00000000) >> 4) | (b & 0x00000000ffffffff); + if (stop_bit == 0) { + // Error: unterminated varint. + upb_decoderet err_r = {(void*)0, 0}; + return err_r; + } + upb_decoderet my_r = {r.p + ((__builtin_ctzll(stop_bit) + 1) / 8), + r.val | (b << 14)}; + return my_r; +} diff --git a/src/upb_varint.h b/src/upb_varint.h index 7ca93ec..fb44cd9 100644 --- a/src/upb_varint.h +++ b/src/upb_varint.h @@ -75,53 +75,11 @@ done: return r; } -// Given an encoded varint v, returns an integer with a single bit set that -// indicates the end of the varint. Subtracting one from this value will -// yield a mask that leaves only bits that are part of the varint. Returns -// 0 if the varint is unterminated. -INLINE uint64_t upb_get_vstopbit(uint64_t v) { - uint64_t cbits = v | 0x7f7f7f7f7f7f7f7fULL; - return ~cbits & (cbits+1); -} -INLINE uint64_t upb_get_vmask(uint64_t v) { return upb_get_vstopbit(v) - 1; } - // Decodes a varint of at most 8 bytes without branching (except for error). -INLINE upb_decoderet upb_vdecode_max8_wright(upb_decoderet r) { - uint64_t b; - memcpy(&b, r.p, sizeof(b)); - uint64_t stop_bit = upb_get_vstopbit(b); - b &= (stop_bit - 1); - b = ((b & 0x7f007f007f007f00) >> 1) | (b & 0x007f007f007f007f); - b = ((b & 0xffff0000ffff0000) >> 2) | (b & 0x0000ffff0000ffff); - b = ((b & 0xffffffff00000000) >> 4) | (b & 0x00000000ffffffff); - if (stop_bit == 0) { - // Error: unterminated varint. - upb_decoderet err_r = {(void*)0, 0}; - return err_r; - } - upb_decoderet my_r = {r.p + ((__builtin_ctzll(stop_bit) + 1) / 8), - r.val | (b << 14)}; - return my_r; -} +upb_decoderet upb_vdecode_max8_wright(upb_decoderet r); // Another implementation of the previous. -INLINE upb_decoderet upb_vdecode_max8_massimino(upb_decoderet r) { - uint64_t b; - memcpy(&b, r.p, sizeof(b)); - uint64_t stop_bit = upb_get_vstopbit(b); - b = (b & 0x7f7f7f7f7f7f7f7fULL) & (stop_bit - 1); - b += b & 0x007f007f007f007fULL; - b += 3 * (b & 0x0000ffff0000ffffULL); - b += 15 * (b & 0x00000000ffffffffULL); - if (stop_bit == 0) { - // Error: unterminated varint. - upb_decoderet err_r = {(void*)0, 0}; - return err_r; - } - upb_decoderet my_r = {r.p + ((__builtin_ctzll(stop_bit) + 1) / 8), - r.val | (b << 7)}; - return my_r; -} +upb_decoderet upb_vdecode_max8_massimino(upb_decoderet r); // Template for a function that checks the first two bytes with branching // and dispatches 2-10 bytes with a separate function. @@ -169,8 +127,8 @@ INLINE size_t upb_value_size(uint64_t val) { return val == 0 ? 1 : high_bit / 8 + 1; } -// Currently only works with 32-bit varints. -INLINE uint64_t upb_vencode(uint32_t val) { +// Encodes a 32-bit varint, *not* sign-extended. +INLINE uint64_t upb_vencode32(uint32_t val) { uint64_t ret = 0; for (int bitpos = 0; val; bitpos+=8, val >>=7) { if (bitpos > 0) ret |= (1 << (bitpos-1)); diff --git a/tests/test_decoder.c b/tests/test_decoder.c index 714871a..5f01179 100644 --- a/tests/test_decoder.c +++ b/tests/test_decoder.c @@ -2,38 +2,73 @@ #include "upb_decoder.h" #include "upb_textprinter.h" #include "upb_stdio.h" +#include "upb_glue.h" + +int main(int argc, char *argv[]) { + if (argc < 3) { + fprintf(stderr, "Usage: test_decoder \n"); + return 1; + } -int main() { upb_symtab *symtab = upb_symtab_new(); - upb_symtab_add_descriptorproto(symtab); - upb_def *fds = upb_symtab_lookup( - symtab, UPB_STRLIT("google.protobuf.FileDescriptorSet")); + upb_string *desc = upb_strreadfile(argv[1]); + if (!desc) { + fprintf(stderr, "Couldn't open descriptor file: %s\n", argv[1]); + return 1; + } + + upb_status status = UPB_STATUS_INIT; + upb_parsedesc(symtab, desc, &status); + if (!upb_ok(&status)) { + fprintf(stderr, "Error parsing descriptor: "); + upb_printerr(&status); + return 1; + } + upb_string_unref(desc); + + upb_string *name = upb_strdupc(argv[2]); + upb_def *md = upb_symtab_lookup(symtab, name); + upb_string_unref(name); + if (!md) { + fprintf(stderr, "Descriptor did not contain message: %s\n", argv[2]); + return 1; + } + + upb_msgdef *m = upb_dyncast_msgdef(md); + if (!m) { + fprintf(stderr, "Def was not a msgdef.\n"); + return 1; + } upb_stdio *in = upb_stdio_new(); upb_stdio_reset(in, stdin); upb_stdio *out = upb_stdio_new(); upb_stdio_reset(out, stdout); - upb_decoder d; - upb_decoder_init(&d, upb_downcast_msgdef(fds)); - upb_decoder_reset(&d, upb_stdio_bytesrc(in)); - upb_textprinter *p = upb_textprinter_new(); + upb_handlers handlers; - upb_handlers_init(&handlers); - upb_textprinter_reset(p, &handlers, upb_stdio_bytesink(out), false); - upb_src *src = upb_decoder_src(&d); - upb_src_sethandlers(src, &handlers); + upb_handlers_init(&handlers, m); + upb_textprinter *p = upb_textprinter_new(); + upb_textprinter_reset(p, upb_stdio_bytesink(out), false); + upb_textprinter_reghandlers(&handlers); - upb_status status = UPB_STATUS_INIT; - upb_src_run(src, &status); + upb_decoder d; + upb_decoder_init(&d, &handlers); + upb_decoder_reset(&d, upb_stdio_bytesrc(in), p); + + upb_clearerr(&status); + upb_decoder_decode(&d, &status); - assert(upb_ok(&status)); + if (!upb_ok(&status)) { + fprintf(stderr, "Error parsing input: "); + upb_printerr(&status); + } upb_status_uninit(&status); upb_stdio_free(in); upb_stdio_free(out); upb_decoder_uninit(&d); upb_textprinter_free(p); - upb_def_unref(fds); + upb_def_unref(UPB_UPCAST(m)); upb_symtab_unref(symtab); // Prevent C library from holding buffers open, so Valgrind doesn't see -- cgit v1.2.3