From 9116c697f845e7ca215628029800c36f7dfbfaee Mon Sep 17 00:00:00 2001 From: Joshua Haberman Date: Fri, 15 Jan 2010 18:12:28 -0800 Subject: upb_parser -> upb_decoder --- Makefile | 2 +- benchmarks/parsetostruct.upb_table.c | 18 +- src/upb_data.c | 14 +- src/upb_data.h | 4 +- src/upb_decoder.c | 507 +++++++++++++++++++++++++++++++++++ src/upb_decoder.h | 57 ++++ src/upb_def.c | 2 +- src/upb_parse.c | 507 ----------------------------------- src/upb_parse.h | 57 ---- tests/test_vs_proto2.cc | 4 +- tests/tests.c | 2 +- tools/upbc.c | 2 +- 12 files changed, 588 insertions(+), 588 deletions(-) create mode 100644 src/upb_decoder.c create mode 100644 src/upb_decoder.h delete mode 100644 src/upb_parse.c delete mode 100644 src/upb_parse.h diff --git a/Makefile b/Makefile index 0d5da5d..7235ff9 100644 --- a/Makefile +++ b/Makefile @@ -46,7 +46,7 @@ clean: cd lang_ext/python && python setup.py clean --all # The core library (src/libupb.a) -SRC=src/upb.c src/upb_parse.c src/upb_table.c src/upb_def.c src/upb_data.c \ +SRC=src/upb.c src/upb_decoder.c src/upb_table.c src/upb_def.c src/upb_data.c \ src/upb_encoder.c descriptor/descriptor.c src/upb_text.c # Override the optimization level for upb_def.o, because it is not in the # critical path but gets very large when -O3 is used. diff --git a/benchmarks/parsetostruct.upb_table.c b/benchmarks/parsetostruct.upb_table.c index 7614b30..72799e4 100644 --- a/benchmarks/parsetostruct.upb_table.c +++ b/benchmarks/parsetostruct.upb_table.c @@ -3,18 +3,18 @@ #include "upb_data.h" #include "upb_def.h" -#include "upb_parse.h" +#include "upb_decoder.h" static struct upb_symtab *s; static upb_strptr str; static struct upb_msgdef *def; static upb_msg *msgs[NUM_MESSAGES]; -static upb_parser *parser; +static upb_decoder *decoder; static upb_msgsink *sink; static bool initialize() { - // Initialize upb state, parse descriptor. + // Initialize upb state, decode descriptor. struct upb_status status = UPB_STATUS_INIT; s = upb_symtab_new(); upb_strptr fds = upb_strreadfile(MESSAGE_DESCRIPTOR_FILE); @@ -49,7 +49,7 @@ static bool initialize() fprintf(stderr, "Error reading " MESSAGE_FILE "\n"); return false; } - parser = upb_parser_new(def); + decoder = upb_decoder_new(def); sink = upb_msgsink_new(def); return true; } @@ -60,7 +60,7 @@ static void cleanup() upb_msg_unref(msgs[i], def); upb_string_unref(str); upb_symtab_unref(s); - upb_parser_free(parser); + upb_decoder_free(decoder); upb_msgsink_free(sink); } @@ -69,11 +69,11 @@ static size_t run(int i) struct upb_status status = UPB_STATUS_INIT; upb_msg *msg = msgs[i%NUM_MESSAGES]; upb_msgsink_reset(sink, msg); - upb_parser_reset(parser, upb_msgsink_sink(sink)); + upb_decoder_reset(decoder, upb_msgsink_sink(sink)); upb_msg_clear(msg, def); - size_t parsed = upb_parser_parse(parser, str, &status); - if(!upb_ok(&status) || parsed != upb_strlen(str)) { - fprintf(stderr, "Parse error: %s\n", status.msg); + size_t decoded = upb_decoder_decode(decoder, str, &status); + if(!upb_ok(&status) || decoded != upb_strlen(str)) { + fprintf(stderr, "Decode error: %s\n", status.msg); return 0; } return upb_strlen(str); diff --git a/src/upb_data.c b/src/upb_data.c index fd0415c..a7027d6 100644 --- a/src/upb_data.c +++ b/src/upb_data.c @@ -6,8 +6,8 @@ #include #include "upb_data.h" +#include "upb_decoder.h" #include "upb_def.h" -#include "upb_parse.h" static uint32_t round_up_to_pow2(uint32_t v) { @@ -282,18 +282,18 @@ void _upb_msg_free(upb_msg *msg, struct upb_msgdef *md) free(msg); } -void upb_msg_parsestr(upb_msg *msg, struct upb_msgdef *md, upb_strptr str, - struct upb_status *status) +void upb_msg_decodestr(upb_msg *msg, struct upb_msgdef *md, upb_strptr str, + struct upb_status *status) { - upb_parser *p = upb_parser_new(md); + upb_decoder *d = upb_decoder_new(md); upb_msgsink *s = upb_msgsink_new(md); upb_msgsink_reset(s, msg); - upb_parser_reset(p, upb_msgsink_sink(s)); + upb_decoder_reset(d, upb_msgsink_sink(s)); upb_msg_clear(msg, md); - upb_parser_parse(p, str, status); + upb_decoder_decode(d, str, status); - upb_parser_free(p); + upb_decoder_free(d); upb_msgsink_free(s); } diff --git a/src/upb_data.h b/src/upb_data.h index 22a5ea9..a11b84a 100644 --- a/src/upb_data.h +++ b/src/upb_data.h @@ -516,8 +516,8 @@ INLINE void upb_msg_clear(upb_msg *msg, struct upb_msgdef *md) { // A convenience function for parsing an entire protobuf all at once, without // having to worry about setting up the appropriate objects. -void upb_msg_parsestr(upb_msg *msg, struct upb_msgdef *md, upb_strptr str, - struct upb_status *status); +void upb_msg_decodestr(upb_msg *msg, struct upb_msgdef *md, upb_strptr str, + struct upb_status *status); /* upb_msgsrc *****************************************************************/ diff --git a/src/upb_decoder.c b/src/upb_decoder.c new file mode 100644 index 0000000..ff9753d --- /dev/null +++ b/src/upb_decoder.c @@ -0,0 +1,507 @@ +/* + * upb - a minimalist implementation of protocol buffers. + * + * Copyright (c) 2008-2009 Joshua Haberman. See LICENSE for details. + */ + +#include "upb_decoder.h" + +#include +#include +#include +#include "upb_def.h" + +/* Functions to read wire values. *********************************************/ + +// These functions are internal to the decode, but might be moved into an +// internal header file if we at some point in the future opt to do code +// generation, because the generated code would want to inline these functions. +// The same applies to the functions to read .proto values below. + +const uint8_t *upb_get_v_uint64_t_full(const uint8_t *buf, const uint8_t *end, + uint64_t *val, + struct upb_status *status); + +// Gets a varint (wire type: UPB_WIRE_TYPE_VARINT). +INLINE const uint8_t *upb_get_v_uint64_t(const uint8_t *buf, const uint8_t *end, + uint64_t *val, + struct upb_status *status) +{ + // We inline this common case (1-byte varints), if that fails we dispatch to + // the full (non-inlined) version. + if((*buf & 0x80) == 0) { + *val = *buf & 0x7f; + return buf + 1; + } else { + return upb_get_v_uint64_t_full(buf, end, val, status); + } +} + +// Gets a varint -- called when we only need 32 bits of it. Note that a 32-bit +// varint is not a true wire type. +INLINE const uint8_t *upb_get_v_uint32_t(const uint8_t *buf, const uint8_t *end, + uint32_t *val, + struct upb_status *status) +{ + uint64_t val64; + const uint8_t *ret = upb_get_v_uint64_t(buf, end, &val64, status); + *val = (uint32_t)val64; // Discard the high bits. + return ret; +} + +// Gets a fixed-length 32-bit integer (wire type: UPB_WIRE_TYPE_32BIT). +INLINE const uint8_t *upb_get_f_uint32_t(const uint8_t *buf, const uint8_t *end, + uint32_t *val, struct upb_status *status) +{ + const uint8_t *uint32_end = buf + sizeof(uint32_t); + if(uint32_end > end) { + status->code = UPB_STATUS_NEED_MORE_DATA; + return end; + } +#if UPB_UNALIGNED_READS_OK + *val = *(uint32_t*)buf; +#else +#define SHL(val, bits) ((uint32_t)val << bits) + *val = SHL(buf[0], 0) | SHL(buf[1], 8) | SHL(buf[2], 16) | SHL(buf[3], 24); +#undef SHL +#endif + return uint32_end; +} + +// Gets a fixed-length 64-bit integer (wire type: UPB_WIRE_TYPE_64BIT). +INLINE const uint8_t *upb_get_f_uint64_t(const uint8_t *buf, const uint8_t *end, + uint64_t *val, + struct upb_status *status) +{ + const uint8_t *uint64_end = buf + sizeof(uint64_t); + if(uint64_end > end) { + status->code = UPB_STATUS_NEED_MORE_DATA; + return end; + } +#if UPB_UNALIGNED_READS_OK + *val = *(uint64_t*)buf; +#else +#define SHL(val, bits) ((uint64_t)val << bits) + *val = SHL(buf[0], 0) | SHL(buf[1], 8) | SHL(buf[2], 16) | SHL(buf[3], 24) | + SHL(buf[4], 32) | SHL(buf[5], 40) | SHL(buf[6], 48) | SHL(buf[7], 56); +#undef SHL +#endif + return uint64_end; +} + +INLINE const uint8_t *upb_skip_v_uint64_t(const uint8_t *buf, + const uint8_t *end, + struct upb_status *status) +{ + const uint8_t *const maxend = buf + 10; + uint8_t last = 0x80; + for(; buf < (uint8_t*)end && (last & 0x80); buf++) + last = *buf; + + if(buf >= end && buf <= maxend && (last & 0x80)) { + status->code = UPB_STATUS_NEED_MORE_DATA; + buf = end; + } else if(buf > maxend) { + status->code = UPB_ERROR_UNTERMINATED_VARINT; + buf = end; + } + return buf; +} + +INLINE const uint8_t *upb_skip_f_uint32_t(const uint8_t *buf, + const uint8_t *end, + struct upb_status *status) +{ + const uint8_t *uint32_end = buf + sizeof(uint32_t); + if(uint32_end > end) { + status->code = UPB_STATUS_NEED_MORE_DATA; + return end; + } + return uint32_end; +} + +INLINE const uint8_t *upb_skip_f_uint64_t(const uint8_t *buf, + const uint8_t *end, + struct upb_status *status) +{ + const uint8_t *uint64_end = buf + sizeof(uint64_t); + if(uint64_end > end) { + status->code = UPB_STATUS_NEED_MORE_DATA; + return end; + } + return uint64_end; +} + +/* Functions to read .proto values. *******************************************/ + +// Performs zig-zag decoding, which is used by sint32 and sint64. +INLINE int32_t upb_zzdec_32(uint32_t n) { return (n >> 1) ^ -(int32_t)(n & 1); } +INLINE int64_t upb_zzdec_64(uint64_t n) { return (n >> 1) ^ -(int64_t)(n & 1); } + +// Use macros to define a set of two functions for each .proto type: +// +// // Reads and converts a .proto value from buf, placing it in d. +// // "end" indicates the end of the current buffer (if the buffer does +// // not contain the entire value UPB_STATUS_NEED_MORE_DATA is returned). +// // On success, a pointer will be returned to the first byte that was +// // not consumed. +// const uint8_t *upb_get_INT32(const uint8_t *buf, const uint8_t *end, +// int32_t *d, struct upb_status *status); +// +// // Given an already read wire value s (source), convert it to a .proto +// // value and return it. +// int32_t upb_wvtov_INT32(uint32_t s); +// +// These are the most efficient functions to call if you want to decode a value +// for a known type. + +#define WVTOV(type, wire_t, val_t) \ + INLINE val_t upb_wvtov_ ## type(wire_t s) + +#define GET(type, v_or_f, wire_t, val_t, member_name) \ + INLINE const uint8_t *upb_get_ ## type(const uint8_t *buf, const uint8_t *end, \ + val_t *d, struct upb_status *status) { \ + wire_t tmp = 0; \ + const uint8_t *ret = upb_get_ ## v_or_f ## _ ## wire_t(buf, end, &tmp, status); \ + *d = upb_wvtov_ ## type(tmp); \ + return ret; \ + } + +#define T(type, v_or_f, wire_t, val_t, member_name) \ + WVTOV(type, wire_t, val_t); /* prototype for GET below */ \ + GET(type, v_or_f, wire_t, val_t, member_name) \ + WVTOV(type, wire_t, val_t) + +T(INT32, v, uint32_t, int32_t, int32) { return (int32_t)s; } +T(INT64, v, uint64_t, int64_t, int64) { return (int64_t)s; } +T(UINT32, v, uint32_t, uint32_t, uint32) { return s; } +T(UINT64, v, uint64_t, uint64_t, uint64) { return s; } +T(SINT32, v, uint32_t, int32_t, int32) { return upb_zzdec_32(s); } +T(SINT64, v, uint64_t, int64_t, int64) { return upb_zzdec_64(s); } +T(FIXED32, f, uint32_t, uint32_t, uint32) { return s; } +T(FIXED64, f, uint64_t, uint64_t, uint64) { return s; } +T(SFIXED32, f, uint32_t, int32_t, int32) { return (int32_t)s; } +T(SFIXED64, f, uint64_t, int64_t, int64) { return (int64_t)s; } +T(BOOL, v, uint32_t, bool, _bool) { return (bool)s; } +T(ENUM, v, uint32_t, int32_t, int32) { return (int32_t)s; } +T(DOUBLE, f, uint64_t, double, _double) { + union upb_value v; + v.uint64 = s; + return v._double; +} +T(FLOAT, f, uint32_t, float, _float) { + union upb_value v; + v.uint32 = s; + return v._float; +} + +#undef WVTOV +#undef GET +#undef T + +// Parses a tag, places the result in *tag. +INLINE const uint8_t *decode_tag(const uint8_t *buf, const uint8_t *end, + struct upb_tag *tag, struct upb_status *status) +{ + uint32_t tag_int; + const uint8_t *ret = upb_get_v_uint32_t(buf, end, &tag_int, status); + tag->wire_type = (upb_wire_type_t)(tag_int & 0x07); + tag->field_number = tag_int >> 3; + return ret; +} + + +/** + * Parses a 64-bit varint that is known to be >= 2 bytes (the inline version + * handles 1 and 2 byte varints). + */ +const uint8_t *upb_get_v_uint64_t_full(const uint8_t *buf, const uint8_t *end, + uint64_t *val, struct upb_status *status) +{ + const uint8_t *const maxend = buf + 10; + uint8_t last = 0x80; + *val = 0; + int bitpos; + + for(bitpos = 0; buf < (uint8_t*)end && (last & 0x80); buf++, bitpos += 7) + *val |= ((uint64_t)((last = *buf) & 0x7F)) << bitpos; + + if(buf >= end && buf <= maxend && (last & 0x80)) { + upb_seterr(status, UPB_STATUS_NEED_MORE_DATA, + "Provided data ended in the middle of a varint.\n"); + buf = end; + } else if(buf > maxend) { + upb_seterr(status, UPB_ERROR_UNTERMINATED_VARINT, + "Varint was unterminated after 10 bytes.\n"); + buf = end; + } + + return buf; +} + +const uint8_t *upb_decode_wire_value(uint8_t *buf, uint8_t *end, + upb_wire_type_t wt, + union upb_wire_value *wv, + struct upb_status *status) +{ + switch(wt) { + case UPB_WIRE_TYPE_VARINT: + return upb_get_v_uint64_t(buf, end, &wv->varint, status); + case UPB_WIRE_TYPE_64BIT: + return upb_get_f_uint64_t(buf, end, &wv->_64bit, status); + case UPB_WIRE_TYPE_32BIT: + return upb_get_f_uint32_t(buf, end, &wv->_32bit, status); + default: + status->code = UPB_STATUS_ERROR; // Doesn't handle delimited, groups. + return end; + } +} + +/** + * Advances buf past the current wire value (of type wt), saving the result in + * outbuf. + */ +static const uint8_t *skip_wire_value(const uint8_t *buf, const uint8_t *end, + upb_wire_type_t wt, + struct upb_status *status) +{ + switch(wt) { + case UPB_WIRE_TYPE_VARINT: + return upb_skip_v_uint64_t(buf, end, status); + case UPB_WIRE_TYPE_64BIT: + return upb_skip_f_uint64_t(buf, end, status); + case UPB_WIRE_TYPE_32BIT: + return upb_skip_f_uint32_t(buf, end, status); + case UPB_WIRE_TYPE_START_GROUP: + // TODO: skip to matching end group. + case UPB_WIRE_TYPE_END_GROUP: + return buf; + default: + status->code = UPB_STATUS_ERROR; + return end; + } +} + +static const uint8_t *upb_decode_value(const uint8_t *buf, const uint8_t *end, + upb_field_type_t ft, + union upb_value_ptr v, + struct upb_status *status) +{ +#define CASE(t, member_name) \ + case UPB_TYPE(t): return upb_get_ ## t(buf, end, v.member_name, status); + + switch(ft) { + CASE(DOUBLE, _double) + CASE(FLOAT, _float) + CASE(INT32, int32) + CASE(INT64, int64) + CASE(UINT32, uint32) + CASE(UINT64, uint64) + CASE(SINT32, int32) + CASE(SINT64, int64) + CASE(FIXED32, uint32) + CASE(FIXED64, uint64) + CASE(SFIXED32, int32) + CASE(SFIXED64, int64) + CASE(BOOL, _bool) + CASE(ENUM, int32) + default: return end; + } + +#undef CASE +} + +struct upb_decoder_frame { + struct upb_msgdef *msgdef; + struct upb_fielddef *field; + size_t end_offset; // For groups, 0. +}; + +struct upb_decoder { + // Immutable state of the decoder. + struct upb_msgdef *toplevel_msgdef; + upb_sink *sink; + + // State pertaining to a particular decode (resettable). + // Stack entries store the offset where the submsg ends (for groups, 0). + struct upb_decoder_frame stack[UPB_MAX_NESTING], *top, *limit; + size_t completed_offset; + void *udata; +}; + +upb_decoder *upb_decoder_new(struct upb_msgdef *msgdef) +{ + upb_decoder *d = malloc(sizeof(*d)); + d->toplevel_msgdef = msgdef; + d->limit = &d->stack[UPB_MAX_NESTING]; + return d; +} + +void upb_decoder_free(upb_decoder *d) +{ + free(d); +} + +void upb_decoder_reset(upb_decoder *d, upb_sink *sink) +{ + d->top = d->stack; + d->completed_offset = 0; + d->sink = sink; + d->top->msgdef = d->toplevel_msgdef; + // The top-level message is not delimited (we can keep receiving data for it + // indefinitely), so we treat it like a group. + d->top->end_offset = 0; +} + +static const void *get_msgend(upb_decoder *d, const uint8_t *start) +{ + if(d->top->end_offset > 0) + return start + (d->top->end_offset - d->completed_offset); + else + return (void*)UINTPTR_MAX; // group. +} + +static bool isgroup(const void *submsg_end) +{ + return submsg_end == (void*)UINTPTR_MAX; +} + +extern upb_wire_type_t upb_expected_wire_types[]; +// Returns true if wt is the correct on-the-wire type for ft. +INLINE bool upb_check_type(upb_wire_type_t wt, upb_field_type_t ft) { + // This doesn't currently support packed arrays. + return upb_type_info[ft].expected_wire_type == wt; +} + + +/** + * Pushes a new stack frame for a submessage with the given len (which will + * be zero if the submessage is a group). + */ +static const uint8_t *push(upb_decoder *d, const uint8_t *start, + uint32_t submsg_len, struct upb_fielddef *f, + struct upb_status *status) +{ + d->top->field = f; + d->top++; + if(d->top >= d->limit) { + upb_seterr(status, UPB_STATUS_ERROR, + "Nesting exceeded maximum (%d levels)\n", + UPB_MAX_NESTING); + return NULL; + } + struct upb_decoder_frame *frame = d->top; + frame->end_offset = d->completed_offset + submsg_len; + frame->msgdef = upb_downcast_msgdef(f->def); + + upb_sink_onstart(d->sink, f); + return get_msgend(d, start); +} + +/** + * Pops a stack frame, returning a pointer for where the next submsg should + * end (or a pointer that is out of range for a group). + */ +static const void *pop(upb_decoder *d, const uint8_t *start) +{ + d->top--; + upb_sink_onend(d->sink, d->top->field); + return get_msgend(d, start); +} + + +size_t upb_decoder_decode(upb_decoder *d, upb_strptr str, struct upb_status *status) +{ + // buf is our current offset, moves from start to end. + const uint8_t *buf = (uint8_t*)upb_string_getrobuf(str); + const uint8_t *const start = buf; // ptr equivalent of d->completed_offset + const uint8_t *const end = buf + upb_strlen(str); + + // When we have fully decoded a tag/value pair, we advance this. + const uint8_t *completed = buf; + + const uint8_t *submsg_end = get_msgend(d, start); + struct upb_msgdef *msgdef = d->top->msgdef; + upb_sink_status sink_status = UPB_SINK_CONTINUE; + + // We need to check the status of operations that can fail, but we do so as + // late as possible to avoid introducing branches that have to wait on + // (status->code) which must be loaded from memory. We must always check + // before calling a user callback. +#define CHECK_STATUS() do { if(!upb_ok(status)) goto err; } while(0) + + // Main loop: executed once per tag/field pair. + while(sink_status == UPB_SINK_CONTINUE && buf < end) { + // Parse/handle tag. + struct upb_tag tag; + buf = decode_tag(buf, end, &tag, status); + if(tag.wire_type == UPB_WIRE_TYPE_END_GROUP) { + CHECK_STATUS(); + if(!isgroup(submsg_end)) { + upb_seterr(status, UPB_STATUS_ERROR, "End group seen but current " + "message is not a group, byte offset: %zd", + d->completed_offset + (completed - start)); + goto err; + } + submsg_end = pop(d, start); + msgdef = d->top->msgdef; + completed = buf; + continue; + } + + // Look up field by tag number. + struct upb_fielddef *f = upb_msg_itof(msgdef, tag.field_number); + + // Parse/handle field. + if(tag.wire_type == UPB_WIRE_TYPE_DELIMITED) { + int32_t delim_len; + buf = upb_get_INT32(buf, end, &delim_len, status); + CHECK_STATUS(); // Checking decode_tag() and upb_get_INT32(). + const uint8_t *delim_end = buf + delim_len; + if(f && f->type == UPB_TYPE(MESSAGE)) { + submsg_end = push(d, start, delim_end - start, f, status); + msgdef = d->top->msgdef; + } else { + if(f && upb_isstringtype(f->type)) { + int32_t str_start = buf - start; + sink_status = + upb_sink_onstr(d->sink, f, str, str_start, str_start + delim_len); + } // else { TODO: packed arrays } + // If field was not found, it is skipped silently. + buf = delim_end; // Could be >end. + } + } else { + if(!f || !upb_check_type(tag.wire_type, f->type)) { + buf = skip_wire_value(buf, end, tag.wire_type, status); + } else if (f->type == UPB_TYPE(GROUP)) { + submsg_end = push(d, start, 0, f, status); + msgdef = d->top->msgdef; + } else { + union upb_value val; + buf = upb_decode_value(buf, end, f->type, upb_value_addrof(&val), + status); + CHECK_STATUS(); // Checking upb_decode_value(). + sink_status = upb_sink_onvalue(d->sink, f, val); + } + } + CHECK_STATUS(); + + while(buf >= submsg_end) { + if(buf > submsg_end) { + upb_seterr(status, UPB_STATUS_ERROR, "Expected submsg end offset " + "did not lie on a tag/value boundary."); + goto err; + } + submsg_end = pop(d, start); + msgdef = d->top->msgdef; + } + // while(buf < d->packed_end) { TODO: packed arrays } + completed = buf; + } + + size_t read; +err: + read = (char*)completed - (char*)start; + d->completed_offset += read; + return read; +} diff --git a/src/upb_decoder.h b/src/upb_decoder.h new file mode 100644 index 0000000..4251fda --- /dev/null +++ b/src/upb_decoder.h @@ -0,0 +1,57 @@ +/* + * upb - a minimalist implementation of protocol buffers. + * + * upb_decoder implements a high performance, callback-based, stream-oriented + * decoder (comparable to the SAX model in XML parsers). For parsing protobufs + * into in-memory messages (a more DOM-like model), see the routines in + * upb_msg.h, which are layered on top of this decoder. + * + * TODO: the decoder currently does not support returning unknown values. This + * can easily be added when it is needed. + * + * Copyright (c) 2009 Joshua Haberman. See LICENSE for details. + */ + +#ifndef UPB_DECODER_H_ +#define UPB_DECODER_H_ + +#include +#include +#include "upb.h" +#include "descriptor.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/* upb_decoder *****************************************************************/ + +// A upb_decoder decodes the binary protocol buffer format, writing the data it +// decodes to a upb_sink. +struct upb_decoder; +typedef struct upb_decoder upb_decoder; + +// Allocates and frees a upb_decoder, respectively. +upb_decoder *upb_decoder_new(struct upb_msgdef *md); +void upb_decoder_free(upb_decoder *p); + +// Resets the internal state of an already-allocated decoder. This puts it in a +// state where it has not seen any data, and expects the next data to be from +// the beginning of a new protobuf. Parsers must be reset before they can be +// used. A decoder can be reset multiple times. +void upb_decoder_reset(upb_decoder *p, upb_sink *sink); + +// Decodes protobuf data out of str, returning how much data was decoded. The +// next call to upb_decoder_decode should begin with the first byte that was +// not decoded. "status" indicates whether an error occurred. +// +// TODO: provide the following guarantee: +// retval will always be >= len. +size_t upb_decoder_decode(upb_decoder *p, upb_strptr str, + struct upb_status *status); + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif /* UPB_DECODER_H_ */ diff --git a/src/upb_def.c b/src/upb_def.c index 24a57a6..2ab1d11 100644 --- a/src/upb_def.c +++ b/src/upb_def.c @@ -829,7 +829,7 @@ void upb_symtab_add_desc(struct upb_symtab *s, upb_strptr desc, struct upb_status *status) { upb_msg *fds = upb_msg_new(s->fds_msgdef); - upb_msg_parsestr(fds, s->fds_msgdef, desc, status); + upb_msg_decodestr(fds, s->fds_msgdef, desc, status); if(!upb_ok(status)) return; upb_symtab_addfds(s, (google_protobuf_FileDescriptorSet*)fds, status); upb_msg_unref(fds, s->fds_msgdef); diff --git a/src/upb_parse.c b/src/upb_parse.c deleted file mode 100644 index be22203..0000000 --- a/src/upb_parse.c +++ /dev/null @@ -1,507 +0,0 @@ -/* - * upb - a minimalist implementation of protocol buffers. - * - * Copyright (c) 2008-2009 Joshua Haberman. See LICENSE for details. - */ - -#include "upb_parse.h" - -#include -#include -#include -#include "upb_def.h" - -/* Functions to read wire values. *********************************************/ - -// These functions are internal to the parser, but might be moved into an -// internal header file if we at some point in the future opt to do code -// generation, because the generated code would want to inline these functions. -// The same applies to the functions to read .proto values below. - -const uint8_t *upb_get_v_uint64_t_full(const uint8_t *buf, const uint8_t *end, - uint64_t *val, - struct upb_status *status); - -// Gets a varint (wire type: UPB_WIRE_TYPE_VARINT). -INLINE const uint8_t *upb_get_v_uint64_t(const uint8_t *buf, const uint8_t *end, - uint64_t *val, - struct upb_status *status) -{ - // We inline this common case (1-byte varints), if that fails we dispatch to - // the full (non-inlined) version. - if((*buf & 0x80) == 0) { - *val = *buf & 0x7f; - return buf + 1; - } else { - return upb_get_v_uint64_t_full(buf, end, val, status); - } -} - -// Gets a varint -- called when we only need 32 bits of it. Note that a 32-bit -// varint is not a true wire type. -INLINE const uint8_t *upb_get_v_uint32_t(const uint8_t *buf, const uint8_t *end, - uint32_t *val, - struct upb_status *status) -{ - uint64_t val64; - const uint8_t *ret = upb_get_v_uint64_t(buf, end, &val64, status); - *val = (uint32_t)val64; // Discard the high bits. - return ret; -} - -// Gets a fixed-length 32-bit integer (wire type: UPB_WIRE_TYPE_32BIT). -INLINE const uint8_t *upb_get_f_uint32_t(const uint8_t *buf, const uint8_t *end, - uint32_t *val, struct upb_status *status) -{ - const uint8_t *uint32_end = buf + sizeof(uint32_t); - if(uint32_end > end) { - status->code = UPB_STATUS_NEED_MORE_DATA; - return end; - } -#if UPB_UNALIGNED_READS_OK - *val = *(uint32_t*)buf; -#else -#define SHL(val, bits) ((uint32_t)val << bits) - *val = SHL(buf[0], 0) | SHL(buf[1], 8) | SHL(buf[2], 16) | SHL(buf[3], 24); -#undef SHL -#endif - return uint32_end; -} - -// Gets a fixed-length 64-bit integer (wire type: UPB_WIRE_TYPE_64BIT). -INLINE const uint8_t *upb_get_f_uint64_t(const uint8_t *buf, const uint8_t *end, - uint64_t *val, - struct upb_status *status) -{ - const uint8_t *uint64_end = buf + sizeof(uint64_t); - if(uint64_end > end) { - status->code = UPB_STATUS_NEED_MORE_DATA; - return end; - } -#if UPB_UNALIGNED_READS_OK - *val = *(uint64_t*)buf; -#else -#define SHL(val, bits) ((uint64_t)val << bits) - *val = SHL(buf[0], 0) | SHL(buf[1], 8) | SHL(buf[2], 16) | SHL(buf[3], 24) | - SHL(buf[4], 32) | SHL(buf[5], 40) | SHL(buf[6], 48) | SHL(buf[7], 56); -#undef SHL -#endif - return uint64_end; -} - -INLINE const uint8_t *upb_skip_v_uint64_t(const uint8_t *buf, - const uint8_t *end, - struct upb_status *status) -{ - const uint8_t *const maxend = buf + 10; - uint8_t last = 0x80; - for(; buf < (uint8_t*)end && (last & 0x80); buf++) - last = *buf; - - if(buf >= end && buf <= maxend && (last & 0x80)) { - status->code = UPB_STATUS_NEED_MORE_DATA; - buf = end; - } else if(buf > maxend) { - status->code = UPB_ERROR_UNTERMINATED_VARINT; - buf = end; - } - return buf; -} - -INLINE const uint8_t *upb_skip_f_uint32_t(const uint8_t *buf, - const uint8_t *end, - struct upb_status *status) -{ - const uint8_t *uint32_end = buf + sizeof(uint32_t); - if(uint32_end > end) { - status->code = UPB_STATUS_NEED_MORE_DATA; - return end; - } - return uint32_end; -} - -INLINE const uint8_t *upb_skip_f_uint64_t(const uint8_t *buf, - const uint8_t *end, - struct upb_status *status) -{ - const uint8_t *uint64_end = buf + sizeof(uint64_t); - if(uint64_end > end) { - status->code = UPB_STATUS_NEED_MORE_DATA; - return end; - } - return uint64_end; -} - -/* Functions to read .proto values. *******************************************/ - -// Performs zig-zag decoding, which is used by sint32 and sint64. -INLINE int32_t upb_zzdec_32(uint32_t n) { return (n >> 1) ^ -(int32_t)(n & 1); } -INLINE int64_t upb_zzdec_64(uint64_t n) { return (n >> 1) ^ -(int64_t)(n & 1); } - -// Use macros to define a set of two functions for each .proto type: -// -// // Reads and converts a .proto value from buf, placing it in d. -// // "end" indicates the end of the current buffer (if the buffer does -// // not contain the entire value UPB_STATUS_NEED_MORE_DATA is returned). -// // On success, a pointer will be returned to the first byte that was -// // not consumed. -// const uint8_t *upb_get_INT32(const uint8_t *buf, const uint8_t *end, -// int32_t *d, struct upb_status *status); -// -// // Given an already read wire value s (source), convert it to a .proto -// // value and return it. -// int32_t upb_wvtov_INT32(uint32_t s); -// -// These are the most efficient functions to call if you want to decode a value -// for a known type. - -#define WVTOV(type, wire_t, val_t) \ - INLINE val_t upb_wvtov_ ## type(wire_t s) - -#define GET(type, v_or_f, wire_t, val_t, member_name) \ - INLINE const uint8_t *upb_get_ ## type(const uint8_t *buf, const uint8_t *end, \ - val_t *d, struct upb_status *status) { \ - wire_t tmp = 0; \ - const uint8_t *ret = upb_get_ ## v_or_f ## _ ## wire_t(buf, end, &tmp, status); \ - *d = upb_wvtov_ ## type(tmp); \ - return ret; \ - } - -#define T(type, v_or_f, wire_t, val_t, member_name) \ - WVTOV(type, wire_t, val_t); /* prototype for GET below */ \ - GET(type, v_or_f, wire_t, val_t, member_name) \ - WVTOV(type, wire_t, val_t) - -T(INT32, v, uint32_t, int32_t, int32) { return (int32_t)s; } -T(INT64, v, uint64_t, int64_t, int64) { return (int64_t)s; } -T(UINT32, v, uint32_t, uint32_t, uint32) { return s; } -T(UINT64, v, uint64_t, uint64_t, uint64) { return s; } -T(SINT32, v, uint32_t, int32_t, int32) { return upb_zzdec_32(s); } -T(SINT64, v, uint64_t, int64_t, int64) { return upb_zzdec_64(s); } -T(FIXED32, f, uint32_t, uint32_t, uint32) { return s; } -T(FIXED64, f, uint64_t, uint64_t, uint64) { return s; } -T(SFIXED32, f, uint32_t, int32_t, int32) { return (int32_t)s; } -T(SFIXED64, f, uint64_t, int64_t, int64) { return (int64_t)s; } -T(BOOL, v, uint32_t, bool, _bool) { return (bool)s; } -T(ENUM, v, uint32_t, int32_t, int32) { return (int32_t)s; } -T(DOUBLE, f, uint64_t, double, _double) { - union upb_value v; - v.uint64 = s; - return v._double; -} -T(FLOAT, f, uint32_t, float, _float) { - union upb_value v; - v.uint32 = s; - return v._float; -} - -#undef WVTOV -#undef GET -#undef T - -// Parses a tag, places the result in *tag. -INLINE const uint8_t *parse_tag(const uint8_t *buf, const uint8_t *end, - struct upb_tag *tag, struct upb_status *status) -{ - uint32_t tag_int; - const uint8_t *ret = upb_get_v_uint32_t(buf, end, &tag_int, status); - tag->wire_type = (upb_wire_type_t)(tag_int & 0x07); - tag->field_number = tag_int >> 3; - return ret; -} - - -/** - * Parses a 64-bit varint that is known to be >= 2 bytes (the inline version - * handles 1 and 2 byte varints). - */ -const uint8_t *upb_get_v_uint64_t_full(const uint8_t *buf, const uint8_t *end, - uint64_t *val, struct upb_status *status) -{ - const uint8_t *const maxend = buf + 10; - uint8_t last = 0x80; - *val = 0; - int bitpos; - - for(bitpos = 0; buf < (uint8_t*)end && (last & 0x80); buf++, bitpos += 7) - *val |= ((uint64_t)((last = *buf) & 0x7F)) << bitpos; - - if(buf >= end && buf <= maxend && (last & 0x80)) { - upb_seterr(status, UPB_STATUS_NEED_MORE_DATA, - "Provided data ended in the middle of a varint.\n"); - buf = end; - } else if(buf > maxend) { - upb_seterr(status, UPB_ERROR_UNTERMINATED_VARINT, - "Varint was unterminated after 10 bytes.\n"); - buf = end; - } - - return buf; -} - -const uint8_t *upb_parse_wire_value(uint8_t *buf, uint8_t *end, - upb_wire_type_t wt, - union upb_wire_value *wv, - struct upb_status *status) -{ - switch(wt) { - case UPB_WIRE_TYPE_VARINT: - return upb_get_v_uint64_t(buf, end, &wv->varint, status); - case UPB_WIRE_TYPE_64BIT: - return upb_get_f_uint64_t(buf, end, &wv->_64bit, status); - case UPB_WIRE_TYPE_32BIT: - return upb_get_f_uint32_t(buf, end, &wv->_32bit, status); - default: - status->code = UPB_STATUS_ERROR; // Doesn't handle delimited, groups. - return end; - } -} - -/** - * Advances buf past the current wire value (of type wt), saving the result in - * outbuf. - */ -static const uint8_t *skip_wire_value(const uint8_t *buf, const uint8_t *end, - upb_wire_type_t wt, - struct upb_status *status) -{ - switch(wt) { - case UPB_WIRE_TYPE_VARINT: - return upb_skip_v_uint64_t(buf, end, status); - case UPB_WIRE_TYPE_64BIT: - return upb_skip_f_uint64_t(buf, end, status); - case UPB_WIRE_TYPE_32BIT: - return upb_skip_f_uint32_t(buf, end, status); - case UPB_WIRE_TYPE_START_GROUP: - // TODO: skip to matching end group. - case UPB_WIRE_TYPE_END_GROUP: - return buf; - default: - status->code = UPB_STATUS_ERROR; - return end; - } -} - -static const uint8_t *upb_parse_value(const uint8_t *buf, const uint8_t *end, - upb_field_type_t ft, - union upb_value_ptr v, - struct upb_status *status) -{ -#define CASE(t, member_name) \ - case UPB_TYPE(t): return upb_get_ ## t(buf, end, v.member_name, status); - - switch(ft) { - CASE(DOUBLE, _double) - CASE(FLOAT, _float) - CASE(INT32, int32) - CASE(INT64, int64) - CASE(UINT32, uint32) - CASE(UINT64, uint64) - CASE(SINT32, int32) - CASE(SINT64, int64) - CASE(FIXED32, uint32) - CASE(FIXED64, uint64) - CASE(SFIXED32, int32) - CASE(SFIXED64, int64) - CASE(BOOL, _bool) - CASE(ENUM, int32) - default: return end; - } - -#undef CASE -} - -struct upb_parser_frame { - struct upb_msgdef *msgdef; - struct upb_fielddef *field; - size_t end_offset; // For groups, 0. -}; - -struct upb_parser { - // Immutable state of the parser. - struct upb_msgdef *toplevel_msgdef; - upb_sink *sink; - - // State pertaining to a particular parse (resettable). - // Stack entries store the offset where the submsg ends (for groups, 0). - struct upb_parser_frame stack[UPB_MAX_NESTING], *top, *limit; - size_t completed_offset; - void *udata; -}; - -upb_parser *upb_parser_new(struct upb_msgdef *msgdef) -{ - upb_parser *p = malloc(sizeof(*p)); - p->toplevel_msgdef = msgdef; - p->limit = &p->stack[UPB_MAX_NESTING]; - return p; -} - -void upb_parser_free(upb_parser *p) -{ - free(p); -} - -void upb_parser_reset(upb_parser *p, upb_sink *sink) -{ - p->top = p->stack; - p->completed_offset = 0; - p->sink = sink; - p->top->msgdef = p->toplevel_msgdef; - // The top-level message is not delimited (we can keep receiving data for it - // indefinitely), so we treat it like a group. - p->top->end_offset = 0; -} - -static const void *get_msgend(upb_parser *p, const uint8_t *start) -{ - if(p->top->end_offset > 0) - return start + (p->top->end_offset - p->completed_offset); - else - return (void*)UINTPTR_MAX; // group. -} - -static bool isgroup(const void *submsg_end) -{ - return submsg_end == (void*)UINTPTR_MAX; -} - -extern upb_wire_type_t upb_expected_wire_types[]; -// Returns true if wt is the correct on-the-wire type for ft. -INLINE bool upb_check_type(upb_wire_type_t wt, upb_field_type_t ft) { - // This doesn't currently support packed arrays. - return upb_type_info[ft].expected_wire_type == wt; -} - - -/** - * Pushes a new stack frame for a submessage with the given len (which will - * be zero if the submessage is a group). - */ -static const uint8_t *push(upb_parser *p, const uint8_t *start, - uint32_t submsg_len, struct upb_fielddef *f, - struct upb_status *status) -{ - p->top->field = f; - p->top++; - if(p->top >= p->limit) { - upb_seterr(status, UPB_STATUS_ERROR, - "Nesting exceeded maximum (%d levels)\n", - UPB_MAX_NESTING); - return NULL; - } - struct upb_parser_frame *frame = p->top; - frame->end_offset = p->completed_offset + submsg_len; - frame->msgdef = upb_downcast_msgdef(f->def); - - upb_sink_onstart(p->sink, f); - return get_msgend(p, start); -} - -/** - * Pops a stack frame, returning a pointer for where the next submsg should - * end (or a pointer that is out of range for a group). - */ -static const void *pop(upb_parser *p, const uint8_t *start) -{ - p->top--; - upb_sink_onend(p->sink, p->top->field); - return get_msgend(p, start); -} - - -size_t upb_parser_parse(upb_parser *p, upb_strptr str, struct upb_status *status) -{ - // buf is our current offset, moves from start to end. - const uint8_t *buf = (uint8_t*)upb_string_getrobuf(str); - const uint8_t *const start = buf; // ptr equivalent of p->completed_offset - const uint8_t *const end = buf + upb_strlen(str); - - // When we have fully parsed a tag/value pair, we advance this. - const uint8_t *completed = buf; - - const uint8_t *submsg_end = get_msgend(p, start); - struct upb_msgdef *msgdef = p->top->msgdef; - upb_sink_status sink_status = UPB_SINK_CONTINUE; - - // We need to check the status of operations that can fail, but we do so as - // late as possible to avoid introducing branches that have to wait on - // (status->code) which must be loaded from memory. We must always check - // before calling a user callback. -#define CHECK_STATUS() do { if(!upb_ok(status)) goto err; } while(0) - - // Main loop: executed once per tag/field pair. - while(sink_status == UPB_SINK_CONTINUE && buf < end) { - // Parse/handle tag. - struct upb_tag tag; - buf = parse_tag(buf, end, &tag, status); - if(tag.wire_type == UPB_WIRE_TYPE_END_GROUP) { - CHECK_STATUS(); - if(!isgroup(submsg_end)) { - upb_seterr(status, UPB_STATUS_ERROR, "End group seen but current " - "message is not a group, byte offset: %zd", - p->completed_offset + (completed - start)); - goto err; - } - submsg_end = pop(p, start); - msgdef = p->top->msgdef; - completed = buf; - continue; - } - - // Look up field by tag number. - struct upb_fielddef *f = upb_msg_itof(msgdef, tag.field_number); - - // Parse/handle field. - if(tag.wire_type == UPB_WIRE_TYPE_DELIMITED) { - int32_t delim_len; - buf = upb_get_INT32(buf, end, &delim_len, status); - CHECK_STATUS(); // Checking parse_tag() and upb_get_INT32(). - const uint8_t *delim_end = buf + delim_len; - if(f && f->type == UPB_TYPE(MESSAGE)) { - submsg_end = push(p, start, delim_end - start, f, status); - msgdef = p->top->msgdef; - } else { - if(f && upb_isstringtype(f->type)) { - int32_t str_start = buf - start; - sink_status = - upb_sink_onstr(p->sink, f, str, str_start, str_start + delim_len); - } // else { TODO: packed arrays } - // If field was not found, it is skipped silently. - buf = delim_end; // Could be >end. - } - } else { - if(!f || !upb_check_type(tag.wire_type, f->type)) { - buf = skip_wire_value(buf, end, tag.wire_type, status); - } else if (f->type == UPB_TYPE(GROUP)) { - submsg_end = push(p, start, 0, f, status); - msgdef = p->top->msgdef; - } else { - union upb_value val; - buf = upb_parse_value(buf, end, f->type, upb_value_addrof(&val), - status); - CHECK_STATUS(); // Checking upb_parse_value(). - sink_status = upb_sink_onvalue(p->sink, f, val); - } - } - CHECK_STATUS(); - - while(buf >= submsg_end) { - if(buf > submsg_end) { - upb_seterr(status, UPB_STATUS_ERROR, "Expected submsg end offset " - "did not lie on a tag/value boundary."); - goto err; - } - submsg_end = pop(p, start); - msgdef = p->top->msgdef; - } - // while(buf < p->packed_end) { TODO: packed arrays } - completed = buf; - } - - size_t read; -err: - read = (char*)completed - (char*)start; - p->completed_offset += read; - return read; -} diff --git a/src/upb_parse.h b/src/upb_parse.h deleted file mode 100644 index 9afa96c..0000000 --- a/src/upb_parse.h +++ /dev/null @@ -1,57 +0,0 @@ -/* - * upb - a minimalist implementation of protocol buffers. - * - * upb_parse implements a high performance, callback-based, stream-oriented - * parser (comparable to the SAX model in XML parsers). For parsing protobufs - * into in-memory messages (a more DOM-like model), see the routines in - * upb_msg.h, which are layered on top of this parser. - * - * TODO: the parser currently does not support returning unknown values. This - * can easily be added when it is needed. - * - * Copyright (c) 2009 Joshua Haberman. See LICENSE for details. - */ - -#ifndef UPB_PARSE_H_ -#define UPB_PARSE_H_ - -#include -#include -#include "upb.h" -#include "descriptor.h" - -#ifdef __cplusplus -extern "C" { -#endif - -/* upb_parser *****************************************************************/ - -// A upb_parser parses the binary protocol buffer format, writing the data it -// parses to a upb_sink. -struct upb_parser; -typedef struct upb_parser upb_parser; - -// Allocates and frees a upb_parser, respectively. -upb_parser *upb_parser_new(struct upb_msgdef *md); -void upb_parser_free(upb_parser *p); - -// Resets the internal state of an already-allocated parser. This puts it in a -// state where it has not seen any data, and expects the next data to be from -// the beginning of a new protobuf. Parsers must be reset before they can be -// used. A parser can be reset multiple times. -void upb_parser_reset(upb_parser *p, upb_sink *sink); - -// Parses protobuf data out of str, returning how much data was parsed. The -// next call to upb_parser_parse should begin with the first byte that was -// not parsed. "status" indicates whether an error occurred. -// -// TODO: provide the following guarantee: -// retval will always be >= len. */ -size_t upb_parser_parse(upb_parser *p, upb_strptr str, - struct upb_status *status); - -#ifdef __cplusplus -} /* extern "C" */ -#endif - -#endif /* UPB_PARSE_H_ */ diff --git a/tests/test_vs_proto2.cc b/tests/test_vs_proto2.cc index 4356b05..6cb37b7 100644 --- a/tests/test_vs_proto2.cc +++ b/tests/test_vs_proto2.cc @@ -6,7 +6,7 @@ #include #include "upb_data.h" #include "upb_def.h" -#include "upb_parse.h" +#include "upb_decoder.h" int num_assertions = 0; #define ASSERT(expr) do { \ @@ -171,7 +171,7 @@ void parse_and_compare(MESSAGE_CIDENT *proto2_msg, // Parse to both proto2 and upb. ASSERT(proto2_msg->ParseFromArray(upb_string_getrobuf(str), upb_strlen(str))); struct upb_status status = UPB_STATUS_INIT; - upb_msg_parsestr(upb_msg, upb_md, str, &status); + upb_msg_decodestr(upb_msg, upb_md, str, &status); ASSERT(upb_ok(&status)); compare(*proto2_msg, upb_msg, upb_md); } diff --git a/tests/tests.c b/tests/tests.c index cd7a4f0..03026a6 100644 --- a/tests/tests.c +++ b/tests/tests.c @@ -3,7 +3,7 @@ #include #include #include -#include "upb_parse.c" +#include "upb_decoder.c" #include "upb_def.h" int num_assertions = 0; diff --git a/tools/upbc.c b/tools/upbc.c index 1341800..9fbf3d0 100644 --- a/tools/upbc.c +++ b/tools/upbc.c @@ -673,7 +673,7 @@ int main(int argc, char *argv[]) struct upb_symtab *s = upb_symtab_new(); upb_msg *fds_msg = upb_msg_new(s->fds_msgdef); struct upb_status status = UPB_STATUS_INIT; - upb_msg_parsestr(fds_msg, s->fds_msgdef, descriptor, &status); + upb_msg_decodestr(fds_msg, s->fds_msgdef, descriptor, &status); if(!upb_ok(&status)) error("Failed to parse input file descriptor: %s", status.msg); google_protobuf_FileDescriptorSet *fds = (void*)fds_msg; -- cgit v1.2.3