/* * upb - a minimalist implementation of protocol buffers. * * Copyright (c) 2008-2009 Joshua Haberman. See LICENSE for details. */ #include "upb_parse.h" #include #include #include #include "upb_def.h" /* Functions to read wire values. *********************************************/ // These functions are internal to the parser, but might be moved into an // internal header file if we at some point in the future opt to do code // generation, because the generated code would want to inline these functions. // The same applies to the functions to read .proto values below. uint8_t *upb_get_v_uint64_t_full(uint8_t *buf, uint8_t *end, uint64_t *val, struct upb_status *status); // Gets a varint (wire type: UPB_WIRE_TYPE_VARINT). INLINE uint8_t *upb_get_v_uint64_t(uint8_t *buf, uint8_t *end, uint64_t *val, struct upb_status *status) { // We inline this common case (1-byte varints), if that fails we dispatch to // the full (non-inlined) version. if((*buf & 0x80) == 0) { *val = *buf & 0x7f; return buf + 1; } else { return upb_get_v_uint64_t_full(buf, end, val, status); } } // Gets a varint -- called when we only need 32 bits of it. INLINE uint8_t *upb_get_v_uint32_t(uint8_t *buf, uint8_t *end, uint32_t *val, struct upb_status *status) { uint64_t val64; uint8_t *ret = upb_get_v_uint64_t(buf, end, &val64, status); *val = (uint32_t)val64; // Discard the high bits. return ret; } // Gets a fixed-length 32-bit integer (wire type: UPB_WIRE_TYPE_32BIT). INLINE uint8_t *upb_get_f_uint32_t(uint8_t *buf, uint8_t *end, uint32_t *val, struct upb_status *status) { uint8_t *uint32_end = buf + sizeof(uint32_t); if(uint32_end > end) { status->code = UPB_STATUS_NEED_MORE_DATA; return end; } #if UPB_UNALIGNED_READS_OK *val = *(uint32_t*)buf; #else #define SHL(val, bits) ((uint32_t)val << bits) *val = SHL(buf[0], 0) | SHL(buf[1], 8) | SHL(buf[2], 16) | SHL(buf[3], 24); #undef SHL #endif return uint32_end; } // Gets a fixed-length 64-bit integer (wire type: UPB_WIRE_TYPE_64BIT). INLINE uint8_t *upb_get_f_uint64_t(uint8_t *buf, uint8_t *end, uint64_t *val, struct upb_status *status) { uint8_t *uint64_end = buf + sizeof(uint64_t); if(uint64_end > end) { status->code = UPB_STATUS_NEED_MORE_DATA; return end; } #if UPB_UNALIGNED_READS_OK *val = *(uint64_t*)buf; #else #define SHL(val, bits) ((uint64_t)val << bits) *val = SHL(buf[0], 0) | SHL(buf[1], 8) | SHL(buf[2], 16) | SHL(buf[3], 24) | SHL(buf[4], 32) | SHL(buf[5], 40) | SHL(buf[6], 48) | SHL(buf[7], 56); #undef SHL #endif return uint64_end; } INLINE uint8_t *upb_skip_v_uint64_t(uint8_t *buf, uint8_t *end, struct upb_status *status) { uint8_t *const maxend = buf + 10; uint8_t last = 0x80; for(; buf < (uint8_t*)end && (last & 0x80); buf++) last = *buf; if(buf >= end && buf <= maxend && (last & 0x80)) { status->code = UPB_STATUS_NEED_MORE_DATA; buf = end; } else if(buf > maxend) { status->code = UPB_ERROR_UNTERMINATED_VARINT; buf = end; } return buf; } INLINE uint8_t *upb_skip_f_uint32_t(uint8_t *buf, uint8_t *end, struct upb_status *status) { uint8_t *uint32_end = buf + sizeof(uint32_t); if(uint32_end > end) { status->code = UPB_STATUS_NEED_MORE_DATA; return end; } return uint32_end; } INLINE uint8_t *upb_skip_f_uint64_t(uint8_t *buf, uint8_t *end, struct upb_status *status) { uint8_t *uint64_end = buf + sizeof(uint64_t); if(uint64_end > end) { status->code = UPB_STATUS_NEED_MORE_DATA; return end; } return uint64_end; } /* Functions to read .proto values. *******************************************/ // Performs zig-zag decoding, which is used by sint32 and sint64. INLINE int32_t upb_zzdec_32(uint32_t n) { return (n >> 1) ^ -(int32_t)(n & 1); } INLINE int64_t upb_zzdec_64(uint64_t n) { return (n >> 1) ^ -(int64_t)(n & 1); } // Use macros to define a set of two functions for each .proto type: // // // Reads and converts a .proto value from buf, placing it in d. // // "end" indicates the end of the current buffer (if the buffer does // // not contain the entire value UPB_STATUS_NEED_MORE_DATA is returned). // // On success, a pointer will be returned to the first byte that was // // not consumed. // uint8_t *upb_get_INT32(uint8_t *buf, uint8_t *end, int32_t *d, // struct upb_status *status); // // // Given an already read wire value s (source), convert it to a .proto // // value and return it. // int32_t upb_wvtov_INT32(uint32_t s); // // These are the most efficient functions to call if you want to decode a value // for a known type. #define WVTOV(type, wire_t, val_t) \ INLINE val_t upb_wvtov_ ## type(wire_t s) #define GET(type, v_or_f, wire_t, val_t, member_name) \ INLINE uint8_t *upb_get_ ## type(uint8_t *buf, uint8_t *end, val_t *d, \ struct upb_status *status) { \ wire_t tmp = 0; \ uint8_t *ret = upb_get_ ## v_or_f ## _ ## wire_t(buf, end, &tmp, status); \ *d = upb_wvtov_ ## type(tmp); \ return ret; \ } #define T(type, v_or_f, wire_t, val_t, member_name) \ WVTOV(type, wire_t, val_t); /* prototype for GET below */ \ GET(type, v_or_f, wire_t, val_t, member_name) \ WVTOV(type, wire_t, val_t) T(INT32, v, uint32_t, int32_t, int32) { return (int32_t)s; } T(INT64, v, uint64_t, int64_t, int64) { return (int64_t)s; } T(UINT32, v, uint32_t, uint32_t, uint32) { return s; } T(UINT64, v, uint64_t, uint64_t, uint64) { return s; } T(SINT32, v, uint32_t, int32_t, int32) { return upb_zzdec_32(s); } T(SINT64, v, uint64_t, int64_t, int64) { return upb_zzdec_64(s); } T(FIXED32, f, uint32_t, uint32_t, uint32) { return s; } T(FIXED64, f, uint64_t, uint64_t, uint64) { return s; } T(SFIXED32, f, uint32_t, int32_t, int32) { return (int32_t)s; } T(SFIXED64, f, uint64_t, int64_t, int64) { return (int64_t)s; } T(BOOL, v, uint32_t, bool, _bool) { return (bool)s; } T(ENUM, v, uint32_t, int32_t, int32) { return (int32_t)s; } T(DOUBLE, f, uint64_t, double, _double) { union upb_value v; v.uint64 = s; return v._double; } T(FLOAT, f, uint32_t, float, _float) { union upb_value v; v.uint32 = s; return v._float; } #undef WVTOV #undef GET #undef T // Parses a tag, places the result in *tag. INLINE uint8_t *parse_tag(uint8_t *buf, uint8_t *end, struct upb_tag *tag, struct upb_status *status) { uint32_t tag_int; uint8_t *ret = upb_get_v_uint32_t(buf, end, &tag_int, status); tag->wire_type = (upb_wire_type_t)(tag_int & 0x07); tag->field_number = tag_int >> 3; return ret; } /** * Parses a 64-bit varint that is known to be >= 2 bytes (the inline version * handles 1 and 2 byte varints). */ uint8_t *upb_get_v_uint64_t_full(uint8_t *buf, uint8_t *end, uint64_t *val, struct upb_status *status) { uint8_t *const maxend = buf + 10; uint8_t last = 0x80; *val = 0; int bitpos; for(bitpos = 0; buf < (uint8_t*)end && (last & 0x80); buf++, bitpos += 7) *val |= ((uint64_t)((last = *buf) & 0x7F)) << bitpos; if(buf >= end && buf <= maxend && (last & 0x80)) { upb_seterr(status, UPB_STATUS_NEED_MORE_DATA, "Provided data ended in the middle of a varint.\n"); buf = end; } else if(buf > maxend) { upb_seterr(status, UPB_ERROR_UNTERMINATED_VARINT, "Varint was unterminated after 10 bytes.\n"); buf = end; } return buf; } uint8_t *upb_parse_wire_value(uint8_t *buf, uint8_t *end, upb_wire_type_t wt, union upb_wire_value *wv, struct upb_status *status) { switch(wt) { case UPB_WIRE_TYPE_VARINT: return upb_get_v_uint64_t(buf, end, &wv->varint, status); case UPB_WIRE_TYPE_64BIT: return upb_get_f_uint64_t(buf, end, &wv->_64bit, status); case UPB_WIRE_TYPE_32BIT: return upb_get_f_uint32_t(buf, end, &wv->_32bit, status); default: status->code = UPB_STATUS_ERROR; // Doesn't handle delimited, groups. return end; } } /** * Advances buf past the current wire value (of type wt), saving the result in * outbuf. */ static uint8_t *skip_wire_value(uint8_t *buf, uint8_t *end, upb_wire_type_t wt, struct upb_status *status) { switch(wt) { case UPB_WIRE_TYPE_VARINT: return upb_skip_v_uint64_t(buf, end, status); case UPB_WIRE_TYPE_64BIT: return upb_skip_f_uint64_t(buf, end, status); case UPB_WIRE_TYPE_32BIT: return upb_skip_f_uint32_t(buf, end, status); case UPB_WIRE_TYPE_START_GROUP: // TODO: skip to matching end group. case UPB_WIRE_TYPE_END_GROUP: return buf; default: status->code = UPB_STATUS_ERROR; return end; } } uint8_t *upb_parse_value(uint8_t *buf, uint8_t *end, upb_field_type_t ft, union upb_value_ptr v, struct upb_status *status) { #define CASE(t, member_name) \ case UPB_TYPENUM(t): return upb_get_ ## t(buf, end, v.member_name, status); switch(ft) { CASE(DOUBLE, _double) CASE(FLOAT, _float) CASE(INT32, int32) CASE(INT64, int64) CASE(UINT32, uint32) CASE(UINT64, uint64) CASE(SINT32, int32) CASE(SINT64, int64) CASE(FIXED32, uint32) CASE(FIXED64, uint64) CASE(SFIXED32, int32) CASE(SFIXED64, int64) CASE(BOOL, _bool) CASE(ENUM, int32) default: return end; } #undef CASE } struct upb_cbparser_frame { struct upb_msgdef *msgdef; size_t end_offset; // For groups, 0. }; struct upb_cbparser { // Immutable state of the parser. struct upb_msgdef *toplevel_msgdef; upb_value_cb value_cb; upb_str_cb str_cb; upb_start_cb start_cb; upb_end_cb end_cb; // State pertaining to a particular parse (resettable). // Stack entries store the offset where the submsg ends (for groups, 0). struct upb_cbparser_frame stack[UPB_MAX_NESTING], *top, *limit; size_t completed_offset; void *udata; }; struct upb_cbparser *upb_cbparser_new(struct upb_msgdef *msgdef, upb_value_cb valuecb, upb_str_cb strcb, upb_start_cb startcb, upb_end_cb endcb) { struct upb_cbparser *p = malloc(sizeof(struct upb_cbparser)); p->toplevel_msgdef = msgdef; p->value_cb = valuecb; p->str_cb = strcb; p->start_cb = startcb; p->end_cb = endcb; p->limit = &p->stack[UPB_MAX_NESTING]; return p; } void upb_cbparser_free(struct upb_cbparser *p) { free(p); } void upb_cbparser_reset(struct upb_cbparser *p, void *udata) { p->top = p->stack; p->completed_offset = 0; p->udata = udata; p->top->msgdef = p->toplevel_msgdef; // The top-level message is not delimited (we can keep receiving data for it // indefinitely), so we treat it like a group. p->top->end_offset = 0; } static void *get_msgend(struct upb_cbparser *p, uint8_t *start) { if(p->top->end_offset > 0) return start + (p->top->end_offset - p->completed_offset); else return (void*)UINTPTR_MAX; // group. } static bool isgroup(void *submsg_end) { return submsg_end == (void*)UINTPTR_MAX; } extern upb_wire_type_t upb_expected_wire_types[]; // Returns true if wt is the correct on-the-wire type for ft. INLINE bool upb_check_type(upb_wire_type_t wt, upb_field_type_t ft) { // This doesn't currently support packed arrays. return upb_type_info[ft].expected_wire_type == wt; } /** * Pushes a new stack frame for a submessage with the given len (which will * be zero if the submessage is a group). */ static uint8_t *push(struct upb_cbparser *p, uint8_t *start, uint32_t submsg_len, struct upb_fielddef *f, struct upb_status *status) { p->top++; if(p->top >= p->limit) { upb_seterr(status, UPB_STATUS_ERROR, "Nesting exceeded maximum (%d levels)\n", UPB_MAX_NESTING); return NULL; } struct upb_cbparser_frame *frame = p->top; frame->end_offset = p->completed_offset + submsg_len; frame->msgdef = f->ref.msg; if(p->start_cb) p->start_cb(p->udata, f); return get_msgend(p, start); } /** * Pops a stack frame, returning a pointer for where the next submsg should * end (or a pointer that is out of range for a group). */ static void *pop(struct upb_cbparser *p, uint8_t *start) { if(p->end_cb) p->end_cb(p->udata); p->top--; return get_msgend(p, start); } size_t upb_cbparser_parse(struct upb_cbparser *p, void *_buf, size_t len, struct upb_status *status) { // buf is our current offset, moves from start to end. uint8_t *buf = _buf; uint8_t *const start = buf; // ptr equivalent of p->completed_offset uint8_t *end = buf + len; // When we have fully parsed a tag/value pair, we advance this. uint8_t *completed = buf; uint8_t *submsg_end = get_msgend(p, start); struct upb_msgdef *msgdef = p->top->msgdef; bool keep_going = true; // Make local copies so optimizer knows they won't change. upb_str_cb str_cb = p->str_cb; upb_value_cb value_cb = p->value_cb; void *udata = p->udata; // We need to check the status of operations that can fail, but we do so as // late as possible to avoid introducing branches that have to wait on // (status->code) which must be loaded from memory. #define CHECK_STATUS() do { if(!upb_ok(status)) goto err; } while(0) // Main loop: parse a tag, find the appropriate fielddef. while(keep_going && buf < end) { struct upb_tag tag; buf = parse_tag(buf, end, &tag, status); if(tag.wire_type == UPB_WIRE_TYPE_END_GROUP) { CHECK_STATUS(); if(!isgroup(submsg_end)) { upb_seterr(status, UPB_STATUS_ERROR, "End group seen but current " "message is not a group, byte offset: %zd", p->completed_offset + (completed - start)); goto err; } submsg_end = pop(p, start); msgdef = p->top->msgdef; completed = buf; continue; } struct upb_fielddef *f = upb_msg_fieldbynum(msgdef, tag.field_number); if(tag.wire_type == UPB_WIRE_TYPE_DELIMITED) { int32_t delim_len; buf = upb_get_INT32(buf, end, &delim_len, status); CHECK_STATUS(); uint8_t *delim_end = buf + delim_len; if(f && f->type == UPB_TYPENUM(MESSAGE)) { submsg_end = push(p, start, delim_end - start, f, status); msgdef = p->top->msgdef; } else { if(f && upb_isstringtype(f->type)) { size_t avail_len = UPB_MIN(delim_end, end) - buf; keep_going = str_cb(udata, msgdef, f, buf, avail_len, delim_end - buf); } // else { TODO: packed arrays } // If field was not found, it is skipped silently. buf = delim_end; // Could be >end. } } else { //if(!f || !upb_check_type(tag.wire_type, f->type)) { // buf = skip_wire_value(buf, end, tag.wire_type, status); if (f->type == UPB_TYPENUM(GROUP)) { submsg_end = push(p, start, 0, f, status); msgdef = p->top->msgdef; } else { union upb_value val; buf = upb_parse_value(buf, end, f->type, upb_value_addrof(&val), status); keep_going = value_cb(udata, msgdef, f, val); } } CHECK_STATUS(); while(buf >= submsg_end) { if(buf > submsg_end) { upb_seterr(status, UPB_STATUS_ERROR, "Expected submsg end offset " "did not lie on a tag/value boundary."); goto err; } submsg_end = pop(p, start); msgdef = p->top->msgdef; } // while(buf < p->packed_end) { TODO: packed arrays } completed = buf; } size_t read; err: read = (char*)completed - (char*)start; p->completed_offset += read; return read; }