From 93381f1411def0dba5677b71cd4df859d99777f3 Mon Sep 17 00:00:00 2001 From: Joshua Haberman Date: Mon, 24 Jan 2011 21:15:44 -0800 Subject: Decoder compiles again! But probably doesn't work. --- stream/upb_decoder.c | 241 ++++++++++++++++++++++++++++++--------------------- 1 file changed, 143 insertions(+), 98 deletions(-) (limited to 'stream/upb_decoder.c') diff --git a/stream/upb_decoder.c b/stream/upb_decoder.c index 9a17451..b4b32ff 100644 --- a/stream/upb_decoder.c +++ b/stream/upb_decoder.c @@ -14,27 +14,27 @@ /* Pure Decoding **************************************************************/ // The key fast-path varint-decoding routine. Here we can assume we have at -// least UPB_MAX_ENCODED_SIZE bytes available. There are a lot of +// least UPB_MAX_VARINT_ENCODED_SIZE bytes available. There are a lot of // possibilities for optimization/experimentation here. -INLINE bool upb_decode_varint_fast(uint8_t **ptr, uint64_t &val, +INLINE bool upb_decode_varint_fast(const char **ptr, uint64_t *val, upb_status *status) { - *high = 0; + uint32_t low, high = 0; uint32_t b; - uint8_t *ptr = p->ptr; - b = *(*ptr++); *low = (b & 0x7f) ; if(!(b & 0x80)) goto done; - b = *(*ptr++); *low |= (b & 0x7f) << 7; if(!(b & 0x80)) goto done; - b = *(*ptr++); *low |= (b & 0x7f) << 14; if(!(b & 0x80)) goto done; - b = *(*ptr++); *low |= (b & 0x7f) << 21; if(!(b & 0x80)) goto done; - b = *(*ptr++); *low |= (b & 0x7f) << 28; - *high = (b & 0x7f) >> 3; if(!(b & 0x80)) goto done; - b = *(*ptr++); *high |= (b & 0x7f) << 4; if(!(b & 0x80)) goto done; - b = *(*ptr++); *high |= (b & 0x7f) << 11; if(!(b & 0x80)) goto done; - b = *(*ptr++); *high |= (b & 0x7f) << 18; if(!(b & 0x80)) goto done; - b = *(*ptr++); *high |= (b & 0x7f) << 25; if(!(b & 0x80)) goto done; + b = *(*ptr++); low = (b & 0x7f) ; if(!(b & 0x80)) goto done; + b = *(*ptr++); low |= (b & 0x7f) << 7; if(!(b & 0x80)) goto done; + b = *(*ptr++); low |= (b & 0x7f) << 14; if(!(b & 0x80)) goto done; + b = *(*ptr++); low |= (b & 0x7f) << 21; if(!(b & 0x80)) goto done; + b = *(*ptr++); low |= (b & 0x7f) << 28; + high = (b & 0x7f) >> 3; if(!(b & 0x80)) goto done; + b = *(*ptr++); high |= (b & 0x7f) << 4; if(!(b & 0x80)) goto done; + b = *(*ptr++); high |= (b & 0x7f) << 11; if(!(b & 0x80)) goto done; + b = *(*ptr++); high |= (b & 0x7f) << 18; if(!(b & 0x80)) goto done; + b = *(*ptr++); high |= (b & 0x7f) << 25; if(!(b & 0x80)) goto done; upb_seterr(status, UPB_ERROR, "Unterminated varint"); return false; done: + *val = ((uint64_t)high << 32) | low; return true; } @@ -50,7 +50,7 @@ INLINE int64_t upb_zzdec_64(uint64_t n) { return (n >> 1) ^ -(int64_t)(n & 1); } typedef struct { upb_msgdef *msgdef; upb_fielddef *field; - size_t end_offset; // For groups, 0. + ssize_t end_offset; // For groups, 0. } upb_decoder_frame; struct upb_decoder { @@ -76,23 +76,50 @@ struct upb_decoder { upb_strlen_t buf_stream_offset; }; +typedef struct { + // Our current position in the data buffer. + const char *ptr; + + // Number of bytes available at ptr, until either end-of-buf or + // end-of-submessage (whichever is smaller). + size_t len; + + // Msgdef for the current level. + upb_msgdef *msgdef; +} upb_dstate; + +INLINE void upb_dstate_advance(upb_dstate *s, size_t len) { + s->ptr += len; + s->len -= len; +} + +static upb_flow_t upb_pop(upb_decoder *d); + +// Constant used to signal that the submessage is a group and therefore we +// don't know its end offset. This cannot be the offset of a real submessage +// end because it takes at least one byte to begin a submessage. +#define UPB_GROUP_END_OFFSET -1 +#define UPB_MAX_VARINT_ENCODED_SIZE 10 + // Called only from the slow path, this function copies the next "len" bytes // from the stream to "data", adjusting "buf" and "len" appropriately. static bool upb_getbuf(upb_decoder *d, void *data, size_t bytes_wanted, - uint8_t **ptr, size_t *len) { + upb_dstate *s) { while (1) { - memcpy(data, *ptr, *len); - bytes_wanted -= *len; - *ptr += *len; + size_t to_copy = UPB_MIN(bytes_wanted, s->len); + memcpy(data, s->ptr, to_copy); + upb_dstate_advance(s, to_copy); + bytes_wanted -= to_copy; if (bytes_wanted == 0) return true; // Did "len" indicate end-of-submessage or end-of-buffer? - size_t buf_offset = d->buf ? (*ptr - upb_string_getrobuf(d->buf)) : 0; + ssize_t buf_offset = + d->buf ? ((const char*)s->ptr - upb_string_getrobuf(d->buf)) : 0; if (d->top->end_offset > 0 && d->top->end_offset == d->buf_stream_offset + buf_offset) { // End-of-submessage. if (bytes_wanted > 0) { - upb_seterr(d->status, UPB_ERROR, "Bad submessage end.") + upb_seterr(d->status, UPB_ERROR, "Bad submessage end."); return false; } if (upb_pop(d) != UPB_CONTINUE) return false; @@ -100,100 +127,121 @@ static bool upb_getbuf(upb_decoder *d, void *data, size_t bytes_wanted, // End-of-buffer. if (d->buf) d->buf_stream_offset += upb_string_len(d->buf); if (!upb_bytesrc_getstr(d->bytesrc, d->buf, d->status)) return false; - *ptr = upb_string_getrobuf(d->buf); + s->ptr = upb_string_getrobuf(d->buf); } // Wait for end-of-submessage or end-of-buffer, whichever comes first. - size_t offset_in_buf = *ptr - upb_string_getrobuf(d->buf); - size_t buf_remaining = upb_string_getbufend(d->buf) - *ptr; - size_t submsg_remaining = + ssize_t offset_in_buf = s->ptr - upb_string_getrobuf(d->buf); + ssize_t buf_remaining = upb_string_getbufend(d->buf) - s->ptr; + ssize_t submsg_remaining = d->top->end_offset - d->buf_stream_offset - offset_in_buf; if (d->top->end_offset == UPB_GROUP_END_OFFSET || buf_remaining > submsg_remaining) { - *len = buf_remaining; + s->len = buf_remaining; } else { // Check that non of our subtraction overflowed. assert(d->top->end_offset > d->buf_stream_offset); assert(d->top->end_offset - d->buf_stream_offset > offset_in_buf); - *len = submsg_remaining; + s->len = submsg_remaining; } } } -// We use this path when we don't have UPB_MAX_ENCODED_SIZE contiguous bytes -// available in our current buffer. We don't inline this because we accept -// that it will be slow and we don't want to pay for two copies of it. -static bool upb_decode_varint_slow(upb_decoder *d) { - uint8_t buf[UPB_MAX_ENCODED_SIZE]; - uint8_t *p = buf, *end = buf + sizeof(buf); - for(int bitpos = 0; p < end && getbyte(d, p) && (last & 0x80); p++, bitpos += 7) - *val |= ((uint64_t)((last = *p) & 0x7F)) << bitpos; - - if(d->status->code == UPB_EOF && (last & 0x80)) { - upb_seterr(status, UPB_ERROR, - "Provided data ended in the middle of a varint.\n"); - } else if(buf == maxend) { - upb_seterr(status, UPB_ERROR, +// We use this path when we don't have UPB_MAX_VARINT_ENCODED_SIZE contiguous +// bytes available in our current buffer. We don't inline this because we +// accept that it will be slow and we don't want to pay for two copies of it. +static bool upb_decode_varint_slow(upb_decoder *d, upb_dstate *s, + upb_value *val) { + char byte = 0x80; + uint64_t val64 = 0; + int bitpos; + for(bitpos = 0; + bitpos < 70 && (byte & 0x80) && upb_getbuf(d, &byte, 1, s); + bitpos += 7) + val64 |= ((uint64_t)byte & 0x7F) << bitpos; + + if(bitpos == 70) { + upb_seterr(d->status, UPB_ERROR, "Varint was unterminated after 10 bytes.\n"); + return false; + } else if (d->status->code == UPB_EOF && (byte & 0x80)) { + upb_seterr(d->status, UPB_ERROR, + "Provided data ended in the middle of a varint.\n"); + return false; } else { // Success. - return; + upb_value_setint64(val, val64); + return true; } } -INLINE bool upb_decode_tag(upb_decoder *d, const uint8_t **_ptr, - const uint8_t **len, upb_tag *tag) { - const uint8_t *ptr = *_ptr, *len = *_end; +typedef struct { + upb_wire_type_t wire_type; + upb_field_number_t field_number; +} upb_tag; + +INLINE bool upb_decode_tag(upb_decoder *d, upb_dstate *s, upb_tag *tag) { + const char *p = s->ptr; uint32_t tag_int; + upb_value val; // Nearly all tag varints will be either 1 byte (1-16) or 2 bytes (17-2048). - if (len - ptr < 2) goto slow; // unlikely. - tag_int = *ptr & 0x7f; - if ((*(ptr++) & 0x80) == 0) goto done; // predictable if fields are in order - tag_int |= (*ptr & 0x7f) << 7; - if ((*(ptr++) & 0x80) != 0) goto slow; // unlikely. + if (s->len < 2) goto slow; // unlikely. + tag_int = *p & 0x7f; + if ((*(p++) & 0x80) == 0) goto done; // predictable if fields are in order + tag_int |= (*p & 0x7f) << 7; + if ((*(p++) & 0x80) == 0) goto done; // likely slow: - if (!upb_decode_varint_slow(d, _ptr, _end)) return false; - ptr = *_ptr; // Trick the next line into not overwriting us. + // Decode a full varint starting over from ptr. + if (!upb_decode_varint_slow(d, s, &val)) return false; + tag_int = upb_value_getint64(val); + p = s->ptr; // Trick the next line into not overwriting us. done: - *_ptr = ptr; + upb_dstate_advance(s, p - s->ptr); tag->wire_type = (upb_wire_type_t)(tag_int & 0x07); tag->field_number = tag_int >> 3; return true; } -INLINE bool upb_decode_varint(upb_decoder *d, ptrs *p, - uint32_t *low, uint32_t *high) { - if (p->len - p->ptr >= UPB_MAX_VARINT_ENCODED_SIZE) - return upb_decode_varint_fast(d); - else - return upb_decode_varint_slow(d); +INLINE bool upb_decode_varint(upb_decoder *d, upb_dstate *s, upb_value *val) { + if (s->len >= UPB_MAX_VARINT_ENCODED_SIZE) { + // Common (fast) case. + uint64_t val64; + const char *p = s->ptr; + if (!upb_decode_varint_fast(&p, &val64, d->status)) return false; + upb_dstate_advance(s, p - s->ptr); + upb_value_setint64(val, val64); + return true; + } else { + return upb_decode_varint_slow(d, s, val); + } } INLINE bool upb_decode_fixed(upb_decoder *d, upb_wire_type_t wt, - uint8_t **ptr, uint8_t **len, upb_value *val) { - static const char table = {0, 8, 0, 0, 0, 4}; + upb_dstate *s, upb_value *val) { + static const char table[] = {0, 8, 0, 0, 0, 4}; size_t bytes = table[wt]; - if (*len - *ptr >= bytes) { + if (s->len >= bytes) { // Common (fast) case. - memcpy(&val, *ptr, bytes); - *ptr += bytes; + memcpy(&val, s->ptr, bytes); + upb_dstate_advance(s, bytes); } else { - if (!upb_getptr(d, &val, bytes, ptr, len)) return false; + if (!upb_getbuf(d, &val, bytes, s)) return false; } return true; } // "val" initially holds the length of the string, this is replaced by the // contents of the string. -INLINE bool upb_decode_string(upb_decoder *d, upb_value *val, upb_string **str) { +INLINE bool upb_decode_string(upb_decoder *d, upb_value *val, upb_string **str, + upb_dstate *s) { upb_string_recycle(str); - upb_strlen_t len = upb_valu_getint32(*val); - if (*len - *ptr >= len) { + uint32_t strlen = upb_value_getint32(*val); + if (s->len >= strlen) { // Common (fast) case. - upb_string_substr(*str, d->buf, *ptr - upb_string_getrobuf(d->buf), len); - *ptr += len; + upb_string_substr(*str, d->buf, s->ptr - upb_string_getrobuf(d->buf), strlen); + upb_dstate_advance(s, strlen); } else { - if (!upb_getbuf(d, upb_string_getrwbuf(*str, len), len, ptr, len)) + if (!upb_getbuf(d, upb_string_getrwbuf(*str, strlen), strlen, s)) return false; } return true; @@ -204,21 +252,22 @@ INLINE bool upb_decode_string(upb_decoder *d, upb_value *val, upb_string **str) extern upb_wire_type_t upb_expected_wire_types[]; // Returns true if wt is the correct on-the-wire type for ft. -INLINE bool upb_check_type(upb_wire_type_t wt, upb_field_type_t ft) { +INLINE bool upb_check_type(upb_wire_type_t wt, upb_fieldtype_t ft) { // This doesn't currently support packed arrays. - return upb_types[ft].expected_wire_type == wt; + return upb_types[ft].native_wire_type == wt; } -static upb_flow_t upb_push(upb_decoder *d, upb_fielddef *f, - upb_strlen_t submsg_len, upb_field_type_t type) { +static upb_flow_t upb_push(upb_decoder *d, upb_dstate *s, upb_fielddef *f, + upb_strlen_t submsg_len, upb_fieldtype_t type) { d->top->field = f; d->top++; if(d->top >= d->limit) { - upb_seterr(status, UPB_ERROR, "Nesting too deep."); + upb_seterr(d->status, UPB_ERROR, "Nesting too deep."); return UPB_ERROR; } - d->top->end_offset = type == UPB_TYPE(GROUP) ? - UPB_GROUP_END_OFFSET : d->completed_offset + submsg_len; + d->top->end_offset = (type == UPB_TYPE(GROUP)) ? + UPB_GROUP_END_OFFSET : + d->buf_stream_offset + (s->ptr - upb_string_getrobuf(d->buf)) + submsg_len; d->top->msgdef = upb_downcast_msgdef(f->def); return upb_dispatch_startsubmsg(&d->dispatcher, f); } @@ -229,15 +278,11 @@ static upb_flow_t upb_pop(upb_decoder *d) { } void upb_decoder_run(upb_src *src, upb_status *status) { - // We use stack variables for our frequently used vars so the compiler knows - // they can't be changed by external code (like when we dispatch a callback). - - // Our current position in the data buffer. - uint8_t *ptr = NULL; - // Number of bytes available at ptr, until either end-of-buf or - // end-of-submessage (whichever is smaller). - size_t len = 0; - + upb_decoder *d = (upb_decoder*)src; + // We put our dstate on the stack so the compiler knows they can't be changed + // by external code (like when we dispatch a callback). We must be sure not + // to let its address escape this source file. + upb_dstate state = {NULL, 0, d->top->msgdef}; upb_string *str = NULL; // TODO: handle UPB_SKIPSUBMSG @@ -250,14 +295,14 @@ void upb_decoder_run(upb_src *src, upb_status *status) { while(1) { // Parse/handle tag. upb_tag tag; - CHECK(upb_decode_tag(d, &ptr, &len, &tag)); + CHECK(upb_decode_tag(d, &state, &tag)); // Decode wire data. Hopefully this branch will predict pretty well // since most types will read a varint here. upb_value val; switch (tag.wire_type) { case UPB_WIRE_TYPE_END_GROUP: - if(d->top->end_offset != UPB_GROUP_END_OFFSET) + if(d->top->end_offset != UPB_GROUP_END_OFFSET) { upb_seterr(status, UPB_ERROR, "Unexpected END_GROUP tag."); goto err; } @@ -266,21 +311,21 @@ void upb_decoder_run(upb_src *src, upb_status *status) { case UPB_WIRE_TYPE_VARINT: case UPB_WIRE_TYPE_DELIMITED: // For the delimited case we are parsing the length. - CHECK(upb_decode_varint(d, &ptr, &len, &val)); + CHECK(upb_decode_varint(d, &state, &val)); break; case UPB_WIRE_TYPE_32BIT: case UPB_WIRE_TYPE_64BIT: - CHECK(upb_decode_fixed(d, tag.wire_type, &ptr, &len, &val)); + CHECK(upb_decode_fixed(d, tag.wire_type, &state, &val)); break; } // Look up field by tag number. - upb_fielddef *f = upb_msg_itof(d->top->msgdef, tag.field_number); + upb_fielddef *f = upb_msgdef_itof(d->top->msgdef, tag.field_number); if (!f) { if (tag.wire_type == UPB_WIRE_TYPE_DELIMITED) - CHECK(upb_decode_string(d, &val, &str)); - CHECK_FLOW(upb_dispatch_unknownval(d, tag.field_number, val)); + CHECK(upb_decode_string(d, &val, &str, &state)); + CHECK_FLOW(upb_dispatch_unknownval(&d->dispatcher, tag.field_number, val)); } else if (!upb_check_type(tag.wire_type, f->type)) { // TODO: put more details in this error msg. upb_seterr(status, UPB_ERROR, "Field had incorrect type."); @@ -298,11 +343,11 @@ void upb_decoder_run(upb_src *src, upb_status *status) { switch (f->type) { case UPB_TYPE(MESSAGE): case UPB_TYPE(GROUP): - CHECK_FLOW(upb_push(d, start, upb_value_getint32(val), f, status, &msgdef)); + CHECK_FLOW(upb_push(d, &state, f, upb_value_getint32(val), f->type)); continue; // We have no value to dispatch. case UPB_TYPE(STRING): case UPB_TYPE(BYTES): - CHECK(upb_decode_string(d, &val, &str)); + CHECK(upb_decode_string(d, &val, &str, &state)); break; case UPB_TYPE(SINT32): upb_value_setint32(&val, upb_zzdec_32(upb_value_getint32(val))); @@ -313,7 +358,7 @@ void upb_decoder_run(upb_src *src, upb_status *status) { default: break; // Other types need no further processing at this point. } - CHECK_FLOW(upb_dispatch_value(d->sink, f, val, status)); + CHECK_FLOW(upb_dispatch_value(&d->dispatcher, f, val)); } CHECK_FLOW(upb_dispatch_endmsg(&d->dispatcher)); @@ -329,7 +374,7 @@ void upb_decoder_sethandlers(upb_src *src, upb_handlers *handlers) { upb_decoder *d = (upb_decoder*)src; upb_dispatcher_reset(&d->dispatcher, handlers); d->top = d->stack; - d->completed_offset = 0; + d->buf_stream_offset = 0; d->top->msgdef = d->toplevel_msgdef; // The top-level message is not delimited (we can keep receiving data for it // indefinitely), so we treat it like a group. -- cgit v1.2.3