From fe659c8c93c464fcbcfb5739935a2e4341d01fd4 Mon Sep 17 00:00:00 2001 From: Joshua Haberman Date: Sun, 23 Jan 2011 18:59:31 -0800 Subject: Getting closer to a decoder that could actually compile and work. --- core/upb_stream.h | 7 +- core/upb_string.h | 6 ++ stream/upb_decoder.c | 207 +++++++++++++++++++++++++++------------------------ 3 files changed, 119 insertions(+), 101 deletions(-) diff --git a/core/upb_stream.h b/core/upb_stream.h index 54fd930..bf312a8 100644 --- a/core/upb_stream.h +++ b/core/upb_stream.h @@ -40,8 +40,11 @@ typedef enum { UPB_CONTINUE, // Stop processing for now; check status for details. If no status was set, - // a generic error will be returned. If the error is resumable, processing - // will resume by delivering this callback again. + // a generic error will be returned. If the error is resumable, it is not + // (yet) defined where processing will resume -- waiting for real-world + // examples of resumable decoders and resume-requiring clients. upb_src + // implementations that are not capable of resuming will override the return + // status to be non-resumable if a resumable status was set by the handlers. UPB_BREAK, // Skips to the end of the current submessage (or if we are at the top diff --git a/core/upb_string.h b/core/upb_string.h index 04c0ae9..1a7e06b 100644 --- a/core/upb_string.h +++ b/core/upb_string.h @@ -134,6 +134,12 @@ INLINE upb_strlen_t upb_string_len(upb_string *str) { return str->len; } INLINE const char *upb_string_getrobuf(upb_string *str) { return str->ptr; } INLINE void upb_string_endread(upb_string *str) { (void)str; } +// Convenience method for getting the end of the string. Calls +// upb_string_getrobuf() so inherits the caveats of calling that function. +INLINE const char *upb_string_getbufend(upb_string *str) { + return upb_string_getrobuf(str) + upb_string_len(str); +} + // Attempts to recycle the string "str" so it may be reused and have different // data written to it. After the function returns, "str" points to a writable // string, which is either the original string if it had no other references diff --git a/stream/upb_decoder.c b/stream/upb_decoder.c index fbd7eba..9a17451 100644 --- a/stream/upb_decoder.c +++ b/stream/upb_decoder.c @@ -13,23 +13,24 @@ /* Pure Decoding **************************************************************/ -// The key fast-path varint-decoding routine. There are a lot of possibilities -// for optimization/experimentation here. -INLINE bool upb_decode_varint_fast(uint8_t **buf, uint8_t *end, uint64_t &val, +// The key fast-path varint-decoding routine. Here we can assume we have at +// least UPB_MAX_ENCODED_SIZE bytes available. There are a lot of +// possibilities for optimization/experimentation here. +INLINE bool upb_decode_varint_fast(uint8_t **ptr, uint64_t &val, upb_status *status) { *high = 0; uint32_t b; uint8_t *ptr = p->ptr; - b = *(*buf++); *low = (b & 0x7f) ; if(!(b & 0x80)) goto done; - b = *(*buf++); *low |= (b & 0x7f) << 7; if(!(b & 0x80)) goto done; - b = *(*buf++); *low |= (b & 0x7f) << 14; if(!(b & 0x80)) goto done; - b = *(*buf++); *low |= (b & 0x7f) << 21; if(!(b & 0x80)) goto done; - b = *(*buf++); *low |= (b & 0x7f) << 28; + b = *(*ptr++); *low = (b & 0x7f) ; if(!(b & 0x80)) goto done; + b = *(*ptr++); *low |= (b & 0x7f) << 7; if(!(b & 0x80)) goto done; + b = *(*ptr++); *low |= (b & 0x7f) << 14; if(!(b & 0x80)) goto done; + b = *(*ptr++); *low |= (b & 0x7f) << 21; if(!(b & 0x80)) goto done; + b = *(*ptr++); *low |= (b & 0x7f) << 28; *high = (b & 0x7f) >> 3; if(!(b & 0x80)) goto done; - b = *(*buf++); *high |= (b & 0x7f) << 4; if(!(b & 0x80)) goto done; - b = *(*buf++); *high |= (b & 0x7f) << 11; if(!(b & 0x80)) goto done; - b = *(*buf++); *high |= (b & 0x7f) << 18; if(!(b & 0x80)) goto done; - b = *(*buf++); *high |= (b & 0x7f) << 25; if(!(b & 0x80)) goto done; + b = *(*ptr++); *high |= (b & 0x7f) << 4; if(!(b & 0x80)) goto done; + b = *(*ptr++); *high |= (b & 0x7f) << 11; if(!(b & 0x80)) goto done; + b = *(*ptr++); *high |= (b & 0x7f) << 18; if(!(b & 0x80)) goto done; + b = *(*ptr++); *high |= (b & 0x7f) << 25; if(!(b & 0x80)) goto done; upb_seterr(status, UPB_ERROR, "Unterminated varint"); return false; @@ -71,23 +72,51 @@ struct upb_decoder { // Current input buffer. upb_string *buf; - // Our current offset *within* buf. - upb_strlen_t buf_offset; - // The offset within the overall stream represented by the *beginning* of buf. upb_strlen_t buf_stream_offset; }; // Called only from the slow path, this function copies the next "len" bytes -// from the stream to "data", adjusting "buf" and "end" appropriately. -INLINE bool upb_getbuf(upb_decoder *d, void *data, size_t len, - uint8_t **buf, uint8_t **end) { - while (len > 0) { - memcpy(data, *buf, *end-*buf); - len -= (*end-*buf); - if (!upb_bytesrc_getstr(d->bytesrc, d->buf, d->status)) return false; - *buf = upb_string_getrobuf(d->buf); - *end = *buf + upb_string_len(d->buf); +// from the stream to "data", adjusting "buf" and "len" appropriately. +static bool upb_getbuf(upb_decoder *d, void *data, size_t bytes_wanted, + uint8_t **ptr, size_t *len) { + while (1) { + memcpy(data, *ptr, *len); + bytes_wanted -= *len; + *ptr += *len; + if (bytes_wanted == 0) return true; + + // Did "len" indicate end-of-submessage or end-of-buffer? + size_t buf_offset = d->buf ? (*ptr - upb_string_getrobuf(d->buf)) : 0; + if (d->top->end_offset > 0 && + d->top->end_offset == d->buf_stream_offset + buf_offset) { + // End-of-submessage. + if (bytes_wanted > 0) { + upb_seterr(d->status, UPB_ERROR, "Bad submessage end.") + return false; + } + if (upb_pop(d) != UPB_CONTINUE) return false; + } else { + // End-of-buffer. + if (d->buf) d->buf_stream_offset += upb_string_len(d->buf); + if (!upb_bytesrc_getstr(d->bytesrc, d->buf, d->status)) return false; + *ptr = upb_string_getrobuf(d->buf); + } + + // Wait for end-of-submessage or end-of-buffer, whichever comes first. + size_t offset_in_buf = *ptr - upb_string_getrobuf(d->buf); + size_t buf_remaining = upb_string_getbufend(d->buf) - *ptr; + size_t submsg_remaining = + d->top->end_offset - d->buf_stream_offset - offset_in_buf; + if (d->top->end_offset == UPB_GROUP_END_OFFSET || + buf_remaining > submsg_remaining) { + *len = buf_remaining; + } else { + // Check that non of our subtraction overflowed. + assert(d->top->end_offset > d->buf_stream_offset); + assert(d->top->end_offset - d->buf_stream_offset > offset_in_buf); + *len = submsg_remaining; + } } } @@ -112,21 +141,21 @@ static bool upb_decode_varint_slow(upb_decoder *d) { } } -INLINE bool upb_decode_tag(upb_decoder *d, const uint8_t **_buf, - const uint8_t **end, upb_tag *tag) { - const uint8_t *buf = *_buf, *end = *_end; +INLINE bool upb_decode_tag(upb_decoder *d, const uint8_t **_ptr, + const uint8_t **len, upb_tag *tag) { + const uint8_t *ptr = *_ptr, *len = *_end; uint32_t tag_int; // Nearly all tag varints will be either 1 byte (1-16) or 2 bytes (17-2048). - if (end - buf < 2) goto slow; // unlikely. - tag_int = *buf & 0x7f; - if ((*(buf++) & 0x80) == 0) goto done; // predictable if fields are in order - tag_int |= (*buf & 0x7f) << 7; - if ((*(buf++) & 0x80) != 0) goto slow; // unlikely. + if (len - ptr < 2) goto slow; // unlikely. + tag_int = *ptr & 0x7f; + if ((*(ptr++) & 0x80) == 0) goto done; // predictable if fields are in order + tag_int |= (*ptr & 0x7f) << 7; + if ((*(ptr++) & 0x80) != 0) goto slow; // unlikely. slow: - if (!upb_decode_varint_slow(d, _buf, _end)) return false; - buf = *_buf; // Trick the next line into not overwriting us. + if (!upb_decode_varint_slow(d, _ptr, _end)) return false; + ptr = *_ptr; // Trick the next line into not overwriting us. done: - *_buf = buf; + *_ptr = ptr; tag->wire_type = (upb_wire_type_t)(tag_int & 0x07); tag->field_number = tag_int >> 3; return true; @@ -134,22 +163,22 @@ done: INLINE bool upb_decode_varint(upb_decoder *d, ptrs *p, uint32_t *low, uint32_t *high) { - if (p->end - p->ptr >= UPB_MAX_VARINT_ENCODED_SIZE) + if (p->len - p->ptr >= UPB_MAX_VARINT_ENCODED_SIZE) return upb_decode_varint_fast(d); else return upb_decode_varint_slow(d); } INLINE bool upb_decode_fixed(upb_decoder *d, upb_wire_type_t wt, - uint8_t **buf, uint8_t **end, upb_value *val) { + uint8_t **ptr, uint8_t **len, upb_value *val) { static const char table = {0, 8, 0, 0, 0, 4}; size_t bytes = table[wt]; - if (*end - *buf >= bytes) { + if (*len - *ptr >= bytes) { // Common (fast) case. - memcpy(&val, *buf, bytes); - *buf += bytes; + memcpy(&val, *ptr, bytes); + *ptr += bytes; } else { - if (!upb_getbuf(d, &val, bytes, buf, end)) return false; + if (!upb_getptr(d, &val, bytes, ptr, len)) return false; } return true; } @@ -159,12 +188,12 @@ INLINE bool upb_decode_fixed(upb_decoder *d, upb_wire_type_t wt, INLINE bool upb_decode_string(upb_decoder *d, upb_value *val, upb_string **str) { upb_string_recycle(str); upb_strlen_t len = upb_valu_getint32(*val); - if (*end - *buf >= len) { + if (*len - *ptr >= len) { // Common (fast) case. - upb_string_substr(*str, d->buf, *buf - upb_string_getrobuf(d->buf), len); - *buf += len; + upb_string_substr(*str, d->buf, *ptr - upb_string_getrobuf(d->buf), len); + *ptr += len; } else { - if (!upb_getbuf(d, upb_string_getrwbuf(*str, len), len, buf, end)) + if (!upb_getbuf(d, upb_string_getrwbuf(*str, len), len, ptr, len)) return false; } return true; @@ -173,19 +202,6 @@ INLINE bool upb_decode_string(upb_decoder *d, upb_value *val, upb_string **str) /* The main decoding loop *****************************************************/ -static const void *get_msgend(upb_decoder *d) -{ - if(d->top->end_offset > 0) - return upb_string_getrobuf(d->buf) + (d->top->end_offset - d->buf_stream_offset); - else - return (void*)UINTPTR_MAX; // group. -} - -static bool isgroup(const void *submsg_end) -{ - return submsg_end == (void*)UINTPTR_MAX; -} - extern upb_wire_type_t upb_expected_wire_types[]; // Returns true if wt is the correct on-the-wire type for ft. INLINE bool upb_check_type(upb_wire_type_t wt, upb_field_type_t ft) { @@ -193,76 +209,78 @@ INLINE bool upb_check_type(upb_wire_type_t wt, upb_field_type_t ft) { return upb_types[ft].expected_wire_type == wt; } -static bool upb_push(upb_decoder *d, const uint8_t *start, - uint32_t submsg_len, upb_fielddef *f, - upb_status *status) -{ +static upb_flow_t upb_push(upb_decoder *d, upb_fielddef *f, + upb_strlen_t submsg_len, upb_field_type_t type) { d->top->field = f; d->top++; if(d->top >= d->limit) { upb_seterr(status, UPB_ERROR, "Nesting too deep."); - return false; + return UPB_ERROR; } - d->top->end_offset = d->completed_offset + submsg_len; + d->top->end_offset = type == UPB_TYPE(GROUP) ? + UPB_GROUP_END_OFFSET : d->completed_offset + submsg_len; d->top->msgdef = upb_downcast_msgdef(f->def); - *submsg_end = get_msgend(d); - if (!upb_dispatch_startsubmsg(&d->dispatcher, f)) return false; - return true; + return upb_dispatch_startsubmsg(&d->dispatcher, f); } -static bool upb_pop(upb_decoder *d, const uint8_t *start, upb_status *status) -{ +static upb_flow_t upb_pop(upb_decoder *d) { d->top--; - upb_dispatch_endsubmsg(&d->dispatcher); - *submsg_end = get_msgend(d); - return true; + return upb_dispatch_endsubmsg(&d->dispatcher); } void upb_decoder_run(upb_src *src, upb_status *status) { - // buf is our current offset, moves from start to end. - const uint8_t *buf = (uint8_t*)upb_string_getrobuf(str) + d->buf_offset; - const uint8_t *end = (uint8_t*)upb_string_getrobuf(str) + upb_string_len(str); - const uint8_t *submsg_end = get_msgend(d, start); - upb_msgdef *msgdef = d->top->msgdef; + // We use stack variables for our frequently used vars so the compiler knows + // they can't be changed by external code (like when we dispatch a callback). + + // Our current position in the data buffer. + uint8_t *ptr = NULL; + // Number of bytes available at ptr, until either end-of-buf or + // end-of-submessage (whichever is smaller). + size_t len = 0; + upb_string *str = NULL; - upb_dispatch_startmsg(&d->dispatcher); +// TODO: handle UPB_SKIPSUBMSG +#define CHECK_FLOW(expr) if ((expr) != UPB_CONTINUE) goto err +#define CHECK(expr) if (!expr) goto err; + + CHECK_FLOW(upb_dispatch_startmsg(&d->dispatcher)); // Main loop: executed once per tag/field pair. while(1) { // Parse/handle tag. upb_tag tag; - CHECK(upb_decode_tag(d, &buf, &end, &tag)); + CHECK(upb_decode_tag(d, &ptr, &len, &tag)); // Decode wire data. Hopefully this branch will predict pretty well // since most types will read a varint here. upb_value val; switch (tag.wire_type) { case UPB_WIRE_TYPE_END_GROUP: - if(!isgroup(submsg_end)) { + if(d->top->end_offset != UPB_GROUP_END_OFFSET) upb_seterr(status, UPB_ERROR, "Unexpected END_GROUP tag."); goto err; } - CHECK(upb_pop(d, start, status, &msgdef, &submsg_end)); - goto check_msgend; // We have no value to dispatch. + CHECK_FLOW(upb_pop(d)); + continue; // We have no value to dispatch. case UPB_WIRE_TYPE_VARINT: case UPB_WIRE_TYPE_DELIMITED: // For the delimited case we are parsing the length. - CHECK(upb_decode_varint(d, &buf, &end, &val)); + CHECK(upb_decode_varint(d, &ptr, &len, &val)); break; case UPB_WIRE_TYPE_32BIT: case UPB_WIRE_TYPE_64BIT: - CHECK(upb_decode_fixed(d, tag.wire_type, &buf, &end, &val)); + CHECK(upb_decode_fixed(d, tag.wire_type, &ptr, &len, &val)); break; } // Look up field by tag number. - upb_fielddef *f = upb_msg_itof(msgdef, tag.field_number); + upb_fielddef *f = upb_msg_itof(d->top->msgdef, tag.field_number); if (!f) { if (tag.wire_type == UPB_WIRE_TYPE_DELIMITED) CHECK(upb_decode_string(d, &val, &str)); - CHECK(upb_dispatch_unknownval(d, tag.field_number, val)); + CHECK_FLOW(upb_dispatch_unknownval(d, tag.field_number, val)); } else if (!upb_check_type(tag.wire_type, f->type)) { // TODO: put more details in this error msg. upb_seterr(status, UPB_ERROR, "Field had incorrect type."); @@ -280,8 +298,8 @@ void upb_decoder_run(upb_src *src, upb_status *status) { switch (f->type) { case UPB_TYPE(MESSAGE): case UPB_TYPE(GROUP): - CHECK(upb_push(d, start, upb_value_getint32(val), f, status, &msgdef)); - goto check_msgend; // We have no value to dispatch. + CHECK_FLOW(upb_push(d, start, upb_value_getint32(val), f, status, &msgdef)); + continue; // We have no value to dispatch. case UPB_TYPE(STRING): case UPB_TYPE(BYTES): CHECK(upb_decode_string(d, &val, &str)); @@ -295,19 +313,10 @@ void upb_decoder_run(upb_src *src, upb_status *status) { default: break; // Other types need no further processing at this point. } - CHECK(upb_dispatch_value(d->sink, f, val, status)); - -check_msgend: - while(buf >= submsg_end) { - if(buf > submsg_end) { - upb_seterr(status, UPB_ERROR, "Bad submessage end.") - goto err; - } - CHECK(upb_pop(d, start, status, &msgdef, &submsg_end)); - } + CHECK_FLOW(upb_dispatch_value(d->sink, f, val, status)); } - CHECK(upb_dispatch_endmsg(&d->dispatcher)); + CHECK_FLOW(upb_dispatch_endmsg(&d->dispatcher)); return; err: -- cgit v1.2.3