From 2d21601d477dfc242d31acabdad119a56d2b7945 Mon Sep 17 00:00:00 2001 From: Joshua Haberman Date: Mon, 29 Jun 2009 17:54:50 -0700 Subject: Reworked low-level parsing: smaller, safer. --- Makefile | 2 +- tests.c | 18 +++++-- upb.h | 3 ++ upb_parse.c | 175 ++++++++++++++++++++++++++++++++++-------------------------- upb_parse.h | 40 ++++---------- 5 files changed, 126 insertions(+), 112 deletions(-) diff --git a/Makefile b/Makefile index 0ac85b6..7bc24fe 100644 --- a/Makefile +++ b/Makefile @@ -3,7 +3,7 @@ CC=gcc CXX=g++ CFLAGS=-std=c99 -CPPFLAGS=-O3 -Wall -Wextra -pedantic -g -DUPB_UNALIGNED_READS_OK -fomit-frame-pointer +CPPFLAGS=-O3 -DNDEBUG -Wall -Wextra -pedantic -g -DUPB_UNALIGNED_READS_OK -fomit-frame-pointer OBJ=upb_parse.o upb_table.o upb_msg.o upb_context.o descriptor.o all: $(OBJ) test_table tests clean: diff --git a/tests.c b/tests.c index 854dbf1..128e43b 100644 --- a/tests.c +++ b/tests.c @@ -10,7 +10,7 @@ void test_get_v_uint64_t() uint8_t zero[] = {0x00}; void *zero_buf = zero; uint64_t zero_val = 0; - status = get_v_uint64_t(&zero_buf, &zero_val); + status = get_v_uint64_t(&zero_buf, sizeof(zero), &zero_val); assert(status == UPB_STATUS_OK); assert(zero_val == 0); assert(zero_buf == zero + sizeof(zero)); @@ -18,29 +18,37 @@ void test_get_v_uint64_t() uint8_t one[] = {0x01}; void *one_buf = one; uint64_t one_val = 0; - status = get_v_uint64_t(&one_buf, &one_val); + status = get_v_uint64_t(&one_buf, sizeof(one), &one_val); assert(status == UPB_STATUS_OK); assert(one_val == 1); + assert(one_buf == one + sizeof(one)); uint8_t twobyte[] = {0xAC, 0x02}; void *twobyte_buf = twobyte; uint64_t twobyte_val = 0; - status = get_v_uint64_t(&twobyte_buf, &twobyte_val); + status = get_v_uint64_t(&twobyte_buf, sizeof(twobyte), &twobyte_val); assert(status == UPB_STATUS_OK); assert(twobyte_val == 300); + assert(twobyte_buf == twobyte + sizeof(twobyte)); uint8_t tenbyte[] = {0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x7F}; void *tenbyte_buf = tenbyte; uint64_t tenbyte_val = 0; - status = get_v_uint64_t(&tenbyte_buf, &tenbyte_val); + status = get_v_uint64_t(&tenbyte_buf, sizeof(tenbyte), &tenbyte_val); assert(status == UPB_STATUS_OK); assert(tenbyte_val == 0x89101c305080c101); + assert(tenbyte_buf == tenbyte + sizeof(tenbyte)); uint8_t elevenbyte[] = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x01}; void *elevenbyte_buf = elevenbyte; uint64_t elevenbyte_val = 0; - status = get_v_uint64_t(&elevenbyte_buf, &elevenbyte_val); + status = get_v_uint64_t(&elevenbyte_buf, sizeof(elevenbyte), &elevenbyte_val); assert(status == UPB_ERROR_UNTERMINATED_VARINT); + status = get_v_uint64_t(&elevenbyte_buf, sizeof(elevenbyte)-1, &elevenbyte_val); + /* Byte 10 is 0x80, so we know it's unterminated. */ + assert(status == UPB_ERROR_UNTERMINATED_VARINT); + status = get_v_uint64_t(&elevenbyte_buf, sizeof(elevenbyte)-2, &elevenbyte_val); + assert(status == UPB_STATUS_NEED_MORE_DATA); } int main() diff --git a/upb.h b/upb.h index 2e62b5a..3281a0e 100644 --- a/upb.h +++ b/upb.h @@ -110,6 +110,9 @@ union upb_symbol_ref { typedef enum upb_status { UPB_STATUS_OK = 0, + // The input byte stream ended in the middle of a record. + UPB_STATUS_NEED_MORE_DATA = 1, + // A varint did not terminate before hitting 64 bits. UPB_ERROR_UNTERMINATED_VARINT = -1, diff --git a/upb_parse.c b/upb_parse.c index 57cca2b..f45ff1f 100644 --- a/upb_parse.c +++ b/upb_parse.c @@ -30,88 +30,93 @@ * To avoid branches, none of these do bounds checking. So we force clients * to overallocate their buffers by >=9 bytes. */ -static upb_status_t get_v_uint64_t(void *restrict *buf, +static size_t min(size_t a, size_t b) { return a < b ? a : b; } + +static upb_status_t get_v_uint64_t(void *restrict *buf, size_t len, uint64_t *restrict val) { - uint8_t *ptr = *buf, b; - uint32_t part0 = 0, part1 = 0, part2 = 0; - - /* From the original proto2 implementation. */ - b = *(ptr++); part0 = (b & 0x7F) ; if (!(b & 0x80)) goto done; - b = *(ptr++); part0 |= (b & 0x7F) << 7; if (!(b & 0x80)) goto done; - b = *(ptr++); part0 |= (b & 0x7F) << 14; if (!(b & 0x80)) goto done; - b = *(ptr++); part0 |= (b & 0x7F) << 21; if (!(b & 0x80)) goto done; - b = *(ptr++); part1 = (b & 0x7F) ; if (!(b & 0x80)) goto done; - b = *(ptr++); part1 |= (b & 0x7F) << 7; if (!(b & 0x80)) goto done; - b = *(ptr++); part1 |= (b & 0x7F) << 14; if (!(b & 0x80)) goto done; - b = *(ptr++); part1 |= (b & 0x7F) << 21; if (!(b & 0x80)) goto done; - b = *(ptr++); part2 = (b & 0x7F) ; if (!(b & 0x80)) goto done; - b = *(ptr++); part2 |= (b & 0x7F) << 7; if (!(b & 0x80)) goto done; - return UPB_ERROR_UNTERMINATED_VARINT; - -done: - *buf = ptr; - *val = (uint64_t)part0 | ((uint64_t)part1 << 28) | ((uint64_t)part2 << 56); - return UPB_STATUS_OK; + uint32_t bitpos, bytes = min(len, 10); + uint8_t *b = *buf; + uint8_t *end = b + bytes; + uint8_t last = 0x80; + *val = 0; + for(bitpos = 0; b < end && (last & 0x80); b++, bitpos += 7) + *val |= ((uint64_t)((last = *b) & 0x7F)) << bitpos; + + if(unlikely(last & 0x80)) { + return bytes < 10 ? UPB_STATUS_NEED_MORE_DATA : UPB_ERROR_UNTERMINATED_VARINT; + } else { + *buf = b; + return UPB_STATUS_OK; + } } -static upb_status_t skip_v_uint64_t(void **buf) +static upb_status_t skip_v_uint64_t(void **buf, size_t len) { + uint32_t bytes = min(len, 10); uint8_t *b = *buf; - for(int i = 0; i < 10; i++, b++) { - if(!(*b & 0x80)) { - *buf = b; - return UPB_STATUS_OK; - } + uint8_t *end = b + bytes; + uint8_t last = 0x80; + for(; b < end && (last & 0x80); b++) + last = *b; + + if(unlikely(b == end)) { + return bytes < 10 ? UPB_STATUS_NEED_MORE_DATA : UPB_ERROR_UNTERMINATED_VARINT; + } else { + *buf = b; + return UPB_STATUS_OK; } - return UPB_ERROR_UNTERMINATED_VARINT; } -static upb_status_t get_v_uint32_t(void *restrict *buf, +static upb_status_t get_v_uint32_t(void *restrict *buf, size_t len, uint32_t *restrict val) { - uint8_t *ptr = *buf, b; - uint32_t result; - - /* From the original proto2 implementation. */ - b = *(ptr++); result = (b & 0x7F) ; if (!(b & 0x80)) goto done; - b = *(ptr++); result |= (b & 0x7F) << 7; if (!(b & 0x80)) goto done; - b = *(ptr++); result |= (b & 0x7F) << 14; if (!(b & 0x80)) goto done; - b = *(ptr++); result |= (b & 0x7F) << 21; if (!(b & 0x80)) goto done; - b = *(ptr++); result = (b & 0x7F) << 28; if (!(b & 0x80)) goto done; - return UPB_ERROR_UNTERMINATED_VARINT; - -done: - *buf = ptr; - *val = result; - return UPB_STATUS_OK; + uint32_t bitpos, bytes = min(len, 5); + uint8_t *b = *buf; + uint8_t *end = b + bytes; + uint8_t last = 0x80; + *val = 0; + for(bitpos = 0; b < end && (last & 0x80); b++, bitpos += 7) + *val |= ((uint32_t)((last = *b) & 0x7F)) << bitpos; + + if(unlikely(b == end)) { + return bytes < 5 ? UPB_STATUS_NEED_MORE_DATA : UPB_ERROR_UNTERMINATED_VARINT; + } else { + *buf = b; + return UPB_STATUS_OK; + } } #define SHL(val, bits) ((uint32_t)val << bits) -static upb_status_t get_f_uint32_t(void *restrict *buf, +static upb_status_t get_f_uint32_t(void *restrict *buf, size_t len, uint32_t *restrict val) { + const uint8_t size = sizeof(uint32_t); + if(unlikely(len < size)) return UPB_STATUS_NEED_MORE_DATA; uint8_t *b = *buf; #if UPB_UNALIGNED_READS_OK *val = *(uint32_t*)b; #else *val = SHL(b[0], 0) | SHL(b[1], 8) | SHL(b[2], 16) | SHL(b[3], 24); #endif - b += sizeof(uint32_t); + b += size; *buf = b; return UPB_STATUS_OK; } #undef SHL -static upb_status_t skip_f_uint32_t(void **buf) +static upb_status_t skip_f_uint32_t(void **buf, size_t len) { - *buf = (char*)*buf + sizeof(uint32_t); + const uint8_t size = sizeof(uint32_t); + if(unlikely(len < size)) return UPB_STATUS_NEED_MORE_DATA; + *buf = (char*)*buf + size; return UPB_STATUS_OK; } -static upb_status_t get_f_uint64_t(void *restrict *buf, +static upb_status_t get_f_uint64_t(void *restrict *buf, size_t len, uint64_t *restrict val) { + if(unlikely(len < sizeof(uint64_t))) return UPB_STATUS_NEED_MORE_DATA; #if UPB_UNALIGNED_READS_OK *val = *(uint64_t*)*buf; *buf = (char*)*buf + sizeof(uint64_t); @@ -124,9 +129,11 @@ static upb_status_t get_f_uint64_t(void *restrict *buf, return UPB_STATUS_OK; } -static upb_status_t skip_f_uint64_t(void **buf) +static upb_status_t skip_f_uint64_t(void **buf, size_t len) { - *buf = (char*)*buf + sizeof(uint64_t); + const uint8_t size = sizeof(uint64_t); + if(unlikely(len < size)) return UPB_STATUS_NEED_MORE_DATA; + *buf = (char*)*buf + size; return UPB_STATUS_OK; } @@ -140,10 +147,10 @@ static int64_t zz_decode_64(uint64_t n) { return (n >> 1) ^ -(int64_t)(n & 1); } static void wvtov_ ## type(wire_t s, val_t *d) #define GET(type, v_or_f, wire_t, val_t, member_name) \ - static upb_status_t get_ ## type(void **buf, union upb_value *d) { \ + static upb_status_t get_ ## type(void **buf, size_t len, val_t *d) { \ wire_t tmp; \ - CHECK(get_ ## v_or_f ## _ ## wire_t(buf, &tmp)); \ - wvtov_ ## type(tmp, &d->member_name); \ + CHECK(get_ ## v_or_f ## _ ## wire_t(buf, len, &tmp)); \ + wvtov_ ## type(tmp, d); \ return UPB_STATUS_OK; \ } @@ -194,27 +201,27 @@ struct upb_type_info upb_type_info[] = { [GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_SINT64] = {alignof(int64_t), sizeof(int64_t), UPB_WIRE_TYPE_VARINT}, }; -upb_status_t upb_parse_tag(void **buf, struct upb_tag *tag) +upb_status_t upb_parse_tag(void **buf, size_t len, struct upb_tag *tag) { uint32_t tag_int; - CHECK(get_v_uint32_t(buf, &tag_int)); + CHECK(get_v_uint32_t(buf, len, &tag_int)); tag->wire_type = (upb_wire_type_t)(tag_int & 0x07); tag->field_number = tag_int >> 3; return UPB_STATUS_OK; } -upb_status_t upb_parse_wire_value(void *buf, size_t *offset, +upb_status_t upb_parse_wire_value(void *buf, size_t len, size_t *offset, upb_wire_type_t wt, union upb_wire_value *wv) { #define READ(expr) CHECK(expr); *offset += ((char*)b-(char*)buf) void *b = buf; switch(wt) { - case UPB_WIRE_TYPE_VARINT: READ(get_v_uint64_t(&b, &wv->varint)); break; - case UPB_WIRE_TYPE_64BIT: READ(get_f_uint64_t(&b, &wv->_64bit)); break; - case UPB_WIRE_TYPE_32BIT: READ(get_f_uint32_t(&b, &wv->_32bit)); break; + case UPB_WIRE_TYPE_VARINT: READ(get_v_uint64_t(&b, len, &wv->varint)); break; + case UPB_WIRE_TYPE_64BIT: READ(get_f_uint64_t(&b, len, &wv->_64bit)); break; + case UPB_WIRE_TYPE_32BIT: READ(get_f_uint32_t(&b, len, &wv->_32bit)); break; case UPB_WIRE_TYPE_DELIMITED: - READ(get_v_uint32_t(&b, &wv->_32bit)); + READ(get_v_uint32_t(&b, len, &wv->_32bit)); size_t new_offset = *offset + wv->_32bit; if (new_offset < *offset) return UPB_ERROR_OVERFLOW; *offset = new_offset; @@ -225,19 +232,19 @@ upb_status_t upb_parse_wire_value(void *buf, size_t *offset, return UPB_STATUS_OK; } -upb_status_t upb_skip_wire_value(void *buf, size_t *offset, +upb_status_t upb_skip_wire_value(void *buf, size_t len, size_t *offset, upb_wire_type_t wt) { void *b = buf; switch(wt) { - case UPB_WIRE_TYPE_VARINT: READ(skip_v_uint64_t(&b)); break; - case UPB_WIRE_TYPE_64BIT: READ(skip_f_uint64_t(&b)); break; - case UPB_WIRE_TYPE_32BIT: READ(skip_f_uint32_t(&b)); break; + case UPB_WIRE_TYPE_VARINT: READ(skip_v_uint64_t(&b, len)); break; + case UPB_WIRE_TYPE_64BIT: READ(skip_f_uint64_t(&b, len)); break; + case UPB_WIRE_TYPE_32BIT: READ(skip_f_uint32_t(&b, len)); break; case UPB_WIRE_TYPE_DELIMITED: { /* Have to get (not skip) the length to skip the bytes. */ - uint32_t len; - READ(get_v_uint32_t(&b, &len)); - size_t new_offset = *offset + len; + uint32_t delim_len; + READ(get_v_uint32_t(&b, len, &delim_len)); + size_t new_offset = *offset + delim_len; if (new_offset < *offset) return UPB_ERROR_OVERFLOW; *offset = new_offset; break; @@ -249,19 +256,31 @@ upb_status_t upb_skip_wire_value(void *buf, size_t *offset, #undef READ } -upb_status_t upb_parse_value(void **b, upb_field_type_t ft, +upb_status_t upb_parse_value(void **b, size_t len, upb_field_type_t ft, union upb_value *v) { -#define CASE(t) \ - case GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_ ## t: return get_ ## t(b, v); +#define CASE(t, member_name) \ + case GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_ ## t: \ + return get_ ## t(b, len, &v->member_name); switch(ft) { - CASE(DOUBLE) CASE(FLOAT) CASE(INT64) CASE(UINT64) CASE(INT32) CASE(FIXED64) - CASE(FIXED32) CASE(BOOL) CASE(UINT32) CASE(ENUM) CASE(SFIXED32) - CASE(SFIXED64) CASE(SINT32) CASE(SINT64) + CASE(DOUBLE, _double) + CASE(FLOAT, _float) + CASE(INT32, int32) + CASE(INT64, int64) + CASE(UINT32, uint32) + CASE(UINT64, uint64) + CASE(SINT32, int32) + CASE(SINT64, int64) + CASE(FIXED32, uint32) + CASE(FIXED64, uint64) + CASE(SFIXED32, int32) + CASE(SFIXED64, int64) + CASE(BOOL, _bool) + CASE(ENUM, int32) case GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_BYTES: case GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_STRING: case GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_MESSAGE: - return get_UINT32(b, v); + return get_INT32(b, len, &v->int32); default: return 0; /* Including GROUP -- groups have no value. */ } #undef CASE @@ -301,6 +320,7 @@ static upb_status_t push_stack_frame(struct upb_parse_state *s, size_t end, return UPB_STATUS_OK; } +#if 0 upb_status_t upb_parse(struct upb_parse_state *s, void *buf, size_t len, size_t *read) { @@ -314,7 +334,7 @@ upb_status_t upb_parse(struct upb_parse_state *s, void *buf, size_t len, struct upb_tag tag; void *b = buf; - CHECK(upb_parse_tag(&b, &tag)); + CHECK(upb_parse_tag(&b, len, &tag)); int tag_bytes = ((char*)b - (char*)buf); s->offset += tag_bytes; buf = b; @@ -325,7 +345,7 @@ upb_status_t upb_parse(struct upb_parse_state *s, void *buf, size_t len, } void *user_field_desc; - upb_field_type_t ft = s->tag_cb(s, &tag, &user_field_desc); + //upb_field_type_t ft = s->tag_cb(s, &tag, &user_field_desc); if(ft == 0) { CHECK(upb_skip_wire_value(b, &s->offset, tag.wire_type)); } else if(ft == GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_GROUP) { @@ -356,3 +376,4 @@ upb_status_t upb_parse(struct upb_parse_state *s, void *buf, size_t len, *read = s->offset - start_offset; return UPB_STATUS_OK; } +#endif diff --git a/upb_parse.h b/upb_parse.h index 4be272a..fbc5898 100644 --- a/upb_parse.h +++ b/upb_parse.h @@ -29,31 +29,14 @@ void upb_parse_state_init(struct upb_parse_state *state, size_t udata_size); void upb_parse_state_free(struct upb_parse_state *state); /* The callback that is called immediately after a tag has been parsed. The - * client uses it to decide if it wants to process this value or skip it. If - * it wants to process it, it must determine its specific .proto type (at this - * point we only know its wire type) and verify that it matches the wire type. - * The client will then return the .proto type. To skip the value, the client - * should return 0 (which is not a valid .proto type). + * client must either advance the stream beyond the corresponding value or + * return an error to indicate that the stream should rewind to before the + * tag. * - * The client can set user_field_desc to a record describing this field -- this - * pointer will be supplied to the value callback (for simple values) or the - * submsg_start callback (for submessages). - * - * TODO: there needs to be a way to skip a delimited field while still knowing - * its offset and length. That could be through this callback or it could be a - * separate callback. */ -typedef upb_field_type_t (*upb_tag_cb)(struct upb_parse_state *s, - struct upb_tag *tag, - void **user_field_desc); - -/* The callback that is called for individual values. This callback is only - * called when the previously invoked tag_cb has returned nonzero. It receives - * the parsed and converted value as well as the user_field_desc that was set - * by the tag_cb. Note that this function can be called several times in a row - * (ie. with no intervening tag_cb) in the case of packed arrays. For string - * data (bytes and string) str points to the beginning of the string. */ -typedef void (*upb_value_cb)(struct upb_parse_state *s, union upb_value *v, - void *str, void *user_field_desc); + * The client advances the stream beyond the corresponding value by either + * parsing the value or skipping it. */ +typedef upb_field_type_t (*upb_tag_cb)(void **buf, struct upb_parse_state *s, + struct upb_tag *tag); /* Callbacks that are called when a submessage begins and ends, respectively. * Both are called with the submessage's stack frame at the top of the stack. */ @@ -77,7 +60,6 @@ struct upb_parse_state { size_t packed_end_offset; /* 0 if not in a packed array. */ upb_field_type_t packed_type; upb_tag_cb tag_cb; - upb_value_cb value_cb; upb_submsg_start_cb submsg_start_cb; upb_submsg_end_cb submsg_end_cb; }; @@ -94,7 +76,7 @@ upb_status_t upb_parse(struct upb_parse_state *s, void *buf, size_t len, /* Parses a single tag from the character data starting at buf, and updates * buf to point one past the bytes that were consumed. buf will be incremented * by at most ten bytes. */ -upb_status_t upb_parse_tag(void **buf, struct upb_tag *tag); +upb_status_t upb_parse_tag(void **buf, size_t len, struct upb_tag *tag); extern upb_wire_type_t upb_expected_wire_types[]; /* Returns true if wt is the correct on-the-wire type for ft. */ @@ -106,19 +88,19 @@ INLINE bool upb_check_type(upb_wire_type_t wt, upb_field_type_t ft) { * caller must have previously checked that the wire type is appropriate for * this field type. For delimited data, buf is advanced to the beginning of * the delimited data, not the end. */ -upb_status_t upb_parse_value(void **buf, upb_field_type_t ft, +upb_status_t upb_parse_value(void **buf, size_t len, upb_field_type_t ft, union upb_value *value); /* Parses a wire value with the given type (which must have been obtained from * a tag that was just parsed) and adds the number of bytes that were consumed * to *offset. For delimited types, offset is advanced past the delimited * data. */ -upb_status_t upb_parse_wire_value(void *buf, size_t *offset, +upb_status_t upb_parse_wire_value(void *buf, size_t len, size_t *offset, upb_wire_type_t wt, union upb_wire_value *wv); /* Like the above, but discards the wire value instead of saving it. */ -upb_status_t upb_skip_wire_value(void *buf, size_t *offset, +upb_status_t upb_skip_wire_value(void *buf, size_t len, size_t *offset, upb_wire_type_t wt); #ifdef __cplusplus -- cgit v1.2.3