From d1aa095cb30da50c1dac4263233b45c3a9ab726e Mon Sep 17 00:00:00 2001 From: Joshua Haberman Date: Wed, 17 Jun 2009 13:08:06 -0700 Subject: High-level parsing interface written (not yet tested). --- test_table.cc | 1 - upb.h | 32 +++++--------- upb_parse.c | 133 +++++++++++++++++++++++++++++++++++++++++++++------------- upb_parse.h | 75 +++++++++++++++++++++++++++++++-- upb_struct.h | 40 +++++++++--------- 5 files changed, 206 insertions(+), 75 deletions(-) diff --git a/test_table.cc b/test_table.cc index 5cbeb41..67e0314 100644 --- a/test_table.cc +++ b/test_table.cc @@ -189,4 +189,3 @@ int main() test(keys4, 64); delete[] keys4; } - diff --git a/upb.h b/upb.h index ae1845a..36abcaa 100644 --- a/upb.h +++ b/upb.h @@ -37,7 +37,7 @@ enum upb_wire_type { UPB_WIRE_TYPE_END_GROUP = 4, UPB_WIRE_TYPE_32BIT = 5 }; -typedef int8_t upb_wire_type_t; +typedef uint8_t upb_wire_type_t; /* A value as it is encoded on-the-wire, except delimited, which is handled * separately. */ @@ -48,8 +48,10 @@ union upb_wire_value { }; /* Value type as defined in a .proto file. The values of this are defined by - * google_protobuf_FieldDescriptorProto_Type (from descriptor.proto). */ -typedef int32_t upb_field_type_t; + * google_protobuf_FieldDescriptorProto_Type (from descriptor.proto). + * Note that descriptor.proto reserves "0" for errors, and we use it to + * represent exceptional circumstances. */ +typedef uint8_t upb_field_type_t; /* A value as described in a .proto file, except delimited, which is handled * separately. */ @@ -76,35 +78,21 @@ struct upb_tag { /* Status codes used as a return value. */ typedef enum upb_status { UPB_STATUS_OK = 0, - UPB_STATUS_SUBMESSAGE_END = 1, - - /** FATAL ERRORS: these indicate corruption, and cannot be recovered. */ // A varint did not terminate before hitting 64 bits. UPB_ERROR_UNTERMINATED_VARINT = -1, - // A submessage ended in the middle of data. + // A submessage or packed array ended in the middle of data. UPB_ERROR_BAD_SUBMESSAGE_END = -2, - // Encountered a "group" on the wire (deprecated and unsupported). - UPB_ERROR_GROUP = -3, - // Input was nested more than UPB_MAX_NESTING deep. - UPB_ERROR_STACK_OVERFLOW = -4, + UPB_ERROR_STACK_OVERFLOW = -3, // The input data caused the pb's offset (a size_t) to overflow. - UPB_ERROR_OVERFLOW = -5, - - // Generic error. - UPB_ERROR = -6, - - /** NONFATAL ERRORS: the input was invalid, but we can continue if desired. */ - - // A value was encountered that was not defined in the .proto file. - UPB_ERROR_UNKNOWN_VALUE = 2, + UPB_ERROR_OVERFLOW = -4, - // A field was encoded with the wrong wire type. - UPB_ERROR_MISMATCHED_TYPE = 3 + // An "end group" tag was encountered in an inappropriate place. + UPB_ERROR_SPURIOUS_END_GROUP = -5 } upb_status_t; #ifdef __cplusplus diff --git a/upb_parse.c b/upb_parse.c index 2262417..288fa2e 100644 --- a/upb_parse.c +++ b/upb_parse.c @@ -29,7 +29,7 @@ * To avoid branches, none of these do bounds checking. So we force clients * to overallocate their buffers by >=9 bytes. */ -static upb_status_t get_v_uint64_t(uint8_t *restrict *buf, +static upb_status_t get_v_uint64_t(void *restrict *buf, uint64_t *restrict val) { uint8_t *ptr = *buf, b; @@ -59,7 +59,7 @@ done: #if 0 /* The no-branching version. */ -static upb_status_t get_v_uint64_t(uint8_t *restrict *buf, +static upb_status_t get_v_uint64_t(void *restrict *buf, uint64_t *restrict val) { uint8_t *b = *buf; @@ -95,7 +95,7 @@ static upb_status_t get_v_uint64_t(uint8_t *restrict *buf, } /* The single-branch version. */ -static upb_status_t get_v_uint64_t(uint8_t *restrict *buf, +static upb_status_t get_v_uint64_t(void *restrict *buf, uint64_t *restrict val) { /* Endian-specific! */ @@ -127,7 +127,7 @@ static upb_status_t get_v_uint64_t(uint8_t *restrict *buf, } #endif -static upb_status_t skip_v_uint64_t(uint8_t **buf) +static upb_status_t skip_v_uint64_t(void **buf) { uint8_t *ptr = *buf, b; b = *(ptr++); if (!(b & 0x80)) goto done; @@ -147,7 +147,7 @@ done: return UPB_STATUS_OK; } -static upb_status_t get_v_uint32_t(uint8_t *restrict *buf, +static upb_status_t get_v_uint32_t(void *restrict *buf, uint32_t *restrict val) { uint8_t *ptr = *buf, b; @@ -167,24 +167,25 @@ done: return UPB_STATUS_OK; } -static upb_status_t get_f_uint32_t(uint8_t *restrict *buf, +static upb_status_t get_f_uint32_t(void *restrict *buf, uint32_t *restrict val) { uint8_t *b = *buf; #define SHL(val, bits) ((uint32_t)val << bits) *val = SHL(b[0], 0) | SHL(b[1], 8) | SHL(b[2], 16) | SHL(b[3], 24); #undef SHL - *buf += sizeof(uint32_t); + b += sizeof(uint32_t); + *buf = b; return UPB_STATUS_OK; } -static upb_status_t skip_f_uint32_t(uint8_t **buf) +static upb_status_t skip_f_uint32_t(void **buf) { - *buf += sizeof(uint32_t); + *buf = (char*)*buf + sizeof(uint32_t); return UPB_STATUS_OK; } -static upb_status_t get_f_uint64_t(uint8_t *restrict *buf, +static upb_status_t get_f_uint64_t(void *restrict *buf, uint64_t *restrict val) { uint8_t *b = *buf; @@ -193,13 +194,14 @@ static upb_status_t get_f_uint64_t(uint8_t *restrict *buf, *val = SHL(b[0], 0) | SHL(b[1], 8) | SHL(b[2], 16) | SHL(b[3], 24) | SHL(b[4], 32) | SHL(b[5], 40) | SHL(b[6], 48) | SHL(b[7], 56); #undef SHL - *buf += sizeof(uint64_t); + b += sizeof(uint64_t); + *buf = b; return UPB_STATUS_OK; } -static upb_status_t skip_f_uint64_t(uint8_t **buf) +static upb_status_t skip_f_uint64_t(void **buf) { - *buf += sizeof(uint64_t); + *buf = (char*)*buf + sizeof(uint64_t); return UPB_STATUS_OK; } @@ -213,7 +215,7 @@ static int64_t zz_decode_64(uint64_t n) { return (n >> 1) ^ -(int64_t)(n & 1); } static void wvtov_ ## type(wire_t s, val_t *d) #define GET(type, v_or_f, wire_t, val_t, member_name) \ - static upb_status_t get_ ## type(uint8_t **buf, union upb_value *d) { \ + static upb_status_t get_ ## type(void **buf, union upb_value *d) { \ wire_t tmp; \ CHECK(get_ ## v_or_f ## _ ## wire_t(buf, &tmp)); \ wvtov_ ## type(tmp, &d->member_name); \ @@ -264,7 +266,7 @@ upb_wire_type_t upb_expected_wire_types[] = { [GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_SINT64] = UPB_WIRE_TYPE_VARINT, }; -upb_status_t parse_tag(uint8_t **buf, struct upb_tag *tag) +upb_status_t upb_parse_tag(void **buf, struct upb_tag *tag) { uint32_t tag_int; CHECK(get_v_uint32_t(buf, &tag_int)); @@ -273,12 +275,12 @@ upb_status_t parse_tag(uint8_t **buf, struct upb_tag *tag) return UPB_STATUS_OK; } -upb_status_t parse_wire_value(uint8_t *buf, size_t *offset, - upb_wire_type_t wt, - union upb_wire_value *wv) +upb_status_t upb_parse_wire_value(void *buf, size_t *offset, + upb_wire_type_t wt, + union upb_wire_value *wv) { -#define READ(expr) CHECK(expr); *offset += (b-buf) - uint8_t *b = buf; +#define READ(expr) CHECK(expr); *offset += ((char*)b-(char*)buf) + void *b = buf; switch(wt) { case UPB_WIRE_TYPE_VARINT: READ(get_v_uint64_t(&b, &wv->varint)); break; case UPB_WIRE_TYPE_64BIT: READ(get_f_uint64_t(&b, &wv->_64bit)); break; @@ -290,15 +292,15 @@ upb_status_t parse_wire_value(uint8_t *buf, size_t *offset, *offset += new_offset; break; case UPB_WIRE_TYPE_START_GROUP: - case UPB_WIRE_TYPE_END_GROUP: return UPB_ERROR_GROUP; /* TODO */ + case UPB_WIRE_TYPE_END_GROUP: break; } return UPB_STATUS_OK; } -upb_status_t skip_wire_value(uint8_t *buf, size_t *offset, - upb_wire_type_t wt) +upb_status_t upb_skip_wire_value(void *buf, size_t *offset, + upb_wire_type_t wt) { - uint8_t *b = buf; + void *b = buf; switch(wt) { case UPB_WIRE_TYPE_VARINT: READ(skip_v_uint64_t(&b)); break; case UPB_WIRE_TYPE_64BIT: READ(skip_f_uint64_t(&b)); break; @@ -312,14 +314,14 @@ upb_status_t skip_wire_value(uint8_t *buf, size_t *offset, *offset += new_offset; break; } - case UPB_WIRE_TYPE_START_GROUP: - case UPB_WIRE_TYPE_END_GROUP: return UPB_ERROR_GROUP; /* TODO */ + case UPB_WIRE_TYPE_START_GROUP: /* TODO: skip to matching end group. */ + case UPB_WIRE_TYPE_END_GROUP: break; } return UPB_STATUS_OK; #undef READ } -upb_status_t upb_parse_value(uint8_t **b, upb_field_type_t ft, +upb_status_t upb_parse_value(void **b, upb_field_type_t ft, union upb_value *v) { #define CASE(t) \ @@ -332,7 +334,82 @@ upb_status_t upb_parse_value(uint8_t **b, upb_field_type_t ft, case GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_STRING: case GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_MESSAGE: return get_UINT32(b, v); - default: return UPB_ERROR; /* Including GROUP. */ + default: return 0; /* Including GROUP -- groups have no value. */ } #undef CASE } + +static void pop_stack_frame(struct upb_parse_state *s) +{ + s->submsg_end_cb(s); + s->top--; + s->top = (struct upb_parse_stack_frame*)((char*)s->top - s->udata_size); +} + +static upb_status_t push_stack_frame(struct upb_parse_state *s, size_t end, + void *user_field_desc) +{ + s->top++; + s->top = (struct upb_parse_stack_frame*)((char*)s->top + s->udata_size); + if(unlikely(s->top > s->limit)) return UPB_ERROR_STACK_OVERFLOW; + s->top->end_offset = end; + s->submsg_start_cb(s, user_field_desc); + return UPB_STATUS_OK; +} + +upb_status_t upb_parse(struct upb_parse_state *s, void *buf, size_t len, + size_t *read) +{ + size_t start_offset = s->offset; + size_t end_offset = start_offset + len; + while(!s->done && s->offset < end_offset) { + while(s->offset >= s->top->end_offset) pop_stack_frame(s); + while(s->packed_end_offset > s->offset) { + /* Parse a packed field entry. */ + } + + struct upb_tag tag; + void *b = buf; + CHECK(upb_parse_tag(&b, &tag)); + int tag_bytes = ((char*)b - (char*)buf); + s->offset += tag_bytes; + buf = b; + if(unlikely(tag.wire_type == UPB_WIRE_TYPE_END_GROUP)) { + if(unlikely(s->top->end_offset != 0)) return UPB_ERROR_SPURIOUS_END_GROUP; + pop_stack_frame(s); + continue; + } + + void *user_field_desc; + upb_field_type_t ft = s->tag_cb(s, &tag, &user_field_desc); + if(ft == 0) { + CHECK(upb_skip_wire_value(b, &s->offset, tag.wire_type)); + } else if(ft == GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_GROUP) { + /* No length specified, an "end group" tag will mark the end. */ + push_stack_frame(s, 0, user_field_desc); + } else { + /* For all other cases we parse the next value. */ + union upb_value v; + CHECK(upb_parse_value(&b, ft, &v)); + if(ft == GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_MESSAGE) { + /* The value we parsed is the length of the submessage. */ + push_stack_frame(s, s->offset + v.delim_len, user_field_desc); + } else if(ft == GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_STRING || + ft == GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_BYTES) { + s->value_cb(s, &v, b, user_field_desc); + b = (char*)b + v.delim_len; + } else if(tag.wire_type == UPB_WIRE_TYPE_DELIMITED) { + /* Delimited data which is not a string, bytes, or a submessage. + * It must be a packed array. */ + s->packed_type = ft; + s->packed_end_offset = s->offset + v.delim_len; + } else { + /* The common case: a simple value. */ + CHECK(upb_parse_value(&b, ft, &v)); + s->value_cb(s, &v, b, user_field_desc); + } + } + } + *read = s->offset - start_offset; + return UPB_STATUS_OK; +} diff --git a/upb_parse.h b/upb_parse.h index 068d522..7e53cee 100644 --- a/upb_parse.h +++ b/upb_parse.h @@ -18,12 +18,79 @@ extern "C" { #endif +/* High-level parsing interface. **********************************************/ + +struct upb_parse_state; + +/* Initialize and free (respectively) the given parse state, which must have + * been previously allocated. udata_size specifies how much space will be + * available at parse_stack_frame.user_data in each frame for user data. */ +void upb_parse_state_init(struct upb_parse_state *state, size_t udata_size); +void upb_parse_state_free(struct upb_parse_state *state); + +/* The callback that is called immediately after a tag has been parsed. The + * client uses it to decide if it wants to process this value or skip it. If + * it wants to process it, it must determine its specific .proto type (at this + * point we only know its wire type) and verify that it matches the wire type. + * The client will then return the .proto type. To skip the value, the client + * should return 0 (which is not a valid .proto type). + * + * The client can set user_field_desc to a record describing this field -- this + * pointer will be supplied to the value callback (for simple values) or the + * submsg_start callback (for submessages). */ +typedef upb_field_type_t (*upb_tag_cb)(struct upb_parse_state *s, + struct upb_tag *tag, + void **user_field_desc); + +/* The callback that is called for individual values. This callback is only + * called when the previously invoked tag_cb has returned nonzero. It receives + * the parsed and converted value as well as the user_field_desc that was set + * by the tag_cb. Note that this function can be called several times in a row + * (ie. with no intervening tag_cb) in the case of packed arrays. For string + * data (bytes and string) str points to the beginning of the string. */ +typedef void (*upb_value_cb)(struct upb_parse_state *s, union upb_value *v, + void *str, void *user_field_desc); + +/* Callbacks that are called when a submessage begins and ends, respectively. + * Both are called with the submessage's stack frame at the top of the stack. */ +typedef void (*upb_submsg_start_cb)(struct upb_parse_state *s, + void *user_field_desc); +typedef void (*upb_submsg_end_cb)(struct upb_parse_state *s); + +/* Each stack frame (one for each level of submessages/groups) has this format, + * where user_data has as many bytes allocated as specified when initialized. */ +struct upb_parse_stack_frame { + size_t end_offset; /* 0 indicates that this is a group. */ + char user_data[]; +}; + +struct upb_parse_state { + size_t offset; + struct upb_parse_stack_frame *stack, *top, *limit; + size_t udata_size; /* How many bytes the user gets in each frame. */ + bool done; /* Any callback can abort processing by setting done=true. */ + /* These are only set if we're in the middle of a packed array. */ + size_t packed_end_offset; /* 0 if not in a packed array. */ + upb_field_type_t packed_type; + upb_tag_cb tag_cb; + upb_value_cb value_cb; + upb_submsg_start_cb submsg_start_cb; + upb_submsg_end_cb submsg_end_cb; +}; + +/* Parses up to len bytes of protobuf data out of buf, calling cb as needed. + * The function returns how many bytes were consumed from buf. Data is parsed + * until no more data can be read from buf, or the callback sets *done=true, + * or an error occured. Sets *read to the number of bytes consumed. */ +upb_status_t upb_parse(struct upb_parse_state *s, void *buf, size_t len, + size_t *read); + /* Low-level parsing functions. ***********************************************/ /* Parses a single tag from the character data starting at buf, and updates * buf to point one past the bytes that were consumed. buf will be incremented * by at most ten bytes. */ -upb_status_t parse_tag(uint8_t **buf, struct upb_tag *tag); +upb_status_t upb_parse_tag(void **buf, struct upb_tag *tag); extern upb_wire_type_t upb_expected_wire_types[]; /* Returns true if wt is the correct on-the-wire type for ft. */ @@ -35,19 +102,19 @@ INLINE bool upb_check_type(upb_wire_type_t wt, upb_field_type_t ft) { * caller must have previously checked that the wire type is appropriate for * this field type. For delimited data, buf is advanced to the beginning of * the delimited data, not the end. */ -upb_status_t upb_parse_value(uint8_t **buf, upb_field_type_t ft, +upb_status_t upb_parse_value(void **buf, upb_field_type_t ft, union upb_value *value); /* Parses a wire value with the given type (which must have been obtained from * a tag that was just parsed) and adds the number of bytes that were consumed * to *offset. For delimited types, offset is advanced past the delimited * data. */ -upb_status_t upb_parse_wire_value(uint8_t *buf, size_t *offset, +upb_status_t upb_parse_wire_value(void *buf, size_t *offset, upb_wire_type_t wt, union upb_wire_value *wv); /* Like the above, but discards the wire value instead of saving it. */ -upb_status_t upb_skip_wire_value(uint8_t *buf, size_t *offset, +upb_status_t upb_skip_wire_value(void *buf, size_t *offset, upb_wire_type_t wt); #ifdef __cplusplus diff --git a/upb_struct.h b/upb_struct.h index c43fdca..d46b007 100644 --- a/upb_struct.h +++ b/upb_struct.h @@ -70,14 +70,14 @@ struct upb_struct_field *upb_struct_find_field_by_number( /* Represents a string or bytes. */ struct upb_string { size_t byte_len; - uint8_t *data; + void *data; }; /* Represents an array (a repeated field) of any type. The interpretation of * the data in the array depends on the type. */ struct upb_array { size_t len; /* Measured in elements. */ - uint8_t *data; /* Size of individual elements is based on type. */ + void *data; /* Size of individual elements is based on type. */ }; /* A generic array of structs, using void* instead of specific types. */ @@ -121,9 +121,9 @@ UPB_DEFINE_PRIMITIVE_ARRAY(bool, bool) /* For each primitive type we define a set of six functions: * * // For fetching out of a struct (s points to the raw struct data). - * int32_t *upb_struct_get_int32_ptr(uint8_t *s, struct upb_struct_field *f); - * int32_t upb_struct_get_int32(uint8_t *s, struct upb_struct_field *f); - * void upb_struct_set_int32(uint8_t *s, struct upb_struct_field *f, int32_t val); + * int32_t *upb_struct_get_int32_ptr(void *s, struct upb_struct_field *f); + * int32_t upb_struct_get_int32(void *s, struct upb_struct_field *f); + * void upb_struct_set_int32(void *s, struct upb_struct_field *f, int32_t val); * * // For fetching out of an array. * int32_t *upb_array_get_int32_ptr(struct upb_array *a, int n); @@ -137,15 +137,15 @@ UPB_DEFINE_PRIMITIVE_ARRAY(bool, bool) #define UPB_DEFINE_ACCESSORS(ctype, name, INLINE) \ INLINE ctype *upb_struct_get_ ## name ## _ptr( \ - uint8_t *s, struct upb_struct_field *f) { \ - return (ctype*)(s + f->byte_offset); \ + void *s, struct upb_struct_field *f) { \ + return (ctype*)((char*)s + f->byte_offset); \ } \ INLINE ctype upb_struct_get_ ## name( \ - uint8_t *s, struct upb_struct_field *f) { \ + void *s, struct upb_struct_field *f) { \ return *upb_struct_get_ ## name ## _ptr(s, f); \ } \ INLINE void upb_struct_set_ ## name( \ - uint8_t *s, struct upb_struct_field *f, ctype val) { \ + void *s, struct upb_struct_field *f, ctype val) { \ *upb_struct_get_ ## name ## _ptr(s, f) = val; \ } @@ -173,42 +173,42 @@ UPB_DEFINE_ALL_ACCESSORS(uint64_t, uint64, INLINE) UPB_DEFINE_ALL_ACCESSORS(bool, bool, INLINE) UPB_DEFINE_ALL_ACCESSORS(struct upb_struct_delimited*, bytes, INLINE) UPB_DEFINE_ALL_ACCESSORS(struct upb_struct_delimited*, string, INLINE) -UPB_DEFINE_ALL_ACCESSORS(uint8_t*, substruct, INLINE) +UPB_DEFINE_ALL_ACCESSORS(void*, substruct, INLINE) UPB_DEFINE_ACCESSORS(struct upb_array*, array, INLINE) /* Functions for reading and writing the "set" flags in the pbstruct. Note * that these do not perform any memory management associated with any dynamic * memory these fields may be referencing; that is the client's responsibility. * These *only* set and test the flags. */ -INLINE void upb_struct_set(uint8_t *s, struct upb_struct_field *f) +INLINE void upb_struct_set(void *s, struct upb_struct_field *f) { - s[f->isset_byte_offset] |= f->isset_byte_mask; + ((char*)s)[f->isset_byte_offset] |= f->isset_byte_mask; } -INLINE void upb_struct_unset(uint8_t *s, struct upb_struct_field *f) +INLINE void upb_struct_unset(void *s, struct upb_struct_field *f) { - s[f->isset_byte_offset] &= ~f->isset_byte_mask; + ((char*)s)[f->isset_byte_offset] &= ~f->isset_byte_mask; } -INLINE bool upb_struct_is_set(uint8_t *s, struct upb_struct_field *f) +INLINE bool upb_struct_is_set(void *s, struct upb_struct_field *f) { - return s[f->isset_byte_offset] & f->isset_byte_mask; + return ((char*)s)[f->isset_byte_offset] & f->isset_byte_mask; } INLINE bool upb_struct_all_required_fields_set( - uint8_t *s, struct upb_struct_definition *d) + void *s, struct upb_struct_definition *d) { int num_fields = d->num_required_fields; int i = 0; while(num_fields > 8) { - if(s[i++] != 0xFF) return false; + if(((uint8_t*)s)[i++] != 0xFF) return false; num_fields -= 8; } - if(s[i] != (1 << num_fields) - 1) return false; + if(((uint8_t*)s)[i] != (1 << num_fields) - 1) return false; return true; } -INLINE void upb_struct_clear(uint8_t *s, struct upb_struct_definition *d) +INLINE void upb_struct_clear(void *s, struct upb_struct_definition *d) { memset(s, 0, d->set_flags_bytes); } -- cgit v1.2.3