From d1f78c88faafea7e672c7c45e20f6f040942a92a Mon Sep 17 00:00:00 2001 From: Joshua Haberman Date: Wed, 3 Jun 2009 22:06:24 -0700 Subject: A bunch more work, a fast table for field lookup. --- Makefile | 8 +-- upb.h | 56 ++++++++++++++--- upb_fieldmap.c | 50 --------------- upb_fieldmap.h | 53 ---------------- upb_parse.c | 189 ++++++++++++++------------------------------------------- upb_parse.h | 110 +++++---------------------------- upb_struct.c | 1 + upb_struct.h | 4 +- upb_table.c | 89 +++++++++++++++++++++++++++ upb_table.h | 75 +++++++++++++++++++++++ 10 files changed, 279 insertions(+), 356 deletions(-) delete mode 100644 upb_fieldmap.c delete mode 100644 upb_fieldmap.h create mode 100644 upb_table.c create mode 100644 upb_table.h diff --git a/Makefile b/Makefile index eb2f10d..4d1efd7 100644 --- a/Makefile +++ b/Makefile @@ -1,16 +1,16 @@ .PHONY: all clean CFLAGS=-std=c99 -O3 -Wall -Wextra -pedantic -OBJ=upb_parse.o upb_fieldmap.o upb_struct.o -all: $(OBJ) tests +OBJ=upb_parse.o upb_table.o upb_struct.o +all: $(OBJ) clean: rm -f $(OBJ) tests upb_parse.o: upb_parse.c upb_parse.h gcc $(CFLAGS) -o upb_parse.o -c upb_parse.c -upb_fieldmap.o: upb_fieldmap.c upb_fieldmap.h - gcc $(CFLAGS) -o upb_fieldmap.o -c upb_fieldmap.c +upb_table.o: upb_table.c upb_table.h + gcc $(CFLAGS) -o upb_table.o -c upb_table.c upb_struct.o: upb_struct.c upb_struct.h gcc $(CFLAGS) -o upb_struct.o -c upb_struct.c diff --git a/upb.h b/upb.h index d63e207..445975b 100644 --- a/upb.h +++ b/upb.h @@ -19,29 +19,30 @@ extern "C" { #define UPB_MAX_NESTING 64 /* A list of types as they are encoded on-the-wire. */ -typedef enum upb_wire_type { +enum upb_wire_type { UPB_WIRE_TYPE_VARINT = 0, UPB_WIRE_TYPE_64BIT = 1, UPB_WIRE_TYPE_DELIMITED = 2, UPB_WIRE_TYPE_START_GROUP = 3, UPB_WIRE_TYPE_END_GROUP = 4, UPB_WIRE_TYPE_32BIT = 5, -} upb_wire_type_t; - -struct upb_delimited { - size_t offset; /* relative to the beginning of the stream. */ - uint32_t len; }; +typedef int8_t upb_wire_type_t; -/* A value as it is encoded on-the-wire. */ +/* A value as it is encoded on-the-wire, except delimited, which is handled + * separately. */ union upb_wire_value { uint64_t varint; uint64_t _64bit; uint32_t _32bit; - struct upb_delimited delimited; }; -/* A value as described in a .proto file. */ +/* Value type as defined in a .proto file. The values of this are defined by + * google_protobuf_FieldDescriptorProto_Type (from descriptor.proto). */ +typedef int32_t upb_field_type_t; + +/* A value as described in a .proto file, except delimited, which is handled + * separately. */ union upb_value { double _double; float _float; @@ -50,9 +51,10 @@ union upb_value { uint32_t uint32; uint64_t uint64; bool _bool; - struct upb_delimited delimited; + uint32_t delim_len; }; +/* The number of a field, eg. "optional string foo = 3". */ typedef int32_t upb_field_number_t; /* A tag occurs before each value on-the-wire. */ @@ -61,6 +63,40 @@ struct upb_tag { upb_wire_type_t wire_type; }; +/* Status codes used as a return value. */ +typedef enum upb_status { + UPB_STATUS_OK = 0, + UPB_STATUS_SUBMESSAGE_END = 1, + + /** FATAL ERRORS: these indicate corruption, and cannot be recovered. */ + + // A varint did not terminate before hitting 64 bits. + UPB_ERROR_UNTERMINATED_VARINT = -1, + + // A submessage ended in the middle of data. + UPB_ERROR_BAD_SUBMESSAGE_END = -2, + + // Encountered a "group" on the wire (deprecated and unsupported). + UPB_ERROR_GROUP = -3, + + // Input was nested more than UPB_MAX_NESTING deep. + UPB_ERROR_STACK_OVERFLOW = -4, + + // The input data caused the pb's offset (a size_t) to overflow. + UPB_ERROR_OVERFLOW = -5, + + // Generic error. + UPB_ERROR = -6, + + /** NONFATAL ERRORS: the input was invalid, but we can continue if desired. */ + + // A value was encountered that was not defined in the .proto file. + UPB_ERROR_UNKNOWN_VALUE = 2, + + // A field was encoded with the wrong wire type. + UPB_ERROR_MISMATCHED_TYPE = 3, +} upb_status_t; + #ifdef __cplusplus } /* extern "C" */ #endif diff --git a/upb_fieldmap.c b/upb_fieldmap.c deleted file mode 100644 index 015b2e1..0000000 --- a/upb_fieldmap.c +++ /dev/null @@ -1,50 +0,0 @@ -/* - * upb - a minimalist implementation of protocol buffers. - * - * Copyright (c) 2009 Joshua Haberman. See LICENSE for details. - */ - -#include "upb_fieldmap.h" - -#include - -void pbstream_init_fieldmap(struct pbstream_fieldmap *fieldmap, - struct pbstream_field *fields, - int num_fields) -{ - qsort(fields, num_fields, sizeof(*fields), compare_fields); - - /* Find the largest n for which at least half the fieldnums 8 && maybe_n/(i+1) >= 2) break; - n = maybe_n; - } - - fieldmap->num_fields = num_fields; - fieldmap->fields = malloc(sizeof(*fieldmap->fields)*num_fields); - memcpy(fieldmap->fields, fields, sizeof(*fields)*num_fields); - - fieldmap->array_size = n; - fieldmap->array = malloc(sizeof(*fieldmap->array)*n); - memset(fieldmap->array, 0, sizeof(*fieldmap->array)*n); - - for (int i = 0; i < num_fields && fields[i].field_number <= n; i++) - fieldmap->array[fields[i].field_number-1] = &fieldmap->fields[i]; - - /* Until we support the hashtable part... */ - assert(n == fields[num_fields-1].field_number); -} - -void pbstream_free_fieldmap(struct pbstream_fieldmap *fieldmap) -{ - free(fieldmap->fields); - free(fieldmap->array); -} - -/* Emit definition for inline function. */ -extern void *upb_fieldmap_find(struct upb_fieldmap *fm, - pbstream_field_number_t num, - size_t info_size); diff --git a/upb_fieldmap.h b/upb_fieldmap.h deleted file mode 100644 index 0fb5a3e..0000000 --- a/upb_fieldmap.h +++ /dev/null @@ -1,53 +0,0 @@ -/* - * upb - a minimalist implementation of protocol buffers. - * - * Copyright (c) 2009 Joshua Haberman. See LICENSE for details. - * - * A fieldmap is a data structure that supports fast lookup of fields by - * number. It is logically a map of {field_number -> }, where - * is any struct that begins with the field number. Fast lookup - * is important, because it is in the critical path of parsing. */ - -#ifndef UPB_FIELDMAP_H_ -#define UPB_FIELDMAP_H_ - -#include "upb.h" - -#ifdef __cplusplus -extern "C" { -#endif - -struct upb_fieldmap { - int array_size; - void *array; - /* TODO: the hashtable part. */ -}; - -/* Takes an array of num_fields fields and builds an optimized table for fast - * lookup of fields by number. The input fields need not be sorted. This - * fieldmap must be freed with upb_free_fieldmap(). */ -void upb_init_fieldmap(struct upb_fieldmap *fieldmap, - void *fields, - int num_fields, - int field_size); -void upb_free_fieldmap(struct upb_fieldmap *fieldmap); - -/* Looks the given field number up in the fieldmap, and returns the - * corresponding field definition (or NULL if this field number does not exist - * in this fieldmap). */ -inline void *upb_fieldmap_find(struct upb_fieldmap *fm, - upb_field_number_t num, - size_t info_size) -{ - if (num < array_size) { - return (char*)fs->array + (num*info_size); - } else { - /* TODO: the hashtable part. */ - } -} - -#ifdef __cplusplus -} /* extern "C" */ -#endif - -#endif /* UPB_PARSE_H_ */ diff --git a/upb_parse.c b/upb_parse.c index c0fc007..458876e 100644 --- a/upb_parse.c +++ b/upb_parse.c @@ -8,6 +8,7 @@ #include #include +#include "descriptor.h" /* Branch prediction hints for GCC. */ #ifdef __GNUC__ @@ -138,14 +139,10 @@ static int64_t zz_decode_64(uint64_t n) { return (n >> 1) ^ -(int64_t)(n & 1); } static void wvtov_ ## type(wire_t s, val_t *d) #define GET(type, v_or_f, wire_t, val_t, member_name) \ - static upb_status_t get_ ## type(struct upb_parse_state *s, \ - uint8_t *buf, \ - struct upb_tagged_value *d) { \ + static upb_status_t get_ ## type(uint8_t **buf, union upb_value *d) { \ wire_t tmp; \ - uint8_t *b = buf; \ - CHECK(get_ ## v_or_f ## _ ## wire_t(&b, &tmp)); \ - wvtov_ ## type(tmp, &d->v.member_name); \ - s->offset += (b-buf); \ + CHECK(get_ ## v_or_f ## _ ## wire_t(buf, &tmp)); \ + wvtov_ ## type(tmp, &d->member_name); \ return UPB_STATUS_OK; \ } @@ -172,65 +169,25 @@ T(ENUM, v, uint32_t, int32_t, int32) { *d = (int32_t)s; } #undef GET #undef T -static void wvtov_delimited(uint32_t s, struct upb_delimited *d, size_t o) -{ - d->offset = o; - d->len = s; -} - -/* Use BYTES version for both STRING and BYTES, leave UTF-8 checks to client. */ -static upb_status_t get_BYTES(struct upb_parse_state *s, uint8_t *buf, - struct upb_tagged_value *d) { - uint32_t tmp; - uint8_t *b = buf; - CHECK(get_v_uint32_t(&b, &tmp)); - s->offset += (b-buf); /* advance past length varint. */ - wvtov_delimited(tmp, &d->v.delimited, s->offset); - size_t new_offset = s->offset + d->v.delimited.len; /* skip bytes */ - if (unlikely(new_offset < s->offset)) return UPB_ERROR_OVERFLOW; - s->offset = new_offset; - return UPB_STATUS_OK; -} - -static upb_status_t get_MESSAGE(struct upb_parse_state *s, uint8_t *buf, - struct upb_tagged_value *d) { - /* We're entering a sub-message. */ - uint32_t tmp; - uint8_t *b = buf; - CHECK(get_v_uint32_t(&b, &tmp)); - s->offset += (b-buf); /* advance past length varint. */ - wvtov_delimited(tmp, &d->v.delimited, s->offset); - /* Unlike STRING and BYTES, we *don't* advance past delimited here. */ - if (unlikely(++s->top == s->limit)) return UPB_ERROR_STACK_OVERFLOW; - s->top->fieldset = d->field->fieldset; - s->top->end_offset = d->v.delimited.offset + d->v.delimited.len; - if (unlikely(s->top->end_offset < s->offset)) return UPB_ERROR_OVERFLOW; - return UPB_STATUS_OK; -} - -struct upb_type_info { - upb_wire_type_t expected_wire_type; - upb_status_t (*get)(struct upb_parse_state *s, uint8_t *buf, - struct upb_tagged_value *d); -}; -static struct upb_type_info type_info[] = { - {UPB_WIRE_TYPE_64BIT, get_DOUBLE}, - {UPB_WIRE_TYPE_32BIT, get_FLOAT}, - {UPB_WIRE_TYPE_VARINT, get_INT32}, - {UPB_WIRE_TYPE_VARINT, get_INT64}, - {UPB_WIRE_TYPE_VARINT, get_UINT32}, - {UPB_WIRE_TYPE_VARINT, get_UINT64}, - {UPB_WIRE_TYPE_VARINT, get_SINT32}, - {UPB_WIRE_TYPE_VARINT, get_SINT64}, - {UPB_WIRE_TYPE_32BIT, get_FIXED32}, - {UPB_WIRE_TYPE_64BIT, get_FIXED64}, - {UPB_WIRE_TYPE_32BIT, get_SFIXED32}, - {UPB_WIRE_TYPE_64BIT, get_SFIXED64}, - {UPB_WIRE_TYPE_VARINT, get_BOOL}, - {UPB_WIRE_TYPE_DELIMITED, get_BYTES}, - {UPB_WIRE_TYPE_DELIMITED, get_BYTES}, - {UPB_WIRE_TYPE_VARINT, get_ENUM}, - {UPB_WIRE_TYPE_DELIMITED, get_MESSAGE} +upb_wire_type_t upb_expected_wire_types[] = { + [GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_DOUBLE] = UPB_WIRE_TYPE_64BIT, + [GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_FLOAT] = UPB_WIRE_TYPE_32BIT, + [GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_INT64] = UPB_WIRE_TYPE_VARINT, + [GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_UINT64] = UPB_WIRE_TYPE_VARINT, + [GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_INT32] = UPB_WIRE_TYPE_VARINT, + [GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_FIXED64] = UPB_WIRE_TYPE_64BIT, + [GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_FIXED32] = UPB_WIRE_TYPE_32BIT, + [GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_BOOL] = UPB_WIRE_TYPE_VARINT, + [GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_STRING] = UPB_WIRE_TYPE_DELIMITED, + [GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_BYTES] = UPB_WIRE_TYPE_DELIMITED, + [GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_GROUP] = -1, /* TODO */ + [GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_MESSAGE] = UPB_WIRE_TYPE_DELIMITED, + [GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_UINT32] = UPB_WIRE_TYPE_VARINT, + [GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_ENUM] = UPB_WIRE_TYPE_VARINT, + [GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_SFIXED32] = UPB_WIRE_TYPE_32BIT, + [GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_SFIXED64] = UPB_WIRE_TYPE_64BIT, + [GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_SINT32] = UPB_WIRE_TYPE_VARINT, + [GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_SINT64] = UPB_WIRE_TYPE_VARINT, }; upb_status_t parse_tag(uint8_t **buf, struct upb_tag *tag) @@ -249,22 +206,17 @@ upb_status_t parse_wire_value(uint8_t *buf, size_t *offset, #define READ(expr) CHECK(expr); *offset += (b-buf) uint8_t *b = buf; switch(wt) { - case UPB_WIRE_TYPE_VARINT: - READ(get_v_uint64_t(&b, &wv->varint)); break; - case UPB_WIRE_TYPE_64BIT: - READ(get_f_uint64_t(&b, &wv->_64bit)); break; - case UPB_WIRE_TYPE_32BIT: - READ(get_f_uint32_t(&b, &wv->_32bit)); break; + case UPB_WIRE_TYPE_VARINT: READ(get_v_uint64_t(&b, &wv->varint)); break; + case UPB_WIRE_TYPE_64BIT: READ(get_f_uint64_t(&b, &wv->_64bit)); break; + case UPB_WIRE_TYPE_32BIT: READ(get_f_uint32_t(&b, &wv->_32bit)); break; case UPB_WIRE_TYPE_DELIMITED: - wv->delimited.offset = *offset; - READ(get_v_uint32_t(&b, &wv->delimited.len)); - size_t new_offset = *offset + wv->delimited.len; + READ(get_v_uint32_t(&b, &wv->_32bit)); + size_t new_offset = *offset + wv->_32bit; if (new_offset < *offset) return UPB_ERROR_OVERFLOW; *offset += new_offset; break; case UPB_WIRE_TYPE_START_GROUP: - case UPB_WIRE_TYPE_END_GROUP: - return UPB_ERROR_GROUP; /* deprecated, no plans to support. */ + case UPB_WIRE_TYPE_END_GROUP: return UPB_ERROR_GROUP; /* TODO */ } return UPB_STATUS_OK; } @@ -274,12 +226,9 @@ upb_status_t skip_wire_value(uint8_t *buf, size_t *offset, { uint8_t *b = buf; switch(wt) { - case UPB_WIRE_TYPE_VARINT: - READ(skip_v_uint64_t(&b)); break; - case UPB_WIRE_TYPE_64BIT: - READ(skip_f_uint64_t(&b)); break; - case UPB_WIRE_TYPE_32BIT: - READ(skip_f_uint32_t(&b)); break; + case UPB_WIRE_TYPE_VARINT: READ(skip_v_uint64_t(&b)); break; + case UPB_WIRE_TYPE_64BIT: READ(skip_f_uint64_t(&b)); break; + case UPB_WIRE_TYPE_32BIT: READ(skip_f_uint32_t(&b)); break; case UPB_WIRE_TYPE_DELIMITED: { /* Have to get (not skip) the length to skip the bytes. */ uint32_t len; @@ -290,71 +239,27 @@ upb_status_t skip_wire_value(uint8_t *buf, size_t *offset, break; } case UPB_WIRE_TYPE_START_GROUP: - case UPB_WIRE_TYPE_END_GROUP: - return UPB_ERROR_GROUP; /* deprecated, no plans to support. */ + case UPB_WIRE_TYPE_END_GROUP: return UPB_ERROR_GROUP; /* TODO */ } return UPB_STATUS_OK; #undef READ } -/* Parses and processes the next value from buf. */ -upb_status_t upb_parse_field(struct upb_parse_state *s, - uint8_t *buf, - upb_field_number_t *fieldnum, - struct upb_tagged_value *val, - struct upb_tagged_wire_value *wv) +upb_status_t upb_parse_value(uint8_t **b, upb_field_type_t ft, + union upb_value *v) { - /* Check for end-of-message at the current stack depth. */ - if(unlikely(s->offset >= s->top->end_offset)) { - /* If the end offset isn't an exact field boundary, the pb is corrupt. */ - if(unlikely(s->offset != s->top->end_offset)) - return UPB_ERROR_BAD_SUBMESSAGE_END; - s->top--; - return UPB_STATUS_SUBMESSAGE_END; +#define CASE(t) \ + case GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_ ## t: return get_ ## t(b, v); + switch(ft) { + CASE(DOUBLE) CASE(FLOAT) CASE(INT64) CASE(UINT64) CASE(INT32) CASE(FIXED64) + CASE(FIXED32) CASE(BOOL) CASE(UINT32) CASE(ENUM) CASE(SFIXED32) + CASE(SFIXED64) CASE(SINT32) CASE(SINT64) + case GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_BYTES: + case GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_STRING: + case GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_MESSAGE: + return get_UINT32(b, v); + default: return UPB_ERROR; /* Including GROUP. */ } - - struct upb_tag tag; - uint8_t *b = buf; - CHECK(parse_tag(&b, &tag)); - s->offset += (b-buf); - struct upb_field *fd = upb_find_field(s->top->fieldset, - tag.field_number); - upb_status_t unknown_value_status; - if(unlikely(!fd)) { - unknown_value_status = UPB_ERROR_UNKNOWN_VALUE; - goto unknown_value; - } - struct upb_type_info *info = &type_info[fd->type]; - if(unlikely(tag.wire_type != info->expected_wire_type)) { - unknown_value_status = UPB_ERROR_MISMATCHED_TYPE; - goto unknown_value; - } - - *fieldnum = tag.field_number; - val->field = fd; - CHECK(info->get(s, b, val)); - return UPB_STATUS_OK; - -unknown_value: - wv->type = tag.wire_type; - CHECK(parse_wire_value(buf, &s->offset, tag.wire_type, &wv->v)); - return unknown_value_status; -} - -void upb_init_parser( - struct upb_parse_state *state, - struct upb_fieldset *toplevel_fieldset) -{ - state->offset = 0; - state->top = state->stack; - state->limit = state->top + UPB_MAX_STACK; - state->top->fieldset = toplevel_fieldset; - state->top->end_offset = SIZE_MAX; -} - -static int compare_fields(const void *f1, const void *f2) -{ - return ((struct upb_field*)f1)->field_number - - ((struct upb_field*)f2)->field_number; +#undef CASE } diff --git a/upb_parse.h b/upb_parse.h index 75752a0..829e97c 100644 --- a/upb_parse.h +++ b/upb_parse.h @@ -18,99 +18,6 @@ extern "C" { #endif -/* A deserialized value as described in a .proto file. */ -struct upb_tagged_value { - struct upb_field *field; - union upb_value v; -}; - -/* A value as it is encoded on-the-wire, before it has been interpreted as - * any particular .proto type. */ -struct upb_tagged_wire_value { - upb_wire_type_t type; - union upb_wire_value v; -}; - -/* Definition of a single field in a message, for the purposes of the parser's - * fieldmap. Note that this does not include nearly all of the information - * that can be specified about a field in a .proto file. For example, we don't - * even know the field's name. We keep only the information necessary to parse - * the field. */ -struct upb_field { - upb_field_number_t field_number; - int32_t type; /* google_protobuf_FieldDescriptorProto_Type */ - struct upb_fieldset *fieldset; /* if type == MESSAGE */ -}; - -struct upb_parse_stack_frame { - struct upb_fieldset *fieldset; - size_t end_offset; /* unknown for the top frame, so we set to SIZE_MAX */ -}; - -/* The stream parser's state. */ -struct upb_parse_state { - size_t offset; - struct upb_parse_stack_frame stack[UPB_MAX_STACK]; - struct upb_parse_stack_frame *top, *limit; -}; - -/* Call this once before parsing to initialize the data structures. - * message_type can be NULL, in which case all fields will be reported as - * unknown. */ -void upb_init_parser(struct upb_parse_state *state, - struct upb_fieldset *toplevel_fieldset); - -/* Status as returned by upb_parse(). Status codes <0 are fatal errors - * that cannot be recovered. Status codes >0 are unusual but nonfatal events, - * which nonetheless must be handled differently since they do not return data - * in val. */ -typedef enum upb_status { - UPB_STATUS_OK = 0, - UPB_STATUS_SUBMESSAGE_END = 1, // No data is stored in val or wv. - - /** FATAL ERRORS: these indicate corruption, and cannot be recovered. */ - - // A varint did not terminate before hitting 64 bits. - UPB_ERROR_UNTERMINATED_VARINT = -1, - - // A submessage ended in the middle of data. - UPB_ERROR_BAD_SUBMESSAGE_END = -2, - - // Encountered a "group" on the wire (deprecated and unsupported). - UPB_ERROR_GROUP = -3, - - // Input was nested more than UPB_MAX_NESTING deep. - UPB_ERROR_STACK_OVERFLOW = -4, - - // The input data caused the pb's offset (a size_t) to overflow. - UPB_ERROR_OVERFLOW = -5, - - /** NONFATAL ERRORS: the input was invalid, but we can continue if desired. */ - - // A value was encountered that was not defined in the .proto file. The - // unknown value is stored in wv. - UPB_ERROR_UNKNOWN_VALUE = 2, - - // A field was encoded with the wrong wire type. The wire value is stored in - // wv. - UPB_ERROR_MISMATCHED_TYPE = 3, -} upb_status_t; -struct upb_parse_state; - -/* The main parsing function. Parses the next value from buf, storing the - * parsed value in val. If val is of type UPB_TYPE_MESSAGE, then a - * submessage was entered. - * - * IMPORTANT NOTE: for efficiency, the parsing routines do not do bounds checks, - * and may read as much as far as buf+10. So the caller must ensure that buf is - * not within 10 bytes of unmapped memory, or the program will segfault. Clients - * are encouraged to overallocate their buffers by ten bytes to compensate. */ -upb_status_t upb_parse_field(struct upb_parse_state *s, - uint8_t *buf, - upb_field_number_t *fieldnum, - struct upb_tagged_value *val, - struct upb_tagged_wire_value *wv); - /* Low-level parsing functions. ***********************************************/ /* Parses a single tag from the character data starting at buf, and updates @@ -118,6 +25,19 @@ upb_status_t upb_parse_field(struct upb_parse_state *s, * by at most ten bytes. */ upb_status_t parse_tag(uint8_t **buf, struct upb_tag *tag); +extern upb_wire_type_t upb_expected_wire_types[]; +/* Returns true if wt is the correct on-the-wire type for ft. */ +inline bool upb_check_type(upb_wire_type_t wt, upb_field_type_t ft) { + return upb_expected_wire_types[ft] == wt; +} + +/* Parses and converts a value from the character data starting at buf. The + * caller must have previously checked that the wire type is appropriate for + * this field type. For delimited data, buf is advanced to the beginning of + * the delimited data, not the end. */ +upb_status_t upb_parse_value(uint8_t **buf, upb_field_type_t ft, + union upb_value *value); + /* Parses a wire value with the given type (which must have been obtained from * a tag that was just parsed) and adds the number of bytes that were consumed * to *offset. For delimited types, offset is advanced past the delimited @@ -127,8 +47,8 @@ upb_status_t upb_parse_wire_value(uint8_t *buf, size_t *offset, union upb_wire_value *wv); /* Like the above, but discards the wire value instead of saving it. */ -upb_status_t skip_wire_value(uint8_t *buf, size_t *offset, - upb_wire_type_t wt); +upb_status_t upb_skip_wire_value(uint8_t *buf, size_t *offset, + upb_wire_type_t wt); #ifdef __cplusplus } /* extern "C" */ diff --git a/upb_struct.c b/upb_struct.c index b0b244a..3284796 100644 --- a/upb_struct.c +++ b/upb_struct.c @@ -42,3 +42,4 @@ extern bool upb_struct_is_set(uint8_t *s, struct upb_struct_field *f); extern bool upb_struct_all_required_fields_set( uint8_t *s, struct upb_struct_definition *d); extern void upb_struct_clear(uint8_t *s, struct upb_struct_definition *d); + diff --git a/upb_struct.h b/upb_struct.h index a64b3d4..7d5c219 100644 --- a/upb_struct.h +++ b/upb_struct.h @@ -57,8 +57,8 @@ struct upb_struct_definition { }; /* While these are written to be as fast as possible, it will still be faster - * cache the results of this lookup if possible. These return NULL if no such - * field is found. */ + * to cache the results of this lookup if possible. These return NULL if no + * such field is found. */ struct upb_struct_field *upb_struct_find_field_by_name( struct upb_struct_definition *d, char *name); struct upb_struct_field *upb_struct_find_field_by_number( diff --git a/upb_table.c b/upb_table.c new file mode 100644 index 0000000..656da24 --- /dev/null +++ b/upb_table.c @@ -0,0 +1,89 @@ +/* + * upb - a minimalist implementation of protocol buffers. + * + * Copyright (c) 2009 Joshua Haberman. See LICENSE for details. + */ + +#include "upb_table.h" + +#include + +static int compare_entries(const void *f1, const void *f2) +{ + return ((struct upb_inttable_entry*)f1)->key - + ((struct upb_inttable_entry*)f2)->key; +} + +static uint32_t max(uint32_t a, uint32_t b) { return a > b ? a : b; } + +static uint32_t round_up_to_pow2(uint32_t v) +{ + /* cf. Bit Twiddling Hacks: + * http://www-graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2 */ + v--; + v |= v >> 1; v |= v >> 2; v |= v >> 4; v |= v >> 8; v |= v >> 16; + v++; + return v; +} + +void upb_inttable_init(struct upb_inttable *table, void *entries, + int num_entries, int entry_size) +{ + qsort(entries, num_entries, entry_size, compare_entries); + + /* Find the largest n for which at least half the keys n might + * possibly have collisions. Start at 8 to avoid noise of small numbers. */ + upb_inttable_key_t n = 0, maybe_n; + bool all_in_array = true; + for(int i = 0; i < num_entries; i++) { + struct upb_inttable_entry *e = + upb_inttable_entry_get(entries, i, entry_size); + maybe_n = e->key; + if(maybe_n > 8 && maybe_n/(i+1) >= 2) { + all_in_array = false; + break; + } + n = maybe_n; + } + + /* TODO: measure, tweak, optimize this choice of table size. Possibly test + * (at runtime) maximum chain length for each proposed size. */ + uint32_t min_size_by_load = all_in_array ? n : (double)num_entries / 0.85; + uint32_t min_size = max(n, min_size_by_load); + table->size = round_up_to_pow2(min_size); + + table->entries = malloc(table->size * entry_size); + /* Initialize to empty. */ + for(size_t i = 0; i < table->size; i++) { + struct upb_inttable_entry *e = upb_inttable_get(table, i, entry_size); + e->key = UPB_END_OF_CHAIN; + e->next = UPB_END_OF_CHAIN; + } + + /* Insert the elements. */ + for(int i = 0; i < num_entries; i++) { + struct upb_inttable_entry *e = + upb_inttable_entry_get(entries, i, entry_size); + int32_t hash = upb_inttable_hash(table, e->key); + struct upb_inttable_entry *table_e = + upb_inttable_get(table, hash, entry_size); + if(table_e->key != UPB_END_OF_CHAIN) { /* Collision. */ + if(hash == upb_inttable_hash(table, table_e->key)) { + /* Existing element is in its main posisiton. Find an empty slot to + * place our new element. */ + } else { + /* Existing element is not in its main position. Move it to an empty + * slot and put our element in its main position. */ + } + } + } +} + +void upb_inttable_free(struct upb_inttable *table) +{ + free(table->entries); +} + +/* Emit definition for inline functions. */ diff --git a/upb_table.h b/upb_table.h new file mode 100644 index 0000000..684dbff --- /dev/null +++ b/upb_table.h @@ -0,0 +1,75 @@ +/* + * upb - a minimalist implementation of protocol buffers. + * + * Copyright (c) 2009 Joshua Haberman. See LICENSE for details. + */ + +#ifndef UPB_TABLE_H_ +#define UPB_TABLE_H_ + +#include "upb.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef int32_t upb_inttable_key_t; + +#define UPB_END_OF_CHAIN (upb_inttable_key_t)-1 + +struct upb_inttable_entry { + upb_inttable_key_t key; + int32_t next; +}; + +struct upb_inttable { + uint32_t size; /* Is a power of two. */ + void *entries; +}; + +/* Builds an int32_t -> table, optimized for very fast lookup by + * number. table is a pointer to a previously allocated upb_inttable. + * entries points to an array of the desired entries themselves, each of size + * entry_size. The table is allocated in dynamic memory, and does not reference + * the data in entries. Entries may be modified by the function. + * + * The table must be freed with upb_inttable_free. */ +void upb_inttable_init(struct upb_inttable *table, void *entries, + int num_entries, int entry_size); + +inline int32_t upb_inttable_hash(struct upb_inttable * table, + upb_inttable_key_t key) { + return key & (table->size-1); +} + +/* Frees any data that was allocated by upb_inttable_init. */ +void upb_inttable_free(struct upb_inttable *table); + +inline struct upb_inttable_entry *upb_inttable_entry_get( + void *entries, int32_t pos, int entry_size) { + return (struct upb_inttable_entry*)((char*)entries) + pos*entry_size; +} + +inline struct upb_inttable_entry *upb_inttable_get( + struct upb_inttable *table, int32_t pos, int entry_size) { + return upb_inttable_entry_get(table->entries, pos, entry_size); +} + +/* Lookups up key in this table. Inlined because this is in the critical path + * of parsing. */ +inline void *upb_inttable_lookup(struct upb_inttable *table, int32_t key, + int entry_size) { + int32_t pos = upb_inttable_hash(table, key); + do { + struct upb_inttable_entry *e = upb_inttable_get(table, pos, entry_size); + if (key == e->key) return e; + pos = e->next; + } while (pos != UPB_END_OF_CHAIN); + return NULL; /* Not found. */ +} + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif /* UPB_TABLE_H_ */ -- cgit v1.2.3