A bunch more work, a fast table for field lookup.

author: Joshua Haberman <joshua@reverberate.org> 2009-06-03 22:06:24 -0700
committer: Joshua Haberman <joshua@reverberate.org> 2009-06-03 22:06:24 -0700
commit: d1f78c88faafea7e672c7c45e20f6f040942a92a (patch)
tree: 67d6f959815b464a241d5c6b7c7df1a6755e4f55
parent: 0c80c384756a48d5f731eeafa62b6cd5f3861749 (diff)
10 files changed, 279 insertions, 356 deletions
diff --git a/Makefile b/Makefile
index eb2f10d..4d1efd7 100644
--- a/Makefile
+++ b/Makefile
@@ -1,16 +1,16 @@
 
 .PHONY: all clean
 CFLAGS=-std=c99 -O3 -Wall -Wextra -pedantic
-OBJ=upb_parse.o upb_fieldmap.o upb_struct.o
-all: $(OBJ) tests
+OBJ=upb_parse.o upb_table.o upb_struct.o
+all: $(OBJ)
 clean:
 	rm -f $(OBJ) tests
 
 upb_parse.o: upb_parse.c upb_parse.h
 	gcc $(CFLAGS) -o upb_parse.o -c upb_parse.c
 
-upb_fieldmap.o: upb_fieldmap.c upb_fieldmap.h
-	gcc $(CFLAGS) -o upb_fieldmap.o -c upb_fieldmap.c
+upb_table.o: upb_table.c upb_table.h
+	gcc $(CFLAGS) -o upb_table.o -c upb_table.c
 
 upb_struct.o: upb_struct.c upb_struct.h
 	gcc $(CFLAGS) -o upb_struct.o -c upb_struct.c
diff --git a/upb.h b/upb.h
index d63e207..445975b 100644
--- a/upb.h
+++ b/upb.h
@@ -19,29 +19,30 @@ extern "C" {
 #define UPB_MAX_NESTING 64
 
 /* A list of types as they are encoded on-the-wire. */
-typedef enum upb_wire_type {
+enum upb_wire_type {
   UPB_WIRE_TYPE_VARINT      = 0,
   UPB_WIRE_TYPE_64BIT       = 1,
   UPB_WIRE_TYPE_DELIMITED   = 2,
   UPB_WIRE_TYPE_START_GROUP = 3,
   UPB_WIRE_TYPE_END_GROUP   = 4,
   UPB_WIRE_TYPE_32BIT       = 5,
-} upb_wire_type_t;
-
-struct upb_delimited {
-  size_t offset;  /* relative to the beginning of the stream. */
-  uint32_t len;
 };
+typedef int8_t upb_wire_type_t;
 
-/* A value as it is encoded on-the-wire. */
+/* A value as it is encoded on-the-wire, except delimited, which is handled
+ * separately. */
 union upb_wire_value {
   uint64_t varint;
   uint64_t _64bit;
   uint32_t _32bit;
-  struct upb_delimited delimited;
 };
 
-/* A value as described in a .proto file. */
+/* Value type as defined in a .proto file.  The values of this are defined by
+ * google_protobuf_FieldDescriptorProto_Type (from descriptor.proto).  */
+typedef int32_t upb_field_type_t;
+
+/* A value as described in a .proto file, except delimited, which is handled
+ * separately. */
 union upb_value {
   double _double;
   float  _float;
@@ -50,9 +51,10 @@ union upb_value {
   uint32_t uint32;
   uint64_t uint64;
   bool _bool;
-  struct upb_delimited delimited;
+  uint32_t delim_len;
 };
 
+/* The number of a field, eg. "optional string foo = 3". */
 typedef int32_t upb_field_number_t;
 
 /* A tag occurs before each value on-the-wire. */
@@ -61,6 +63,40 @@ struct upb_tag {
   upb_wire_type_t wire_type;
 };
 
+/* Status codes used as a return value. */
+typedef enum upb_status {
+  UPB_STATUS_OK = 0,
+  UPB_STATUS_SUBMESSAGE_END = 1,
+
+  /** FATAL ERRORS: these indicate corruption, and cannot be recovered. */
+
+  // A varint did not terminate before hitting 64 bits.
+  UPB_ERROR_UNTERMINATED_VARINT = -1,
+
+  // A submessage ended in the middle of data.
+  UPB_ERROR_BAD_SUBMESSAGE_END = -2,
+
+  // Encountered a "group" on the wire (deprecated and unsupported).
+  UPB_ERROR_GROUP = -3,
+
+  // Input was nested more than UPB_MAX_NESTING deep.
+  UPB_ERROR_STACK_OVERFLOW = -4,
+
+  // The input data caused the pb's offset (a size_t) to overflow.
+  UPB_ERROR_OVERFLOW = -5,
+
+  // Generic error.
+  UPB_ERROR = -6,
+
+  /** NONFATAL ERRORS: the input was invalid, but we can continue if desired. */
+
+  // A value was encountered that was not defined in the .proto file.
+  UPB_ERROR_UNKNOWN_VALUE = 2,
+
+  // A field was encoded with the wrong wire type.
+  UPB_ERROR_MISMATCHED_TYPE = 3,
+} upb_status_t;
+
 #ifdef __cplusplus
 }  /* extern "C" */
 #endif
diff --git a/upb_fieldmap.c b/upb_fieldmap.c
deleted file mode 100644
index 015b2e1..0000000
--- a/upb_fieldmap.c
+++ /dev/null
@@ -1,50 +0,0 @@
-/*
- * upb - a minimalist implementation of protocol buffers.
- *
- * Copyright (c) 2009 Joshua Haberman.  See LICENSE for details.
- */
-
-#include "upb_fieldmap.h"
-
-#include <stdlib.h>
-
-void pbstream_init_fieldmap(struct pbstream_fieldmap *fieldmap,
-                            struct pbstream_field *fields,
-                            int num_fields)
-{
-  qsort(fields, num_fields, sizeof(*fields), compare_fields);
-
-  /* Find the largest n for which at least half the fieldnums <n are used.
-   * Start at 8 to avoid noise of small numbers. */
-  pbstream_field_number_t n = 0, maybe_n;
-  for(int i = 0; i < num_fields; i++) {
-    maybe_n = fields[i].field_number;
-    if(maybe_n > 8 && maybe_n/(i+1) >= 2) break;
-    n = maybe_n;
-  }
-
-  fieldmap->num_fields = num_fields;
-  fieldmap->fields = malloc(sizeof(*fieldmap->fields)*num_fields);
-  memcpy(fieldmap->fields, fields, sizeof(*fields)*num_fields);
-
-  fieldmap->array_size = n;
-  fieldmap->array = malloc(sizeof(*fieldmap->array)*n);
-  memset(fieldmap->array, 0, sizeof(*fieldmap->array)*n);
-
-  for (int i = 0; i < num_fields && fields[i].field_number <= n; i++)
-    fieldmap->array[fields[i].field_number-1] = &fieldmap->fields[i];
-
-  /* Until we support the hashtable part... */
-  assert(n == fields[num_fields-1].field_number);
-}
-
-void pbstream_free_fieldmap(struct pbstream_fieldmap *fieldmap)
-{
-  free(fieldmap->fields);
-  free(fieldmap->array);
-}
-
-/* Emit definition for inline function. */
-extern void *upb_fieldmap_find(struct upb_fieldmap *fm,
-                               pbstream_field_number_t num,
-                               size_t info_size);
diff --git a/upb_fieldmap.h b/upb_fieldmap.h
deleted file mode 100644
index 0fb5a3e..0000000
--- a/upb_fieldmap.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * upb - a minimalist implementation of protocol buffers.
- *
- * Copyright (c) 2009 Joshua Haberman.  See LICENSE for details.
- *
- * A fieldmap is a data structure that supports fast lookup of fields by
- * number.  It is logically a map of {field_number -> <field info>}, where
- * <field info> is any struct that begins with the field number.  Fast lookup
- * is important, because it is in the critical path of parsing. */
-
-#ifndef UPB_FIELDMAP_H_
-#define UPB_FIELDMAP_H_
-
-#include "upb.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-struct upb_fieldmap {
-  int array_size;
-  void *array;
-  /* TODO: the hashtable part. */
-};
-
-/* Takes an array of num_fields fields and builds an optimized table for fast
- * lookup of fields by number.  The input fields need not be sorted.  This
- * fieldmap must be freed with upb_free_fieldmap(). */
-void upb_init_fieldmap(struct upb_fieldmap *fieldmap,
-                       void *fields,
-                       int num_fields,
-                       int field_size);
-void upb_free_fieldmap(struct upb_fieldmap *fieldmap);
-
-/* Looks the given field number up in the fieldmap, and returns the
- * corresponding field definition (or NULL if this field number does not exist
- * in this fieldmap). */
-inline void *upb_fieldmap_find(struct upb_fieldmap *fm,
-                               upb_field_number_t num,
-                               size_t info_size)
-{
-  if (num < array_size) {
-    return (char*)fs->array + (num*info_size);
-  } else {
-    /* TODO: the hashtable part. */
-  }
-}
-
-#ifdef __cplusplus
-}  /* extern "C" */
-#endif
-
-#endif  /* UPB_PARSE_H_ */
diff --git a/upb_parse.c b/upb_parse.c
index c0fc007..458876e 100644
--- a/upb_parse.c
+++ b/upb_parse.c
@@ -8,6 +8,7 @@
 
 #include <assert.h>
 #include <string.h>
+#include "descriptor.h"
 
 /* Branch prediction hints for GCC. */
 #ifdef __GNUC__
@@ -138,14 +139,10 @@ static int64_t zz_decode_64(uint64_t n) { return (n >> 1) ^ -(int64_t)(n & 1); }
   static void wvtov_ ## type(wire_t s, val_t *d)
 
 #define GET(type, v_or_f, wire_t, val_t, member_name) \
-  static upb_status_t get_ ## type(struct upb_parse_state *s, \
-                                        uint8_t *buf, \
-                                        struct upb_tagged_value *d) { \
+  static upb_status_t get_ ## type(uint8_t **buf, union upb_value *d) { \
     wire_t tmp; \
-    uint8_t *b = buf; \
-    CHECK(get_ ## v_or_f ## _ ## wire_t(&b, &tmp)); \
-    wvtov_ ## type(tmp, &d->v.member_name); \
-    s->offset += (b-buf); \
+    CHECK(get_ ## v_or_f ## _ ## wire_t(buf, &tmp)); \
+    wvtov_ ## type(tmp, &d->member_name); \
     return UPB_STATUS_OK; \
   }
 
@@ -172,65 +169,25 @@ T(ENUM,     v, uint32_t, int32_t,  int32)   { *d = (int32_t)s;               }
 #undef GET
 #undef T
 
-static void wvtov_delimited(uint32_t s, struct upb_delimited *d, size_t o)
-{
-  d->offset = o;
-  d->len = s;
-}
-
-/* Use BYTES version for both STRING and BYTES, leave UTF-8 checks to client. */
-static upb_status_t get_BYTES(struct upb_parse_state *s, uint8_t *buf,
-                              struct upb_tagged_value *d) {
-  uint32_t tmp;
-  uint8_t *b = buf;
-  CHECK(get_v_uint32_t(&b, &tmp));
-  s->offset += (b-buf);  /* advance past length varint. */
-  wvtov_delimited(tmp, &d->v.delimited, s->offset);
-  size_t new_offset = s->offset + d->v.delimited.len; /* skip bytes */
-  if (unlikely(new_offset < s->offset)) return UPB_ERROR_OVERFLOW;
-  s->offset = new_offset;
-  return UPB_STATUS_OK;
-}
-
-static upb_status_t get_MESSAGE(struct upb_parse_state *s, uint8_t *buf,
-                                struct upb_tagged_value *d) {
-  /* We're entering a sub-message. */
-  uint32_t tmp;
-  uint8_t *b = buf;
-  CHECK(get_v_uint32_t(&b, &tmp));
-  s->offset += (b-buf);  /* advance past length varint. */
-  wvtov_delimited(tmp, &d->v.delimited, s->offset);
-  /* Unlike STRING and BYTES, we *don't* advance past delimited here. */
-  if (unlikely(++s->top == s->limit)) return UPB_ERROR_STACK_OVERFLOW;
-  s->top->fieldset = d->field->fieldset;
-  s->top->end_offset = d->v.delimited.offset + d->v.delimited.len;
-  if (unlikely(s->top->end_offset < s->offset)) return UPB_ERROR_OVERFLOW;
-  return UPB_STATUS_OK;
-}
-
-struct upb_type_info {
-  upb_wire_type_t expected_wire_type;
-  upb_status_t (*get)(struct upb_parse_state *s, uint8_t *buf,
-                           struct upb_tagged_value *d);
-};
-static struct upb_type_info type_info[] = {
-  {UPB_WIRE_TYPE_64BIT,     get_DOUBLE},
-  {UPB_WIRE_TYPE_32BIT,     get_FLOAT},
-  {UPB_WIRE_TYPE_VARINT,    get_INT32},
-  {UPB_WIRE_TYPE_VARINT,    get_INT64},
-  {UPB_WIRE_TYPE_VARINT,    get_UINT32},
-  {UPB_WIRE_TYPE_VARINT,    get_UINT64},
-  {UPB_WIRE_TYPE_VARINT,    get_SINT32},
-  {UPB_WIRE_TYPE_VARINT,    get_SINT64},
-  {UPB_WIRE_TYPE_32BIT,     get_FIXED32},
-  {UPB_WIRE_TYPE_64BIT,     get_FIXED64},
-  {UPB_WIRE_TYPE_32BIT,     get_SFIXED32},
-  {UPB_WIRE_TYPE_64BIT,     get_SFIXED64},
-  {UPB_WIRE_TYPE_VARINT,    get_BOOL},
-  {UPB_WIRE_TYPE_DELIMITED, get_BYTES},
-  {UPB_WIRE_TYPE_DELIMITED, get_BYTES},
-  {UPB_WIRE_TYPE_VARINT,    get_ENUM},
-  {UPB_WIRE_TYPE_DELIMITED, get_MESSAGE}
+upb_wire_type_t upb_expected_wire_types[] = {
+  [GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_DOUBLE] = UPB_WIRE_TYPE_64BIT,
+  [GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_FLOAT] = UPB_WIRE_TYPE_32BIT,
+  [GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_INT64] = UPB_WIRE_TYPE_VARINT,
+  [GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_UINT64] = UPB_WIRE_TYPE_VARINT,
+  [GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_INT32] = UPB_WIRE_TYPE_VARINT,
+  [GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_FIXED64] = UPB_WIRE_TYPE_64BIT,
+  [GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_FIXED32] = UPB_WIRE_TYPE_32BIT,
+  [GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_BOOL] = UPB_WIRE_TYPE_VARINT,
+  [GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_STRING] = UPB_WIRE_TYPE_DELIMITED,
+  [GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_BYTES] = UPB_WIRE_TYPE_DELIMITED,
+  [GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_GROUP] = -1,  /* TODO */
+  [GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_MESSAGE] = UPB_WIRE_TYPE_DELIMITED,
+  [GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_UINT32] = UPB_WIRE_TYPE_VARINT,
+  [GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_ENUM] = UPB_WIRE_TYPE_VARINT,
+  [GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_SFIXED32] = UPB_WIRE_TYPE_32BIT,
+  [GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_SFIXED64] = UPB_WIRE_TYPE_64BIT,
+  [GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_SINT32] = UPB_WIRE_TYPE_VARINT,
+  [GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_SINT64] = UPB_WIRE_TYPE_VARINT,
 };
 
 upb_status_t parse_tag(uint8_t **buf, struct upb_tag *tag)
@@ -249,22 +206,17 @@ upb_status_t parse_wire_value(uint8_t *buf, size_t *offset,
 #define READ(expr) CHECK(expr); *offset += (b-buf)
   uint8_t *b = buf;
   switch(wt) {
-    case UPB_WIRE_TYPE_VARINT:
-      READ(get_v_uint64_t(&b, &wv->varint)); break;
-    case UPB_WIRE_TYPE_64BIT:
-      READ(get_f_uint64_t(&b, &wv->_64bit)); break;
-    case UPB_WIRE_TYPE_32BIT:
-      READ(get_f_uint32_t(&b, &wv->_32bit)); break;
+    case UPB_WIRE_TYPE_VARINT: READ(get_v_uint64_t(&b, &wv->varint)); break;
+    case UPB_WIRE_TYPE_64BIT: READ(get_f_uint64_t(&b, &wv->_64bit)); break;
+    case UPB_WIRE_TYPE_32BIT: READ(get_f_uint32_t(&b, &wv->_32bit)); break;
     case UPB_WIRE_TYPE_DELIMITED:
-      wv->delimited.offset = *offset;
-      READ(get_v_uint32_t(&b, &wv->delimited.len));
-      size_t new_offset = *offset + wv->delimited.len;
+      READ(get_v_uint32_t(&b, &wv->_32bit));
+      size_t new_offset = *offset + wv->_32bit;
       if (new_offset < *offset) return UPB_ERROR_OVERFLOW;
       *offset += new_offset;
       break;
     case UPB_WIRE_TYPE_START_GROUP:
-    case UPB_WIRE_TYPE_END_GROUP:
-      return UPB_ERROR_GROUP;  /* deprecated, no plans to support. */
+    case UPB_WIRE_TYPE_END_GROUP: return UPB_ERROR_GROUP;  /* TODO */
   }
   return UPB_STATUS_OK;
 }
@@ -274,12 +226,9 @@ upb_status_t skip_wire_value(uint8_t *buf, size_t *offset,
 {
   uint8_t *b = buf;
   switch(wt) {
-    case UPB_WIRE_TYPE_VARINT:
-      READ(skip_v_uint64_t(&b)); break;
-    case UPB_WIRE_TYPE_64BIT:
-      READ(skip_f_uint64_t(&b)); break;
-    case UPB_WIRE_TYPE_32BIT:
-      READ(skip_f_uint32_t(&b)); break;
+    case UPB_WIRE_TYPE_VARINT: READ(skip_v_uint64_t(&b)); break;
+    case UPB_WIRE_TYPE_64BIT: READ(skip_f_uint64_t(&b)); break;
+    case UPB_WIRE_TYPE_32BIT: READ(skip_f_uint32_t(&b)); break;
     case UPB_WIRE_TYPE_DELIMITED: {
       /* Have to get (not skip) the length to skip the bytes. */
       uint32_t len;
@@ -290,71 +239,27 @@ upb_status_t skip_wire_value(uint8_t *buf, size_t *offset,
       break;
     }
     case UPB_WIRE_TYPE_START_GROUP:
-    case UPB_WIRE_TYPE_END_GROUP:
-      return UPB_ERROR_GROUP;  /* deprecated, no plans to support. */
+    case UPB_WIRE_TYPE_END_GROUP: return UPB_ERROR_GROUP;  /* TODO */
   }
   return UPB_STATUS_OK;
 #undef READ
 }
 
-/* Parses and processes the next value from buf. */
-upb_status_t upb_parse_field(struct upb_parse_state *s,
-                             uint8_t *buf,
-                             upb_field_number_t *fieldnum,
-                             struct upb_tagged_value *val,
-                             struct upb_tagged_wire_value *wv)
+upb_status_t upb_parse_value(uint8_t **b, upb_field_type_t ft,
+                             union upb_value *v)
 {
-  /* Check for end-of-message at the current stack depth. */
-  if(unlikely(s->offset >= s->top->end_offset)) {
-    /* If the end offset isn't an exact field boundary, the pb is corrupt. */
-    if(unlikely(s->offset != s->top->end_offset))
-      return UPB_ERROR_BAD_SUBMESSAGE_END;
-    s->top--;
-    return UPB_STATUS_SUBMESSAGE_END;
+#define CASE(t) \
+  case GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_ ## t: return get_ ## t(b, v);
+  switch(ft) {
+    CASE(DOUBLE) CASE(FLOAT) CASE(INT64) CASE(UINT64) CASE(INT32) CASE(FIXED64)
+    CASE(FIXED32) CASE(BOOL) CASE(UINT32) CASE(ENUM) CASE(SFIXED32)
+    CASE(SFIXED64) CASE(SINT32) CASE(SINT64)
+    case GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_BYTES:
+    case GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_STRING:
+    case GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_MESSAGE:
+      return get_UINT32(b, v);
+    default: return UPB_ERROR;  /* Including GROUP. */
   }
-
-  struct upb_tag tag;
-  uint8_t *b = buf;
-  CHECK(parse_tag(&b, &tag));
-  s->offset += (b-buf);
-  struct upb_field *fd = upb_find_field(s->top->fieldset,
-                                                  tag.field_number);
-  upb_status_t unknown_value_status;
-  if(unlikely(!fd)) {
-    unknown_value_status = UPB_ERROR_UNKNOWN_VALUE;
-    goto unknown_value;
-  }
-  struct upb_type_info *info = &type_info[fd->type];
-  if(unlikely(tag.wire_type != info->expected_wire_type)) {
-    unknown_value_status = UPB_ERROR_MISMATCHED_TYPE;
-    goto unknown_value;
-  }
-
-  *fieldnum = tag.field_number;
-  val->field = fd;
-  CHECK(info->get(s, b, val));
-  return UPB_STATUS_OK;
-
-unknown_value:
-  wv->type = tag.wire_type;
-  CHECK(parse_wire_value(buf, &s->offset, tag.wire_type, &wv->v));
-  return unknown_value_status;
-}
-
-void upb_init_parser(
-    struct upb_parse_state *state,
-    struct upb_fieldset *toplevel_fieldset)
-{
-  state->offset = 0;
-  state->top = state->stack;
-  state->limit = state->top + UPB_MAX_STACK;
-  state->top->fieldset = toplevel_fieldset;
-  state->top->end_offset = SIZE_MAX;
-}
-
-static int compare_fields(const void *f1, const void *f2)
-{
-  return ((struct upb_field*)f1)->field_number -
-         ((struct upb_field*)f2)->field_number;
+#undef CASE
 }
 
diff --git a/upb_parse.h b/upb_parse.h
index 75752a0..829e97c 100644
--- a/upb_parse.h
+++ b/upb_parse.h
@@ -18,99 +18,6 @@
 extern "C" {
 #endif
 
-/* A deserialized value as described in a .proto file. */
-struct upb_tagged_value {
-  struct upb_field *field;
-  union upb_value v;
-};
-
-/* A value as it is encoded on-the-wire, before it has been interpreted as
- * any particular .proto type. */
-struct upb_tagged_wire_value {
-  upb_wire_type_t type;
-  union upb_wire_value v;
-};
-
-/* Definition of a single field in a message, for the purposes of the parser's
- * fieldmap.  Note that this does not include nearly all of the information
- * that can be specified about a field in a .proto file.  For example, we don't
- * even know the field's name.  We keep only the information necessary to parse
- * the field. */
-struct upb_field {
-  upb_field_number_t field_number;
-  int32_t type;  /* google_protobuf_FieldDescriptorProto_Type */
-  struct upb_fieldset *fieldset;  /* if type == MESSAGE */
-};
-
-struct upb_parse_stack_frame {
-  struct upb_fieldset *fieldset;
-  size_t end_offset;  /* unknown for the top frame, so we set to SIZE_MAX */
-};
-
-/* The stream parser's state. */
-struct upb_parse_state {
-  size_t offset;
-  struct upb_parse_stack_frame stack[UPB_MAX_STACK];
-  struct upb_parse_stack_frame *top, *limit;
-};
-
-/* Call this once before parsing to initialize the data structures.
- * message_type can be NULL, in which case all fields will be reported as
- * unknown. */
-void upb_init_parser(struct upb_parse_state *state,
-                     struct upb_fieldset *toplevel_fieldset);
-
-/* Status as returned by upb_parse().  Status codes <0 are fatal errors
- * that cannot be recovered.  Status codes >0 are unusual but nonfatal events,
- * which nonetheless must be handled differently since they do not return data
- * in val. */
-typedef enum upb_status {
-  UPB_STATUS_OK = 0,
-  UPB_STATUS_SUBMESSAGE_END = 1,  // No data is stored in val or wv.
-
-  /** FATAL ERRORS: these indicate corruption, and cannot be recovered. */
-
-  // A varint did not terminate before hitting 64 bits.
-  UPB_ERROR_UNTERMINATED_VARINT = -1,
-
-  // A submessage ended in the middle of data.
-  UPB_ERROR_BAD_SUBMESSAGE_END = -2,
-
-  // Encountered a "group" on the wire (deprecated and unsupported).
-  UPB_ERROR_GROUP = -3,
-
-  // Input was nested more than UPB_MAX_NESTING deep.
-  UPB_ERROR_STACK_OVERFLOW = -4,
-
-  // The input data caused the pb's offset (a size_t) to overflow.
-  UPB_ERROR_OVERFLOW = -5,
-
-  /** NONFATAL ERRORS: the input was invalid, but we can continue if desired. */
-
-  // A value was encountered that was not defined in the .proto file.  The
-  // unknown value is stored in wv.
-  UPB_ERROR_UNKNOWN_VALUE = 2,
-
-  // A field was encoded with the wrong wire type.  The wire value is stored in
-  // wv.
-  UPB_ERROR_MISMATCHED_TYPE = 3,
-} upb_status_t;
-struct upb_parse_state;
-
-/* The main parsing function.  Parses the next value from buf, storing the
- * parsed value in val.  If val is of type UPB_TYPE_MESSAGE, then a
- * submessage was entered.
- *
- * IMPORTANT NOTE: for efficiency, the parsing routines do not do bounds checks,
- * and may read as much as far as buf+10.  So the caller must ensure that buf is
- * not within 10 bytes of unmapped memory, or the program will segfault. Clients
- * are encouraged to overallocate their buffers by ten bytes to compensate. */
-upb_status_t upb_parse_field(struct upb_parse_state *s,
-                             uint8_t *buf,
-                             upb_field_number_t *fieldnum,
-                             struct upb_tagged_value *val,
-                             struct upb_tagged_wire_value *wv);
-
 /* Low-level parsing functions. ***********************************************/
 
 /* Parses a single tag from the character data starting at buf, and updates
@@ -118,6 +25,19 @@ upb_status_t upb_parse_field(struct upb_parse_state *s,
  * by at most ten bytes. */
 upb_status_t parse_tag(uint8_t **buf, struct upb_tag *tag);
 
+extern upb_wire_type_t upb_expected_wire_types[];
+/* Returns true if wt is the correct on-the-wire type for ft. */
+inline bool upb_check_type(upb_wire_type_t wt, upb_field_type_t ft) {
+  return upb_expected_wire_types[ft] == wt;
+}
+
+/* Parses and converts a value from the character data starting at buf.  The
+ * caller must have previously checked that the wire type is appropriate for
+ * this field type.  For delimited data, buf is advanced to the beginning of
+ * the delimited data, not the end. */
+upb_status_t upb_parse_value(uint8_t **buf, upb_field_type_t ft,
+                             union upb_value *value);
+
 /* Parses a wire value with the given type (which must have been obtained from
  * a tag that was just parsed) and adds the number of bytes that were consumed
  * to *offset.  For delimited types, offset is advanced past the delimited
@@ -127,8 +47,8 @@ upb_status_t upb_parse_wire_value(uint8_t *buf, size_t *offset,
                                   union upb_wire_value *wv);
 
 /* Like the above, but discards the wire value instead of saving it. */
-upb_status_t skip_wire_value(uint8_t *buf, size_t *offset,
-                             upb_wire_type_t wt);
+upb_status_t upb_skip_wire_value(uint8_t *buf, size_t *offset,
+                                 upb_wire_type_t wt);
 
 #ifdef __cplusplus
 }  /* extern "C" */
diff --git a/upb_struct.c b/upb_struct.c
index b0b244a..3284796 100644
--- a/upb_struct.c
+++ b/upb_struct.c
@@ -42,3 +42,4 @@ extern bool upb_struct_is_set(uint8_t *s, struct upb_struct_field *f);
 extern bool upb_struct_all_required_fields_set(
     uint8_t *s, struct upb_struct_definition *d);
 extern void upb_struct_clear(uint8_t *s, struct upb_struct_definition *d);
+
diff --git a/upb_struct.h b/upb_struct.h
index a64b3d4..7d5c219 100644
--- a/upb_struct.h
+++ b/upb_struct.h
@@ -57,8 +57,8 @@ struct upb_struct_definition {
 };
 
 /* While these are written to be as fast as possible, it will still be faster
- * cache the results of this lookup if possible.  These return NULL if no such
- * field is found. */
+ * to cache the results of this lookup if possible.  These return NULL if no
+ * such field is found. */
 struct upb_struct_field *upb_struct_find_field_by_name(
   struct upb_struct_definition *d, char *name);
 struct upb_struct_field *upb_struct_find_field_by_number(
diff --git a/upb_table.c b/upb_table.c
new file mode 100644
index 0000000..656da24
--- /dev/null
+++ b/upb_table.c
@@ -0,0 +1,89 @@
+/*
+ * upb - a minimalist implementation of protocol buffers.
+ *
+ * Copyright (c) 2009 Joshua Haberman.  See LICENSE for details.
+ */
+
+#include "upb_table.h"
+
+#include <stdlib.h>
+
+static int compare_entries(const void *f1, const void *f2)
+{
+  return ((struct upb_inttable_entry*)f1)->key -
+         ((struct upb_inttable_entry*)f2)->key;
+}
+
+static uint32_t max(uint32_t a, uint32_t b) { return a > b ? a : b; }
+
+static uint32_t round_up_to_pow2(uint32_t v)
+{
+  /* cf. Bit Twiddling Hacks:
+   * http://www-graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2 */
+  v--;
+  v |= v >> 1; v |= v >> 2; v |= v >> 4; v |= v >> 8; v |= v >> 16;
+  v++;
+  return v;
+}
+
+void upb_inttable_init(struct upb_inttable *table, void *entries,
+                       int num_entries, int entry_size)
+{
+  qsort(entries, num_entries, entry_size, compare_entries);
+
+  /* Find the largest n for which at least half the keys <n are used.  We
+   * make sure our table size is at least n.  This allows all keys <n to be
+   * in their main position (as if it were an array) and only numbers >n might
+   * possibly have collisions.  Start at 8 to avoid noise of small numbers. */
+  upb_inttable_key_t n = 0, maybe_n;
+  bool all_in_array = true;
+  for(int i = 0; i < num_entries; i++) {
+    struct upb_inttable_entry *e =
+        upb_inttable_entry_get(entries, i, entry_size);
+    maybe_n = e->key;
+    if(maybe_n > 8 && maybe_n/(i+1) >= 2) {
+      all_in_array = false;
+      break;
+    }
+    n = maybe_n;
+  }
+
+  /* TODO: measure, tweak, optimize this choice of table size.  Possibly test
+   * (at runtime) maximum chain length for each proposed size. */
+  uint32_t min_size_by_load = all_in_array ? n : (double)num_entries / 0.85;
+  uint32_t min_size = max(n, min_size_by_load);
+  table->size = round_up_to_pow2(min_size);
+
+  table->entries = malloc(table->size * entry_size);
+  /* Initialize to empty. */
+  for(size_t i = 0; i < table->size; i++) {
+    struct upb_inttable_entry *e = upb_inttable_get(table, i, entry_size);
+    e->key = UPB_END_OF_CHAIN;
+    e->next = UPB_END_OF_CHAIN;
+  }
+
+  /* Insert the elements. */
+  for(int i = 0; i < num_entries; i++) {
+    struct upb_inttable_entry *e =
+        upb_inttable_entry_get(entries, i, entry_size);
+    int32_t hash = upb_inttable_hash(table, e->key);
+    struct upb_inttable_entry *table_e =
+        upb_inttable_get(table, hash, entry_size);
+    if(table_e->key != UPB_END_OF_CHAIN) {  /* Collision. */
+      if(hash == upb_inttable_hash(table, table_e->key)) {
+        /* Existing element is in its main posisiton.  Find an empty slot to
+         * place our new element. */
+      } else {
+        /* Existing element is not in its main position.  Move it to an empty
+         * slot and put our element in its main position. */
+      }
+    }
+  }
+}
+
+void upb_inttable_free(struct upb_inttable *table)
+{
+  free(table->entries);
+}
+
+/* Emit definition for inline functions. */
diff --git a/upb_table.h b/upb_table.h
new file mode 100644
index 0000000..684dbff
--- /dev/null
+++ b/upb_table.h
@@ -0,0 +1,75 @@
+/*
+ * upb - a minimalist implementation of protocol buffers.
+ *
+ * Copyright (c) 2009 Joshua Haberman.  See LICENSE for details.
+ */
+
+#ifndef UPB_TABLE_H_
+#define UPB_TABLE_H_
+
+#include "upb.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef int32_t upb_inttable_key_t;
+
+#define UPB_END_OF_CHAIN (upb_inttable_key_t)-1
+
+struct upb_inttable_entry {
+  upb_inttable_key_t key;
+  int32_t next;
+};
+
+struct upb_inttable {
+  uint32_t size;  /* Is a power of two. */
+  void *entries;
+};
+
+/* Builds an int32_t -> <entry> table, optimized for very fast lookup by
+ * number.  table is a pointer to a previously allocated upb_inttable.
+ * entries points to an array of the desired entries themselves, each of size
+ * entry_size.  The table is allocated in dynamic memory, and does not reference
+ * the data in entries.  Entries may be modified by the function.
+ *
+ * The table must be freed with upb_inttable_free. */
+void upb_inttable_init(struct upb_inttable *table, void *entries,
+                       int num_entries, int entry_size);
+
+inline int32_t upb_inttable_hash(struct upb_inttable * table,
+                                 upb_inttable_key_t key) {
+  return key & (table->size-1);
+}
+
+/* Frees any data that was allocated by upb_inttable_init. */
+void upb_inttable_free(struct upb_inttable *table);
+
+inline struct upb_inttable_entry *upb_inttable_entry_get(
+    void *entries, int32_t pos, int entry_size) {
+  return (struct upb_inttable_entry*)((char*)entries) + pos*entry_size;
+}
+
+inline struct upb_inttable_entry *upb_inttable_get(
+    struct upb_inttable *table, int32_t pos, int entry_size) {
+  return upb_inttable_entry_get(table->entries, pos, entry_size);
+}
+
+/* Lookups up key in this table.  Inlined because this is in the critical path
+ * of parsing. */
+inline void *upb_inttable_lookup(struct upb_inttable *table, int32_t key,
+                                 int entry_size) {
+  int32_t pos = upb_inttable_hash(table, key);
+  do {
+    struct upb_inttable_entry *e = upb_inttable_get(table, pos, entry_size);
+    if (key == e->key) return e;
+    pos = e->next;
+  } while (pos != UPB_END_OF_CHAIN);
+  return NULL;  /* Not found. */
+}
+
+#ifdef __cplusplus
+}  /* extern "C" */
+#endif
+
+#endif  /* UPB_TABLE_H_ */
author	Joshua Haberman <joshua@reverberate.org>	2009-06-03 22:06:24 -0700
committer	Joshua Haberman <joshua@reverberate.org>	2009-06-03 22:06:24 -0700
commit	d1f78c88faafea7e672c7c45e20f6f040942a92a (patch)
tree	67d6f959815b464a241d5c6b7c7df1a6755e4f55
parent	0c80c384756a48d5f731eeafa62b6cd5f3861749 (diff)