Directory restructuring.

author: Joshua Haberman <joshua@reverberate.org> 2009-07-08 12:06:47 -0700
committer: Joshua Haberman <joshua@reverberate.org> 2009-07-08 12:06:47 -0700
commit: 462b26c1cc041a8fa26deb62cf12f1f351a5b2f6 (patch)
tree: de5a58f8d66d11c13b349448a970f84d57d16cad /src
parent: c7ee14f8ef38a8bc90c0f1db1ad47b2e06612fa3 (diff)
14 files changed, 2553 insertions, 0 deletions
diff --git a/src/upb.h b/src/upb.h
new file mode 100644
index 0000000..64f2c54
--- /dev/null
+++ b/src/upb.h
@@ -0,0 +1,136 @@
+/*
+ * upb - a minimalist implementation of protocol buffers.
+
+ * Copyright (c) 2009 Joshua Haberman.  See LICENSE for details.
+ *
+ * This file contains shared definitions that are widely used across upb.
+ */
+
+#ifndef UPB_H_
+#define UPB_H_
+
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>  /* for size_t. */
+#include <string.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* inline if possible, emit standalone code if required. */
+#ifndef INLINE
+#define INLINE static inline
+#endif
+
+/* The maximum that any submessages can be nested.  Matches proto2's limit. */
+#define UPB_MAX_NESTING 64
+
+/* The maximum number of fields that any one .proto type can have. */
+#define UPB_MAX_FIELDS (1<<16)
+
+/* Nested type names are separated by periods. */
+#define UPB_SYMBOL_SEPARATOR '.'
+#define UPB_SYMBOL_MAX_LENGTH 256
+
+#define UPB_INDEX(base, i, m) (void*)((char*)(base) + ((i)*(m)))
+
+INLINE uint32_t max(uint32_t a, uint32_t b) { return a > b ? a : b; }
+
+/* Value type as defined in a .proto file.  The values of this are defined by
+ * google_protobuf_FieldDescriptorProto_Type (from descriptor.proto).
+ * Note that descriptor.proto reserves "0" for errors, and we use it to
+ * represent exceptional circumstances. */
+typedef uint8_t upb_field_type_t;
+
+/* Label (optional, repeated, required) as defined in a .proto file.  The values
+ * of this are defined by google.protobuf.FieldDescriptorProto.Label (from
+ * descriptor.proto). */
+typedef uint8_t  upb_label_t;
+
+struct upb_type_info {
+  uint8_t align;
+  uint8_t size;
+  uint8_t expected_wire_type;
+};
+
+/* Contains information for all .proto types.  Indexed by upb_field_type_t. */
+extern struct upb_type_info upb_type_info[];
+
+/* A pointer to a .proto value.  The owner must have an out-of-band way of
+ * knowing the type, so it knows which union member to use. */
+union upb_value {
+  double   _double;
+  float    _float;
+  int32_t  int32;
+  int64_t  int64;
+  uint32_t uint32;
+  uint64_t uint64;
+  bool     _bool;
+  struct upb_string **string;
+  struct upb_array **array;
+  void     *message;
+};
+
+union upb_value_ptr {
+  double   *_double;
+  float    *_float;
+  int32_t  *int32;
+  int64_t  *int64;
+  uint32_t *uint32;
+  uint64_t *uint64;
+  bool     *_bool;
+  struct upb_string **string;
+  struct upb_array **array;
+  void     **message;
+  void     *_void;
+};
+
+union upb_symbol_ref {
+  struct upb_msg *msg;
+  struct upb_enum *_enum;
+  struct upb_svc *svc;
+};
+
+/* The number of a field, eg. "optional string foo = 3". */
+typedef int32_t upb_field_number_t;
+
+/* Status codes used as a return value.  Codes >0 are not fatal and can be
+ * resumed. */
+typedef enum upb_status {
+  UPB_STATUS_OK = 0,
+
+  // The input byte stream ended in the middle of a record.
+  UPB_STATUS_NEED_MORE_DATA = 1,
+
+  // The user value callback opted to stop parsing.
+  UPB_STATUS_USER_CANCELLED = 2,
+
+  // A varint did not terminate before hitting 64 bits.
+  UPB_ERROR_UNTERMINATED_VARINT = -1,
+
+  // A submessage or packed array ended in the middle of data.
+  UPB_ERROR_BAD_SUBMESSAGE_END = -2,
+
+  // Input was nested more than UPB_MAX_NESTING deep.
+  UPB_ERROR_STACK_OVERFLOW = -3,
+
+  // The input data caused the pb's offset (a size_t) to overflow.
+  UPB_ERROR_OVERFLOW = -4,
+
+  // An "end group" tag was encountered in an inappropriate place.
+  UPB_ERROR_SPURIOUS_END_GROUP = -5,
+
+  UPB_ERROR_ILLEGAL = -6
+} upb_status_t;
+
+#define UPB_CHECK(func) do { \
+  upb_status_t status = func; \
+  if(status != UPB_STATUS_OK) return status; \
+  } while (0)
+
+#ifdef __cplusplus
+}  /* extern "C" */
+#endif
+
+#endif  /* UPB_H_ */
diff --git a/src/upb_context.c b/src/upb_context.c
new file mode 100644
index 0000000..d857d16
--- /dev/null
+++ b/src/upb_context.c
@@ -0,0 +1,304 @@
+/*
+ * upb - a minimalist implementation of protocol buffers.
+ *
+ * Copyright (c) 2009 Joshua Haberman.  See LICENSE for details.
+ */
+
+#include <stdlib.h>
+#include <string.h>
+#include "descriptor.h"
+#include "upb_context.h"
+#include "upb_enum.h"
+#include "upb_msg.h"
+
+/* Search for a character in a string, in reverse. */
+static int memrchr(char *data, char c, size_t len)
+{
+  int off = len-1;
+  while(off > 0 && data[off] != c) --off;
+  return off;
+}
+
+bool addfd(struct upb_strtable *addto, struct upb_strtable *existingdefs,
+           google_protobuf_FileDescriptorProto *fd);
+
+bool upb_context_init(struct upb_context *c)
+{
+  upb_strtable_init(&c->symtab, 16, sizeof(struct upb_symtab_entry));
+  upb_strtable_init(&c->psymtab, 16, sizeof(struct upb_symtab_entry));
+  /* Add all the types in descriptor.proto so we can parse descriptors. */
+  if(!addfd(&c->psymtab, &c->symtab, &google_protobuf_filedescriptor)) {
+    assert(false);
+    return false;  /* Indicates that upb is buggy or corrupt. */
+  }
+  struct upb_string name = UPB_STRLIT("google.protobuf.FileDescriptorSet");
+  struct upb_symtab_entry *e = upb_strtable_lookup(&c->psymtab, &name);
+  assert(e);
+  c->fds_msg = e->ref.msg;
+  c->fds_size = 16;
+  c->fds_len = 0;
+  c->fds = malloc(sizeof(*c->fds));
+  return true;
+}
+
+static void free_symtab(struct upb_strtable *t)
+{
+  struct upb_symtab_entry *e = upb_strtable_begin(t);
+  for(; e; e = upb_strtable_next(t, &e->e)) {
+    switch(e->type) {
+      case UPB_SYM_MESSAGE: upb_msg_free(e->ref.msg); break;
+      case UPB_SYM_ENUM: upb_enum_free(e->ref._enum); break;
+      default: break;  /* TODO */
+    }
+    free(e->ref.msg);  /* The pointer is the same for all. */
+    free(e->e.key.ptr);
+  }
+  upb_strtable_free(t);
+}
+
+void upb_context_free(struct upb_context *c)
+{
+  free_symtab(&c->symtab);
+  for(size_t i = 0; i < c->fds_len; i++)
+    upb_msgdata_free(c->fds[i], c->fds_msg, true);
+  free_symtab(&c->psymtab);
+  free(c->fds);
+}
+
+struct upb_symtab_entry *upb_context_lookup(struct upb_context *c,
+                                            struct upb_string *symbol)
+{
+  return upb_strtable_lookup(&c->symtab, symbol);
+}
+
+/* Given a symbol and the base symbol inside which it is defined, find the
+ * symbol's definition in t. */
+static struct upb_symtab_entry *resolve(struct upb_strtable *t,
+                                        struct upb_string *base,
+                                        struct upb_string *symbol)
+{
+  if(base->byte_len + symbol->byte_len + 1 >= UPB_SYMBOL_MAX_LENGTH ||
+     symbol->byte_len == 0) return NULL;
+
+  if(symbol->ptr[0] == UPB_SYMBOL_SEPARATOR) {
+    /* Symbols starting with '.' are absolute, so we do a single lookup. */
+    struct upb_string sym_str = {.ptr = symbol->ptr+1,
+                                 .byte_len = symbol->byte_len-1};
+    return upb_strtable_lookup(t, &sym_str);
+  } else {
+    /* Remove components from base until we find an entry or run out. */
+    char sym[UPB_SYMBOL_MAX_LENGTH+1];
+    struct upb_string sym_str = {.ptr = sym};
+    int baselen = base->byte_len;
+    while(1) {
+      /* sym_str = base[0...base_len] + UPB_SYMBOL_SEPARATOR + symbol */
+      memcpy(sym, base->ptr, baselen);
+      sym[baselen] = UPB_SYMBOL_SEPARATOR;
+      memcpy(sym + baselen + 1, symbol->ptr, symbol->byte_len);
+      sym_str.byte_len = baselen + symbol->byte_len + 1;
+
+      struct upb_symtab_entry *e = upb_strtable_lookup(t, &sym_str);
+      if (e) return e;
+      else if(baselen == 0) return NULL;  /* No more scopes to try. */
+
+      baselen = memrchr(base->ptr, UPB_SYMBOL_SEPARATOR, baselen);
+    }
+  }
+}
+
+/* Tries to resolve a symbol in two different tables. */
+union upb_symbol_ref resolve2(struct upb_strtable *t1, struct upb_strtable *t2,
+                              struct upb_string *base, struct upb_string *sym,
+                                 enum upb_symbol_type expected_type) {
+  union upb_symbol_ref nullref = {.msg = NULL};
+  struct upb_symtab_entry *e = resolve(t1, base, sym);
+  if(e == NULL) e = resolve(t2, base, sym);
+
+  if(e && e->type == expected_type) return e->ref;
+  else return nullref;
+}
+
+
+struct upb_symtab_entry *upb_context_resolve(struct upb_context *c,
+                                             struct upb_string *base,
+                                             struct upb_string *symbol) {
+  return resolve(&c->symtab, base, symbol);
+}
+
+/* Joins strings together, for example:
+ *   join("Foo.Bar", "Baz") -> "Foo.Bar.Baz"
+ *   join("", "Baz") -> "Baz"
+ * Caller owns the returned string and must free it. */
+static struct upb_string join(struct upb_string *base, struct upb_string *name) {
+  size_t len = base->byte_len + name->byte_len;
+  if(base->byte_len > 0) len++;  /* For the separator. */
+  struct upb_string joined = {.byte_len=len, .ptr=malloc(len)};
+  if(base->byte_len > 0) {
+    /* nested_base = base + '.' +  d->name */
+    memcpy(joined.ptr, base->ptr, base->byte_len);
+    joined.ptr[base->byte_len] = UPB_SYMBOL_SEPARATOR;
+    memcpy(&joined.ptr[base->byte_len+1], name->ptr, name->byte_len);
+  } else {
+    memcpy(joined.ptr, name->ptr, name->byte_len);
+  }
+  return joined;
+}
+
+static bool insert_enum(struct upb_strtable *t,
+                        google_protobuf_EnumDescriptorProto *ed,
+                        struct upb_string *base)
+{
+  // TODO: re-enable when compiler sets this flag
+  //if(!ed->set_flags.has.name) return false;
+
+  /* We own this and must free it on destruct. */
+  struct upb_string fqname = join(base, ed->name);
+
+  /* Redefinition within a FileDescriptorProto is not allowed. */
+  if(upb_strtable_lookup(t, &fqname)) {
+    free(fqname.ptr);
+    return false;
+  }
+
+  struct upb_symtab_entry e;
+  e.e.key = fqname;
+  e.type = UPB_SYM_ENUM;
+  e.ref._enum = malloc(sizeof(*e.ref._enum));
+  upb_enum_init(e.ref._enum, ed);
+  upb_strtable_insert(t, &e.e);
+
+  return true;
+}
+
+static bool insert_message(struct upb_strtable *t,
+                           google_protobuf_DescriptorProto *d,
+                           struct upb_string *base)
+{
+  /* TODO: re-enable when compiler sets this flag. */
+  //if(!d->set_flags.has.name) return false;
+
+  /* We own this and must free it on destruct. */
+  struct upb_string fqname = join(base, d->name);
+
+  /* Redefinition within a FileDescriptorProto is not allowed. */
+  if(upb_strtable_lookup(t, d->name)) {
+    free(fqname.ptr);
+    return false;
+  }
+
+  struct upb_symtab_entry e;
+  e.e.key = fqname;
+  e.type = UPB_SYM_MESSAGE;
+  e.ref.msg = malloc(sizeof(*e.ref.msg));
+  if(!upb_msg_init(e.ref.msg, d)) {
+    free(fqname.ptr);
+    return false;
+  }
+  upb_strtable_insert(t, &e.e);
+
+  /* Add nested messages and enums. */
+  //if(d->set_flags.has.nested_type)  (re-enable when protoc sets)
+  if(d->nested_type)
+    for(unsigned int i = 0; i < d->nested_type->len; i++)
+      if(!insert_message(t, d->nested_type->elements[i], &fqname))
+        return false;
+
+  //if(d->set_flags.has.enum_type)
+  if(d->enum_type)
+    for(unsigned int i = 0; i < d->enum_type->len; i++)
+      if(!insert_enum(t, d->enum_type->elements[i], &fqname))
+        return false;
+
+  return true;
+}
+
+bool addfd(struct upb_strtable *addto, struct upb_strtable *existingdefs,
+           google_protobuf_FileDescriptorProto *fd)
+{
+  struct upb_string package = {.byte_len=0};
+  if(fd->set_flags.has.package) package = *fd->package;
+
+  if(fd->set_flags.has.message_type)
+    for(unsigned int i = 0; i < fd->message_type->len; i++)
+      if(!insert_message(addto, fd->message_type->elements[i], &package))
+        return false;
+
+  if(fd->set_flags.has.enum_type)
+    for(unsigned int i = 0; i < fd->enum_type->len; i++)
+      if(!insert_enum(addto, fd->enum_type->elements[i], &package))
+        return false;
+
+  /* TODO: handle extensions and services. */
+
+  /* Attempt to resolve all references. */
+  struct upb_symtab_entry *e;
+  for(e = upb_strtable_begin(addto); e; e = upb_strtable_next(addto, &e->e)) {
+    if(upb_strtable_lookup(existingdefs, &e->e.key))
+      return false;  /* Redefinition prohibited. */
+    if(e->type == UPB_SYM_MESSAGE) {
+      struct upb_msg *m = e->ref.msg;
+      for(unsigned int i = 0; i < m->num_fields; i++) {
+        struct upb_msg_field *f = &m->fields[i];
+        google_protobuf_FieldDescriptorProto *fd = m->field_descriptors[i];
+        union upb_symbol_ref ref;
+        if(fd->type == GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_MESSAGE ||
+           fd->type == GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_GROUP)
+          ref = resolve2(existingdefs, addto, &e->e.key, fd->type_name,
+                         UPB_SYM_MESSAGE);
+        else if(fd->type == GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_ENUM)
+          ref = resolve2(existingdefs, addto, &e->e.key, fd->type_name,
+                         UPB_SYM_ENUM);
+        else
+          continue;  /* No resolving necessary. */
+        if(!ref.msg) return false;  /* Ref. to undefined symbol. */
+        upb_msg_ref(m, f, ref);
+      }
+    }
+  }
+  return true;
+}
+
+bool upb_context_addfd(struct upb_context *c,
+                       google_protobuf_FileDescriptorProto *fd)
+{
+  struct upb_strtable tmp;
+  if(!addfd(&tmp, &c->symtab, fd)) {
+    free_symtab(&tmp);
+    return false;
+  }
+  upb_strtable_free(&tmp);
+  return true;
+}
+
+bool upb_context_parsefds(struct upb_context *c, struct upb_string *fds_str) {
+  google_protobuf_FileDescriptorSet *fds =
+      upb_alloc_and_parse(c->fds_msg, fds_str, true);
+  if(!fds) return false;
+
+  if(fds->set_flags.has.file) {
+    /* Insert new symbols into a temporary table until we have verified that
+     * the descriptor is valid. */
+    struct upb_strtable tmp;
+    upb_strtable_init(&tmp, 0, sizeof(struct upb_symtab_entry));
+    for(uint32_t i = 0; i < fds->file->len; i++) {
+      if(!addfd(&tmp, &c->symtab, fds->file->elements[i])) {
+        printf("Not added successfully!\n");
+        free_symtab(&tmp);
+        return false;
+      }
+    }
+    /* Everything was successfully added, copy from the tmp symtable. */
+    struct upb_symtab_entry *e;
+    for(e = upb_strtable_begin(&tmp); e; e = upb_strtable_next(&tmp, &e->e))
+      upb_strtable_insert(&c->symtab, &e->e);
+    upb_strtable_free(&tmp);
+  }
+
+  /* We own fds now, need to keep a ref so we can free it later. */
+  if(c->fds_size == c->fds_len) {
+    c->fds_size *= 2;
+    c->fds = realloc(c->fds, c->fds_size);
+  }
+  c->fds[c->fds_len++] = fds;
+  return true;
+}
diff --git a/src/upb_context.h b/src/upb_context.h
new file mode 100644
index 0000000..4ddaed6
--- /dev/null
+++ b/src/upb_context.h
@@ -0,0 +1,106 @@
+/*
+ * upb - a minimalist implementation of protocol buffers.
+ *
+ * A context represents a namespace of proto definitions, sort of like an
+ * interpreter's symbol table.  It is empty when first constructed.  Clients
+ * add definitions to the context by supplying unserialized or serialized
+ * descriptors (as defined in descriptor.proto).
+ *
+ * Copyright (c) 2009 Joshua Haberman.  See LICENSE for details.
+ */
+
+#ifndef UPB_CONTEXT_H_
+#define UPB_CONTEXT_H_
+
+#include "upb.h"
+#include "upb_table.h"
+
+struct google_protobuf_FileDescriptorProto;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Definitions. ***************************************************************/
+
+/* The symbol table maps names to various kinds of symbols. */
+enum upb_symbol_type {
+  UPB_SYM_MESSAGE,
+  UPB_SYM_ENUM,
+  UPB_SYM_SERVICE,
+  UPB_SYM_EXTENSION
+};
+
+struct upb_symtab_entry {
+  struct upb_strtable_entry e;
+  enum upb_symbol_type type;
+  union upb_symbol_ref ref;
+};
+
+struct upb_context {
+  struct upb_strtable symtab;   /* The context's symbol table. */
+  struct upb_strtable psymtab;  /* Private symbols, for internal use. */
+  struct upb_msg *fds_msg;   /* This is in psymtab, ptr here for convenience. */
+
+  /* A list of the FileDescriptorProtos we own (from having parsed them
+   * ourselves) and must free on destruction. */
+  size_t fds_size, fds_len;
+  struct google_protobuf_FileDescriptorSet **fds;
+};
+
+/* Initializes and frees a upb_context, respectively. */
+bool upb_context_init(struct upb_context *c);
+void upb_context_free(struct upb_context *c);
+
+/* Looking up symbols. ********************************************************/
+
+/* Resolves the given symbol using the rules described in descriptor.proto,
+ * namely:
+ *
+ *    If the name starts with a '.', it is fully-qualified.  Otherwise, C++-like
+ *    scoping rules are used to find the type (i.e. first the nested types
+ *    within this message are searched, then within the parent, on up to the
+ *    root namespace).
+ *
+ * Returns NULL if the symbol has not been defined. */
+struct upb_symtab_entry *upb_context_resolve(struct upb_context *c,
+                                             struct upb_string *base,
+                                             struct upb_string *symbol);
+
+/* Find an entry in the symbol table with this exact name.  Returns NULL if no
+ * such symbol name exists. */
+struct upb_symtab_entry *upb_context_lookup(struct upb_context *c,
+                                            struct upb_string *symbol);
+
+INLINE struct upb_symtab_entry *upb_context_symbegin(struct upb_context *c) {
+  return (struct upb_symtab_entry*)upb_strtable_begin(&c->symtab);
+}
+
+INLINE struct upb_symtab_entry *upb_context_symnext(
+    struct upb_context *c, struct upb_symtab_entry *cur) {
+  return (struct upb_symtab_entry*)upb_strtable_next(&c->symtab, &cur->e);
+}
+
+/* Adding symbols. ************************************************************/
+
+/* Adds the definitions in the given file descriptor to this context.  All
+ * types that are referenced from fd must have previously been defined (or be
+ * defined in fd).  fd may not attempt to define any names that are already
+ * defined in this context.
+ *
+ * Caller retains ownership of fd, but the context will contain references to
+ * it, so it must outlive the context.
+ *
+ * upb_context_addfd only returns true or false; it does not give any hint
+ * about what happened in the case of failure.  This is because the descriptor
+ * is expected to have been validated at the time it was parsed/generated. */
+bool upb_context_addfd(struct upb_context *c,
+                       struct google_protobuf_FileDescriptorProto *fd);
+
+bool upb_context_parsefds(struct upb_context *c, struct upb_string *fds);
+
+#ifdef __cplusplus
+}  /* extern "C" */
+#endif
+
+#endif  /* UPB_PARSE_H_ */
diff --git a/src/upb_enum.c b/src/upb_enum.c
new file mode 100644
index 0000000..b599c9b
--- /dev/null
+++ b/src/upb_enum.c
@@ -0,0 +1,31 @@
+/*
+ * upb - a minimalist implementation of protocol buffers.
+ *
+ * Copyright (c) 2009 Joshua Haberman.  See LICENSE for details.
+ */
+
+#include "descriptor.h"
+#include "upb_enum.h"
+
+void upb_enum_init(struct upb_enum *e,
+                   struct google_protobuf_EnumDescriptorProto *ed) {
+  int num_values = ed->set_flags.has.value ? ed->value->len : 0;
+  e->descriptor = ed;
+  upb_strtable_init(&e->nametoint, num_values, sizeof(struct upb_enum_ntoi_entry));
+  upb_inttable_init(&e->inttoname, num_values, sizeof(struct upb_enum_iton_entry));
+
+  for(int i = 0; i < num_values; i++) {
+    google_protobuf_EnumValueDescriptorProto *value = ed->value->elements[i];
+    struct upb_enum_ntoi_entry ntoi_entry = {.e = {.key = *value->name},
+                                             .value = value->number};
+    struct upb_enum_iton_entry iton_entry = {.e = {.key = value->number},
+                                             .string = value->name};
+    upb_strtable_insert(&e->nametoint, &ntoi_entry.e);
+    upb_inttable_insert(&e->inttoname, &iton_entry.e);
+  }
+}
+
+void upb_enum_free(struct upb_enum *e) {
+  upb_strtable_free(&e->nametoint);
+  upb_inttable_free(&e->inttoname);
+}
diff --git a/src/upb_enum.h b/src/upb_enum.h
new file mode 100644
index 0000000..9fea3a4
--- /dev/null
+++ b/src/upb_enum.h
@@ -0,0 +1,41 @@
+/*
+ * upb - a minimalist implementation of protocol buffers.
+ *
+ * Copyright (c) 2009 Joshua Haberman.  See LICENSE for details.
+ *
+ * upb_enum is a simple object that allows run-time reflection over the values
+ * defined within an enum. */
+
+#ifndef UPB_ENUM_H_
+#define UPB_ENUM_H_
+
+#include <stdint.h>
+#include "upb_table.h"
+
+/* Forward declaration from descriptor.h. */
+struct google_protobuf_EnumDescriptorProto;
+struct google_protobuf_EnumValueDescriptorProto;
+
+struct upb_enum {
+  struct google_protobuf_EnumDescriptorProto *descriptor;
+  struct upb_strtable nametoint;
+  struct upb_inttable inttoname;
+};
+
+struct upb_enum_ntoi_entry {
+  struct upb_strtable_entry e;
+  uint32_t value;
+};
+
+struct upb_enum_iton_entry {
+  struct upb_inttable_entry e;
+  struct upb_string *string;
+};
+
+/* Initializes and frees an enum, respectively.  Caller retains ownership of
+ * ed, but it must outlive e. */
+void upb_enum_init(struct upb_enum *e,
+                   struct google_protobuf_EnumDescriptorProto *ed);
+void upb_enum_free(struct upb_enum *e);
+
+#endif  /* UPB_ENUM_H_ */
diff --git a/src/upb_inlinedefs.c b/src/upb_inlinedefs.c
new file mode 100644
index 0000000..60863c4
--- /dev/null
+++ b/src/upb_inlinedefs.c
@@ -0,0 +1,19 @@
+/*
+ * upb - a minimalist implementation of protocol buffers.
+ *
+ * This file, if compiled, will contain standalone (non-inlined) versions of
+ * all inline functions defined in header files.  We don't generally use this
+ * file since we use "static inline" for inline functions (which will put a
+ * standalone version of the function in any .o file that needs it, but
+ * compiling this file and dumping the object file will let us inspect how
+ * inline functions are compiled, so we keep it around.
+ *
+ * Copyright (c) 2009 Joshua Haberman.  See LICENSE for details.
+ */
+
+#define INLINE
+#include "upb_context.h"
+#include "upb_enum.h"
+#include "upb_msg.h"
+#include "upb_parse.h"
+#include "upb_table.h"
diff --git a/src/upb_msg.c b/src/upb_msg.c
new file mode 100644
index 0000000..47a8662
--- /dev/null
+++ b/src/upb_msg.c
@@ -0,0 +1,379 @@
+/*
+ * upb - a minimalist implementation of protocol buffers.
+ *
+ */
+
+#include <inttypes.h>
+#include <stdlib.h>
+#include "descriptor.h"
+#include "upb_msg.h"
+#include "upb_parse.h"
+
+/* Rounds p up to the next multiple of t. */
+#define ALIGN_UP(p, t) ((p) % (t) == 0 ? (p) : (p) + ((t) - ((p) % (t))))
+
+static int div_round_up(int numerator, int denominator) {
+  /* cf. http://stackoverflow.com/questions/17944/how-to-round-up-the-result-of-integer-division */
+  return numerator > 0 ? (numerator - 1) / denominator + 1 : 0;
+}
+
+/* Callback for sorting fields. */
+static int compare_fields(const void *e1, const void *e2) {
+  const google_protobuf_FieldDescriptorProto *fd1 = *(void**)e1;
+  const google_protobuf_FieldDescriptorProto *fd2 = *(void**)e2;
+  /* Required fields go before non-required. */
+  bool req1 = fd1->label == GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_LABEL_REQUIRED;
+  bool req2 = fd2->label == GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_LABEL_REQUIRED;
+  if(req1 != req2) {
+    return req2 - req1;
+  } else {
+    /* Within required and non-required field lists, list in number order.
+     * TODO: consider ordering by data size to reduce padding. */
+    return fd1->number - fd2->number;
+  }
+}
+
+bool upb_msg_init(struct upb_msg *m, struct google_protobuf_DescriptorProto *d)
+{
+  /* TODO: more complete validation.
+   * TODO: re-enable this check when we properly set this flag. */
+  //if(!d->set_flags.has.field) return false;
+
+  upb_inttable_init(&m->fields_by_num, d->field->len,
+                    sizeof(struct upb_fieldsbynum_entry));
+  upb_strtable_init(&m->fields_by_name, d->field->len,
+                    sizeof(struct upb_fieldsbyname_entry));
+
+  m->descriptor = d;
+  m->num_fields = d->field->len;
+  m->set_flags_bytes = div_round_up(m->num_fields, 8);
+  /* These are incremented in the loop. */
+  m->num_required_fields = 0;
+  m->size = m->set_flags_bytes;
+
+  m->fields = malloc(sizeof(*m->fields) * m->num_fields);
+  m->field_descriptors = malloc(sizeof(*m->field_descriptors) * m->num_fields);
+  for(unsigned int i = 0; i < m->num_fields; i++) {
+    /* We count on the caller to keep this pointer alive. */
+    m->field_descriptors[i] = d->field->elements[i];
+  }
+  /* TODO: re-enable proper sorting once the compiler is sorted out. */
+  //qsort(m->field_descriptors, m->num_fields, sizeof(void*), compare_fields);
+
+  size_t max_align = 0;
+  for(unsigned int i = 0; i < m->num_fields; i++) {
+    struct upb_msg_field *f = &m->fields[i];
+    google_protobuf_FieldDescriptorProto *fd = m->field_descriptors[i];
+    struct upb_type_info *type_info = &upb_type_info[fd->type];
+
+    /* General alignment rules are: each member must be at an address that is a
+     * multiple of that type's alignment.  Also, the size of the structure as
+     * a whole must be a multiple of the greatest alignment of any member. */
+    f->field_index = i;
+    f->byte_offset = ALIGN_UP(m->size, type_info->align);
+    f->type = fd->type;
+    f->label = fd->label;
+    m->size = f->byte_offset + type_info->size;
+    max_align = max(max_align, type_info->align);
+    if(fd->label == GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_LABEL_REQUIRED)
+      m->num_required_fields++;
+
+    /* Insert into the tables.  Note that f->ref will be uninitialized, even in
+     * the tables' copies of *f, which is why we must update them separately
+     * in upb_msg_ref() below. */
+    struct upb_fieldsbynum_entry nument = {.e = {.key = fd->number}, .f = *f};
+    struct upb_fieldsbyname_entry strent = {.e = {.key = *fd->name}, .f = *f};
+    upb_inttable_insert(&m->fields_by_num, &nument.e);
+    upb_strtable_insert(&m->fields_by_name, &strent.e);
+  }
+
+  m->size = ALIGN_UP(m->size, max_align);
+  return true;
+}
+
+void upb_msg_free(struct upb_msg *m)
+{
+  upb_inttable_free(&m->fields_by_num);
+  upb_strtable_free(&m->fields_by_name);
+  free(m->fields);
+  free(m->field_descriptors);
+}
+
+void upb_msg_ref(struct upb_msg *m, struct upb_msg_field *f,
+                 union upb_symbol_ref ref) {
+  struct google_protobuf_FieldDescriptorProto *d =
+      upb_msg_field_descriptor(f, m);
+  struct upb_fieldsbynum_entry *int_e = upb_inttable_lookup(
+      &m->fields_by_num, d->number, sizeof(struct upb_fieldsbynum_entry));
+  struct upb_fieldsbyname_entry *str_e =
+      upb_strtable_lookup(&m->fields_by_name, d->name);
+  assert(int_e && str_e);
+  f->ref = ref;
+  int_e->f.ref = ref;
+  str_e->f.ref = ref;
+}
+
+/* Memory management  *********************************************************/
+
+/* Our memory management scheme is as follows:
+ *
+ * All pointers to dynamic memory (strings, arrays, and submessages) are
+ * expected to be good pointers if they are non-zero, *regardless* of whether
+ * that field's bit is set!  That way we can reuse the memory even if the field
+ * is unset and then set later. */
+
+/* For our memory-managed strings and arrays we store extra information
+ * (compared to a plain upb_string or upb_array).  But the data starts with
+ * a upb_string and upb_array, so we can overlay onto the regular types. */
+struct mm_upb_string {
+  struct upb_string s;
+  /* Track the allocated size, so we know when we need to reallocate. */
+  uint32_t size;
+  /* Our allocated data.  Stored separately so that clients can point s.ptr to
+   * a referenced string, but we can reuse this data later. */
+  char *data;
+};
+
+struct mm_upb_array {
+  struct upb_array a;
+  /* Track the allocated size, so we know when we need to reallocate. */
+  uint32_t size;
+};
+
+static uint32_t round_up_to_pow2(uint32_t v)
+{
+  /* cf. http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2 */
+  v--;
+  v |= v >> 1;
+  v |= v >> 2;
+  v |= v >> 4;
+  v |= v >> 8;
+  v |= v >> 16;
+  v++;
+  return v;
+}
+
+void *upb_msgdata_new(struct upb_msg *m)
+{
+  void *msg = malloc(m->size);
+  memset(msg, 0, m->size);  /* Clear all pointers, values, and set bits. */
+  return msg;
+}
+
+static void free_value(union upb_value_ptr p, struct upb_msg_field *f,
+                       bool free_submsgs)
+{
+  switch(f->type) {
+    case GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_STRING:
+    case GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_BYTES: {
+      struct mm_upb_string *mm_str = (void*)*p.string;
+      if(mm_str) {
+        free(mm_str->data);
+        free(mm_str);
+      }
+      break;
+    }
+    case GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_MESSAGE:
+    case GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_GROUP:
+      if(free_submsgs) upb_msgdata_free(*p.message, f->ref.msg, free_submsgs);
+      break;
+    default: break;  /* For non-dynamic types, do nothing. */
+  }
+}
+
+void upb_msgdata_free(void *data, struct upb_msg *m, bool free_submsgs)
+{
+  if(!data) return;  /* A very free-like thing to do. */
+  for(unsigned int i = 0; i < m->num_fields; i++) {
+    struct upb_msg_field *f = &m->fields[i];
+    union upb_value_ptr p = upb_msg_getptr(data, f);
+    if(f->label == GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_LABEL_REPEATED) {
+      if(*p.array) {
+        for(uint32_t j = 0; j < (*p.array)->len; j++)
+          free_value(upb_array_getelementptr(*p.array, j, f->type),
+                     f, free_submsgs);
+        free((*p.array)->elements._void);
+        free(*p.array);
+      }
+    } else {
+      free_value(p, f, free_submsgs);
+    }
+  }
+  free(data);
+}
+
+void upb_msg_reuse_str(struct upb_string **str, uint32_t size)
+{
+  if(!*str) {
+    *str = malloc(sizeof(struct mm_upb_string));
+    memset(*str, 0, sizeof(struct mm_upb_string));
+  }
+  struct mm_upb_string *s = (void*)*str;
+  if(s->size < size) {
+    size = max(16, round_up_to_pow2(size));
+    s->data = realloc(s->data, size);
+    s->size = size;
+  }
+  s->s.ptr = s->data;
+}
+
+void upb_msg_reuse_array(struct upb_array **arr, uint32_t size, upb_field_type_t t)
+{
+  if(!*arr) {
+    *arr = malloc(sizeof(struct mm_upb_array));
+    memset(*arr, 0, sizeof(struct mm_upb_array));
+  }
+  struct mm_upb_array *a = (void*)*arr;
+  if(a->size < size) {
+    size = max(16, round_up_to_pow2(size));
+    size_t type_size = upb_type_info[t].size;
+    a->a.elements._void = realloc(a->a.elements._void, size * type_size);
+    /* Zero any newly initialized memory. */
+    memset(UPB_INDEX(a->a.elements._void, a->size, type_size), 0,
+           (size - a->size) * type_size);
+    a->size = size;
+  }
+}
+
+void upb_msg_reuse_strref(struct upb_string **str) { upb_msg_reuse_str(str, 0); }
+
+void upb_msg_reuse_submsg(void **msg, struct upb_msg *m)
+{
+  if(!*msg) *msg = upb_msgdata_new(m);
+  else upb_msg_clear(*msg, m); /* Clears set bits, leaves pointers. */
+}
+
+/* Serialization/Deserialization.  ********************************************/
+
+/* We use this as our "user_data" for each frame of the parsing stack. */
+struct parse_frame_data {
+  struct upb_msg *m;
+  void *data;
+};
+
+static void set_frame_data(struct upb_parse_state *s, struct upb_msg *m,
+                           void *data)
+{
+  struct parse_frame_data *frame = (void*)&s->top->user_data;
+  frame->m = m;
+  frame->data = data;
+}
+
+static upb_field_type_t tag_cb(struct upb_parse_state *s, struct upb_tag *tag,
+                               void **user_field_desc)
+{
+  struct parse_frame_data *frame = (void*)&s->top->user_data;
+  struct upb_msg_field *f = upb_msg_fieldbynum(frame->m, tag->field_number);
+  if(!f || !upb_check_type(tag->wire_type, f->type))
+    return 0;  /* Skip unknown or fields of the wrong type. */
+  *user_field_desc = f;
+  return f->type;
+}
+
+/* Returns a pointer to where the next value for field "f" should be stored,
+ * taking into account whether f is an array that may need to be reallocatd. */
+static union upb_value_ptr get_value_ptr(void *data, struct upb_msg_field *f)
+{
+  union upb_value_ptr p = upb_msg_getptr(data, f);
+  if(f->label == GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_LABEL_REPEATED) {
+    size_t len = upb_msg_is_set(data, f) ? (*p.array)->len : 0;
+    upb_msg_reuse_array(p.array, len+1, f->type);
+    (*p.array)->len = len + 1;
+    assert(p._void);
+    p = upb_array_getelementptr(*p.array, len, f->type);
+    assert(p._void);
+  }
+  upb_msg_set(data, f);
+  assert(p._void);
+  return p;
+}
+
+static upb_status_t value_cb(struct upb_parse_state *s, void **buf, void *end,
+                             void *user_field_desc)
+{
+  struct parse_frame_data *frame = (void*)&s->top->user_data;
+  struct upb_msg_field *f = user_field_desc;
+  union upb_value_ptr p = get_value_ptr(frame->data, f);
+  UPB_CHECK(upb_parse_value(buf, end, f->type, p));
+  return UPB_STATUS_OK;
+}
+
+static upb_status_t str_cb(struct upb_parse_state *_s, struct upb_string *str,
+                           void *user_field_desc)
+{
+  struct upb_msg_parse_state *s = (void*)_s;
+  struct parse_frame_data *frame = (void*)&s->s.top->user_data;
+  struct upb_msg_field *f = user_field_desc;
+  union upb_value_ptr p = get_value_ptr(frame->data, f);
+  if(s->byref) {
+    upb_msg_reuse_strref(p.string);
+    **p.string = *str;
+  } else {
+    upb_msg_reuse_str(p.string, str->byte_len);
+    upb_strcpy(*p.string, str);
+  }
+  return UPB_STATUS_OK;
+}
+
+static void submsg_start_cb(struct upb_parse_state *_s, void *user_field_desc)
+{
+  struct upb_msg_parse_state *s = (void*)_s;
+  struct upb_msg_field *f = user_field_desc;
+  struct parse_frame_data *frame = (void*)&s->s.top->user_data;
+  // TODO: find a non-hacky way to get a pointer to the old frame.
+  struct parse_frame_data *oldframe = (void*)((char*)s->s.top - s->s.udata_size);
+  union upb_value_ptr p = get_value_ptr(oldframe->data, f);
+  assert(f->ref.msg);
+  upb_msg_reuse_submsg(p.message, f->ref.msg);
+  set_frame_data(&s->s, f->ref.msg, *p.message);
+  if(!s->merge) upb_msg_clear(frame->data, f->ref.msg);
+}
+
+void upb_msg_parse_reset(struct upb_msg_parse_state *s, void *msg,
+                         struct upb_msg *m, bool merge, bool byref)
+{
+  upb_parse_reset(&s->s);
+  s->merge = merge;
+  s->byref = byref;
+  if(!merge && msg == NULL) msg = upb_msgdata_new(m);
+  upb_msg_clear(msg, m);
+  set_frame_data(&s->s, m, msg);
+  s->s.tag_cb = tag_cb;
+  s->s.value_cb = value_cb;
+  s->s.str_cb = str_cb;
+  s->s.submsg_start_cb = submsg_start_cb;
+}
+
+void upb_msg_parse_init(struct upb_msg_parse_state *s, void *msg,
+                        struct upb_msg *m, bool merge, bool byref)
+{
+  upb_parse_init(&s->s, sizeof(struct parse_frame_data));
+  upb_msg_parse_reset(s, msg, m, merge, byref);
+}
+
+void upb_msg_parse_free(struct upb_msg_parse_state *s)
+{
+  upb_parse_free(&s->s);
+}
+
+upb_status_t upb_msg_parse(struct upb_msg_parse_state *s,
+                           void *data, size_t len, size_t *read)
+{
+  return upb_parse(&s->s, data, len, read);
+}
+
+void *upb_alloc_and_parse(struct upb_msg *m, struct upb_string *str, bool byref)
+{
+  struct upb_msg_parse_state s;
+  void *msg = upb_msgdata_new(m);
+  upb_msg_parse_init(&s, msg, m, false, byref);
+  size_t read;
+  upb_status_t status = upb_msg_parse(&s, str->ptr, str->byte_len, &read);
+  upb_msg_parse_free(&s);
+  if(status == UPB_STATUS_OK && read == str->byte_len) {
+    return msg;
+  } else {
+    upb_msg_free(msg);
+    return NULL;
+  }
+}
diff --git a/src/upb_msg.h b/src/upb_msg.h
new file mode 100644
index 0000000..8910505
--- /dev/null
+++ b/src/upb_msg.h
@@ -0,0 +1,369 @@
+/*
+ * upb - a minimalist implementation of protocol buffers.
+ *
+ * Copyright (c) 2009 Joshua Haberman.  See LICENSE for details.
+ *
+ * A upb_msg provides a full description of a message as defined in a .proto
+ * file.  It supports many features and operations for dealing with proto
+ * messages:
+ * - reflection over .proto types at runtime (list fields, get names, etc).
+ * - an in-memory byte-level format for efficiently storing and accessing msgs.
+ * - serializing and deserializing from the in-memory format to a protobuf.
+ * - optional memory management for handling strings, arrays, and submessages.
+ *
+ * Throughout this file, the following convention is used:
+ * - "struct upb_msg *m" describes a message type (name, list of fields, etc).
+ * - "void *data" is an actual message stored using the in-memory format.
+ *
+ * The in-memory format is very much like a C struct that you can define at
+ * run-time, but also supports reflection.  Like C structs it supports
+ * offset-based access, as opposed to the much slower name-based lookup.  The
+ * format stores both the values themselves and bits describing whether each
+ * field is set or not.  For example:
+ *
+ * parsed message Foo {
+ *   optional bool a = 1;
+ *   repeated uint32 b = 2;
+ *   optional Bar c = 3;
+ * }
+ *
+ * The in-memory layout for this message on a 32-bit machine will be something
+ * like:
+ *
+ *  Foo
+ * +------------------------+
+ * | set_flags a:1, b:1, c:1|
+ * +------------------------+
+ * | bool a (1 byte)        |
+ * +------------------------+
+ * | padding (3 bytes)      |
+ * +------------------------+         upb_array
+ * | upb_array* b (4 bytes) | ---->  +----------------------------+
+ * +------------------------+        | uint32* elements (4 bytes) | ---+
+ * | Bar* c (4 bytes)       |        +----------------------------+    |
+ * +------------------------+        | uint32 size (4 bytes)      |    |
+ *                                   +----------------------------+    |
+ *                                                                     |
+ *    -----------------------------------------------------------------+
+ *    |
+ *    V
+ *  uint32 array
+ * +----+----+----+----+----+----+
+ * | e1 | e2 | e3 | e4 | e5 | e6 |
+ * +----+----+----+----+----+----+
+ *
+ * And the corresponding C structure (as emitted by the proto compiler) would be:
+ *
+ * struct Foo {
+ *   union {
+ *     uint8_t bytes[1];
+ *     struct {
+ *       bool a:1;
+ *       bool b:1;
+ *       bool c:1;
+ *     } has;
+ *   } set_flags;
+ *   bool a;
+ *   upb_uint32_array *b;
+ *   Bar *c;
+ * }
+ *
+ * Because the C struct emitted by the upb compiler uses exactly the same
+ * byte-level format as the reflection interface, you can access the same hunk
+ * of memory either way.  The C struct provides maximum performance and static
+ * type safety; upb_msg provides flexibility.
+ *
+ * The in-memory format has no interoperability guarantees whatsoever, except
+ * that a single version of upb will interoperate with itself.  Don't even
+ * think about persisting the in-memory format or sending it anywhere.  That's
+ * what serialized protobufs are for!  The in-memory format is just that -- an
+ * in-memory representation that allows for fast access.
+ *
+ * The in-memory format is carefully designed to *not* mandate any particular
+ * memory management scheme.  This should make it easier to integrate with
+ * existing memory management schemes, or to perform advanced techniques like
+ * reference counting, garbage collection, and string references.  Different
+ * clients can read each others messages regardless of what memory management
+ * scheme each is using.
+ *
+ * A memory management scheme is provided for convenience, and it is used by
+ * default by the stock message parser.  Clients can substitute their own
+ * memory management scheme into this parser without any loss of generality
+ * or performance.
+ */
+
+#ifndef UPB_MSG_H_
+#define UPB_MSG_H_
+
+#include <stdbool.h>
+#include <stdint.h>
+
+#include "upb.h"
+#include "upb_table.h"
+#include "upb_parse.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Forward declarations from descriptor.h. */
+struct google_protobuf_DescriptorProto;
+struct google_protobuf_FieldDescriptorProto;
+
+/* Message definition. ********************************************************/
+
+/* Structure that describes a single field in a message.  This structure is very
+ * consciously designed to fit into 12/16 bytes (32/64 bit, respectively),
+ * because copies of this struct are in the hash table that is read in the
+ * critical path of parsing.  Minimizing the size of this struct increases
+ * cache-friendliness. */
+struct upb_msg_field {
+  union upb_symbol_ref ref;
+  uint32_t byte_offset;     /* Where to find the data. */
+  uint16_t field_index;     /* Indexes upb_msg.fields. Also indicates set bit */
+  upb_field_type_t type;    /* Copied from descriptor for cache-friendliness. */
+  upb_label_t label;
+};
+
+/* Structure that describes a single .proto message type. */
+struct upb_msg {
+  struct google_protobuf_DescriptorProto *descriptor;
+  size_t size;
+  uint32_t num_fields;
+  uint32_t set_flags_bytes;
+  uint32_t num_required_fields;  /* Required fields have the lowest set bytemasks. */
+  struct upb_inttable fields_by_num;
+  struct upb_strtable fields_by_name;
+  struct upb_msg_field *fields;
+  struct google_protobuf_FieldDescriptorProto **field_descriptors;
+};
+
+/* The num->field and name->field maps in upb_msg allow fast lookup of fields
+ * by number or name.  These lookups are in the critical path of parsing and
+ * field lookup, so they must be as fast as possible.  To make these more
+ * cache-friendly, we put the data in the table by value. */
+
+struct upb_fieldsbynum_entry {
+  struct upb_inttable_entry e;
+  struct upb_msg_field f;
+};
+
+struct upb_fieldsbyname_entry {
+  struct upb_strtable_entry e;
+  struct upb_msg_field f;
+};
+
+/* Can be used to retrieve a field descriptor given the upb_msg_field ref. */
+INLINE struct google_protobuf_FieldDescriptorProto *upb_msg_field_descriptor(
+    struct upb_msg_field *f, struct upb_msg *m) {
+  return m->field_descriptors[f->field_index];
+}
+
+/* Initializes/frees a upb_msg.  Caller retains ownership of d, but the msg
+ * will contain references to it, so it must outlive the msg.  Note that init
+ * does not resolve upb_msg_field.ref -- the caller should do that
+ * post-initialization by calling upb_msg_ref() below. */
+bool upb_msg_init(struct upb_msg *m, struct google_protobuf_DescriptorProto *d);
+void upb_msg_free(struct upb_msg *m);
+
+/* Clients use this function on a previously initialized upb_msg to resolve the
+ * "ref" field in the upb_msg_field.  Since messages can refer to each other in
+ * mutually-recursive ways, this step must be separated from initialization. */
+void upb_msg_ref(struct upb_msg *m, struct upb_msg_field *f, union upb_symbol_ref ref);
+
+/* Looks up a field by name or number.  While these are written to be as fast
+ * as possible, it will still be faster to cache the results of this lookup if
+ * possible.  These return NULL if no such field is found. */
+INLINE struct upb_msg_field *upb_msg_fieldbynum(struct upb_msg *m,
+                                                uint32_t number) {
+  struct upb_fieldsbynum_entry *e =
+      (struct upb_fieldsbynum_entry*)upb_inttable_lookup(
+          &m->fields_by_num, number, sizeof(struct upb_fieldsbynum_entry));
+  return e ? &e->f : NULL;
+}
+INLINE struct upb_msg_field *upb_msg_fieldbyname(struct upb_msg *m,
+                                                 struct upb_string *name) {
+  struct upb_fieldsbyname_entry *e =
+      (struct upb_fieldsbyname_entry*)upb_strtable_lookup(
+          &m->fields_by_name, name);
+  return e ? &e->f : NULL;
+}
+
+/* "Set" flag reading and writing.  *******************************************/
+
+INLINE size_t upb_isset_offset(uint32_t field_index) {
+  return field_index / 8;
+}
+
+INLINE uint8_t upb_isset_mask(uint32_t field_index) {
+  return 1 << (field_index % 8);
+}
+
+/* Functions for reading and writing the "set" flags in the msg.  Note that
+ * these do not perform memory management associated with any dynamic memory
+ * these fields may be referencing. These *only* set and test the flags. */
+INLINE void upb_msg_set(void *s, struct upb_msg_field *f)
+{
+  ((char*)s)[upb_isset_offset(f->field_index)] |= upb_isset_mask(f->field_index);
+}
+
+INLINE void upb_msg_unset(void *s, struct upb_msg_field *f)
+{
+  ((char*)s)[upb_isset_offset(f->field_index)] &= ~upb_isset_mask(f->field_index);
+}
+
+INLINE bool upb_msg_is_set(void *s, struct upb_msg_field *f)
+{
+  return ((char*)s)[upb_isset_offset(f->field_index)] & upb_isset_mask(f->field_index);
+}
+
+INLINE bool upb_msg_all_required_fields_set(void *s, struct upb_msg *m)
+{
+  int num_fields = m->num_required_fields;
+  int i = 0;
+  while(num_fields > 8) {
+    if(((uint8_t*)s)[i++] != 0xFF) return false;
+    num_fields -= 8;
+  }
+  if(((uint8_t*)s)[i] != (1 << num_fields) - 1) return false;
+  return true;
+}
+
+INLINE void upb_msg_clear(void *s, struct upb_msg *m)
+{
+  memset(s, 0, m->set_flags_bytes);
+}
+
+/* Scalar (non-array) data access. ********************************************/
+
+/* Returns a pointer to a specific field in a message. */
+INLINE union upb_value_ptr upb_msg_getptr(void *data, struct upb_msg_field *f) {
+  union upb_value_ptr p;
+  p._void = ((char*)data + f->byte_offset);
+  return p;
+}
+
+/* Arrays. ********************************************************************/
+
+/* Represents an array (a repeated field) of any type.  The interpretation of
+ * the data in the array depends on the type. */
+struct upb_array {
+  union upb_value_ptr elements;
+  uint32_t len;     /* Measured in elements. */
+};
+
+/* Returns a pointer to an array element. */
+INLINE union upb_value_ptr upb_array_getelementptr(
+    struct upb_array *arr, uint32_t n, upb_field_type_t type)
+{
+  union upb_value_ptr ptr;
+  ptr._void = (void*)((char*)arr->elements._void + n*upb_type_info[type].size);
+  return ptr;
+}
+
+/* These are all overlays on upb_array, pointers between them can be cast. */
+#define UPB_DEFINE_ARRAY_TYPE(name, type) \
+  struct name ## _array { \
+    type *elements; \
+    uint32_t len; \
+  };
+
+UPB_DEFINE_ARRAY_TYPE(upb_double, double)
+UPB_DEFINE_ARRAY_TYPE(upb_float,  float)
+UPB_DEFINE_ARRAY_TYPE(upb_int32,  int32_t)
+UPB_DEFINE_ARRAY_TYPE(upb_int64,  int64_t)
+UPB_DEFINE_ARRAY_TYPE(upb_uint32, uint32_t)
+UPB_DEFINE_ARRAY_TYPE(upb_uint64, uint64_t)
+UPB_DEFINE_ARRAY_TYPE(upb_bool,   bool)
+UPB_DEFINE_ARRAY_TYPE(upb_string, struct upb_string*)
+
+/* Defines an array of a specific message type. */
+#define UPB_MSG_ARRAY(msg_type) struct msg_type ## _array
+#define UPB_DEFINE_MSG_ARRAY(msg_type) \
+  UPB_MSG_ARRAY(msg_type) { \
+    msg_type **elements; \
+    uint32_t len; \
+  };
+
+/* Memory management  *********************************************************/
+
+/* One important note about these memory management routines: they must be used
+ * completely or not at all (for each message).  In other words, you can't
+ * allocate your own message and then free it with upb_msgdata_free.  As
+ * another example, you can't point a field to your own string and then call
+ * upb_msg_reuse_str. */
+
+/* Allocates and frees message data, respectively.  Newly allocated data is
+ * initialized to empty.  Freeing a message always frees string data, but
+ * the client can decide whether or not submessages should be deleted. */
+void *upb_msgdata_new(struct upb_msg *m);
+void upb_msgdata_free(void *data, struct upb_msg *m, bool free_submsgs);
+
+/* Given a pointer to the appropriate field of the message or array, these
+ * functions will lazily allocate memory for a string, array, or submessage.
+ * If the previously allocated memory is big enough, it will reuse it without
+ * re-allocating.  See upb_msg.c for example usage. */
+
+/* Reuse a string of at least the given size. */
+void upb_msg_reuse_str(struct upb_string **str, uint32_t size);
+/* Like the previous, but assumes that the string will be by reference, so
+ * doesn't allocate memory for the string itself. */
+void upb_msg_reuse_strref(struct upb_string **str);
+
+/* Reuse an array of at least the given size, with the given type. */
+void upb_msg_reuse_array(struct upb_array **arr, uint32_t size,
+                         upb_field_type_t t);
+
+/* Reuse a submessage of the given type. */
+void upb_msg_reuse_submsg(void **msg, struct upb_msg *m);
+
+/* Serialization/Deserialization.  ********************************************/
+
+/* This is all just a layer on top of the stream-oriented facility in
+ * upb_parse.h. */
+
+struct upb_msg_parse_state {
+  struct upb_parse_state s;
+  bool merge;
+  bool byref;
+  struct upb_msg *m;
+};
+
+/* Initializes/frees a message parser.  The parser will write the data to the
+ * message data "data", which the caller must have previously allocated (the
+ * parser will allocate submsgs, strings, and arrays as needed, however).
+ *
+ * "Merge" controls whether the parser will append to data instead of
+ * overwriting.  Merging concatenates arrays and merges submessages instead
+ * of clearing both.
+ *
+ * "Byref" controls whether the new message data copies or references strings
+ * it encounters.  If byref == true, then all strings supplied to upb_msg_parse
+ * must remain unchanged and must outlive data. */
+void upb_msg_parse_init(struct upb_msg_parse_state *s, void *data,
+                        struct upb_msg *m, bool merge, bool byref);
+void upb_msg_parse_reset(struct upb_msg_parse_state *s, void *data,
+                         struct upb_msg *m, bool merge, bool byref);
+void upb_msg_parse_free(struct upb_msg_parse_state *s);
+
+/* Parses a protobuf fragment, writing the data to the message that was passed
+ * to upb_msg_parse_init.  This function can be called multiple times as more
+ * data becomes available. */
+upb_status_t upb_msg_parse(struct upb_msg_parse_state *s,
+                           void *data, size_t len, size_t *read);
+
+/* Parses the protobuf in s (which is expected to be complete) and allocates
+ * new message data to hold it.  This is an alternative to the streaming API
+ * above.  "byref" works as in upb_msg_parse_init(). */
+void *upb_alloc_and_parse(struct upb_msg *m, struct upb_string *s, bool byref);
+
+
+/* Text dump  *****************************************************************/
+
+void upb_msg_print(void *data, struct upb_msg *m, FILE *stream);
+
+#ifdef __cplusplus
+}  /* extern "C" */
+#endif
+
+#endif  /* UPB_MSG_H_ */
diff --git a/src/upb_parse.c b/src/upb_parse.c
new file mode 100644
index 0000000..3a011a6
--- /dev/null
+++ b/src/upb_parse.c
@@ -0,0 +1,383 @@
+/*
+ * upb - a minimalist implementation of protocol buffers.
+ *
+ * Copyright (c) 2008-2009 Joshua Haberman.  See LICENSE for details.
+ */
+
+#include "upb_parse.h"
+
+#include <assert.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+#include "descriptor.h"
+
+/* Lowest-level functions -- these read integers from the input buffer. */
+
+static void *check_end(uint8_t *buf, void *end, size_t maxlen,
+                       upb_status_t *bound_error)
+{
+  void *maxend = buf + maxlen;
+  if(end < maxend) {
+    *bound_error = UPB_STATUS_NEED_MORE_DATA;
+    return end;
+  } else {
+    *bound_error = UPB_ERROR_UNTERMINATED_VARINT;
+    return maxend;
+  }
+}
+
+static upb_status_t get_v_uint64_t(void *restrict *buf, void *end,
+                                   uint64_t *restrict val)
+{
+  uint8_t *b = *buf;
+  upb_status_t bound_error;
+  end = check_end(b, end, 10, &bound_error);  /* 2**64 is a 10-byte varint. */
+  uint8_t last = 0x80;
+  *val = 0;
+  for(int bitpos = 0; b < (uint8_t*)end && (last & 0x80); b++, bitpos += 7)
+    *val |= ((uint64_t)((last = *b) & 0x7F)) << bitpos;
+
+  if(last & 0x80) return bound_error;
+  *buf = b;
+  return UPB_STATUS_OK;
+}
+
+static upb_status_t skip_v_uint64_t(void **buf, void *end)
+{
+  uint8_t *b = *buf;
+  upb_status_t bound_error;
+  end = check_end(b, end, 10, &bound_error);  /* 2**64 is a 10-byte varint. */
+  uint8_t last = 0x80;
+  for(; b < (uint8_t*)end && (last & 0x80); b++)
+    last = *b;
+
+  if(last & 0x80) return bound_error;
+  *buf = b;
+  return UPB_STATUS_OK;
+}
+
+static upb_status_t get_v_uint32_t(void *restrict *buf, void *end,
+                                   uint32_t *restrict val)
+{
+  uint8_t *b = *buf, *dend;
+  upb_status_t bound_error;
+  dend = check_end(b, end, 5, &bound_error);  /* 2**32 is a 5-byte varint. */
+  end = check_end(b, end, 10, &bound_error);  /* May have to discard bytes. */
+  uint8_t last = 0x80;
+  *val = 0;
+  for(int bitpos = 0; b < dend && (last & 0x80); b++, bitpos += 7)
+    *val |= ((uint32_t)((last = *b) & 0x7F)) << bitpos;
+
+  /* Discard high bytes until varint ends. */
+  for(; b < (uint8_t*)end && (last & 0x80); b++)
+    last = *b;
+
+  if(last & 0x80)
+    return bound_error;
+  *buf = b;
+  return UPB_STATUS_OK;
+}
+
+static upb_status_t get_f_uint32_t(void *restrict *buf, void *end,
+                                   uint32_t *restrict val)
+{
+  uint8_t *b = *buf;
+  void *uint32_end = (uint8_t*)*buf + sizeof(uint32_t);
+  if(uint32_end > end) return UPB_STATUS_NEED_MORE_DATA;
+#if UPB_UNALIGNED_READS_OK
+  *val = *(uint32_t*)b;
+#else
+#define SHL(val, bits) ((uint32_t)val << bits)
+  *val = SHL(b[0], 0) | SHL(b[1], 8) | SHL(b[2], 16) | SHL(b[3], 24);
+#undef SHL
+#endif
+  *buf = uint32_end;
+  return UPB_STATUS_OK;
+}
+
+static upb_status_t get_f_uint64_t(void *restrict *buf, void *end,
+                                   uint64_t *restrict val)
+{
+  void *uint64_end = (uint8_t*)*buf + sizeof(uint64_t);
+  if(uint64_end > end) return UPB_STATUS_NEED_MORE_DATA;
+#if UPB_UNALIGNED_READS_OK
+  *val = *(uint64_t*)*buf;
+  *buf = uint64_end;
+#else
+  uint32_t lo32, hi32;
+  get_f_uint32_t(buf, &lo32, end);
+  get_f_uint32_t(buf, &hi32, end);
+  *val = lo32 | ((uint64_t)hi32 << 32);
+#endif
+  return UPB_STATUS_OK;
+}
+
+static upb_status_t skip_f_uint32_t(void **buf, void *end)
+{
+  void *uint32_end = (uint8_t*)*buf + sizeof(uint32_t);
+  if(uint32_end > end) return UPB_STATUS_NEED_MORE_DATA;
+  *buf = uint32_end;
+  return UPB_STATUS_OK;
+}
+
+static upb_status_t skip_f_uint64_t(void **buf, void *end)
+{
+  void *uint64_end = (uint8_t*)*buf + sizeof(uint64_t);
+  if(uint64_end > end) return UPB_STATUS_NEED_MORE_DATA;
+  *buf = uint64_end;
+  return UPB_STATUS_OK;
+}
+
+static int32_t zz_decode_32(uint32_t n) { return (n >> 1) ^ -(int32_t)(n & 1); }
+static int64_t zz_decode_64(uint64_t n) { return (n >> 1) ^ -(int64_t)(n & 1); }
+
+/* Functions for reading wire values and converting them to values.  These
+ * are generated with macros because they follow a higly consistent pattern. */
+
+#define WVTOV(type, wire_t, val_t) \
+  static void wvtov_ ## type(wire_t s, val_t *d)
+
+#define GET(type, v_or_f, wire_t, val_t, member_name) \
+  static upb_status_t get_ ## type(void **buf, void *end, val_t *d) { \
+    wire_t tmp; \
+    UPB_CHECK(get_ ## v_or_f ## _ ## wire_t(buf, end, &tmp)); \
+    wvtov_ ## type(tmp, d); \
+    return UPB_STATUS_OK; \
+  }
+
+#define T(type, v_or_f, wire_t, val_t, member_name) \
+  WVTOV(type, wire_t, val_t);  /* prototype for GET below */ \
+  GET(type, v_or_f, wire_t, val_t, member_name) \
+  WVTOV(type, wire_t, val_t)
+
+T(DOUBLE,   f, uint64_t, double,   _double) { memcpy(d, &s, sizeof(double)); }
+T(FLOAT,    f, uint32_t, float,    _float)  { memcpy(d, &s, sizeof(float));  }
+T(INT32,    v, uint32_t, int32_t,  int32)   { *d = (int32_t)s;               }
+T(INT64,    v, uint64_t, int64_t,  int64)   { *d = (int64_t)s;               }
+T(UINT32,   v, uint32_t, uint32_t, uint32)  { *d = s;                        }
+T(UINT64,   v, uint64_t, uint64_t, uint64)  { *d = s;                        }
+T(SINT32,   v, uint32_t, int32_t,  int32)   { *d = zz_decode_32(s);          }
+T(SINT64,   v, uint64_t, int64_t,  int64)   { *d = zz_decode_64(s);          }
+T(FIXED32,  f, uint32_t, uint32_t, uint32)  { *d = s;                        }
+T(FIXED64,  f, uint64_t, uint64_t, uint64)  { *d = s;                        }
+T(SFIXED32, f, uint32_t, int32_t,  int32)   { *d = (int32_t)s;               }
+T(SFIXED64, f, uint64_t, int64_t,  int64)   { *d = (int64_t)s;               }
+T(BOOL,     v, uint32_t, bool,     _bool)   { *d = (bool)s;                  }
+T(ENUM,     v, uint32_t, int32_t,  int32)   { *d = (int32_t)s;               }
+#undef WVTOV
+#undef GET
+#undef T
+
+#define alignof(t) offsetof(struct { char c; t x; }, x)
+
+/* May want to move this to upb.c if enough other things warrant it. */
+struct upb_type_info upb_type_info[] = {
+  [GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_DOUBLE]  = {alignof(double),   sizeof(double),   UPB_WIRE_TYPE_64BIT},
+  [GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_FLOAT]   = {alignof(float),    sizeof(float),    UPB_WIRE_TYPE_32BIT},
+  [GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_INT64]   = {alignof(int64_t),  sizeof(int64_t),  UPB_WIRE_TYPE_VARINT},
+  [GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_UINT64]  = {alignof(uint64_t), sizeof(uint64_t), UPB_WIRE_TYPE_VARINT},
+  [GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_INT32]   = {alignof(int32_t),  sizeof(int32_t),  UPB_WIRE_TYPE_VARINT},
+  [GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_FIXED64] = {alignof(uint64_t), sizeof(uint64_t), UPB_WIRE_TYPE_64BIT},
+  [GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_FIXED32] = {alignof(uint32_t), sizeof(uint32_t), UPB_WIRE_TYPE_32BIT},
+  [GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_BOOL]    = {alignof(bool),     sizeof(bool),     UPB_WIRE_TYPE_VARINT},
+  [GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_MESSAGE] = {alignof(void*),    sizeof(void*),    UPB_WIRE_TYPE_DELIMITED},
+  [GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_GROUP]   = {alignof(void*),    sizeof(void*),    UPB_WIRE_TYPE_START_GROUP},
+  [GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_UINT32]  = {alignof(uint32_t), sizeof(uint32_t), UPB_WIRE_TYPE_VARINT},
+  [GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_ENUM]    = {alignof(uint32_t), sizeof(uint32_t), UPB_WIRE_TYPE_VARINT},
+  [GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_SFIXED32]= {alignof(int32_t),  sizeof(int32_t),  UPB_WIRE_TYPE_32BIT},
+  [GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_SFIXED64]= {alignof(int64_t),  sizeof(int64_t),  UPB_WIRE_TYPE_64BIT},
+  [GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_SINT32]  = {alignof(int32_t),  sizeof(int32_t),  UPB_WIRE_TYPE_VARINT},
+  [GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_SINT64]  = {alignof(int64_t),  sizeof(int64_t),  UPB_WIRE_TYPE_VARINT},
+  [GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_STRING]  = {alignof(struct upb_string*), sizeof(struct upb_string*), UPB_WIRE_TYPE_DELIMITED},
+  [GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_BYTES]   = {alignof(struct upb_string*), sizeof(struct upb_string*), UPB_WIRE_TYPE_DELIMITED},
+};
+
+static upb_status_t parse_tag(void **buf, void *end, struct upb_tag *tag)
+{
+  uint32_t tag_int;
+  UPB_CHECK(get_v_uint32_t(buf, end, &tag_int));
+  tag->wire_type    = (upb_wire_type_t)(tag_int & 0x07);
+  tag->field_number = tag_int >> 3;
+  return UPB_STATUS_OK;
+}
+
+upb_status_t upb_parse_wire_value(void **buf, void *end, upb_wire_type_t wt,
+                                  union upb_wire_value *wv)
+{
+  switch(wt) {
+    case UPB_WIRE_TYPE_VARINT: UPB_CHECK(get_v_uint64_t(buf, end, &wv->varint)); break;
+    case UPB_WIRE_TYPE_64BIT:  UPB_CHECK(get_f_uint64_t(buf, end, &wv->_64bit)); break;
+    case UPB_WIRE_TYPE_32BIT:  UPB_CHECK(get_f_uint32_t(buf, end, &wv->_32bit)); break;
+    default: return UPB_ERROR_ILLEGAL; /* Doesn't handle delimited, groups. */
+  }
+  return UPB_STATUS_OK;
+}
+
+static upb_status_t skip_wire_value(void **buf, void *end, upb_wire_type_t wt)
+{
+  switch(wt) {
+    case UPB_WIRE_TYPE_VARINT: UPB_CHECK(skip_v_uint64_t(buf, end)); break;
+    case UPB_WIRE_TYPE_64BIT:  UPB_CHECK(skip_f_uint64_t(buf, end)); break;
+    case UPB_WIRE_TYPE_32BIT:  UPB_CHECK(skip_f_uint32_t(buf, end)); break;
+    case UPB_WIRE_TYPE_START_GROUP: /* TODO: skip to matching end group. */
+    case UPB_WIRE_TYPE_END_GROUP: break;
+    default: return UPB_ERROR_ILLEGAL;
+  }
+  return UPB_STATUS_OK;
+}
+
+upb_status_t upb_parse_value(void **buf, void *end, upb_field_type_t ft,
+                             union upb_value_ptr v)
+{
+#define CASE(t, member_name) \
+  case GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_ ## t: \
+    return get_ ## t(buf, end, v.member_name);
+  switch(ft) {
+    CASE(DOUBLE,   _double)
+    CASE(FLOAT,    _float)
+    CASE(INT32,    int32)
+    CASE(INT64,    int64)
+    CASE(UINT32,   uint32)
+    CASE(UINT64,   uint64)
+    CASE(SINT32,   int32)
+    CASE(SINT64,   int64)
+    CASE(FIXED32,  uint32)
+    CASE(FIXED64,  uint64)
+    CASE(SFIXED32, int32)
+    CASE(SFIXED64, int64)
+    CASE(BOOL,     _bool)
+    CASE(ENUM,     int32)
+    default: return UPB_ERROR_ILLEGAL;
+  }
+#undef CASE
+}
+
+void upb_parse_reset(struct upb_parse_state *state)
+{
+  state->offset = 0;
+  state->top = state->stack;
+  /* The top-level message is not delimited (we can keep receiving data for
+   * it indefinitely). */
+  state->top->end_offset = SIZE_MAX;
+}
+
+void upb_parse_init(struct upb_parse_state *state, size_t udata_size)
+{
+  memset(state, 0, sizeof(struct upb_parse_state));  /* Clear all callbacks. */
+  size_t stack_bytes = (sizeof(*state->stack) + udata_size) * UPB_MAX_NESTING;
+  state->stack = malloc(stack_bytes);
+  state->limit = (struct upb_parse_stack_frame*)((char*)state->stack + stack_bytes);
+  state->udata_size = udata_size;
+  upb_parse_reset(state);
+}
+
+void upb_parse_free(struct upb_parse_state *state)
+{
+  free(state->stack);
+}
+
+static void pop_stack_frame(struct upb_parse_state *s)
+{
+  if(s->submsg_end_cb) s->submsg_end_cb(s);
+  s->top--;
+  s->top = (struct upb_parse_stack_frame*)((char*)s->top - s->udata_size);
+}
+
+static upb_status_t push_stack_frame(struct upb_parse_state *s, size_t end,
+                                     void *user_field_desc)
+{
+  s->top++;
+  s->top = (struct upb_parse_stack_frame*)((char*)s->top + s->udata_size);
+  if(s->top > s->limit) return UPB_ERROR_STACK_OVERFLOW;
+  s->top->end_offset = end;
+  if(s->submsg_start_cb) s->submsg_start_cb(s, user_field_desc);
+  return UPB_STATUS_OK;
+}
+
+static upb_status_t parse_delimited(struct upb_parse_state *s,
+                                    struct upb_tag *tag,
+                                    void **buf, void *end,
+                                    size_t base_offset)
+{
+  int32_t delim_len;
+  void *user_field_desc;
+  void *bufstart = *buf;
+
+  /* Whether we are parsing or skipping the field, we always need to parse
+   * the length. */
+  UPB_CHECK(get_INT32(buf, end, &delim_len));
+  upb_field_type_t ft = s->tag_cb(s, tag, &user_field_desc);
+  if(*buf < bufstart) return UPB_ERROR_OVERFLOW;
+  if(*buf > end && ft != GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_MESSAGE) {
+    /* Streaming submessages is ok, but for other delimited types (string,
+     * bytes, and packed arrays) we require that all the delimited data is
+     * available.  This could be relaxed if desired. */
+    return UPB_STATUS_NEED_MORE_DATA;
+  }
+
+  if(ft == GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_MESSAGE) {
+    base_offset += ((char*)*buf - (char*)bufstart);
+    UPB_CHECK(push_stack_frame(s, base_offset + delim_len, user_field_desc));
+  } else {
+    void *delim_end = (char*)*buf + delim_len;
+    if(ft == GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_STRING ||
+       ft == GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_BYTES) {
+      struct upb_string str = {.ptr = *buf, .byte_len = delim_len};
+      s->str_cb(s, &str, user_field_desc);
+      *buf = delim_end;
+    } else {
+      /* Packed Array. */
+      while(*buf < delim_end)
+        UPB_CHECK(s->value_cb(s, buf, end, user_field_desc));
+    }
+  }
+  return UPB_STATUS_OK;
+}
+
+static upb_status_t parse_nondelimited(struct upb_parse_state *s,
+                                       struct upb_tag *tag,
+                                       void **buf, void *end)
+{
+  /* Simple value or begin group. */
+  void *user_field_desc;
+  upb_field_type_t ft = s->tag_cb(s, tag, &user_field_desc);
+  if(ft == 0) {
+    UPB_CHECK(skip_wire_value(buf, end, tag->wire_type));
+  } else if(ft == GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_GROUP) {
+    /* No length specified, an "end group" tag will mark the end. */
+    UPB_CHECK(push_stack_frame(s, UINT32_MAX, user_field_desc));
+  } else {
+    UPB_CHECK(s->value_cb(s, buf, end, user_field_desc));
+  }
+  return UPB_STATUS_OK;
+}
+
+upb_status_t upb_parse(struct upb_parse_state *restrict s, void *buf, size_t len,
+                       size_t *restrict read)
+{
+  void *end = (char*)buf + len;
+  *read = 0;
+  while(buf < end) {
+    struct upb_tag tag;
+    void *bufstart = buf;
+    UPB_CHECK(parse_tag(&buf, end, &tag));
+    if(tag.wire_type == UPB_WIRE_TYPE_END_GROUP) {
+      if(s->top->end_offset != UINT32_MAX)
+        return UPB_ERROR_SPURIOUS_END_GROUP;
+      pop_stack_frame(s);
+    } else if(tag.wire_type == UPB_WIRE_TYPE_DELIMITED) {
+      parse_delimited(s, &tag, &buf, end, s->offset + (char*)buf - (char*)bufstart);
+    } else {
+      parse_nondelimited(s, &tag, &buf, end);
+    }
+    size_t bytes_read = ((char*)buf - (char*)bufstart);
+    *read += bytes_read;
+    s->offset += bytes_read;
+    while(s->offset >= s->top->end_offset) {
+      if(s->offset != s->top->end_offset) return UPB_ERROR_BAD_SUBMESSAGE_END;
+      pop_stack_frame(s);
+    }
+  }
+  return UPB_STATUS_OK;
+}
diff --git a/src/upb_parse.h b/src/upb_parse.h
new file mode 100644
index 0000000..d9db85c
--- /dev/null
+++ b/src/upb_parse.h
@@ -0,0 +1,158 @@
+/*
+ * upb - a minimalist implementation of protocol buffers.
+ *
+ * upb_parse implements a high performance, callback-based, stream-oriented
+ * parser (comparable to the SAX model in XML parsers).  For parsing protobufs
+ * into in-memory messages (a more DOM-like model), see the routines in
+ * upb_msg.h, which are layered on top of this parser.
+ *
+ * Copyright (c) 2009 Joshua Haberman.  See LICENSE for details.
+ */
+
+#ifndef UPB_PARSE_H_
+#define UPB_PARSE_H_
+
+#include <stdint.h>
+#include <stdbool.h>
+#include "upb.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Definitions. ***************************************************************/
+
+/* A list of types as they are encoded on-the-wire. */
+enum upb_wire_type {
+  UPB_WIRE_TYPE_VARINT      = 0,
+  UPB_WIRE_TYPE_64BIT       = 1,
+  UPB_WIRE_TYPE_DELIMITED   = 2,
+  UPB_WIRE_TYPE_START_GROUP = 3,
+  UPB_WIRE_TYPE_END_GROUP   = 4,
+  UPB_WIRE_TYPE_32BIT       = 5
+};
+typedef uint8_t upb_wire_type_t;
+
+/* A value as it is encoded on-the-wire, except delimited, which is handled
+ * separately. */
+union upb_wire_value {
+  uint64_t varint;
+  uint64_t _64bit;
+  uint32_t _32bit;
+};
+
+/* A tag occurs before each value on-the-wire. */
+struct upb_tag {
+  upb_field_number_t field_number;
+  upb_wire_type_t wire_type;
+};
+
+/* High-level parsing interface. **********************************************/
+
+/* The general scheme is that the client registers callbacks that will be
+ * called at the appropriate times.  These callbacks provide the client with
+ * data and let the client make decisions (like whether to parse or to skip
+ * a value).
+ *
+ * After initializing the parse state, the client can repeatedly call upb_parse
+ * as data becomes available.  The parser is fully streaming-capable, so the
+ * data need not all be available at the same time. */
+
+struct upb_parse_state;
+
+/* Initialize and free (respectively) the given parse state, which must have
+ * been previously allocated.  udata_size specifies how much space will be
+ * available at parse_stack_frame.user_data in each frame for user data. */
+void upb_parse_init(struct upb_parse_state *state, size_t udata_size);
+void upb_parse_reset(struct upb_parse_state *state);
+void upb_parse_free(struct upb_parse_state *state);
+
+/* The callback that is called immediately after a tag has been parsed.  The
+ * client should determine whether it wants to parse or skip the corresponding
+ * value.  If it wants to parse it, it must discover and return the correct
+ * .proto type (the tag only contains the wire type) and check that the wire
+ * type is appropriate for the .proto type.  To skip the value (which means
+ * skipping all submessages, in the case of a submessage), the callback should
+ * return zero. */
+typedef upb_field_type_t (*upb_tag_cb)(struct upb_parse_state *s,
+                                       struct upb_tag *tag,
+                                       void **user_field_desc);
+
+/* The callback that is called when a regular value (ie. not a string or
+ * submessage) is encountered which the client has opted to parse (by not
+ * returning 0 from the tag_cb).  The client must parse the value and update
+ * buf accordingly, returning success or failure.
+ *
+ * Note that this callback can be called several times in a row for a single
+ * call to tag_cb in the case of packed arrays. */
+typedef upb_status_t (*upb_value_cb)(struct upb_parse_state *s,
+                                     void **buf, void *end,
+                                     void *user_field_desc);
+
+/* The callback that is called when a string is parsed. */
+typedef upb_status_t (*upb_str_cb)(struct upb_parse_state *s,
+                                   struct upb_string *str,
+                                   void *user_field_desc);
+
+/* Callbacks that are called when a submessage begins and ends, respectively.
+ * Both are called with the submessage's stack frame at the top of the stack. */
+typedef void (*upb_submsg_start_cb)(struct upb_parse_state *s,
+                                    void *user_field_desc);
+typedef void (*upb_submsg_end_cb)(struct upb_parse_state *s);
+
+/* Each stack frame (one for each level of submessages/groups) has this format,
+ * where user_data has as many bytes allocated as specified when initialized. */
+struct upb_parse_stack_frame {
+  size_t end_offset; /* 0 indicates that this is a group. */
+  char user_data[];
+};
+
+struct upb_parse_state {
+  size_t offset;
+  struct upb_parse_stack_frame *stack, *top, *limit;
+  size_t udata_size;  /* How many bytes the user gets in each frame. */
+  upb_tag_cb          tag_cb;
+  upb_value_cb        value_cb;
+  upb_str_cb          str_cb;
+  upb_submsg_start_cb submsg_start_cb;
+  upb_submsg_end_cb   submsg_end_cb;
+};
+
+/* Parses up to len bytes of protobuf data out of buf, calling cb as needed.
+ * The function returns how many bytes were consumed from buf.  Data is parsed
+ * until no more data can be read from buf, or the callback sets *done=true,
+ * or an error occured.  Sets *read to the number of bytes consumed. */
+upb_status_t upb_parse(struct upb_parse_state *s, void *buf, size_t len,
+                       size_t *read);
+
+extern upb_wire_type_t upb_expected_wire_types[];
+/* Returns true if wt is the correct on-the-wire type for ft. */
+INLINE bool upb_check_type(upb_wire_type_t wt, upb_field_type_t ft) {
+  if(ft == 10) { // GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_GROUP)
+    return wt == UPB_WIRE_TYPE_START_GROUP;
+  } else {
+    /* With packed arrays, anything can be delimited (except groups). */
+    return wt == UPB_WIRE_TYPE_DELIMITED ||
+           upb_type_info[ft].expected_wire_type == wt;
+ }
+}
+
+/* Data-consuming functions (to be called from value cb). *********************/
+
+/* Parses and converts a value from the character data starting at buf.  The
+ * caller must have previously checked that the wire type is appropriate for
+ * this field type. */
+upb_status_t upb_parse_value(void **buf, void *end, upb_field_type_t ft,
+                             union upb_value_ptr v);
+
+/* Parses a wire value with the given type (which must have been obtained from
+ * a tag that was just parsed) and adds the number of bytes that were consumed
+ * to *offset. */
+upb_status_t upb_parse_wire_value(void **buf, void *end, upb_wire_type_t wt,
+                                  union upb_wire_value *wv);
+
+#ifdef __cplusplus
+}  /* extern "C" */
+#endif
+
+#endif  /* UPB_PARSE_H_ */
diff --git a/src/upb_string.c b/src/upb_string.c
new file mode 100644
index 0000000..733bafe
--- /dev/null
+++ b/src/upb_string.c
@@ -0,0 +1,24 @@
+/*
+ * upb - a minimalist implementation of protocol buffers.
+ *
+ * Copyright (c) 2009 Joshua Haberman.  See LICENSE for details.
+ */
+
+#include "upb_string.h"
+
+bool upb_strreadfile(const char *filename, struct upb_string *data) {
+  FILE *f = fopen(filename, "rb");
+  if(!f) return false;
+  if(fseek(f, 0, SEEK_END) != 0) return false;
+  long size = ftell(f);
+  if(size < 0) return false;
+  if(fseek(f, 0, SEEK_SET) != 0) return false;
+  data->ptr = (char*)malloc(size);
+  data->byte_len = size;
+  if(fread(data->ptr, size, 1, f) != 1) {
+    free(data->ptr);
+    return false;
+  }
+  fclose(f);
+  return true;
+}
diff --git a/src/upb_string.h b/src/upb_string.h
new file mode 100644
index 0000000..7a05811
--- /dev/null
+++ b/src/upb_string.h
@@ -0,0 +1,82 @@
+/*
+ * upb - a minimalist implementation of protocol buffers.
+ *
+ * Copyright (c) 2009 Joshua Haberman.  See LICENSE for details.
+
+ * Defines a delimited (as opposed to null-terminated) string type and some
+ * library functions for manipulating them.
+ *
+ * There are two primary reasons upb uses delimited strings.  One is that they
+ * can be more efficient for some operations because they do not have to scan
+ * the string to find its length.  For example, streql can start by just
+ * comparing the lengths (very efficient) and scan the strings themselves only
+ * if the lengths are equal.
+ *
+ * More importantly, using delimited strings makes it possible for strings to
+ * reference substrings of other strings.  For example, if I am parsing a
+ * protobuf I can create a string that references the original protobuf's
+ * string data.  With NULL-termination I would be forced to write a NULL
+ * into the middle of the protobuf's data, which is less than ideal and in
+ * some cases not practical or possible.
+ */
+
+#ifndef UPB_STRING_H_
+#define UPB_STRING_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdlib.h>
+#include <string.h>
+#include "upb.h"
+
+struct upb_string {
+  /* We expect the data to be 8-bit clean (uint8_t), but char* is such an
+   * ingrained convention that we follow it. */
+  char *ptr;
+  uint32_t byte_len;
+};
+
+INLINE bool upb_streql(struct upb_string *s1, struct upb_string *s2) {
+  return s1->byte_len == s2->byte_len &&
+         memcmp(s1->ptr, s2->ptr, s1->byte_len) == 0;
+}
+
+INLINE void upb_strcpy(struct upb_string *dest, struct upb_string *src) {
+  memcpy(dest->ptr, src->ptr, dest->byte_len);
+  dest->byte_len = src->byte_len;
+}
+
+INLINE struct upb_string upb_strdup(struct upb_string s) {
+  struct upb_string copy;
+  copy.ptr = (char*)malloc(s.byte_len);
+  copy.byte_len = s.byte_len;
+  memcpy(copy.ptr, s.ptr, s.byte_len);
+  return copy;
+}
+
+INLINE void upb_strfree(struct upb_string s) {
+  free(s.ptr);
+}
+
+/* Reads an entire file into a newly-allocated string. */
+bool upb_strreadfile(const char *filename, struct upb_string *data);
+
+/* Allows defining upb_strings as literals, ie:
+ *   struct upb_string str = UPB_STRLIT("Hello, World!\n");
+ * Doesn't work with C++ due to lack of struct initializer syntax.
+ */
+#define UPB_STRLIT(strlit) {.ptr=strlit, .byte_len=sizeof(strlit)-1}
+
+/* Allows using upb_strings in printf, ie:
+ *   struct upb_string str = UPB_STRLIT("Hello, World!\n");
+ *   printf("String is: " UPB_STRFMT, UPB_STRARG(str)); */
+#define UPB_STRARG(str) (str).byte_len, (str).ptr
+#define UPB_STRFMT "%.*s"
+
+#ifdef __cplusplus
+}  /* extern "C" */
+#endif
+
+#endif  /* UPB_H_ */
diff --git a/src/upb_table.c b/src/upb_table.c
new file mode 100644
index 0000000..3bbc7f7
--- /dev/null
+++ b/src/upb_table.c
@@ -0,0 +1,398 @@
+/*
+ * upb - a minimalist implementation of protocol buffers.
+ *
+ * Copyright (c) 2009 Joshua Haberman.  See LICENSE for details.
+ */
+
+#include "upb_table.h"
+
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+
+static const upb_inttable_key_t EMPTYENT = 0;
+static const double MAX_LOAD = 0.85;
+
+static uint32_t MurmurHash2(const void *key, size_t len, uint32_t seed);
+
+/* We use 1-based indexes into the table so that 0 can be "NULL". */
+static struct upb_inttable_entry *intent(struct upb_inttable *t, int32_t i) {
+  return UPB_INDEX(t->t.entries, i-1, t->t.entry_size);
+}
+static struct upb_strtable_entry *strent(struct upb_strtable *t, int32_t i) {
+  return UPB_INDEX(t->t.entries, i-1, t->t.entry_size);
+}
+
+void upb_table_init(struct upb_table *t, uint32_t size, uint16_t entry_size)
+{
+  t->count = 0;
+  t->entry_size = entry_size;
+  t->size_lg2 = 1;
+  while(size >>= 1) t->size_lg2++;
+  size_t bytes = upb_table_size(t) * t->entry_size;
+  t->mask = upb_table_size(t) - 1;
+  t->entries = malloc(bytes);
+  memset(t->entries, 0, bytes);  /* Both tables consider 0's an empty entry. */
+}
+
+void upb_inttable_init(struct upb_inttable *t, uint32_t size, uint16_t entsize)
+{
+  upb_table_init(&t->t, size, entsize);
+}
+
+void upb_strtable_init(struct upb_strtable *t, uint32_t size, uint16_t entsize)
+{
+  upb_table_init(&t->t, size, entsize);
+}
+
+void upb_table_free(struct upb_table *t) { free(t->entries); }
+void upb_inttable_free(struct upb_inttable *t) { upb_table_free(&t->t); }
+void upb_strtable_free(struct upb_strtable *t) { upb_table_free(&t->t); }
+
+static uint32_t strtable_bucket(struct upb_strtable *t, struct upb_string *key)
+{
+  uint32_t hash = MurmurHash2(key->ptr, key->byte_len, 0);
+  return (hash & (upb_strtable_size(t)-1)) + 1;
+}
+
+void *upb_strtable_lookup(struct upb_strtable *t, struct upb_string *key)
+{
+  uint32_t bucket = strtable_bucket(t, key);
+  struct upb_strtable_entry *e;
+  do {
+    e = strent(t, bucket);
+    if(upb_streql(&e->key, key)) return e;
+  } while((bucket = e->next) != UPB_END_OF_CHAIN);
+  return NULL;
+}
+
+static uint32_t empty_intbucket(struct upb_inttable *table)
+{
+  /* TODO: does it matter that this is biased towards the front of the table? */
+  for(uint32_t i = 1; i <= upb_inttable_size(table); i++) {
+    struct upb_inttable_entry *e = intent(table, i);
+    if(e->key == EMPTYENT) return i;
+  }
+  assert(false);
+  return 0;
+}
+
+/* The insert routines have a lot more code duplication between int/string
+ * variants than I would like, but there's just a bit too much that varies to
+ * parameterize them. */
+static void intinsert(struct upb_inttable *t, struct upb_inttable_entry *e)
+{
+  assert(upb_inttable_lookup(t, e->key, t->t.entry_size) == NULL);
+  uint32_t bucket = upb_inttable_bucket(t, e->key);
+  struct upb_inttable_entry *table_e = intent(t, bucket);
+  if(table_e->key != EMPTYENT) {  /* Collision. */
+    if(bucket == upb_inttable_bucket(t, table_e->key)) {
+      /* Existing element is in its main posisiton.  Find an empty slot to
+       * place our new element and append it to this key's chain. */
+      uint32_t empty_bucket = empty_intbucket(t);
+      while (table_e->next != UPB_END_OF_CHAIN)
+        table_e = intent(t, table_e->next);
+      table_e->next = empty_bucket;
+      table_e = intent(t, empty_bucket);
+    } else {
+      /* Existing element is not in its main position.  Move it to an empty
+       * slot and put our element in its main position. */
+      uint32_t empty_bucket = empty_intbucket(t);
+      uint32_t evictee_bucket = upb_inttable_bucket(t, table_e->key);
+      memcpy(intent(t, empty_bucket), table_e, t->t.entry_size); /* copies next */
+      struct upb_inttable_entry *evictee_e = intent(t, evictee_bucket);
+      while(1) {
+        assert(evictee_e->key != UPB_EMPTY_ENTRY);
+        assert(evictee_e->next != UPB_END_OF_CHAIN);
+        if(evictee_e->next == bucket) {
+          evictee_e->next = empty_bucket;
+          break;
+        }
+      }
+      /* table_e remains set to our mainpos. */
+    }
+  }
+  memcpy(table_e, e, t->t.entry_size);
+  table_e->next = UPB_END_OF_CHAIN;
+  assert(upb_inttable_lookup(t, e->key, t->t.entry_size) == table_e);
+}
+
+void upb_inttable_insert(struct upb_inttable *t, struct upb_inttable_entry *e)
+{
+  assert(e->key != 0);
+  if((double)++t->t.count / upb_inttable_size(t) > MAX_LOAD) {
+    /* Need to resize.  New table of double the size, add old elements to it. */
+    struct upb_inttable new_table;
+    upb_inttable_init(&new_table, upb_inttable_size(t)*2, t->t.entry_size);
+    struct upb_inttable_entry *old_e;
+    for(old_e = upb_inttable_begin(t); old_e; old_e = upb_inttable_next(t, old_e))
+      intinsert(&new_table, old_e);
+    upb_inttable_free(t);
+    *t = new_table;
+  }
+  intinsert(t, e);
+}
+
+static uint32_t empty_strbucket(struct upb_strtable *table)
+{
+  /* TODO: does it matter that this is biased towards the front of the table? */
+  for(uint32_t i = 1; i <= upb_strtable_size(table); i++) {
+    struct upb_strtable_entry *e = strent(table, i);
+    if(e->key.byte_len == 0) return i;
+  }
+  assert(false);
+  return 0;
+}
+
+static void strinsert(struct upb_strtable *t, struct upb_strtable_entry *e)
+{
+  assert(upb_strtable_lookup(t, &e->key) == NULL);
+  uint32_t bucket = strtable_bucket(t, &e->key);
+  struct upb_strtable_entry *table_e = strent(t, bucket);
+  if(table_e->key.byte_len != 0) {  /* Collision. */
+    if(bucket == strtable_bucket(t, &table_e->key)) {
+      /* Existing element is in its main posisiton.  Find an empty slot to
+       * place our new element and append it to this key's chain. */
+      uint32_t empty_bucket = empty_strbucket(t);
+      while (table_e->next != UPB_END_OF_CHAIN)
+        table_e = strent(t, table_e->next);
+      table_e->next = empty_bucket;
+      table_e = strent(t, empty_bucket);
+    } else {
+      /* Existing element is not in its main position.  Move it to an empty
+       * slot and put our element in its main position. */
+      uint32_t empty_bucket = empty_strbucket(t);
+      uint32_t evictee_bucket = strtable_bucket(t, &table_e->key);
+      memcpy(strent(t, empty_bucket), table_e, t->t.entry_size); /* copies next */
+      struct upb_strtable_entry *evictee_e = strent(t, evictee_bucket);
+      while(1) {
+        assert(evictee_e->key.byte_len != 0);
+        assert(evictee_e->next != UPB_END_OF_CHAIN);
+        if(evictee_e->next == bucket) {
+          evictee_e->next = empty_bucket;
+          break;
+        }
+        evictee_e = strent(t, evictee_e->next);
+      }
+      /* table_e remains set to our mainpos. */
+    }
+  }
+  memcpy(table_e, e, t->t.entry_size);
+  table_e->next = UPB_END_OF_CHAIN;
+  assert(upb_strtable_lookup(t, &e->key) == table_e);
+}
+
+void upb_strtable_insert(struct upb_strtable *t, struct upb_strtable_entry *e)
+{
+  if((double)++t->t.count / upb_strtable_size(t) > MAX_LOAD) {
+    /* Need to resize.  New table of double the size, add old elements to it. */
+    struct upb_strtable new_table;
+    upb_strtable_init(&new_table, upb_strtable_size(t)*2, t->t.entry_size);
+    struct upb_strtable_entry *old_e;
+    for(old_e = upb_strtable_begin(t); old_e; old_e = upb_strtable_next(t, old_e))
+      strinsert(&new_table, old_e);
+    upb_strtable_free(t);
+    *t = new_table;
+  }
+  strinsert(t, e);
+}
+
+void *upb_inttable_begin(struct upb_inttable *t) {
+  return upb_inttable_next(t, intent(t, 0));
+}
+
+void *upb_inttable_next(struct upb_inttable *t, struct upb_inttable_entry *cur) {
+  struct upb_inttable_entry *end = intent(t, upb_inttable_size(t)+1);
+  do {
+    cur = (void*)((char*)cur + t->t.entry_size);
+    if(cur == end) return NULL;
+  } while(cur->key == UPB_EMPTY_ENTRY);
+  return cur;
+}
+
+void *upb_strtable_begin(struct upb_strtable *t) {
+  return upb_strtable_next(t, strent(t, 0));
+}
+
+void *upb_strtable_next(struct upb_strtable *t, struct upb_strtable_entry *cur) {
+  struct upb_strtable_entry *end = strent(t, upb_strtable_size(t)+1);
+  do {
+    cur = (void*)((char*)cur + t->t.entry_size);
+    if(cur == end) return NULL;
+  } while(cur->key.byte_len == 0);
+  return cur;
+}
+
+#ifdef UPB_UNALIGNED_READS_OK
+//-----------------------------------------------------------------------------
+// MurmurHash2, by Austin Appleby (released as public domain).
+// Reformatted and C99-ified by Joshua Haberman.
+// Note - This code makes a few assumptions about how your machine behaves -
+//   1. We can read a 4-byte value from any address without crashing
+//   2. sizeof(int) == 4 (in upb this limitation is removed by using uint32_t
+// And it has a few limitations -
+//   1. It will not work incrementally.
+//   2. It will not produce the same results on little-endian and big-endian
+//      machines.
+static uint32_t MurmurHash2(const void *key, size_t len, uint32_t seed)
+{
+  // 'm' and 'r' are mixing constants generated offline.
+  // They're not really 'magic', they just happen to work well.
+  const uint32_t m = 0x5bd1e995;
+  const int32_t r = 24;
+
+  // Initialize the hash to a 'random' value
+  uint32_t h = seed ^ len;
+
+  // Mix 4 bytes at a time into the hash
+  const uint8_t * data = (const uint8_t *)key;
+  while(len >= 4) {
+    uint32_t k = *(uint32_t *)data;
+
+    k *= m;
+    k ^= k >> r;
+    k *= m;
+
+    h *= m;
+    h ^= k;
+
+    data += 4;
+    len -= 4;
+  }
+
+  // Handle the last few bytes of the input array
+  switch(len) {
+    case 3: h ^= data[2] << 16;
+    case 2: h ^= data[1] << 8;
+    case 1: h ^= data[0]; h *= m;
+  };
+
+  // Do a few final mixes of the hash to ensure the last few
+  // bytes are well-incorporated.
+  h ^= h >> 13;
+  h *= m;
+  h ^= h >> 15;
+
+  return h;
+}
+
+#else // !UPB_UNALIGNED_READS_OK
+
+//-----------------------------------------------------------------------------
+// MurmurHashAligned2, by Austin Appleby
+// Same algorithm as MurmurHash2, but only does aligned reads - should be safer
+// on certain platforms.
+// Performance will be lower than MurmurHash2
+
+#define MIX(h,k,m) { k *= m; k ^= k >> r; k *= m; h *= m; h ^= k; }
+
+static uint32_t MurmurHash2(const void * key, size_t len, uint32_t seed)
+{
+  const uint32_t m = 0x5bd1e995;
+  const int32_t r = 24;
+  const uint8_t * data = (const uint8_t *)key;
+  uint32_t h = seed ^ len;
+  uint8_t align = (uintptr_t)data & 3;
+
+  if(align && (len >= 4)) {
+    // Pre-load the temp registers
+    uint32_t t = 0, d = 0;
+
+    switch(align) {
+      case 1: t |= data[2] << 16;
+      case 2: t |= data[1] << 8;
+      case 3: t |= data[0];
+    }
+
+    t <<= (8 * align);
+
+    data += 4-align;
+    len -= 4-align;
+
+    int32_t sl = 8 * (4-align);
+    int32_t sr = 8 * align;
+
+    // Mix
+
+    while(len >= 4) {
+      d = *(uint32_t *)data;
+      t = (t >> sr) | (d << sl);
+
+      uint32_t k = t;
+
+      MIX(h,k,m);
+
+      t = d;
+
+      data += 4;
+      len -= 4;
+    }
+
+    // Handle leftover data in temp registers
+
+    d = 0;
+
+    if(len >= align) {
+      switch(align) {
+        case 3: d |= data[2] << 16;
+        case 2: d |= data[1] << 8;
+        case 1: d |= data[0];
+      }
+
+      uint32_t k = (t >> sr) | (d << sl);
+      MIX(h,k,m);
+
+      data += align;
+      len -= align;
+
+      //----------
+      // Handle tail bytes
+
+      switch(len) {
+        case 3: h ^= data[2] << 16;
+        case 2: h ^= data[1] << 8;
+        case 1: h ^= data[0]; h *= m;
+      };
+    } else {
+      switch(len) {
+        case 3: d |= data[2] << 16;
+        case 2: d |= data[1] << 8;
+        case 1: d |= data[0];
+        case 0: h ^= (t >> sr) | (d << sl); h *= m;
+      }
+    }
+
+    h ^= h >> 13;
+    h *= m;
+    h ^= h >> 15;
+
+    return h;
+  } else {
+    while(len >= 4) {
+      uint32_t k = *(uint32_t *)data;
+
+      MIX(h,k,m);
+
+      data += 4;
+      len -= 4;
+    }
+
+    //----------
+    // Handle tail bytes
+
+    switch(len) {
+      case 3: h ^= data[2] << 16;
+      case 2: h ^= data[1] << 8;
+      case 1: h ^= data[0]; h *= m;
+    };
+
+    h ^= h >> 13;
+    h *= m;
+    h ^= h >> 15;
+
+    return h;
+  }
+}
+#undef MIX
+
+#endif // UPB_UNALIGNED_READS_OK
diff --git a/src/upb_table.h b/src/upb_table.h
new file mode 100644
index 0000000..094ed48
--- /dev/null
+++ b/src/upb_table.h
@@ -0,0 +1,123 @@
+/*
+ * upb - a minimalist implementation of protocol buffers.
+ *
+ * Copyright (c) 2009 Joshua Haberman.  See LICENSE for details.
+ *
+ * This file defines very fast int->struct (inttable) and string->struct
+ * (strtable) hash tables.  The struct can be of any size, and it is stored
+ * in the table itself, for cache-friendly performance.
+ *
+ * The table uses internal chaining with Brent's variation (inspired by the
+ * Lua implementation of hash tables).  The hash function for strings is
+ * Austin Appleby's "MurmurHash."
+ */
+
+#ifndef UPB_TABLE_H_
+#define UPB_TABLE_H_
+
+#include <assert.h>
+#include "upb.h"
+#include "upb_string.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Note: the key cannot be zero!  Zero is used by the implementation. */
+typedef uint32_t upb_inttable_key_t;
+
+#define UPB_END_OF_CHAIN (uint32_t)0
+#define UPB_EMPTY_ENTRY (uint32_t)0
+
+struct upb_inttable_entry {
+  upb_inttable_key_t key;
+  uint32_t next;  /* Internal chaining. */
+};
+
+/* TODO: consider storing the hash in the entry.  This would avoid the need to
+ * rehash on table resizes, but more importantly could possibly improve lookup
+ * performance by letting us compare hashes before comparing lengths or the
+ * strings themselves. */
+struct upb_strtable_entry {
+  struct upb_string key;
+  uint32_t next;  /* Internal chaining. */
+};
+
+struct upb_table {
+  void *entries;
+  uint32_t count;       /* How many elements are currently in the table? */
+  uint16_t entry_size;  /* How big is each entry? */
+  uint8_t size_lg2;     /* The table is 2^size_lg2 in size. */
+  uint32_t mask;
+};
+
+struct upb_strtable {
+  struct upb_table t;
+};
+
+struct upb_inttable {
+  struct upb_table t;
+};
+
+/* Initialize and free a table, respectively.  Specify the initial size
+ * with 'size' (the size will be increased as necessary).  Entry size
+ * specifies how many bytes each entry in the table is. */
+void upb_inttable_init(struct upb_inttable *table,
+                       uint32_t size, uint16_t entry_size);
+void upb_inttable_free(struct upb_inttable *table);
+void upb_strtable_init(struct upb_strtable *table,
+                       uint32_t size, uint16_t entry_size);
+void upb_strtable_free(struct upb_strtable *table);
+
+INLINE uint32_t upb_table_size(struct upb_table *t) { return 1 << t->size_lg2; }
+INLINE uint32_t upb_inttable_size(struct upb_inttable *t) {
+  return upb_table_size(&t->t);
+}
+INLINE uint32_t upb_strtable_size(struct upb_strtable *t) {
+  return upb_table_size(&t->t);
+}
+
+/* Inserts the given key into the hashtable with the given value.  The key must
+ * not already exist in the hash table.  The data will be copied from e into
+ * the hashtable (the amount of data copied comes from entry_size when the
+ * table was constructed).  Therefore the data at val may be freed once the
+ * call returns. */
+void upb_inttable_insert(struct upb_inttable *t, struct upb_inttable_entry *e);
+void upb_strtable_insert(struct upb_strtable *t, struct upb_strtable_entry *e);
+
+INLINE uint32_t upb_inttable_bucket(struct upb_inttable *t, upb_inttable_key_t k) {
+  return (k & t->t.mask) + 1;  /* Identity hash for ints. */
+}
+
+/* Looks up key in this table.  Inlined because this is in the critical path
+ * of parsing.  We have the caller specify the entry_size because fixing
+ * this as a literal (instead of reading table->entry_size) gives the
+ * compiler more ability to optimize. */
+INLINE void *upb_inttable_lookup(struct upb_inttable *t,
+                                 uint32_t key, uint32_t entry_size) {
+  assert(key != 0);
+  uint32_t bucket = upb_inttable_bucket(t, key);
+  struct upb_inttable_entry *e;
+  do {
+    e = (struct upb_inttable_entry*)UPB_INDEX(t->t.entries, bucket-1, entry_size);
+    if(e->key == key) return e;
+  } while((bucket = e->next) != UPB_END_OF_CHAIN);
+  return NULL;  /* Not found. */
+}
+
+void *upb_strtable_lookup(struct upb_strtable *t, struct upb_string *key);
+
+/* Provides iteration over the table.  The order in which the entries are
+ * returned is undefined.  Insertions invalidate iterators.  The _next
+ * functions return NULL when the end has been reached. */
+void *upb_inttable_begin(struct upb_inttable *t);
+void *upb_inttable_next(struct upb_inttable *t, struct upb_inttable_entry *cur);
+
+void *upb_strtable_begin(struct upb_strtable *t);
+void *upb_strtable_next(struct upb_strtable *t, struct upb_strtable_entry *cur);
+
+#ifdef __cplusplus
+}  /* extern "C" */
+#endif
+
+#endif  /* UPB_TABLE_H_ */
author	Joshua Haberman <joshua@reverberate.org>	2009-07-08 12:06:47 -0700
committer	Joshua Haberman <joshua@reverberate.org>	2009-07-08 12:06:47 -0700
commit	462b26c1cc041a8fa26deb62cf12f1f351a5b2f6 (patch)
tree	de5a58f8d66d11c13b349448a970f84d57d16cad /src
parent	c7ee14f8ef38a8bc90c0f1db1ad47b2e06612fa3 (diff)