summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorJoshua Haberman <joshua@reverberate.org>2009-07-08 12:06:47 -0700
committerJoshua Haberman <joshua@reverberate.org>2009-07-08 12:06:47 -0700
commit462b26c1cc041a8fa26deb62cf12f1f351a5b2f6 (patch)
treede5a58f8d66d11c13b349448a970f84d57d16cad /src
parentc7ee14f8ef38a8bc90c0f1db1ad47b2e06612fa3 (diff)
Directory restructuring.
Diffstat (limited to 'src')
-rw-r--r--src/upb.h136
-rw-r--r--src/upb_context.c304
-rw-r--r--src/upb_context.h106
-rw-r--r--src/upb_enum.c31
-rw-r--r--src/upb_enum.h41
-rw-r--r--src/upb_inlinedefs.c19
-rw-r--r--src/upb_msg.c379
-rw-r--r--src/upb_msg.h369
-rw-r--r--src/upb_parse.c383
-rw-r--r--src/upb_parse.h158
-rw-r--r--src/upb_string.c24
-rw-r--r--src/upb_string.h82
-rw-r--r--src/upb_table.c398
-rw-r--r--src/upb_table.h123
14 files changed, 2553 insertions, 0 deletions
diff --git a/src/upb.h b/src/upb.h
new file mode 100644
index 0000000..64f2c54
--- /dev/null
+++ b/src/upb.h
@@ -0,0 +1,136 @@
+/*
+ * upb - a minimalist implementation of protocol buffers.
+
+ * Copyright (c) 2009 Joshua Haberman. See LICENSE for details.
+ *
+ * This file contains shared definitions that are widely used across upb.
+ */
+
+#ifndef UPB_H_
+#define UPB_H_
+
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h> /* for size_t. */
+#include <string.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* inline if possible, emit standalone code if required. */
+#ifndef INLINE
+#define INLINE static inline
+#endif
+
+/* The maximum that any submessages can be nested. Matches proto2's limit. */
+#define UPB_MAX_NESTING 64
+
+/* The maximum number of fields that any one .proto type can have. */
+#define UPB_MAX_FIELDS (1<<16)
+
+/* Nested type names are separated by periods. */
+#define UPB_SYMBOL_SEPARATOR '.'
+#define UPB_SYMBOL_MAX_LENGTH 256
+
+#define UPB_INDEX(base, i, m) (void*)((char*)(base) + ((i)*(m)))
+
+INLINE uint32_t max(uint32_t a, uint32_t b) { return a > b ? a : b; }
+
+/* Value type as defined in a .proto file. The values of this are defined by
+ * google_protobuf_FieldDescriptorProto_Type (from descriptor.proto).
+ * Note that descriptor.proto reserves "0" for errors, and we use it to
+ * represent exceptional circumstances. */
+typedef uint8_t upb_field_type_t;
+
+/* Label (optional, repeated, required) as defined in a .proto file. The values
+ * of this are defined by google.protobuf.FieldDescriptorProto.Label (from
+ * descriptor.proto). */
+typedef uint8_t upb_label_t;
+
+struct upb_type_info {
+ uint8_t align;
+ uint8_t size;
+ uint8_t expected_wire_type;
+};
+
+/* Contains information for all .proto types. Indexed by upb_field_type_t. */
+extern struct upb_type_info upb_type_info[];
+
+/* A pointer to a .proto value. The owner must have an out-of-band way of
+ * knowing the type, so it knows which union member to use. */
+union upb_value {
+ double _double;
+ float _float;
+ int32_t int32;
+ int64_t int64;
+ uint32_t uint32;
+ uint64_t uint64;
+ bool _bool;
+ struct upb_string **string;
+ struct upb_array **array;
+ void *message;
+};
+
+union upb_value_ptr {
+ double *_double;
+ float *_float;
+ int32_t *int32;
+ int64_t *int64;
+ uint32_t *uint32;
+ uint64_t *uint64;
+ bool *_bool;
+ struct upb_string **string;
+ struct upb_array **array;
+ void **message;
+ void *_void;
+};
+
+union upb_symbol_ref {
+ struct upb_msg *msg;
+ struct upb_enum *_enum;
+ struct upb_svc *svc;
+};
+
+/* The number of a field, eg. "optional string foo = 3". */
+typedef int32_t upb_field_number_t;
+
+/* Status codes used as a return value. Codes >0 are not fatal and can be
+ * resumed. */
+typedef enum upb_status {
+ UPB_STATUS_OK = 0,
+
+ // The input byte stream ended in the middle of a record.
+ UPB_STATUS_NEED_MORE_DATA = 1,
+
+ // The user value callback opted to stop parsing.
+ UPB_STATUS_USER_CANCELLED = 2,
+
+ // A varint did not terminate before hitting 64 bits.
+ UPB_ERROR_UNTERMINATED_VARINT = -1,
+
+ // A submessage or packed array ended in the middle of data.
+ UPB_ERROR_BAD_SUBMESSAGE_END = -2,
+
+ // Input was nested more than UPB_MAX_NESTING deep.
+ UPB_ERROR_STACK_OVERFLOW = -3,
+
+ // The input data caused the pb's offset (a size_t) to overflow.
+ UPB_ERROR_OVERFLOW = -4,
+
+ // An "end group" tag was encountered in an inappropriate place.
+ UPB_ERROR_SPURIOUS_END_GROUP = -5,
+
+ UPB_ERROR_ILLEGAL = -6
+} upb_status_t;
+
+#define UPB_CHECK(func) do { \
+ upb_status_t status = func; \
+ if(status != UPB_STATUS_OK) return status; \
+ } while (0)
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif /* UPB_H_ */
diff --git a/src/upb_context.c b/src/upb_context.c
new file mode 100644
index 0000000..d857d16
--- /dev/null
+++ b/src/upb_context.c
@@ -0,0 +1,304 @@
+/*
+ * upb - a minimalist implementation of protocol buffers.
+ *
+ * Copyright (c) 2009 Joshua Haberman. See LICENSE for details.
+ */
+
+#include <stdlib.h>
+#include <string.h>
+#include "descriptor.h"
+#include "upb_context.h"
+#include "upb_enum.h"
+#include "upb_msg.h"
+
+/* Search for a character in a string, in reverse. */
+static int memrchr(char *data, char c, size_t len)
+{
+ int off = len-1;
+ while(off > 0 && data[off] != c) --off;
+ return off;
+}
+
+bool addfd(struct upb_strtable *addto, struct upb_strtable *existingdefs,
+ google_protobuf_FileDescriptorProto *fd);
+
+bool upb_context_init(struct upb_context *c)
+{
+ upb_strtable_init(&c->symtab, 16, sizeof(struct upb_symtab_entry));
+ upb_strtable_init(&c->psymtab, 16, sizeof(struct upb_symtab_entry));
+ /* Add all the types in descriptor.proto so we can parse descriptors. */
+ if(!addfd(&c->psymtab, &c->symtab, &google_protobuf_filedescriptor)) {
+ assert(false);
+ return false; /* Indicates that upb is buggy or corrupt. */
+ }
+ struct upb_string name = UPB_STRLIT("google.protobuf.FileDescriptorSet");
+ struct upb_symtab_entry *e = upb_strtable_lookup(&c->psymtab, &name);
+ assert(e);
+ c->fds_msg = e->ref.msg;
+ c->fds_size = 16;
+ c->fds_len = 0;
+ c->fds = malloc(sizeof(*c->fds));
+ return true;
+}
+
+static void free_symtab(struct upb_strtable *t)
+{
+ struct upb_symtab_entry *e = upb_strtable_begin(t);
+ for(; e; e = upb_strtable_next(t, &e->e)) {
+ switch(e->type) {
+ case UPB_SYM_MESSAGE: upb_msg_free(e->ref.msg); break;
+ case UPB_SYM_ENUM: upb_enum_free(e->ref._enum); break;
+ default: break; /* TODO */
+ }
+ free(e->ref.msg); /* The pointer is the same for all. */
+ free(e->e.key.ptr);
+ }
+ upb_strtable_free(t);
+}
+
+void upb_context_free(struct upb_context *c)
+{
+ free_symtab(&c->symtab);
+ for(size_t i = 0; i < c->fds_len; i++)
+ upb_msgdata_free(c->fds[i], c->fds_msg, true);
+ free_symtab(&c->psymtab);
+ free(c->fds);
+}
+
+struct upb_symtab_entry *upb_context_lookup(struct upb_context *c,
+ struct upb_string *symbol)
+{
+ return upb_strtable_lookup(&c->symtab, symbol);
+}
+
+/* Given a symbol and the base symbol inside which it is defined, find the
+ * symbol's definition in t. */
+static struct upb_symtab_entry *resolve(struct upb_strtable *t,
+ struct upb_string *base,
+ struct upb_string *symbol)
+{
+ if(base->byte_len + symbol->byte_len + 1 >= UPB_SYMBOL_MAX_LENGTH ||
+ symbol->byte_len == 0) return NULL;
+
+ if(symbol->ptr[0] == UPB_SYMBOL_SEPARATOR) {
+ /* Symbols starting with '.' are absolute, so we do a single lookup. */
+ struct upb_string sym_str = {.ptr = symbol->ptr+1,
+ .byte_len = symbol->byte_len-1};
+ return upb_strtable_lookup(t, &sym_str);
+ } else {
+ /* Remove components from base until we find an entry or run out. */
+ char sym[UPB_SYMBOL_MAX_LENGTH+1];
+ struct upb_string sym_str = {.ptr = sym};
+ int baselen = base->byte_len;
+ while(1) {
+ /* sym_str = base[0...base_len] + UPB_SYMBOL_SEPARATOR + symbol */
+ memcpy(sym, base->ptr, baselen);
+ sym[baselen] = UPB_SYMBOL_SEPARATOR;
+ memcpy(sym + baselen + 1, symbol->ptr, symbol->byte_len);
+ sym_str.byte_len = baselen + symbol->byte_len + 1;
+
+ struct upb_symtab_entry *e = upb_strtable_lookup(t, &sym_str);
+ if (e) return e;
+ else if(baselen == 0) return NULL; /* No more scopes to try. */
+
+ baselen = memrchr(base->ptr, UPB_SYMBOL_SEPARATOR, baselen);
+ }
+ }
+}
+
+/* Tries to resolve a symbol in two different tables. */
+union upb_symbol_ref resolve2(struct upb_strtable *t1, struct upb_strtable *t2,
+ struct upb_string *base, struct upb_string *sym,
+ enum upb_symbol_type expected_type) {
+ union upb_symbol_ref nullref = {.msg = NULL};
+ struct upb_symtab_entry *e = resolve(t1, base, sym);
+ if(e == NULL) e = resolve(t2, base, sym);
+
+ if(e && e->type == expected_type) return e->ref;
+ else return nullref;
+}
+
+
+struct upb_symtab_entry *upb_context_resolve(struct upb_context *c,
+ struct upb_string *base,
+ struct upb_string *symbol) {
+ return resolve(&c->symtab, base, symbol);
+}
+
+/* Joins strings together, for example:
+ * join("Foo.Bar", "Baz") -> "Foo.Bar.Baz"
+ * join("", "Baz") -> "Baz"
+ * Caller owns the returned string and must free it. */
+static struct upb_string join(struct upb_string *base, struct upb_string *name) {
+ size_t len = base->byte_len + name->byte_len;
+ if(base->byte_len > 0) len++; /* For the separator. */
+ struct upb_string joined = {.byte_len=len, .ptr=malloc(len)};
+ if(base->byte_len > 0) {
+ /* nested_base = base + '.' + d->name */
+ memcpy(joined.ptr, base->ptr, base->byte_len);
+ joined.ptr[base->byte_len] = UPB_SYMBOL_SEPARATOR;
+ memcpy(&joined.ptr[base->byte_len+1], name->ptr, name->byte_len);
+ } else {
+ memcpy(joined.ptr, name->ptr, name->byte_len);
+ }
+ return joined;
+}
+
+static bool insert_enum(struct upb_strtable *t,
+ google_protobuf_EnumDescriptorProto *ed,
+ struct upb_string *base)
+{
+ // TODO: re-enable when compiler sets this flag
+ //if(!ed->set_flags.has.name) return false;
+
+ /* We own this and must free it on destruct. */
+ struct upb_string fqname = join(base, ed->name);
+
+ /* Redefinition within a FileDescriptorProto is not allowed. */
+ if(upb_strtable_lookup(t, &fqname)) {
+ free(fqname.ptr);
+ return false;
+ }
+
+ struct upb_symtab_entry e;
+ e.e.key = fqname;
+ e.type = UPB_SYM_ENUM;
+ e.ref._enum = malloc(sizeof(*e.ref._enum));
+ upb_enum_init(e.ref._enum, ed);
+ upb_strtable_insert(t, &e.e);
+
+ return true;
+}
+
+static bool insert_message(struct upb_strtable *t,
+ google_protobuf_DescriptorProto *d,
+ struct upb_string *base)
+{
+ /* TODO: re-enable when compiler sets this flag. */
+ //if(!d->set_flags.has.name) return false;
+
+ /* We own this and must free it on destruct. */
+ struct upb_string fqname = join(base, d->name);
+
+ /* Redefinition within a FileDescriptorProto is not allowed. */
+ if(upb_strtable_lookup(t, d->name)) {
+ free(fqname.ptr);
+ return false;
+ }
+
+ struct upb_symtab_entry e;
+ e.e.key = fqname;
+ e.type = UPB_SYM_MESSAGE;
+ e.ref.msg = malloc(sizeof(*e.ref.msg));
+ if(!upb_msg_init(e.ref.msg, d)) {
+ free(fqname.ptr);
+ return false;
+ }
+ upb_strtable_insert(t, &e.e);
+
+ /* Add nested messages and enums. */
+ //if(d->set_flags.has.nested_type) (re-enable when protoc sets)
+ if(d->nested_type)
+ for(unsigned int i = 0; i < d->nested_type->len; i++)
+ if(!insert_message(t, d->nested_type->elements[i], &fqname))
+ return false;
+
+ //if(d->set_flags.has.enum_type)
+ if(d->enum_type)
+ for(unsigned int i = 0; i < d->enum_type->len; i++)
+ if(!insert_enum(t, d->enum_type->elements[i], &fqname))
+ return false;
+
+ return true;
+}
+
+bool addfd(struct upb_strtable *addto, struct upb_strtable *existingdefs,
+ google_protobuf_FileDescriptorProto *fd)
+{
+ struct upb_string package = {.byte_len=0};
+ if(fd->set_flags.has.package) package = *fd->package;
+
+ if(fd->set_flags.has.message_type)
+ for(unsigned int i = 0; i < fd->message_type->len; i++)
+ if(!insert_message(addto, fd->message_type->elements[i], &package))
+ return false;
+
+ if(fd->set_flags.has.enum_type)
+ for(unsigned int i = 0; i < fd->enum_type->len; i++)
+ if(!insert_enum(addto, fd->enum_type->elements[i], &package))
+ return false;
+
+ /* TODO: handle extensions and services. */
+
+ /* Attempt to resolve all references. */
+ struct upb_symtab_entry *e;
+ for(e = upb_strtable_begin(addto); e; e = upb_strtable_next(addto, &e->e)) {
+ if(upb_strtable_lookup(existingdefs, &e->e.key))
+ return false; /* Redefinition prohibited. */
+ if(e->type == UPB_SYM_MESSAGE) {
+ struct upb_msg *m = e->ref.msg;
+ for(unsigned int i = 0; i < m->num_fields; i++) {
+ struct upb_msg_field *f = &m->fields[i];
+ google_protobuf_FieldDescriptorProto *fd = m->field_descriptors[i];
+ union upb_symbol_ref ref;
+ if(fd->type == GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_MESSAGE ||
+ fd->type == GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_GROUP)
+ ref = resolve2(existingdefs, addto, &e->e.key, fd->type_name,
+ UPB_SYM_MESSAGE);
+ else if(fd->type == GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_ENUM)
+ ref = resolve2(existingdefs, addto, &e->e.key, fd->type_name,
+ UPB_SYM_ENUM);
+ else
+ continue; /* No resolving necessary. */
+ if(!ref.msg) return false; /* Ref. to undefined symbol. */
+ upb_msg_ref(m, f, ref);
+ }
+ }
+ }
+ return true;
+}
+
+bool upb_context_addfd(struct upb_context *c,
+ google_protobuf_FileDescriptorProto *fd)
+{
+ struct upb_strtable tmp;
+ if(!addfd(&tmp, &c->symtab, fd)) {
+ free_symtab(&tmp);
+ return false;
+ }
+ upb_strtable_free(&tmp);
+ return true;
+}
+
+bool upb_context_parsefds(struct upb_context *c, struct upb_string *fds_str) {
+ google_protobuf_FileDescriptorSet *fds =
+ upb_alloc_and_parse(c->fds_msg, fds_str, true);
+ if(!fds) return false;
+
+ if(fds->set_flags.has.file) {
+ /* Insert new symbols into a temporary table until we have verified that
+ * the descriptor is valid. */
+ struct upb_strtable tmp;
+ upb_strtable_init(&tmp, 0, sizeof(struct upb_symtab_entry));
+ for(uint32_t i = 0; i < fds->file->len; i++) {
+ if(!addfd(&tmp, &c->symtab, fds->file->elements[i])) {
+ printf("Not added successfully!\n");
+ free_symtab(&tmp);
+ return false;
+ }
+ }
+ /* Everything was successfully added, copy from the tmp symtable. */
+ struct upb_symtab_entry *e;
+ for(e = upb_strtable_begin(&tmp); e; e = upb_strtable_next(&tmp, &e->e))
+ upb_strtable_insert(&c->symtab, &e->e);
+ upb_strtable_free(&tmp);
+ }
+
+ /* We own fds now, need to keep a ref so we can free it later. */
+ if(c->fds_size == c->fds_len) {
+ c->fds_size *= 2;
+ c->fds = realloc(c->fds, c->fds_size);
+ }
+ c->fds[c->fds_len++] = fds;
+ return true;
+}
diff --git a/src/upb_context.h b/src/upb_context.h
new file mode 100644
index 0000000..4ddaed6
--- /dev/null
+++ b/src/upb_context.h
@@ -0,0 +1,106 @@
+/*
+ * upb - a minimalist implementation of protocol buffers.
+ *
+ * A context represents a namespace of proto definitions, sort of like an
+ * interpreter's symbol table. It is empty when first constructed. Clients
+ * add definitions to the context by supplying unserialized or serialized
+ * descriptors (as defined in descriptor.proto).
+ *
+ * Copyright (c) 2009 Joshua Haberman. See LICENSE for details.
+ */
+
+#ifndef UPB_CONTEXT_H_
+#define UPB_CONTEXT_H_
+
+#include "upb.h"
+#include "upb_table.h"
+
+struct google_protobuf_FileDescriptorProto;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Definitions. ***************************************************************/
+
+/* The symbol table maps names to various kinds of symbols. */
+enum upb_symbol_type {
+ UPB_SYM_MESSAGE,
+ UPB_SYM_ENUM,
+ UPB_SYM_SERVICE,
+ UPB_SYM_EXTENSION
+};
+
+struct upb_symtab_entry {
+ struct upb_strtable_entry e;
+ enum upb_symbol_type type;
+ union upb_symbol_ref ref;
+};
+
+struct upb_context {
+ struct upb_strtable symtab; /* The context's symbol table. */
+ struct upb_strtable psymtab; /* Private symbols, for internal use. */
+ struct upb_msg *fds_msg; /* This is in psymtab, ptr here for convenience. */
+
+ /* A list of the FileDescriptorProtos we own (from having parsed them
+ * ourselves) and must free on destruction. */
+ size_t fds_size, fds_len;
+ struct google_protobuf_FileDescriptorSet **fds;
+};
+
+/* Initializes and frees a upb_context, respectively. */
+bool upb_context_init(struct upb_context *c);
+void upb_context_free(struct upb_context *c);
+
+/* Looking up symbols. ********************************************************/
+
+/* Resolves the given symbol using the rules described in descriptor.proto,
+ * namely:
+ *
+ * If the name starts with a '.', it is fully-qualified. Otherwise, C++-like
+ * scoping rules are used to find the type (i.e. first the nested types
+ * within this message are searched, then within the parent, on up to the
+ * root namespace).
+ *
+ * Returns NULL if the symbol has not been defined. */
+struct upb_symtab_entry *upb_context_resolve(struct upb_context *c,
+ struct upb_string *base,
+ struct upb_string *symbol);
+
+/* Find an entry in the symbol table with this exact name. Returns NULL if no
+ * such symbol name exists. */
+struct upb_symtab_entry *upb_context_lookup(struct upb_context *c,
+ struct upb_string *symbol);
+
+INLINE struct upb_symtab_entry *upb_context_symbegin(struct upb_context *c) {
+ return (struct upb_symtab_entry*)upb_strtable_begin(&c->symtab);
+}
+
+INLINE struct upb_symtab_entry *upb_context_symnext(
+ struct upb_context *c, struct upb_symtab_entry *cur) {
+ return (struct upb_symtab_entry*)upb_strtable_next(&c->symtab, &cur->e);
+}
+
+/* Adding symbols. ************************************************************/
+
+/* Adds the definitions in the given file descriptor to this context. All
+ * types that are referenced from fd must have previously been defined (or be
+ * defined in fd). fd may not attempt to define any names that are already
+ * defined in this context.
+ *
+ * Caller retains ownership of fd, but the context will contain references to
+ * it, so it must outlive the context.
+ *
+ * upb_context_addfd only returns true or false; it does not give any hint
+ * about what happened in the case of failure. This is because the descriptor
+ * is expected to have been validated at the time it was parsed/generated. */
+bool upb_context_addfd(struct upb_context *c,
+ struct google_protobuf_FileDescriptorProto *fd);
+
+bool upb_context_parsefds(struct upb_context *c, struct upb_string *fds);
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif /* UPB_PARSE_H_ */
diff --git a/src/upb_enum.c b/src/upb_enum.c
new file mode 100644
index 0000000..b599c9b
--- /dev/null
+++ b/src/upb_enum.c
@@ -0,0 +1,31 @@
+/*
+ * upb - a minimalist implementation of protocol buffers.
+ *
+ * Copyright (c) 2009 Joshua Haberman. See LICENSE for details.
+ */
+
+#include "descriptor.h"
+#include "upb_enum.h"
+
+void upb_enum_init(struct upb_enum *e,
+ struct google_protobuf_EnumDescriptorProto *ed) {
+ int num_values = ed->set_flags.has.value ? ed->value->len : 0;
+ e->descriptor = ed;
+ upb_strtable_init(&e->nametoint, num_values, sizeof(struct upb_enum_ntoi_entry));
+ upb_inttable_init(&e->inttoname, num_values, sizeof(struct upb_enum_iton_entry));
+
+ for(int i = 0; i < num_values; i++) {
+ google_protobuf_EnumValueDescriptorProto *value = ed->value->elements[i];
+ struct upb_enum_ntoi_entry ntoi_entry = {.e = {.key = *value->name},
+ .value = value->number};
+ struct upb_enum_iton_entry iton_entry = {.e = {.key = value->number},
+ .string = value->name};
+ upb_strtable_insert(&e->nametoint, &ntoi_entry.e);
+ upb_inttable_insert(&e->inttoname, &iton_entry.e);
+ }
+}
+
+void upb_enum_free(struct upb_enum *e) {
+ upb_strtable_free(&e->nametoint);
+ upb_inttable_free(&e->inttoname);
+}
diff --git a/src/upb_enum.h b/src/upb_enum.h
new file mode 100644
index 0000000..9fea3a4
--- /dev/null
+++ b/src/upb_enum.h
@@ -0,0 +1,41 @@
+/*
+ * upb - a minimalist implementation of protocol buffers.
+ *
+ * Copyright (c) 2009 Joshua Haberman. See LICENSE for details.
+ *
+ * upb_enum is a simple object that allows run-time reflection over the values
+ * defined within an enum. */
+
+#ifndef UPB_ENUM_H_
+#define UPB_ENUM_H_
+
+#include <stdint.h>
+#include "upb_table.h"
+
+/* Forward declaration from descriptor.h. */
+struct google_protobuf_EnumDescriptorProto;
+struct google_protobuf_EnumValueDescriptorProto;
+
+struct upb_enum {
+ struct google_protobuf_EnumDescriptorProto *descriptor;
+ struct upb_strtable nametoint;
+ struct upb_inttable inttoname;
+};
+
+struct upb_enum_ntoi_entry {
+ struct upb_strtable_entry e;
+ uint32_t value;
+};
+
+struct upb_enum_iton_entry {
+ struct upb_inttable_entry e;
+ struct upb_string *string;
+};
+
+/* Initializes and frees an enum, respectively. Caller retains ownership of
+ * ed, but it must outlive e. */
+void upb_enum_init(struct upb_enum *e,
+ struct google_protobuf_EnumDescriptorProto *ed);
+void upb_enum_free(struct upb_enum *e);
+
+#endif /* UPB_ENUM_H_ */
diff --git a/src/upb_inlinedefs.c b/src/upb_inlinedefs.c
new file mode 100644
index 0000000..60863c4
--- /dev/null
+++ b/src/upb_inlinedefs.c
@@ -0,0 +1,19 @@
+/*
+ * upb - a minimalist implementation of protocol buffers.
+ *
+ * This file, if compiled, will contain standalone (non-inlined) versions of
+ * all inline functions defined in header files. We don't generally use this
+ * file since we use "static inline" for inline functions (which will put a
+ * standalone version of the function in any .o file that needs it, but
+ * compiling this file and dumping the object file will let us inspect how
+ * inline functions are compiled, so we keep it around.
+ *
+ * Copyright (c) 2009 Joshua Haberman. See LICENSE for details.
+ */
+
+#define INLINE
+#include "upb_context.h"
+#include "upb_enum.h"
+#include "upb_msg.h"
+#include "upb_parse.h"
+#include "upb_table.h"
diff --git a/src/upb_msg.c b/src/upb_msg.c
new file mode 100644
index 0000000..47a8662
--- /dev/null
+++ b/src/upb_msg.c
@@ -0,0 +1,379 @@
+/*
+ * upb - a minimalist implementation of protocol buffers.
+ *
+ */
+
+#include <inttypes.h>
+#include <stdlib.h>
+#include "descriptor.h"
+#include "upb_msg.h"
+#include "upb_parse.h"
+
+/* Rounds p up to the next multiple of t. */
+#define ALIGN_UP(p, t) ((p) % (t) == 0 ? (p) : (p) + ((t) - ((p) % (t))))
+
+static int div_round_up(int numerator, int denominator) {
+ /* cf. http://stackoverflow.com/questions/17944/how-to-round-up-the-result-of-integer-division */
+ return numerator > 0 ? (numerator - 1) / denominator + 1 : 0;
+}
+
+/* Callback for sorting fields. */
+static int compare_fields(const void *e1, const void *e2) {
+ const google_protobuf_FieldDescriptorProto *fd1 = *(void**)e1;
+ const google_protobuf_FieldDescriptorProto *fd2 = *(void**)e2;
+ /* Required fields go before non-required. */
+ bool req1 = fd1->label == GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_LABEL_REQUIRED;
+ bool req2 = fd2->label == GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_LABEL_REQUIRED;
+ if(req1 != req2) {
+ return req2 - req1;
+ } else {
+ /* Within required and non-required field lists, list in number order.
+ * TODO: consider ordering by data size to reduce padding. */
+ return fd1->number - fd2->number;
+ }
+}
+
+bool upb_msg_init(struct upb_msg *m, struct google_protobuf_DescriptorProto *d)
+{
+ /* TODO: more complete validation.
+ * TODO: re-enable this check when we properly set this flag. */
+ //if(!d->set_flags.has.field) return false;
+
+ upb_inttable_init(&m->fields_by_num, d->field->len,
+ sizeof(struct upb_fieldsbynum_entry));
+ upb_strtable_init(&m->fields_by_name, d->field->len,
+ sizeof(struct upb_fieldsbyname_entry));
+
+ m->descriptor = d;
+ m->num_fields = d->field->len;
+ m->set_flags_bytes = div_round_up(m->num_fields, 8);
+ /* These are incremented in the loop. */
+ m->num_required_fields = 0;
+ m->size = m->set_flags_bytes;
+
+ m->fields = malloc(sizeof(*m->fields) * m->num_fields);
+ m->field_descriptors = malloc(sizeof(*m->field_descriptors) * m->num_fields);
+ for(unsigned int i = 0; i < m->num_fields; i++) {
+ /* We count on the caller to keep this pointer alive. */
+ m->field_descriptors[i] = d->field->elements[i];
+ }
+ /* TODO: re-enable proper sorting once the compiler is sorted out. */
+ //qsort(m->field_descriptors, m->num_fields, sizeof(void*), compare_fields);
+
+ size_t max_align = 0;
+ for(unsigned int i = 0; i < m->num_fields; i++) {
+ struct upb_msg_field *f = &m->fields[i];
+ google_protobuf_FieldDescriptorProto *fd = m->field_descriptors[i];
+ struct upb_type_info *type_info = &upb_type_info[fd->type];
+
+ /* General alignment rules are: each member must be at an address that is a
+ * multiple of that type's alignment. Also, the size of the structure as
+ * a whole must be a multiple of the greatest alignment of any member. */
+ f->field_index = i;
+ f->byte_offset = ALIGN_UP(m->size, type_info->align);
+ f->type = fd->type;
+ f->label = fd->label;
+ m->size = f->byte_offset + type_info->size;
+ max_align = max(max_align, type_info->align);
+ if(fd->label == GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_LABEL_REQUIRED)
+ m->num_required_fields++;
+
+ /* Insert into the tables. Note that f->ref will be uninitialized, even in
+ * the tables' copies of *f, which is why we must update them separately
+ * in upb_msg_ref() below. */
+ struct upb_fieldsbynum_entry nument = {.e = {.key = fd->number}, .f = *f};
+ struct upb_fieldsbyname_entry strent = {.e = {.key = *fd->name}, .f = *f};
+ upb_inttable_insert(&m->fields_by_num, &nument.e);
+ upb_strtable_insert(&m->fields_by_name, &strent.e);
+ }
+
+ m->size = ALIGN_UP(m->size, max_align);
+ return true;
+}
+
+void upb_msg_free(struct upb_msg *m)
+{
+ upb_inttable_free(&m->fields_by_num);
+ upb_strtable_free(&m->fields_by_name);
+ free(m->fields);
+ free(m->field_descriptors);
+}
+
+void upb_msg_ref(struct upb_msg *m, struct upb_msg_field *f,
+ union upb_symbol_ref ref) {
+ struct google_protobuf_FieldDescriptorProto *d =
+ upb_msg_field_descriptor(f, m);
+ struct upb_fieldsbynum_entry *int_e = upb_inttable_lookup(
+ &m->fields_by_num, d->number, sizeof(struct upb_fieldsbynum_entry));
+ struct upb_fieldsbyname_entry *str_e =
+ upb_strtable_lookup(&m->fields_by_name, d->name);
+ assert(int_e && str_e);
+ f->ref = ref;
+ int_e->f.ref = ref;
+ str_e->f.ref = ref;
+}
+
+/* Memory management *********************************************************/
+
+/* Our memory management scheme is as follows:
+ *
+ * All pointers to dynamic memory (strings, arrays, and submessages) are
+ * expected to be good pointers if they are non-zero, *regardless* of whether
+ * that field's bit is set! That way we can reuse the memory even if the field
+ * is unset and then set later. */
+
+/* For our memory-managed strings and arrays we store extra information
+ * (compared to a plain upb_string or upb_array). But the data starts with
+ * a upb_string and upb_array, so we can overlay onto the regular types. */
+struct mm_upb_string {
+ struct upb_string s;
+ /* Track the allocated size, so we know when we need to reallocate. */
+ uint32_t size;
+ /* Our allocated data. Stored separately so that clients can point s.ptr to
+ * a referenced string, but we can reuse this data later. */
+ char *data;
+};
+
+struct mm_upb_array {
+ struct upb_array a;
+ /* Track the allocated size, so we know when we need to reallocate. */
+ uint32_t size;
+};
+
+static uint32_t round_up_to_pow2(uint32_t v)
+{
+ /* cf. http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2 */
+ v--;
+ v |= v >> 1;
+ v |= v >> 2;
+ v |= v >> 4;
+ v |= v >> 8;
+ v |= v >> 16;
+ v++;
+ return v;
+}
+
+void *upb_msgdata_new(struct upb_msg *m)
+{
+ void *msg = malloc(m->size);
+ memset(msg, 0, m->size); /* Clear all pointers, values, and set bits. */
+ return msg;
+}
+
+static void free_value(union upb_value_ptr p, struct upb_msg_field *f,
+ bool free_submsgs)
+{
+ switch(f->type) {
+ case GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_STRING:
+ case GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_BYTES: {
+ struct mm_upb_string *mm_str = (void*)*p.string;
+ if(mm_str) {
+ free(mm_str->data);
+ free(mm_str);
+ }
+ break;
+ }
+ case GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_MESSAGE:
+ case GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_GROUP:
+ if(free_submsgs) upb_msgdata_free(*p.message, f->ref.msg, free_submsgs);
+ break;
+ default: break; /* For non-dynamic types, do nothing. */
+ }
+}
+
+void upb_msgdata_free(void *data, struct upb_msg *m, bool free_submsgs)
+{
+ if(!data) return; /* A very free-like thing to do. */
+ for(unsigned int i = 0; i < m->num_fields; i++) {
+ struct upb_msg_field *f = &m->fields[i];
+ union upb_value_ptr p = upb_msg_getptr(data, f);
+ if(f->label == GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_LABEL_REPEATED) {
+ if(*p.array) {
+ for(uint32_t j = 0; j < (*p.array)->len; j++)
+ free_value(upb_array_getelementptr(*p.array, j, f->type),
+ f, free_submsgs);
+ free((*p.array)->elements._void);
+ free(*p.array);
+ }
+ } else {
+ free_value(p, f, free_submsgs);
+ }
+ }
+ free(data);
+}
+
+void upb_msg_reuse_str(struct upb_string **str, uint32_t size)
+{
+ if(!*str) {
+ *str = malloc(sizeof(struct mm_upb_string));
+ memset(*str, 0, sizeof(struct mm_upb_string));
+ }
+ struct mm_upb_string *s = (void*)*str;
+ if(s->size < size) {
+ size = max(16, round_up_to_pow2(size));
+ s->data = realloc(s->data, size);
+ s->size = size;
+ }
+ s->s.ptr = s->data;
+}
+
+void upb_msg_reuse_array(struct upb_array **arr, uint32_t size, upb_field_type_t t)
+{
+ if(!*arr) {
+ *arr = malloc(sizeof(struct mm_upb_array));
+ memset(*arr, 0, sizeof(struct mm_upb_array));
+ }
+ struct mm_upb_array *a = (void*)*arr;
+ if(a->size < size) {
+ size = max(16, round_up_to_pow2(size));
+ size_t type_size = upb_type_info[t].size;
+ a->a.elements._void = realloc(a->a.elements._void, size * type_size);
+ /* Zero any newly initialized memory. */
+ memset(UPB_INDEX(a->a.elements._void, a->size, type_size), 0,
+ (size - a->size) * type_size);
+ a->size = size;
+ }
+}
+
+void upb_msg_reuse_strref(struct upb_string **str) { upb_msg_reuse_str(str, 0); }
+
+void upb_msg_reuse_submsg(void **msg, struct upb_msg *m)
+{
+ if(!*msg) *msg = upb_msgdata_new(m);
+ else upb_msg_clear(*msg, m); /* Clears set bits, leaves pointers. */
+}
+
+/* Serialization/Deserialization. ********************************************/
+
+/* We use this as our "user_data" for each frame of the parsing stack. */
+struct parse_frame_data {
+ struct upb_msg *m;
+ void *data;
+};
+
+static void set_frame_data(struct upb_parse_state *s, struct upb_msg *m,
+ void *data)
+{
+ struct parse_frame_data *frame = (void*)&s->top->user_data;
+ frame->m = m;
+ frame->data = data;
+}
+
+static upb_field_type_t tag_cb(struct upb_parse_state *s, struct upb_tag *tag,
+ void **user_field_desc)
+{
+ struct parse_frame_data *frame = (void*)&s->top->user_data;
+ struct upb_msg_field *f = upb_msg_fieldbynum(frame->m, tag->field_number);
+ if(!f || !upb_check_type(tag->wire_type, f->type))
+ return 0; /* Skip unknown or fields of the wrong type. */
+ *user_field_desc = f;
+ return f->type;
+}
+
+/* Returns a pointer to where the next value for field "f" should be stored,
+ * taking into account whether f is an array that may need to be reallocatd. */
+static union upb_value_ptr get_value_ptr(void *data, struct upb_msg_field *f)
+{
+ union upb_value_ptr p = upb_msg_getptr(data, f);
+ if(f->label == GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_LABEL_REPEATED) {
+ size_t len = upb_msg_is_set(data, f) ? (*p.array)->len : 0;
+ upb_msg_reuse_array(p.array, len+1, f->type);
+ (*p.array)->len = len + 1;
+ assert(p._void);
+ p = upb_array_getelementptr(*p.array, len, f->type);
+ assert(p._void);
+ }
+ upb_msg_set(data, f);
+ assert(p._void);
+ return p;
+}
+
+static upb_status_t value_cb(struct upb_parse_state *s, void **buf, void *end,
+ void *user_field_desc)
+{
+ struct parse_frame_data *frame = (void*)&s->top->user_data;
+ struct upb_msg_field *f = user_field_desc;
+ union upb_value_ptr p = get_value_ptr(frame->data, f);
+ UPB_CHECK(upb_parse_value(buf, end, f->type, p));
+ return UPB_STATUS_OK;
+}
+
+static upb_status_t str_cb(struct upb_parse_state *_s, struct upb_string *str,
+ void *user_field_desc)
+{
+ struct upb_msg_parse_state *s = (void*)_s;
+ struct parse_frame_data *frame = (void*)&s->s.top->user_data;
+ struct upb_msg_field *f = user_field_desc;
+ union upb_value_ptr p = get_value_ptr(frame->data, f);
+ if(s->byref) {
+ upb_msg_reuse_strref(p.string);
+ **p.string = *str;
+ } else {
+ upb_msg_reuse_str(p.string, str->byte_len);
+ upb_strcpy(*p.string, str);
+ }
+ return UPB_STATUS_OK;
+}
+
+static void submsg_start_cb(struct upb_parse_state *_s, void *user_field_desc)
+{
+ struct upb_msg_parse_state *s = (void*)_s;
+ struct upb_msg_field *f = user_field_desc;
+ struct parse_frame_data *frame = (void*)&s->s.top->user_data;
+ // TODO: find a non-hacky way to get a pointer to the old frame.
+ struct parse_frame_data *oldframe = (void*)((char*)s->s.top - s->s.udata_size);
+ union upb_value_ptr p = get_value_ptr(oldframe->data, f);
+ assert(f->ref.msg);
+ upb_msg_reuse_submsg(p.message, f->ref.msg);
+ set_frame_data(&s->s, f->ref.msg, *p.message);
+ if(!s->merge) upb_msg_clear(frame->data, f->ref.msg);
+}
+
+void upb_msg_parse_reset(struct upb_msg_parse_state *s, void *msg,
+ struct upb_msg *m, bool merge, bool byref)
+{
+ upb_parse_reset(&s->s);
+ s->merge = merge;
+ s->byref = byref;
+ if(!merge && msg == NULL) msg = upb_msgdata_new(m);
+ upb_msg_clear(msg, m);
+ set_frame_data(&s->s, m, msg);
+ s->s.tag_cb = tag_cb;
+ s->s.value_cb = value_cb;
+ s->s.str_cb = str_cb;
+ s->s.submsg_start_cb = submsg_start_cb;
+}
+
+void upb_msg_parse_init(struct upb_msg_parse_state *s, void *msg,
+ struct upb_msg *m, bool merge, bool byref)
+{
+ upb_parse_init(&s->s, sizeof(struct parse_frame_data));
+ upb_msg_parse_reset(s, msg, m, merge, byref);
+}
+
+void upb_msg_parse_free(struct upb_msg_parse_state *s)
+{
+ upb_parse_free(&s->s);
+}
+
+upb_status_t upb_msg_parse(struct upb_msg_parse_state *s,
+ void *data, size_t len, size_t *read)
+{
+ return upb_parse(&s->s, data, len, read);
+}
+
+void *upb_alloc_and_parse(struct upb_msg *m, struct upb_string *str, bool byref)
+{
+ struct upb_msg_parse_state s;
+ void *msg = upb_msgdata_new(m);
+ upb_msg_parse_init(&s, msg, m, false, byref);
+ size_t read;
+ upb_status_t status = upb_msg_parse(&s, str->ptr, str->byte_len, &read);
+ upb_msg_parse_free(&s);
+ if(status == UPB_STATUS_OK && read == str->byte_len) {
+ return msg;
+ } else {
+ upb_msg_free(msg);
+ return NULL;
+ }
+}
diff --git a/src/upb_msg.h b/src/upb_msg.h
new file mode 100644
index 0000000..8910505
--- /dev/null
+++ b/src/upb_msg.h
@@ -0,0 +1,369 @@
+/*
+ * upb - a minimalist implementation of protocol buffers.
+ *
+ * Copyright (c) 2009 Joshua Haberman. See LICENSE for details.
+ *
+ * A upb_msg provides a full description of a message as defined in a .proto
+ * file. It supports many features and operations for dealing with proto
+ * messages:
+ * - reflection over .proto types at runtime (list fields, get names, etc).
+ * - an in-memory byte-level format for efficiently storing and accessing msgs.
+ * - serializing and deserializing from the in-memory format to a protobuf.
+ * - optional memory management for handling strings, arrays, and submessages.
+ *
+ * Throughout this file, the following convention is used:
+ * - "struct upb_msg *m" describes a message type (name, list of fields, etc).
+ * - "void *data" is an actual message stored using the in-memory format.
+ *
+ * The in-memory format is very much like a C struct that you can define at
+ * run-time, but also supports reflection. Like C structs it supports
+ * offset-based access, as opposed to the much slower name-based lookup. The
+ * format stores both the values themselves and bits describing whether each
+ * field is set or not. For example:
+ *
+ * parsed message Foo {
+ * optional bool a = 1;
+ * repeated uint32 b = 2;
+ * optional Bar c = 3;
+ * }
+ *
+ * The in-memory layout for this message on a 32-bit machine will be something
+ * like:
+ *
+ * Foo
+ * +------------------------+
+ * | set_flags a:1, b:1, c:1|
+ * +------------------------+
+ * | bool a (1 byte) |
+ * +------------------------+
+ * | padding (3 bytes) |
+ * +------------------------+ upb_array
+ * | upb_array* b (4 bytes) | ----> +----------------------------+
+ * +------------------------+ | uint32* elements (4 bytes) | ---+
+ * | Bar* c (4 bytes) | +----------------------------+ |
+ * +------------------------+ | uint32 size (4 bytes) | |
+ * +----------------------------+ |
+ * |
+ * -----------------------------------------------------------------+
+ * |
+ * V
+ * uint32 array
+ * +----+----+----+----+----+----+
+ * | e1 | e2 | e3 | e4 | e5 | e6 |
+ * +----+----+----+----+----+----+
+ *
+ * And the corresponding C structure (as emitted by the proto compiler) would be:
+ *
+ * struct Foo {
+ * union {
+ * uint8_t bytes[1];
+ * struct {
+ * bool a:1;
+ * bool b:1;
+ * bool c:1;
+ * } has;
+ * } set_flags;
+ * bool a;
+ * upb_uint32_array *b;
+ * Bar *c;
+ * }
+ *
+ * Because the C struct emitted by the upb compiler uses exactly the same
+ * byte-level format as the reflection interface, you can access the same hunk
+ * of memory either way. The C struct provides maximum performance and static
+ * type safety; upb_msg provides flexibility.
+ *
+ * The in-memory format has no interoperability guarantees whatsoever, except
+ * that a single version of upb will interoperate with itself. Don't even
+ * think about persisting the in-memory format or sending it anywhere. That's
+ * what serialized protobufs are for! The in-memory format is just that -- an
+ * in-memory representation that allows for fast access.
+ *
+ * The in-memory format is carefully designed to *not* mandate any particular
+ * memory management scheme. This should make it easier to integrate with
+ * existing memory management schemes, or to perform advanced techniques like
+ * reference counting, garbage collection, and string references. Different
+ * clients can read each others messages regardless of what memory management
+ * scheme each is using.
+ *
+ * A memory management scheme is provided for convenience, and it is used by
+ * default by the stock message parser. Clients can substitute their own
+ * memory management scheme into this parser without any loss of generality
+ * or performance.
+ */
+
+#ifndef UPB_MSG_H_
+#define UPB_MSG_H_
+
+#include <stdbool.h>
+#include <stdint.h>
+
+#include "upb.h"
+#include "upb_table.h"
+#include "upb_parse.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Forward declarations from descriptor.h. */
+struct google_protobuf_DescriptorProto;
+struct google_protobuf_FieldDescriptorProto;
+
+/* Message definition. ********************************************************/
+
+/* Structure that describes a single field in a message. This structure is very
+ * consciously designed to fit into 12/16 bytes (32/64 bit, respectively),
+ * because copies of this struct are in the hash table that is read in the
+ * critical path of parsing. Minimizing the size of this struct increases
+ * cache-friendliness. */
+struct upb_msg_field {
+ union upb_symbol_ref ref;
+ uint32_t byte_offset; /* Where to find the data. */
+ uint16_t field_index; /* Indexes upb_msg.fields. Also indicates set bit */
+ upb_field_type_t type; /* Copied from descriptor for cache-friendliness. */
+ upb_label_t label;
+};
+
+/* Structure that describes a single .proto message type. */
+struct upb_msg {
+ struct google_protobuf_DescriptorProto *descriptor;
+ size_t size;
+ uint32_t num_fields;
+ uint32_t set_flags_bytes;
+ uint32_t num_required_fields; /* Required fields have the lowest set bytemasks. */
+ struct upb_inttable fields_by_num;
+ struct upb_strtable fields_by_name;
+ struct upb_msg_field *fields;
+ struct google_protobuf_FieldDescriptorProto **field_descriptors;
+};
+
+/* The num->field and name->field maps in upb_msg allow fast lookup of fields
+ * by number or name. These lookups are in the critical path of parsing and
+ * field lookup, so they must be as fast as possible. To make these more
+ * cache-friendly, we put the data in the table by value. */
+
+struct upb_fieldsbynum_entry {
+ struct upb_inttable_entry e;
+ struct upb_msg_field f;
+};
+
+struct upb_fieldsbyname_entry {
+ struct upb_strtable_entry e;
+ struct upb_msg_field f;
+};
+
+/* Can be used to retrieve a field descriptor given the upb_msg_field ref. */
+INLINE struct google_protobuf_FieldDescriptorProto *upb_msg_field_descriptor(
+ struct upb_msg_field *f, struct upb_msg *m) {
+ return m->field_descriptors[f->field_index];
+}
+
+/* Initializes/frees a upb_msg. Caller retains ownership of d, but the msg
+ * will contain references to it, so it must outlive the msg. Note that init
+ * does not resolve upb_msg_field.ref -- the caller should do that
+ * post-initialization by calling upb_msg_ref() below. */
+bool upb_msg_init(struct upb_msg *m, struct google_protobuf_DescriptorProto *d);
+void upb_msg_free(struct upb_msg *m);
+
+/* Clients use this function on a previously initialized upb_msg to resolve the
+ * "ref" field in the upb_msg_field. Since messages can refer to each other in
+ * mutually-recursive ways, this step must be separated from initialization. */
+void upb_msg_ref(struct upb_msg *m, struct upb_msg_field *f, union upb_symbol_ref ref);
+
+/* Looks up a field by name or number. While these are written to be as fast
+ * as possible, it will still be faster to cache the results of this lookup if
+ * possible. These return NULL if no such field is found. */
+INLINE struct upb_msg_field *upb_msg_fieldbynum(struct upb_msg *m,
+ uint32_t number) {
+ struct upb_fieldsbynum_entry *e =
+ (struct upb_fieldsbynum_entry*)upb_inttable_lookup(
+ &m->fields_by_num, number, sizeof(struct upb_fieldsbynum_entry));
+ return e ? &e->f : NULL;
+}
+INLINE struct upb_msg_field *upb_msg_fieldbyname(struct upb_msg *m,
+ struct upb_string *name) {
+ struct upb_fieldsbyname_entry *e =
+ (struct upb_fieldsbyname_entry*)upb_strtable_lookup(
+ &m->fields_by_name, name);
+ return e ? &e->f : NULL;
+}
+
+/* "Set" flag reading and writing. *******************************************/
+
+INLINE size_t upb_isset_offset(uint32_t field_index) {
+ return field_index / 8;
+}
+
+INLINE uint8_t upb_isset_mask(uint32_t field_index) {
+ return 1 << (field_index % 8);
+}
+
+/* Functions for reading and writing the "set" flags in the msg. Note that
+ * these do not perform memory management associated with any dynamic memory
+ * these fields may be referencing. These *only* set and test the flags. */
+INLINE void upb_msg_set(void *s, struct upb_msg_field *f)
+{
+ ((char*)s)[upb_isset_offset(f->field_index)] |= upb_isset_mask(f->field_index);
+}
+
+INLINE void upb_msg_unset(void *s, struct upb_msg_field *f)
+{
+ ((char*)s)[upb_isset_offset(f->field_index)] &= ~upb_isset_mask(f->field_index);
+}
+
+INLINE bool upb_msg_is_set(void *s, struct upb_msg_field *f)
+{
+ return ((char*)s)[upb_isset_offset(f->field_index)] & upb_isset_mask(f->field_index);
+}
+
+INLINE bool upb_msg_all_required_fields_set(void *s, struct upb_msg *m)
+{
+ int num_fields = m->num_required_fields;
+ int i = 0;
+ while(num_fields > 8) {
+ if(((uint8_t*)s)[i++] != 0xFF) return false;
+ num_fields -= 8;
+ }
+ if(((uint8_t*)s)[i] != (1 << num_fields) - 1) return false;
+ return true;
+}
+
+INLINE void upb_msg_clear(void *s, struct upb_msg *m)
+{
+ memset(s, 0, m->set_flags_bytes);
+}
+
+/* Scalar (non-array) data access. ********************************************/
+
+/* Returns a pointer to a specific field in a message. */
+INLINE union upb_value_ptr upb_msg_getptr(void *data, struct upb_msg_field *f) {
+ union upb_value_ptr p;
+ p._void = ((char*)data + f->byte_offset);
+ return p;
+}
+
+/* Arrays. ********************************************************************/
+
+/* Represents an array (a repeated field) of any type. The interpretation of
+ * the data in the array depends on the type. */
+struct upb_array {
+ union upb_value_ptr elements;
+ uint32_t len; /* Measured in elements. */
+};
+
+/* Returns a pointer to an array element. */
+INLINE union upb_value_ptr upb_array_getelementptr(
+ struct upb_array *arr, uint32_t n, upb_field_type_t type)
+{
+ union upb_value_ptr ptr;
+ ptr._void = (void*)((char*)arr->elements._void + n*upb_type_info[type].size);
+ return ptr;
+}
+
+/* These are all overlays on upb_array, pointers between them can be cast. */
+#define UPB_DEFINE_ARRAY_TYPE(name, type) \
+ struct name ## _array { \
+ type *elements; \
+ uint32_t len; \
+ };
+
+UPB_DEFINE_ARRAY_TYPE(upb_double, double)
+UPB_DEFINE_ARRAY_TYPE(upb_float, float)
+UPB_DEFINE_ARRAY_TYPE(upb_int32, int32_t)
+UPB_DEFINE_ARRAY_TYPE(upb_int64, int64_t)
+UPB_DEFINE_ARRAY_TYPE(upb_uint32, uint32_t)
+UPB_DEFINE_ARRAY_TYPE(upb_uint64, uint64_t)
+UPB_DEFINE_ARRAY_TYPE(upb_bool, bool)
+UPB_DEFINE_ARRAY_TYPE(upb_string, struct upb_string*)
+
+/* Defines an array of a specific message type. */
+#define UPB_MSG_ARRAY(msg_type) struct msg_type ## _array
+#define UPB_DEFINE_MSG_ARRAY(msg_type) \
+ UPB_MSG_ARRAY(msg_type) { \
+ msg_type **elements; \
+ uint32_t len; \
+ };
+
+/* Memory management *********************************************************/
+
+/* One important note about these memory management routines: they must be used
+ * completely or not at all (for each message). In other words, you can't
+ * allocate your own message and then free it with upb_msgdata_free. As
+ * another example, you can't point a field to your own string and then call
+ * upb_msg_reuse_str. */
+
+/* Allocates and frees message data, respectively. Newly allocated data is
+ * initialized to empty. Freeing a message always frees string data, but
+ * the client can decide whether or not submessages should be deleted. */
+void *upb_msgdata_new(struct upb_msg *m);
+void upb_msgdata_free(void *data, struct upb_msg *m, bool free_submsgs);
+
+/* Given a pointer to the appropriate field of the message or array, these
+ * functions will lazily allocate memory for a string, array, or submessage.
+ * If the previously allocated memory is big enough, it will reuse it without
+ * re-allocating. See upb_msg.c for example usage. */
+
+/* Reuse a string of at least the given size. */
+void upb_msg_reuse_str(struct upb_string **str, uint32_t size);
+/* Like the previous, but assumes that the string will be by reference, so
+ * doesn't allocate memory for the string itself. */
+void upb_msg_reuse_strref(struct upb_string **str);
+
+/* Reuse an array of at least the given size, with the given type. */
+void upb_msg_reuse_array(struct upb_array **arr, uint32_t size,
+ upb_field_type_t t);
+
+/* Reuse a submessage of the given type. */
+void upb_msg_reuse_submsg(void **msg, struct upb_msg *m);
+
+/* Serialization/Deserialization. ********************************************/
+
+/* This is all just a layer on top of the stream-oriented facility in
+ * upb_parse.h. */
+
+struct upb_msg_parse_state {
+ struct upb_parse_state s;
+ bool merge;
+ bool byref;
+ struct upb_msg *m;
+};
+
+/* Initializes/frees a message parser. The parser will write the data to the
+ * message data "data", which the caller must have previously allocated (the
+ * parser will allocate submsgs, strings, and arrays as needed, however).
+ *
+ * "Merge" controls whether the parser will append to data instead of
+ * overwriting. Merging concatenates arrays and merges submessages instead
+ * of clearing both.
+ *
+ * "Byref" controls whether the new message data copies or references strings
+ * it encounters. If byref == true, then all strings supplied to upb_msg_parse
+ * must remain unchanged and must outlive data. */
+void upb_msg_parse_init(struct upb_msg_parse_state *s, void *data,
+ struct upb_msg *m, bool merge, bool byref);
+void upb_msg_parse_reset(struct upb_msg_parse_state *s, void *data,
+ struct upb_msg *m, bool merge, bool byref);
+void upb_msg_parse_free(struct upb_msg_parse_state *s);
+
+/* Parses a protobuf fragment, writing the data to the message that was passed
+ * to upb_msg_parse_init. This function can be called multiple times as more
+ * data becomes available. */
+upb_status_t upb_msg_parse(struct upb_msg_parse_state *s,
+ void *data, size_t len, size_t *read);
+
+/* Parses the protobuf in s (which is expected to be complete) and allocates
+ * new message data to hold it. This is an alternative to the streaming API
+ * above. "byref" works as in upb_msg_parse_init(). */
+void *upb_alloc_and_parse(struct upb_msg *m, struct upb_string *s, bool byref);
+
+
+/* Text dump *****************************************************************/
+
+void upb_msg_print(void *data, struct upb_msg *m, FILE *stream);
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif /* UPB_MSG_H_ */
diff --git a/src/upb_parse.c b/src/upb_parse.c
new file mode 100644
index 0000000..3a011a6
--- /dev/null
+++ b/src/upb_parse.c
@@ -0,0 +1,383 @@
+/*
+ * upb - a minimalist implementation of protocol buffers.
+ *
+ * Copyright (c) 2008-2009 Joshua Haberman. See LICENSE for details.
+ */
+
+#include "upb_parse.h"
+
+#include <assert.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+#include "descriptor.h"
+
+/* Lowest-level functions -- these read integers from the input buffer. */
+
+static void *check_end(uint8_t *buf, void *end, size_t maxlen,
+ upb_status_t *bound_error)
+{
+ void *maxend = buf + maxlen;
+ if(end < maxend) {
+ *bound_error = UPB_STATUS_NEED_MORE_DATA;
+ return end;
+ } else {
+ *bound_error = UPB_ERROR_UNTERMINATED_VARINT;
+ return maxend;
+ }
+}
+
+static upb_status_t get_v_uint64_t(void *restrict *buf, void *end,
+ uint64_t *restrict val)
+{
+ uint8_t *b = *buf;
+ upb_status_t bound_error;
+ end = check_end(b, end, 10, &bound_error); /* 2**64 is a 10-byte varint. */
+ uint8_t last = 0x80;
+ *val = 0;
+ for(int bitpos = 0; b < (uint8_t*)end && (last & 0x80); b++, bitpos += 7)
+ *val |= ((uint64_t)((last = *b) & 0x7F)) << bitpos;
+
+ if(last & 0x80) return bound_error;
+ *buf = b;
+ return UPB_STATUS_OK;
+}
+
+static upb_status_t skip_v_uint64_t(void **buf, void *end)
+{
+ uint8_t *b = *buf;
+ upb_status_t bound_error;
+ end = check_end(b, end, 10, &bound_error); /* 2**64 is a 10-byte varint. */
+ uint8_t last = 0x80;
+ for(; b < (uint8_t*)end && (last & 0x80); b++)
+ last = *b;
+
+ if(last & 0x80) return bound_error;
+ *buf = b;
+ return UPB_STATUS_OK;
+}
+
+static upb_status_t get_v_uint32_t(void *restrict *buf, void *end,
+ uint32_t *restrict val)
+{
+ uint8_t *b = *buf, *dend;
+ upb_status_t bound_error;
+ dend = check_end(b, end, 5, &bound_error); /* 2**32 is a 5-byte varint. */
+ end = check_end(b, end, 10, &bound_error); /* May have to discard bytes. */
+ uint8_t last = 0x80;
+ *val = 0;
+ for(int bitpos = 0; b < dend && (last & 0x80); b++, bitpos += 7)
+ *val |= ((uint32_t)((last = *b) & 0x7F)) << bitpos;
+
+ /* Discard high bytes until varint ends. */
+ for(; b < (uint8_t*)end && (last & 0x80); b++)
+ last = *b;
+
+ if(last & 0x80)
+ return bound_error;
+ *buf = b;
+ return UPB_STATUS_OK;
+}
+
+static upb_status_t get_f_uint32_t(void *restrict *buf, void *end,
+ uint32_t *restrict val)
+{
+ uint8_t *b = *buf;
+ void *uint32_end = (uint8_t*)*buf + sizeof(uint32_t);
+ if(uint32_end > end) return UPB_STATUS_NEED_MORE_DATA;
+#if UPB_UNALIGNED_READS_OK
+ *val = *(uint32_t*)b;
+#else
+#define SHL(val, bits) ((uint32_t)val << bits)
+ *val = SHL(b[0], 0) | SHL(b[1], 8) | SHL(b[2], 16) | SHL(b[3], 24);
+#undef SHL
+#endif
+ *buf = uint32_end;
+ return UPB_STATUS_OK;
+}
+
+static upb_status_t get_f_uint64_t(void *restrict *buf, void *end,
+ uint64_t *restrict val)
+{
+ void *uint64_end = (uint8_t*)*buf + sizeof(uint64_t);
+ if(uint64_end > end) return UPB_STATUS_NEED_MORE_DATA;
+#if UPB_UNALIGNED_READS_OK
+ *val = *(uint64_t*)*buf;
+ *buf = uint64_end;
+#else
+ uint32_t lo32, hi32;
+ get_f_uint32_t(buf, &lo32, end);
+ get_f_uint32_t(buf, &hi32, end);
+ *val = lo32 | ((uint64_t)hi32 << 32);
+#endif
+ return UPB_STATUS_OK;
+}
+
+static upb_status_t skip_f_uint32_t(void **buf, void *end)
+{
+ void *uint32_end = (uint8_t*)*buf + sizeof(uint32_t);
+ if(uint32_end > end) return UPB_STATUS_NEED_MORE_DATA;
+ *buf = uint32_end;
+ return UPB_STATUS_OK;
+}
+
+static upb_status_t skip_f_uint64_t(void **buf, void *end)
+{
+ void *uint64_end = (uint8_t*)*buf + sizeof(uint64_t);
+ if(uint64_end > end) return UPB_STATUS_NEED_MORE_DATA;
+ *buf = uint64_end;
+ return UPB_STATUS_OK;
+}
+
+static int32_t zz_decode_32(uint32_t n) { return (n >> 1) ^ -(int32_t)(n & 1); }
+static int64_t zz_decode_64(uint64_t n) { return (n >> 1) ^ -(int64_t)(n & 1); }
+
+/* Functions for reading wire values and converting them to values. These
+ * are generated with macros because they follow a higly consistent pattern. */
+
+#define WVTOV(type, wire_t, val_t) \
+ static void wvtov_ ## type(wire_t s, val_t *d)
+
+#define GET(type, v_or_f, wire_t, val_t, member_name) \
+ static upb_status_t get_ ## type(void **buf, void *end, val_t *d) { \
+ wire_t tmp; \
+ UPB_CHECK(get_ ## v_or_f ## _ ## wire_t(buf, end, &tmp)); \
+ wvtov_ ## type(tmp, d); \
+ return UPB_STATUS_OK; \
+ }
+
+#define T(type, v_or_f, wire_t, val_t, member_name) \
+ WVTOV(type, wire_t, val_t); /* prototype for GET below */ \
+ GET(type, v_or_f, wire_t, val_t, member_name) \
+ WVTOV(type, wire_t, val_t)
+
+T(DOUBLE, f, uint64_t, double, _double) { memcpy(d, &s, sizeof(double)); }
+T(FLOAT, f, uint32_t, float, _float) { memcpy(d, &s, sizeof(float)); }
+T(INT32, v, uint32_t, int32_t, int32) { *d = (int32_t)s; }
+T(INT64, v, uint64_t, int64_t, int64) { *d = (int64_t)s; }
+T(UINT32, v, uint32_t, uint32_t, uint32) { *d = s; }
+T(UINT64, v, uint64_t, uint64_t, uint64) { *d = s; }
+T(SINT32, v, uint32_t, int32_t, int32) { *d = zz_decode_32(s); }
+T(SINT64, v, uint64_t, int64_t, int64) { *d = zz_decode_64(s); }
+T(FIXED32, f, uint32_t, uint32_t, uint32) { *d = s; }
+T(FIXED64, f, uint64_t, uint64_t, uint64) { *d = s; }
+T(SFIXED32, f, uint32_t, int32_t, int32) { *d = (int32_t)s; }
+T(SFIXED64, f, uint64_t, int64_t, int64) { *d = (int64_t)s; }
+T(BOOL, v, uint32_t, bool, _bool) { *d = (bool)s; }
+T(ENUM, v, uint32_t, int32_t, int32) { *d = (int32_t)s; }
+#undef WVTOV
+#undef GET
+#undef T
+
+#define alignof(t) offsetof(struct { char c; t x; }, x)
+
+/* May want to move this to upb.c if enough other things warrant it. */
+struct upb_type_info upb_type_info[] = {
+ [GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_DOUBLE] = {alignof(double), sizeof(double), UPB_WIRE_TYPE_64BIT},
+ [GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_FLOAT] = {alignof(float), sizeof(float), UPB_WIRE_TYPE_32BIT},
+ [GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_INT64] = {alignof(int64_t), sizeof(int64_t), UPB_WIRE_TYPE_VARINT},
+ [GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_UINT64] = {alignof(uint64_t), sizeof(uint64_t), UPB_WIRE_TYPE_VARINT},
+ [GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_INT32] = {alignof(int32_t), sizeof(int32_t), UPB_WIRE_TYPE_VARINT},
+ [GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_FIXED64] = {alignof(uint64_t), sizeof(uint64_t), UPB_WIRE_TYPE_64BIT},
+ [GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_FIXED32] = {alignof(uint32_t), sizeof(uint32_t), UPB_WIRE_TYPE_32BIT},
+ [GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_BOOL] = {alignof(bool), sizeof(bool), UPB_WIRE_TYPE_VARINT},
+ [GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_MESSAGE] = {alignof(void*), sizeof(void*), UPB_WIRE_TYPE_DELIMITED},
+ [GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_GROUP] = {alignof(void*), sizeof(void*), UPB_WIRE_TYPE_START_GROUP},
+ [GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_UINT32] = {alignof(uint32_t), sizeof(uint32_t), UPB_WIRE_TYPE_VARINT},
+ [GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_ENUM] = {alignof(uint32_t), sizeof(uint32_t), UPB_WIRE_TYPE_VARINT},
+ [GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_SFIXED32]= {alignof(int32_t), sizeof(int32_t), UPB_WIRE_TYPE_32BIT},
+ [GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_SFIXED64]= {alignof(int64_t), sizeof(int64_t), UPB_WIRE_TYPE_64BIT},
+ [GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_SINT32] = {alignof(int32_t), sizeof(int32_t), UPB_WIRE_TYPE_VARINT},
+ [GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_SINT64] = {alignof(int64_t), sizeof(int64_t), UPB_WIRE_TYPE_VARINT},
+ [GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_STRING] = {alignof(struct upb_string*), sizeof(struct upb_string*), UPB_WIRE_TYPE_DELIMITED},
+ [GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_BYTES] = {alignof(struct upb_string*), sizeof(struct upb_string*), UPB_WIRE_TYPE_DELIMITED},
+};
+
+static upb_status_t parse_tag(void **buf, void *end, struct upb_tag *tag)
+{
+ uint32_t tag_int;
+ UPB_CHECK(get_v_uint32_t(buf, end, &tag_int));
+ tag->wire_type = (upb_wire_type_t)(tag_int & 0x07);
+ tag->field_number = tag_int >> 3;
+ return UPB_STATUS_OK;
+}
+
+upb_status_t upb_parse_wire_value(void **buf, void *end, upb_wire_type_t wt,
+ union upb_wire_value *wv)
+{
+ switch(wt) {
+ case UPB_WIRE_TYPE_VARINT: UPB_CHECK(get_v_uint64_t(buf, end, &wv->varint)); break;
+ case UPB_WIRE_TYPE_64BIT: UPB_CHECK(get_f_uint64_t(buf, end, &wv->_64bit)); break;
+ case UPB_WIRE_TYPE_32BIT: UPB_CHECK(get_f_uint32_t(buf, end, &wv->_32bit)); break;
+ default: return UPB_ERROR_ILLEGAL; /* Doesn't handle delimited, groups. */
+ }
+ return UPB_STATUS_OK;
+}
+
+static upb_status_t skip_wire_value(void **buf, void *end, upb_wire_type_t wt)
+{
+ switch(wt) {
+ case UPB_WIRE_TYPE_VARINT: UPB_CHECK(skip_v_uint64_t(buf, end)); break;
+ case UPB_WIRE_TYPE_64BIT: UPB_CHECK(skip_f_uint64_t(buf, end)); break;
+ case UPB_WIRE_TYPE_32BIT: UPB_CHECK(skip_f_uint32_t(buf, end)); break;
+ case UPB_WIRE_TYPE_START_GROUP: /* TODO: skip to matching end group. */
+ case UPB_WIRE_TYPE_END_GROUP: break;
+ default: return UPB_ERROR_ILLEGAL;
+ }
+ return UPB_STATUS_OK;
+}
+
+upb_status_t upb_parse_value(void **buf, void *end, upb_field_type_t ft,
+ union upb_value_ptr v)
+{
+#define CASE(t, member_name) \
+ case GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_ ## t: \
+ return get_ ## t(buf, end, v.member_name);
+ switch(ft) {
+ CASE(DOUBLE, _double)
+ CASE(FLOAT, _float)
+ CASE(INT32, int32)
+ CASE(INT64, int64)
+ CASE(UINT32, uint32)
+ CASE(UINT64, uint64)
+ CASE(SINT32, int32)
+ CASE(SINT64, int64)
+ CASE(FIXED32, uint32)
+ CASE(FIXED64, uint64)
+ CASE(SFIXED32, int32)
+ CASE(SFIXED64, int64)
+ CASE(BOOL, _bool)
+ CASE(ENUM, int32)
+ default: return UPB_ERROR_ILLEGAL;
+ }
+#undef CASE
+}
+
+void upb_parse_reset(struct upb_parse_state *state)
+{
+ state->offset = 0;
+ state->top = state->stack;
+ /* The top-level message is not delimited (we can keep receiving data for
+ * it indefinitely). */
+ state->top->end_offset = SIZE_MAX;
+}
+
+void upb_parse_init(struct upb_parse_state *state, size_t udata_size)
+{
+ memset(state, 0, sizeof(struct upb_parse_state)); /* Clear all callbacks. */
+ size_t stack_bytes = (sizeof(*state->stack) + udata_size) * UPB_MAX_NESTING;
+ state->stack = malloc(stack_bytes);
+ state->limit = (struct upb_parse_stack_frame*)((char*)state->stack + stack_bytes);
+ state->udata_size = udata_size;
+ upb_parse_reset(state);
+}
+
+void upb_parse_free(struct upb_parse_state *state)
+{
+ free(state->stack);
+}
+
+static void pop_stack_frame(struct upb_parse_state *s)
+{
+ if(s->submsg_end_cb) s->submsg_end_cb(s);
+ s->top--;
+ s->top = (struct upb_parse_stack_frame*)((char*)s->top - s->udata_size);
+}
+
+static upb_status_t push_stack_frame(struct upb_parse_state *s, size_t end,
+ void *user_field_desc)
+{
+ s->top++;
+ s->top = (struct upb_parse_stack_frame*)((char*)s->top + s->udata_size);
+ if(s->top > s->limit) return UPB_ERROR_STACK_OVERFLOW;
+ s->top->end_offset = end;
+ if(s->submsg_start_cb) s->submsg_start_cb(s, user_field_desc);
+ return UPB_STATUS_OK;
+}
+
+static upb_status_t parse_delimited(struct upb_parse_state *s,
+ struct upb_tag *tag,
+ void **buf, void *end,
+ size_t base_offset)
+{
+ int32_t delim_len;
+ void *user_field_desc;
+ void *bufstart = *buf;
+
+ /* Whether we are parsing or skipping the field, we always need to parse
+ * the length. */
+ UPB_CHECK(get_INT32(buf, end, &delim_len));
+ upb_field_type_t ft = s->tag_cb(s, tag, &user_field_desc);
+ if(*buf < bufstart) return UPB_ERROR_OVERFLOW;
+ if(*buf > end && ft != GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_MESSAGE) {
+ /* Streaming submessages is ok, but for other delimited types (string,
+ * bytes, and packed arrays) we require that all the delimited data is
+ * available. This could be relaxed if desired. */
+ return UPB_STATUS_NEED_MORE_DATA;
+ }
+
+ if(ft == GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_MESSAGE) {
+ base_offset += ((char*)*buf - (char*)bufstart);
+ UPB_CHECK(push_stack_frame(s, base_offset + delim_len, user_field_desc));
+ } else {
+ void *delim_end = (char*)*buf + delim_len;
+ if(ft == GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_STRING ||
+ ft == GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_BYTES) {
+ struct upb_string str = {.ptr = *buf, .byte_len = delim_len};
+ s->str_cb(s, &str, user_field_desc);
+ *buf = delim_end;
+ } else {
+ /* Packed Array. */
+ while(*buf < delim_end)
+ UPB_CHECK(s->value_cb(s, buf, end, user_field_desc));
+ }
+ }
+ return UPB_STATUS_OK;
+}
+
+static upb_status_t parse_nondelimited(struct upb_parse_state *s,
+ struct upb_tag *tag,
+ void **buf, void *end)
+{
+ /* Simple value or begin group. */
+ void *user_field_desc;
+ upb_field_type_t ft = s->tag_cb(s, tag, &user_field_desc);
+ if(ft == 0) {
+ UPB_CHECK(skip_wire_value(buf, end, tag->wire_type));
+ } else if(ft == GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_GROUP) {
+ /* No length specified, an "end group" tag will mark the end. */
+ UPB_CHECK(push_stack_frame(s, UINT32_MAX, user_field_desc));
+ } else {
+ UPB_CHECK(s->value_cb(s, buf, end, user_field_desc));
+ }
+ return UPB_STATUS_OK;
+}
+
+upb_status_t upb_parse(struct upb_parse_state *restrict s, void *buf, size_t len,
+ size_t *restrict read)
+{
+ void *end = (char*)buf + len;
+ *read = 0;
+ while(buf < end) {
+ struct upb_tag tag;
+ void *bufstart = buf;
+ UPB_CHECK(parse_tag(&buf, end, &tag));
+ if(tag.wire_type == UPB_WIRE_TYPE_END_GROUP) {
+ if(s->top->end_offset != UINT32_MAX)
+ return UPB_ERROR_SPURIOUS_END_GROUP;
+ pop_stack_frame(s);
+ } else if(tag.wire_type == UPB_WIRE_TYPE_DELIMITED) {
+ parse_delimited(s, &tag, &buf, end, s->offset + (char*)buf - (char*)bufstart);
+ } else {
+ parse_nondelimited(s, &tag, &buf, end);
+ }
+ size_t bytes_read = ((char*)buf - (char*)bufstart);
+ *read += bytes_read;
+ s->offset += bytes_read;
+ while(s->offset >= s->top->end_offset) {
+ if(s->offset != s->top->end_offset) return UPB_ERROR_BAD_SUBMESSAGE_END;
+ pop_stack_frame(s);
+ }
+ }
+ return UPB_STATUS_OK;
+}
diff --git a/src/upb_parse.h b/src/upb_parse.h
new file mode 100644
index 0000000..d9db85c
--- /dev/null
+++ b/src/upb_parse.h
@@ -0,0 +1,158 @@
+/*
+ * upb - a minimalist implementation of protocol buffers.
+ *
+ * upb_parse implements a high performance, callback-based, stream-oriented
+ * parser (comparable to the SAX model in XML parsers). For parsing protobufs
+ * into in-memory messages (a more DOM-like model), see the routines in
+ * upb_msg.h, which are layered on top of this parser.
+ *
+ * Copyright (c) 2009 Joshua Haberman. See LICENSE for details.
+ */
+
+#ifndef UPB_PARSE_H_
+#define UPB_PARSE_H_
+
+#include <stdint.h>
+#include <stdbool.h>
+#include "upb.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Definitions. ***************************************************************/
+
+/* A list of types as they are encoded on-the-wire. */
+enum upb_wire_type {
+ UPB_WIRE_TYPE_VARINT = 0,
+ UPB_WIRE_TYPE_64BIT = 1,
+ UPB_WIRE_TYPE_DELIMITED = 2,
+ UPB_WIRE_TYPE_START_GROUP = 3,
+ UPB_WIRE_TYPE_END_GROUP = 4,
+ UPB_WIRE_TYPE_32BIT = 5
+};
+typedef uint8_t upb_wire_type_t;
+
+/* A value as it is encoded on-the-wire, except delimited, which is handled
+ * separately. */
+union upb_wire_value {
+ uint64_t varint;
+ uint64_t _64bit;
+ uint32_t _32bit;
+};
+
+/* A tag occurs before each value on-the-wire. */
+struct upb_tag {
+ upb_field_number_t field_number;
+ upb_wire_type_t wire_type;
+};
+
+/* High-level parsing interface. **********************************************/
+
+/* The general scheme is that the client registers callbacks that will be
+ * called at the appropriate times. These callbacks provide the client with
+ * data and let the client make decisions (like whether to parse or to skip
+ * a value).
+ *
+ * After initializing the parse state, the client can repeatedly call upb_parse
+ * as data becomes available. The parser is fully streaming-capable, so the
+ * data need not all be available at the same time. */
+
+struct upb_parse_state;
+
+/* Initialize and free (respectively) the given parse state, which must have
+ * been previously allocated. udata_size specifies how much space will be
+ * available at parse_stack_frame.user_data in each frame for user data. */
+void upb_parse_init(struct upb_parse_state *state, size_t udata_size);
+void upb_parse_reset(struct upb_parse_state *state);
+void upb_parse_free(struct upb_parse_state *state);
+
+/* The callback that is called immediately after a tag has been parsed. The
+ * client should determine whether it wants to parse or skip the corresponding
+ * value. If it wants to parse it, it must discover and return the correct
+ * .proto type (the tag only contains the wire type) and check that the wire
+ * type is appropriate for the .proto type. To skip the value (which means
+ * skipping all submessages, in the case of a submessage), the callback should
+ * return zero. */
+typedef upb_field_type_t (*upb_tag_cb)(struct upb_parse_state *s,
+ struct upb_tag *tag,
+ void **user_field_desc);
+
+/* The callback that is called when a regular value (ie. not a string or
+ * submessage) is encountered which the client has opted to parse (by not
+ * returning 0 from the tag_cb). The client must parse the value and update
+ * buf accordingly, returning success or failure.
+ *
+ * Note that this callback can be called several times in a row for a single
+ * call to tag_cb in the case of packed arrays. */
+typedef upb_status_t (*upb_value_cb)(struct upb_parse_state *s,
+ void **buf, void *end,
+ void *user_field_desc);
+
+/* The callback that is called when a string is parsed. */
+typedef upb_status_t (*upb_str_cb)(struct upb_parse_state *s,
+ struct upb_string *str,
+ void *user_field_desc);
+
+/* Callbacks that are called when a submessage begins and ends, respectively.
+ * Both are called with the submessage's stack frame at the top of the stack. */
+typedef void (*upb_submsg_start_cb)(struct upb_parse_state *s,
+ void *user_field_desc);
+typedef void (*upb_submsg_end_cb)(struct upb_parse_state *s);
+
+/* Each stack frame (one for each level of submessages/groups) has this format,
+ * where user_data has as many bytes allocated as specified when initialized. */
+struct upb_parse_stack_frame {
+ size_t end_offset; /* 0 indicates that this is a group. */
+ char user_data[];
+};
+
+struct upb_parse_state {
+ size_t offset;
+ struct upb_parse_stack_frame *stack, *top, *limit;
+ size_t udata_size; /* How many bytes the user gets in each frame. */
+ upb_tag_cb tag_cb;
+ upb_value_cb value_cb;
+ upb_str_cb str_cb;
+ upb_submsg_start_cb submsg_start_cb;
+ upb_submsg_end_cb submsg_end_cb;
+};
+
+/* Parses up to len bytes of protobuf data out of buf, calling cb as needed.
+ * The function returns how many bytes were consumed from buf. Data is parsed
+ * until no more data can be read from buf, or the callback sets *done=true,
+ * or an error occured. Sets *read to the number of bytes consumed. */
+upb_status_t upb_parse(struct upb_parse_state *s, void *buf, size_t len,
+ size_t *read);
+
+extern upb_wire_type_t upb_expected_wire_types[];
+/* Returns true if wt is the correct on-the-wire type for ft. */
+INLINE bool upb_check_type(upb_wire_type_t wt, upb_field_type_t ft) {
+ if(ft == 10) { // GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_GROUP)
+ return wt == UPB_WIRE_TYPE_START_GROUP;
+ } else {
+ /* With packed arrays, anything can be delimited (except groups). */
+ return wt == UPB_WIRE_TYPE_DELIMITED ||
+ upb_type_info[ft].expected_wire_type == wt;
+ }
+}
+
+/* Data-consuming functions (to be called from value cb). *********************/
+
+/* Parses and converts a value from the character data starting at buf. The
+ * caller must have previously checked that the wire type is appropriate for
+ * this field type. */
+upb_status_t upb_parse_value(void **buf, void *end, upb_field_type_t ft,
+ union upb_value_ptr v);
+
+/* Parses a wire value with the given type (which must have been obtained from
+ * a tag that was just parsed) and adds the number of bytes that were consumed
+ * to *offset. */
+upb_status_t upb_parse_wire_value(void **buf, void *end, upb_wire_type_t wt,
+ union upb_wire_value *wv);
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif /* UPB_PARSE_H_ */
diff --git a/src/upb_string.c b/src/upb_string.c
new file mode 100644
index 0000000..733bafe
--- /dev/null
+++ b/src/upb_string.c
@@ -0,0 +1,24 @@
+/*
+ * upb - a minimalist implementation of protocol buffers.
+ *
+ * Copyright (c) 2009 Joshua Haberman. See LICENSE for details.
+ */
+
+#include "upb_string.h"
+
+bool upb_strreadfile(const char *filename, struct upb_string *data) {
+ FILE *f = fopen(filename, "rb");
+ if(!f) return false;
+ if(fseek(f, 0, SEEK_END) != 0) return false;
+ long size = ftell(f);
+ if(size < 0) return false;
+ if(fseek(f, 0, SEEK_SET) != 0) return false;
+ data->ptr = (char*)malloc(size);
+ data->byte_len = size;
+ if(fread(data->ptr, size, 1, f) != 1) {
+ free(data->ptr);
+ return false;
+ }
+ fclose(f);
+ return true;
+}
diff --git a/src/upb_string.h b/src/upb_string.h
new file mode 100644
index 0000000..7a05811
--- /dev/null
+++ b/src/upb_string.h
@@ -0,0 +1,82 @@
+/*
+ * upb - a minimalist implementation of protocol buffers.
+ *
+ * Copyright (c) 2009 Joshua Haberman. See LICENSE for details.
+
+ * Defines a delimited (as opposed to null-terminated) string type and some
+ * library functions for manipulating them.
+ *
+ * There are two primary reasons upb uses delimited strings. One is that they
+ * can be more efficient for some operations because they do not have to scan
+ * the string to find its length. For example, streql can start by just
+ * comparing the lengths (very efficient) and scan the strings themselves only
+ * if the lengths are equal.
+ *
+ * More importantly, using delimited strings makes it possible for strings to
+ * reference substrings of other strings. For example, if I am parsing a
+ * protobuf I can create a string that references the original protobuf's
+ * string data. With NULL-termination I would be forced to write a NULL
+ * into the middle of the protobuf's data, which is less than ideal and in
+ * some cases not practical or possible.
+ */
+
+#ifndef UPB_STRING_H_
+#define UPB_STRING_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdlib.h>
+#include <string.h>
+#include "upb.h"
+
+struct upb_string {
+ /* We expect the data to be 8-bit clean (uint8_t), but char* is such an
+ * ingrained convention that we follow it. */
+ char *ptr;
+ uint32_t byte_len;
+};
+
+INLINE bool upb_streql(struct upb_string *s1, struct upb_string *s2) {
+ return s1->byte_len == s2->byte_len &&
+ memcmp(s1->ptr, s2->ptr, s1->byte_len) == 0;
+}
+
+INLINE void upb_strcpy(struct upb_string *dest, struct upb_string *src) {
+ memcpy(dest->ptr, src->ptr, dest->byte_len);
+ dest->byte_len = src->byte_len;
+}
+
+INLINE struct upb_string upb_strdup(struct upb_string s) {
+ struct upb_string copy;
+ copy.ptr = (char*)malloc(s.byte_len);
+ copy.byte_len = s.byte_len;
+ memcpy(copy.ptr, s.ptr, s.byte_len);
+ return copy;
+}
+
+INLINE void upb_strfree(struct upb_string s) {
+ free(s.ptr);
+}
+
+/* Reads an entire file into a newly-allocated string. */
+bool upb_strreadfile(const char *filename, struct upb_string *data);
+
+/* Allows defining upb_strings as literals, ie:
+ * struct upb_string str = UPB_STRLIT("Hello, World!\n");
+ * Doesn't work with C++ due to lack of struct initializer syntax.
+ */
+#define UPB_STRLIT(strlit) {.ptr=strlit, .byte_len=sizeof(strlit)-1}
+
+/* Allows using upb_strings in printf, ie:
+ * struct upb_string str = UPB_STRLIT("Hello, World!\n");
+ * printf("String is: " UPB_STRFMT, UPB_STRARG(str)); */
+#define UPB_STRARG(str) (str).byte_len, (str).ptr
+#define UPB_STRFMT "%.*s"
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif /* UPB_H_ */
diff --git a/src/upb_table.c b/src/upb_table.c
new file mode 100644
index 0000000..3bbc7f7
--- /dev/null
+++ b/src/upb_table.c
@@ -0,0 +1,398 @@
+/*
+ * upb - a minimalist implementation of protocol buffers.
+ *
+ * Copyright (c) 2009 Joshua Haberman. See LICENSE for details.
+ */
+
+#include "upb_table.h"
+
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+
+static const upb_inttable_key_t EMPTYENT = 0;
+static const double MAX_LOAD = 0.85;
+
+static uint32_t MurmurHash2(const void *key, size_t len, uint32_t seed);
+
+/* We use 1-based indexes into the table so that 0 can be "NULL". */
+static struct upb_inttable_entry *intent(struct upb_inttable *t, int32_t i) {
+ return UPB_INDEX(t->t.entries, i-1, t->t.entry_size);
+}
+static struct upb_strtable_entry *strent(struct upb_strtable *t, int32_t i) {
+ return UPB_INDEX(t->t.entries, i-1, t->t.entry_size);
+}
+
+void upb_table_init(struct upb_table *t, uint32_t size, uint16_t entry_size)
+{
+ t->count = 0;
+ t->entry_size = entry_size;
+ t->size_lg2 = 1;
+ while(size >>= 1) t->size_lg2++;
+ size_t bytes = upb_table_size(t) * t->entry_size;
+ t->mask = upb_table_size(t) - 1;
+ t->entries = malloc(bytes);
+ memset(t->entries, 0, bytes); /* Both tables consider 0's an empty entry. */
+}
+
+void upb_inttable_init(struct upb_inttable *t, uint32_t size, uint16_t entsize)
+{
+ upb_table_init(&t->t, size, entsize);
+}
+
+void upb_strtable_init(struct upb_strtable *t, uint32_t size, uint16_t entsize)
+{
+ upb_table_init(&t->t, size, entsize);
+}
+
+void upb_table_free(struct upb_table *t) { free(t->entries); }
+void upb_inttable_free(struct upb_inttable *t) { upb_table_free(&t->t); }
+void upb_strtable_free(struct upb_strtable *t) { upb_table_free(&t->t); }
+
+static uint32_t strtable_bucket(struct upb_strtable *t, struct upb_string *key)
+{
+ uint32_t hash = MurmurHash2(key->ptr, key->byte_len, 0);
+ return (hash & (upb_strtable_size(t)-1)) + 1;
+}
+
+void *upb_strtable_lookup(struct upb_strtable *t, struct upb_string *key)
+{
+ uint32_t bucket = strtable_bucket(t, key);
+ struct upb_strtable_entry *e;
+ do {
+ e = strent(t, bucket);
+ if(upb_streql(&e->key, key)) return e;
+ } while((bucket = e->next) != UPB_END_OF_CHAIN);
+ return NULL;
+}
+
+static uint32_t empty_intbucket(struct upb_inttable *table)
+{
+ /* TODO: does it matter that this is biased towards the front of the table? */
+ for(uint32_t i = 1; i <= upb_inttable_size(table); i++) {
+ struct upb_inttable_entry *e = intent(table, i);
+ if(e->key == EMPTYENT) return i;
+ }
+ assert(false);
+ return 0;
+}
+
+/* The insert routines have a lot more code duplication between int/string
+ * variants than I would like, but there's just a bit too much that varies to
+ * parameterize them. */
+static void intinsert(struct upb_inttable *t, struct upb_inttable_entry *e)
+{
+ assert(upb_inttable_lookup(t, e->key, t->t.entry_size) == NULL);
+ uint32_t bucket = upb_inttable_bucket(t, e->key);
+ struct upb_inttable_entry *table_e = intent(t, bucket);
+ if(table_e->key != EMPTYENT) { /* Collision. */
+ if(bucket == upb_inttable_bucket(t, table_e->key)) {
+ /* Existing element is in its main posisiton. Find an empty slot to
+ * place our new element and append it to this key's chain. */
+ uint32_t empty_bucket = empty_intbucket(t);
+ while (table_e->next != UPB_END_OF_CHAIN)
+ table_e = intent(t, table_e->next);
+ table_e->next = empty_bucket;
+ table_e = intent(t, empty_bucket);
+ } else {
+ /* Existing element is not in its main position. Move it to an empty
+ * slot and put our element in its main position. */
+ uint32_t empty_bucket = empty_intbucket(t);
+ uint32_t evictee_bucket = upb_inttable_bucket(t, table_e->key);
+ memcpy(intent(t, empty_bucket), table_e, t->t.entry_size); /* copies next */
+ struct upb_inttable_entry *evictee_e = intent(t, evictee_bucket);
+ while(1) {
+ assert(evictee_e->key != UPB_EMPTY_ENTRY);
+ assert(evictee_e->next != UPB_END_OF_CHAIN);
+ if(evictee_e->next == bucket) {
+ evictee_e->next = empty_bucket;
+ break;
+ }
+ }
+ /* table_e remains set to our mainpos. */
+ }
+ }
+ memcpy(table_e, e, t->t.entry_size);
+ table_e->next = UPB_END_OF_CHAIN;
+ assert(upb_inttable_lookup(t, e->key, t->t.entry_size) == table_e);
+}
+
+void upb_inttable_insert(struct upb_inttable *t, struct upb_inttable_entry *e)
+{
+ assert(e->key != 0);
+ if((double)++t->t.count / upb_inttable_size(t) > MAX_LOAD) {
+ /* Need to resize. New table of double the size, add old elements to it. */
+ struct upb_inttable new_table;
+ upb_inttable_init(&new_table, upb_inttable_size(t)*2, t->t.entry_size);
+ struct upb_inttable_entry *old_e;
+ for(old_e = upb_inttable_begin(t); old_e; old_e = upb_inttable_next(t, old_e))
+ intinsert(&new_table, old_e);
+ upb_inttable_free(t);
+ *t = new_table;
+ }
+ intinsert(t, e);
+}
+
+static uint32_t empty_strbucket(struct upb_strtable *table)
+{
+ /* TODO: does it matter that this is biased towards the front of the table? */
+ for(uint32_t i = 1; i <= upb_strtable_size(table); i++) {
+ struct upb_strtable_entry *e = strent(table, i);
+ if(e->key.byte_len == 0) return i;
+ }
+ assert(false);
+ return 0;
+}
+
+static void strinsert(struct upb_strtable *t, struct upb_strtable_entry *e)
+{
+ assert(upb_strtable_lookup(t, &e->key) == NULL);
+ uint32_t bucket = strtable_bucket(t, &e->key);
+ struct upb_strtable_entry *table_e = strent(t, bucket);
+ if(table_e->key.byte_len != 0) { /* Collision. */
+ if(bucket == strtable_bucket(t, &table_e->key)) {
+ /* Existing element is in its main posisiton. Find an empty slot to
+ * place our new element and append it to this key's chain. */
+ uint32_t empty_bucket = empty_strbucket(t);
+ while (table_e->next != UPB_END_OF_CHAIN)
+ table_e = strent(t, table_e->next);
+ table_e->next = empty_bucket;
+ table_e = strent(t, empty_bucket);
+ } else {
+ /* Existing element is not in its main position. Move it to an empty
+ * slot and put our element in its main position. */
+ uint32_t empty_bucket = empty_strbucket(t);
+ uint32_t evictee_bucket = strtable_bucket(t, &table_e->key);
+ memcpy(strent(t, empty_bucket), table_e, t->t.entry_size); /* copies next */
+ struct upb_strtable_entry *evictee_e = strent(t, evictee_bucket);
+ while(1) {
+ assert(evictee_e->key.byte_len != 0);
+ assert(evictee_e->next != UPB_END_OF_CHAIN);
+ if(evictee_e->next == bucket) {
+ evictee_e->next = empty_bucket;
+ break;
+ }
+ evictee_e = strent(t, evictee_e->next);
+ }
+ /* table_e remains set to our mainpos. */
+ }
+ }
+ memcpy(table_e, e, t->t.entry_size);
+ table_e->next = UPB_END_OF_CHAIN;
+ assert(upb_strtable_lookup(t, &e->key) == table_e);
+}
+
+void upb_strtable_insert(struct upb_strtable *t, struct upb_strtable_entry *e)
+{
+ if((double)++t->t.count / upb_strtable_size(t) > MAX_LOAD) {
+ /* Need to resize. New table of double the size, add old elements to it. */
+ struct upb_strtable new_table;
+ upb_strtable_init(&new_table, upb_strtable_size(t)*2, t->t.entry_size);
+ struct upb_strtable_entry *old_e;
+ for(old_e = upb_strtable_begin(t); old_e; old_e = upb_strtable_next(t, old_e))
+ strinsert(&new_table, old_e);
+ upb_strtable_free(t);
+ *t = new_table;
+ }
+ strinsert(t, e);
+}
+
+void *upb_inttable_begin(struct upb_inttable *t) {
+ return upb_inttable_next(t, intent(t, 0));
+}
+
+void *upb_inttable_next(struct upb_inttable *t, struct upb_inttable_entry *cur) {
+ struct upb_inttable_entry *end = intent(t, upb_inttable_size(t)+1);
+ do {
+ cur = (void*)((char*)cur + t->t.entry_size);
+ if(cur == end) return NULL;
+ } while(cur->key == UPB_EMPTY_ENTRY);
+ return cur;
+}
+
+void *upb_strtable_begin(struct upb_strtable *t) {
+ return upb_strtable_next(t, strent(t, 0));
+}
+
+void *upb_strtable_next(struct upb_strtable *t, struct upb_strtable_entry *cur) {
+ struct upb_strtable_entry *end = strent(t, upb_strtable_size(t)+1);
+ do {
+ cur = (void*)((char*)cur + t->t.entry_size);
+ if(cur == end) return NULL;
+ } while(cur->key.byte_len == 0);
+ return cur;
+}
+
+#ifdef UPB_UNALIGNED_READS_OK
+//-----------------------------------------------------------------------------
+// MurmurHash2, by Austin Appleby (released as public domain).
+// Reformatted and C99-ified by Joshua Haberman.
+// Note - This code makes a few assumptions about how your machine behaves -
+// 1. We can read a 4-byte value from any address without crashing
+// 2. sizeof(int) == 4 (in upb this limitation is removed by using uint32_t
+// And it has a few limitations -
+// 1. It will not work incrementally.
+// 2. It will not produce the same results on little-endian and big-endian
+// machines.
+static uint32_t MurmurHash2(const void *key, size_t len, uint32_t seed)
+{
+ // 'm' and 'r' are mixing constants generated offline.
+ // They're not really 'magic', they just happen to work well.
+ const uint32_t m = 0x5bd1e995;
+ const int32_t r = 24;
+
+ // Initialize the hash to a 'random' value
+ uint32_t h = seed ^ len;
+
+ // Mix 4 bytes at a time into the hash
+ const uint8_t * data = (const uint8_t *)key;
+ while(len >= 4) {
+ uint32_t k = *(uint32_t *)data;
+
+ k *= m;
+ k ^= k >> r;
+ k *= m;
+
+ h *= m;
+ h ^= k;
+
+ data += 4;
+ len -= 4;
+ }
+
+ // Handle the last few bytes of the input array
+ switch(len) {
+ case 3: h ^= data[2] << 16;
+ case 2: h ^= data[1] << 8;
+ case 1: h ^= data[0]; h *= m;
+ };
+
+ // Do a few final mixes of the hash to ensure the last few
+ // bytes are well-incorporated.
+ h ^= h >> 13;
+ h *= m;
+ h ^= h >> 15;
+
+ return h;
+}
+
+#else // !UPB_UNALIGNED_READS_OK
+
+//-----------------------------------------------------------------------------
+// MurmurHashAligned2, by Austin Appleby
+// Same algorithm as MurmurHash2, but only does aligned reads - should be safer
+// on certain platforms.
+// Performance will be lower than MurmurHash2
+
+#define MIX(h,k,m) { k *= m; k ^= k >> r; k *= m; h *= m; h ^= k; }
+
+static uint32_t MurmurHash2(const void * key, size_t len, uint32_t seed)
+{
+ const uint32_t m = 0x5bd1e995;
+ const int32_t r = 24;
+ const uint8_t * data = (const uint8_t *)key;
+ uint32_t h = seed ^ len;
+ uint8_t align = (uintptr_t)data & 3;
+
+ if(align && (len >= 4)) {
+ // Pre-load the temp registers
+ uint32_t t = 0, d = 0;
+
+ switch(align) {
+ case 1: t |= data[2] << 16;
+ case 2: t |= data[1] << 8;
+ case 3: t |= data[0];
+ }
+
+ t <<= (8 * align);
+
+ data += 4-align;
+ len -= 4-align;
+
+ int32_t sl = 8 * (4-align);
+ int32_t sr = 8 * align;
+
+ // Mix
+
+ while(len >= 4) {
+ d = *(uint32_t *)data;
+ t = (t >> sr) | (d << sl);
+
+ uint32_t k = t;
+
+ MIX(h,k,m);
+
+ t = d;
+
+ data += 4;
+ len -= 4;
+ }
+
+ // Handle leftover data in temp registers
+
+ d = 0;
+
+ if(len >= align) {
+ switch(align) {
+ case 3: d |= data[2] << 16;
+ case 2: d |= data[1] << 8;
+ case 1: d |= data[0];
+ }
+
+ uint32_t k = (t >> sr) | (d << sl);
+ MIX(h,k,m);
+
+ data += align;
+ len -= align;
+
+ //----------
+ // Handle tail bytes
+
+ switch(len) {
+ case 3: h ^= data[2] << 16;
+ case 2: h ^= data[1] << 8;
+ case 1: h ^= data[0]; h *= m;
+ };
+ } else {
+ switch(len) {
+ case 3: d |= data[2] << 16;
+ case 2: d |= data[1] << 8;
+ case 1: d |= data[0];
+ case 0: h ^= (t >> sr) | (d << sl); h *= m;
+ }
+ }
+
+ h ^= h >> 13;
+ h *= m;
+ h ^= h >> 15;
+
+ return h;
+ } else {
+ while(len >= 4) {
+ uint32_t k = *(uint32_t *)data;
+
+ MIX(h,k,m);
+
+ data += 4;
+ len -= 4;
+ }
+
+ //----------
+ // Handle tail bytes
+
+ switch(len) {
+ case 3: h ^= data[2] << 16;
+ case 2: h ^= data[1] << 8;
+ case 1: h ^= data[0]; h *= m;
+ };
+
+ h ^= h >> 13;
+ h *= m;
+ h ^= h >> 15;
+
+ return h;
+ }
+}
+#undef MIX
+
+#endif // UPB_UNALIGNED_READS_OK
diff --git a/src/upb_table.h b/src/upb_table.h
new file mode 100644
index 0000000..094ed48
--- /dev/null
+++ b/src/upb_table.h
@@ -0,0 +1,123 @@
+/*
+ * upb - a minimalist implementation of protocol buffers.
+ *
+ * Copyright (c) 2009 Joshua Haberman. See LICENSE for details.
+ *
+ * This file defines very fast int->struct (inttable) and string->struct
+ * (strtable) hash tables. The struct can be of any size, and it is stored
+ * in the table itself, for cache-friendly performance.
+ *
+ * The table uses internal chaining with Brent's variation (inspired by the
+ * Lua implementation of hash tables). The hash function for strings is
+ * Austin Appleby's "MurmurHash."
+ */
+
+#ifndef UPB_TABLE_H_
+#define UPB_TABLE_H_
+
+#include <assert.h>
+#include "upb.h"
+#include "upb_string.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Note: the key cannot be zero! Zero is used by the implementation. */
+typedef uint32_t upb_inttable_key_t;
+
+#define UPB_END_OF_CHAIN (uint32_t)0
+#define UPB_EMPTY_ENTRY (uint32_t)0
+
+struct upb_inttable_entry {
+ upb_inttable_key_t key;
+ uint32_t next; /* Internal chaining. */
+};
+
+/* TODO: consider storing the hash in the entry. This would avoid the need to
+ * rehash on table resizes, but more importantly could possibly improve lookup
+ * performance by letting us compare hashes before comparing lengths or the
+ * strings themselves. */
+struct upb_strtable_entry {
+ struct upb_string key;
+ uint32_t next; /* Internal chaining. */
+};
+
+struct upb_table {
+ void *entries;
+ uint32_t count; /* How many elements are currently in the table? */
+ uint16_t entry_size; /* How big is each entry? */
+ uint8_t size_lg2; /* The table is 2^size_lg2 in size. */
+ uint32_t mask;
+};
+
+struct upb_strtable {
+ struct upb_table t;
+};
+
+struct upb_inttable {
+ struct upb_table t;
+};
+
+/* Initialize and free a table, respectively. Specify the initial size
+ * with 'size' (the size will be increased as necessary). Entry size
+ * specifies how many bytes each entry in the table is. */
+void upb_inttable_init(struct upb_inttable *table,
+ uint32_t size, uint16_t entry_size);
+void upb_inttable_free(struct upb_inttable *table);
+void upb_strtable_init(struct upb_strtable *table,
+ uint32_t size, uint16_t entry_size);
+void upb_strtable_free(struct upb_strtable *table);
+
+INLINE uint32_t upb_table_size(struct upb_table *t) { return 1 << t->size_lg2; }
+INLINE uint32_t upb_inttable_size(struct upb_inttable *t) {
+ return upb_table_size(&t->t);
+}
+INLINE uint32_t upb_strtable_size(struct upb_strtable *t) {
+ return upb_table_size(&t->t);
+}
+
+/* Inserts the given key into the hashtable with the given value. The key must
+ * not already exist in the hash table. The data will be copied from e into
+ * the hashtable (the amount of data copied comes from entry_size when the
+ * table was constructed). Therefore the data at val may be freed once the
+ * call returns. */
+void upb_inttable_insert(struct upb_inttable *t, struct upb_inttable_entry *e);
+void upb_strtable_insert(struct upb_strtable *t, struct upb_strtable_entry *e);
+
+INLINE uint32_t upb_inttable_bucket(struct upb_inttable *t, upb_inttable_key_t k) {
+ return (k & t->t.mask) + 1; /* Identity hash for ints. */
+}
+
+/* Looks up key in this table. Inlined because this is in the critical path
+ * of parsing. We have the caller specify the entry_size because fixing
+ * this as a literal (instead of reading table->entry_size) gives the
+ * compiler more ability to optimize. */
+INLINE void *upb_inttable_lookup(struct upb_inttable *t,
+ uint32_t key, uint32_t entry_size) {
+ assert(key != 0);
+ uint32_t bucket = upb_inttable_bucket(t, key);
+ struct upb_inttable_entry *e;
+ do {
+ e = (struct upb_inttable_entry*)UPB_INDEX(t->t.entries, bucket-1, entry_size);
+ if(e->key == key) return e;
+ } while((bucket = e->next) != UPB_END_OF_CHAIN);
+ return NULL; /* Not found. */
+}
+
+void *upb_strtable_lookup(struct upb_strtable *t, struct upb_string *key);
+
+/* Provides iteration over the table. The order in which the entries are
+ * returned is undefined. Insertions invalidate iterators. The _next
+ * functions return NULL when the end has been reached. */
+void *upb_inttable_begin(struct upb_inttable *t);
+void *upb_inttable_next(struct upb_inttable *t, struct upb_inttable_entry *cur);
+
+void *upb_strtable_begin(struct upb_strtable *t);
+void *upb_strtable_next(struct upb_strtable *t, struct upb_strtable_entry *cur);
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif /* UPB_TABLE_H_ */
generated by cgit on debian on lair
contact matthew@masot.net with questions or feedback