Ditch abbreviated field business.

author: Joshua Haberman <joshua@reverberate.org> 2009-06-28 15:41:53 -0700
committer: Joshua Haberman <joshua@reverberate.org> 2009-06-28 15:41:53 -0700
commit: 5e2691460e9fb2ec9b77c1f9d133ae6b667afc3a (patch)
tree: f64dbba7d073ca6ee8b4c9e0c8ba77fa51a3b72d
parent: 03616c86ea3ba2d8da1e5e6b342d717165b71655 (diff)
4 files changed, 109 insertions, 55 deletions
diff --git a/upb_context.c b/upb_context.c
index e2c7aa9..79ce20d 100644
--- a/upb_context.c
+++ b/upb_context.c
@@ -185,7 +185,6 @@ bool upb_context_addfd(struct upb_context *c,
                  (fd->set_flags.has.service ? fd->service->len : 0);
   upb_strtable_init(&tmp, symcount, sizeof(struct upb_symtab_entry));
 
-  /* TODO: properly handle redefinitions and unresolvable symbols. */
   if(fd->set_flags.has.message_type)
     for(unsigned int i = 0; i < fd->message_type->len; i++)
       if(!insert_message(&tmp, fd->message_type->elements[i], &package))
@@ -207,7 +206,7 @@ bool upb_context_addfd(struct upb_context *c,
       struct upb_msg *m = e->ref.msg;
       for(unsigned int i = 0; i < m->num_fields; i++) {
         struct upb_msg_field *f = &m->fields[i];
-        google_protobuf_FieldDescriptorProto *fd = f->descriptor;
+        google_protobuf_FieldDescriptorProto *fd = m->field_descriptors[i];
         union upb_symbol_ref ref;
         if(fd->type == GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_MESSAGE)
           ref = resolve2(&c->symtab, &tmp, &e->e.key, fd->name, UPB_SYM_MESSAGE);
@@ -215,7 +214,7 @@ bool upb_context_addfd(struct upb_context *c,
           ref = resolve2(&c->symtab, &tmp, &e->e.key, fd->name, UPB_SYM_ENUM);
         else
           continue;  /* No resolving necessary. */
-        if(!ref.msg) goto error;
+        if(!ref.msg) goto error;  /* Ref. to undefined symbol. */
         upb_msg_ref(m, f, ref);
       }
     }
diff --git a/upb_msg.c b/upb_msg.c
index 0517cf0..f2ad6c4 100644
--- a/upb_msg.c
+++ b/upb_msg.c
@@ -6,6 +6,7 @@
 #include <stdlib.h>
 #include "descriptor.h"
 #include "upb_msg.h"
+#include "upb_parse.h"
 
 #define ALIGN_UP(p, t) (t + ((p - 1) & (~t - 1)))
 
@@ -17,9 +18,7 @@ static int div_round_up(int numerator, int denominator) {
 }
 
 static int compare_fields(const void *e1, const void *e2) {
-  const struct upb_msg_field *f1 = e1, *f2 = e2;
-  const google_protobuf_FieldDescriptorProto *fd1  = f1->descriptor;
-  const google_protobuf_FieldDescriptorProto *fd2  = f2->descriptor;
+  const google_protobuf_FieldDescriptorProto *fd1 = e1, *fd2 = e2;
   /* Required fields go before non-required. */
   bool req1 = fd1->label == GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_LABEL_REQUIRED;
   bool req2 = fd2->label == GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_LABEL_REQUIRED;
@@ -48,18 +47,19 @@ bool upb_msg_init(struct upb_msg *m, struct google_protobuf_DescriptorProto *d)
   m->num_required_fields = 0;
   m->size = m->set_flags_bytes;
 
-  m->fields = malloc(sizeof(struct upb_msg_field) * m->num_fields);
+  m->fields = malloc(sizeof(*m->fields) * m->num_fields);
+  m->field_descriptors = malloc(sizeof(*m->field_descriptors) * m->num_fields);
   for(unsigned int i = 0; i < m->num_fields; i++) {
     /* We count on the caller to keep this pointer alive. */
-    m->fields[i].descriptor = d->field->elements[i];
+    m->field_descriptors[i] = d->field->elements[i];
   }
-  qsort(m->fields, m->num_fields, sizeof(struct upb_msg_field), compare_fields);
+  qsort(m->field_descriptors, m->num_fields, sizeof(void*), compare_fields);
 
   size_t max_align = 0;
 
   for(unsigned int i = 0; i < m->num_fields; i++) {
     struct upb_msg_field *f = &m->fields[i];
-    google_protobuf_FieldDescriptorProto *fd = f->descriptor;
+    google_protobuf_FieldDescriptorProto *fd = m->field_descriptors[i];
     struct upb_type_info *type_info = &upb_type_info[fd->type];
 
     /* General alignment rules are: each member must be at an address that is a
@@ -72,14 +72,11 @@ bool upb_msg_init(struct upb_msg *m, struct google_protobuf_DescriptorProto *d)
     if(fd->label == GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_LABEL_REQUIRED)
       m->num_required_fields++;
 
-    /* Insert into the tables.  Note that af->ref will be uninitialized, even in
-     * the tables' copies of *af, which is why we must update them separately
+    /* Insert into the tables.  Note that f->ref will be uninitialized, even in
+     * the tables' copies of *f, which is why we must update them separately
      * when the references are resolved. */
-    struct upb_abbrev_msg_field af = {.byte_offset = f->byte_offset,
-                                      .field_index = f->field_index,
-                                      .type = fd->type};
-    struct upb_fieldsbynum_entry nument = {.e = {.key = fd->number}, .f = af};
-    struct upb_fieldsbyname_entry strent = {.e = {.key = *fd->name}, .f = af};
+    struct upb_fieldsbynum_entry nument = {.e = {.key = fd->number}, .f = *f};
+    struct upb_fieldsbyname_entry strent = {.e = {.key = *fd->name}, .f = *f};
     upb_inttable_insert(&m->fields_by_num, &nument.e);
     upb_strtable_insert(&m->fields_by_name, &strent.e);
   }
@@ -94,3 +91,46 @@ void upb_msg_free(struct upb_msg *m)
   upb_strtable_free(&m->fields_by_name);
   free(m->fields);
 }
+
+#if 0
+struct parse_frame_data {
+  struct upb_msg *m;
+  void *data;
+};
+
+static void set_frame_data(struct upb_parse_state *s, struct upb_msg *m)
+{
+}
+
+static upb_field_type_t tag_cb(struct upb_parse_state *s, struct upb_tag *tag,
+                               void **user_field_desc)
+{
+  struct upb_msg *m = (struct upb_msg*)s->top->user_data;
+  struct upb_msg_field *f = upb_msg_fieldbynum(m, tag->field_number);
+  if(!f || !upb_check_type(tag->wire_type, f->type))
+    return 0;  /* Skip unknown or fields of the wrong type. */
+  *user_field_desc = f->ref.msg;
+  return f->type;
+}
+
+static void value_cb(struct upb_parse_state *s, union upb_value *v,
+                     void *str, void *user_field_desc)
+{
+  *user_field_desc = f->ref.msg;
+}
+
+static void submsg_start_cb(struct upb_parse_state *s, void *user_field_desc)
+{
+  set_frame_data(s, user_field_desc);
+}
+
+void *upb_msg_parse(struct upb_msg *m, struct upb_string *str)
+{
+  struct upb_parse_state s;
+  upb_parse_state_init(&s, sizeof(struct parse_frame_data));
+  set_frame_data(&s, m);
+  s.tag_cb = tag_cb;
+  s.value_cb = value_cb;
+  s.submsg_start_cb = submsg_start_cb;
+}
+#endif
diff --git a/upb_msg.h b/upb_msg.h
index 70d4405..9608e64 100644
--- a/upb_msg.h
+++ b/upb_msg.h
@@ -4,8 +4,11 @@
  * Copyright (c) 2009 Joshua Haberman.  See LICENSE for details.
  *
  * upb_msg contains a full description of a message as defined in a .proto file.
- * This allows for run-time reflection over .proto types, but also defines an
- * in-memory byte-level format for storing protobufs.
+ * It supports many features and operations for dealing with proto messages:
+ * - reflection over .proto types at runtime (list fields, get names, etc).
+ * - an in-memory byte-level format for efficiently storing and accessing msgs.
+ * - serializing and deserializing from the in-memory format to a protobuf.
+ * - optional memory management for handling strings, arrays, and submessages.
  *
  * The in-memory format is very much like a C struct that you can define at
  * run-time, but also supports reflection.  Like C structs it supports
@@ -25,9 +28,12 @@
  * this format.  This format is designed to allow the fastest possible random
  * access of individual fields.
  *
- * Note that no memory management is defined, which should make it easier to
- * integrate this format with existing memory-management schemes.  Any memory
- * management semantics can be used with the format as defined here.
+ * Note that clients need not use the memory management facilities defined here.
+ * They are for convenience only -- clients wishing to do their own memory
+ * management may do so (allowing clients to perform advanced techniques like
+ * reference-counting, garbage collection, and string references).  Different
+ * clients can read each others messages regardless of what memory management
+ * scheme each is using.
  */
 
 #ifndef UPB_MSG_H_
@@ -49,13 +55,13 @@ extern "C" {
 struct google_protobuf_DescriptorProto;
 struct google_protobuf_FieldDescriptorProto;
 
-/* Structure definition. ******************************************************/
+/* Message definition. ********************************************************/
 
 /* Structure that describes a single field in a message. */
 struct upb_msg_field {
-  struct google_protobuf_FieldDescriptorProto *descriptor;
   uint32_t byte_offset;     /* Where to find the data. */
   uint16_t field_index;     /* Indexes upb_msg.fields. Also indicates set bit */
+  upb_field_type_t type;    /* Copied from descriptor for cache-friendliness. */
   union upb_symbol_ref ref;
 };
 
@@ -69,62 +75,53 @@ struct upb_msg {
   struct upb_inttable fields_by_num;
   struct upb_strtable fields_by_name;
   struct upb_msg_field *fields;
+  struct google_protobuf_FieldDescriptorProto **field_descriptors;
 };
 
 /* The num->field and name->field maps in upb_msg allow fast lookup of fields
  * by number or name.  These lookups are in the critical path of parsing and
  * field lookup, so they must be as fast as possible.  To make these more
- * cache-friendly, we put the data in the table by value, but use only an
- * abbreviated set of data (ie. not all the data in upb_msg_field).  Notably,
- * we don't include the pointer to the field descriptor.  But the upb_msg_field
- * can be retrieved in its entirety using the function below.*/
-
-struct upb_abbrev_msg_field {
-  uint32_t byte_offset;     /* Where to find the data. */
-  uint16_t field_index;     /* Indexes upb_msg.fields. Also indicates set bit */
-  upb_field_type_t type;    /* Copied from descriptor for cache-friendliness. */
-  union upb_symbol_ref ref;
-};
+ * cache-friendly, we put the data in the table by value. */
 
 struct upb_fieldsbynum_entry {
   struct upb_inttable_entry e;
-  struct upb_abbrev_msg_field f;
+  struct upb_msg_field f;
 };
 
 struct upb_fieldsbyname_entry {
   struct upb_strtable_entry e;
-  struct upb_abbrev_msg_field f;
+  struct upb_msg_field f;
 };
 
-INLINE struct upb_msg_field *upb_get_msg_field(
-    struct upb_abbrev_msg_field *f, struct upb_msg *m) {
-  return &m->fields[f->field_index];
+/* Can be used to retrieve a field descriptor given the upb_msg_field ref. */
+INLINE struct google_protobuf_FieldDescriptorProto *upb_msg_field_descriptor(
+    struct upb_msg_field *f, struct upb_msg *m) {
+  return m->field_descriptors[f->field_index];
 }
 
 /* Initialize and free a upb_msg.  Caller retains ownership of d, but the msg
  * will contain references to it, so it must outlive the msg.  Note that init
- * does not resolve upb_msg_field.ref -- that is left to the caller. */
+ * does not resolve upb_msg_field.ref -- the caller should do that
+ * post-initialization by calling upb_msg_ref() below. */
 bool upb_msg_init(struct upb_msg *m, struct google_protobuf_DescriptorProto *d);
 void upb_msg_free(struct upb_msg *m);
 
 /* Clients use this function on a previously initialized upb_msg to resolve the
- * "ref" field in the upb_msg_field and upb_abbrev_msg_field.  Since messages
- * can refer to each other in mutually-recursive ways, this step must be
- * separated from initialization.  The function is necessary because there are
- * multiple internal maps in which the ref appears. */
+ * "ref" field in the upb_msg_field.  Since messages can refer to each other in
+ * mutually-recursive ways, this step must be separated from initialization. */
 void upb_msg_ref(struct upb_msg *m, struct upb_msg_field *f, union upb_symbol_ref ref);
 
 /* While these are written to be as fast as possible, it will still be faster
  * to cache the results of this lookup if possible.  These return NULL if no
  * such field is found. */
-INLINE struct upb_abbrev_msg_field *upb_msg_fieldbynum(struct upb_msg *m,
-                                                       uint32_t number) {
+INLINE struct upb_msg_field *upb_msg_fieldbynum(struct upb_msg *m,
+                                                uint32_t number) {
   struct upb_fieldsbynum_entry *e = upb_inttable_lookup(
       &m->fields_by_num, number, sizeof(struct upb_fieldsbynum_entry));
   return e ? &e->f : NULL;
 }
-INLINE struct upb_abbrev_msg_field *upb_msg_fieldbyname(struct upb_msg *m,
-                                                        struct upb_string *name) {
+INLINE struct upb_msg_field *upb_msg_fieldbyname(struct upb_msg *m,
+                                                 struct upb_string *name) {
   struct upb_fieldsbyname_entry *e =
       upb_strtable_lookup(&m->fields_by_name, name);
   return e ? &e->f : NULL;
@@ -179,7 +176,7 @@ UPB_DEFINE_PRIMITIVE_ARRAY(bool,     bool)
 
 /* For each primitive type we define a set of six functions:
  *
- *  // For fetching out of a struct (s points to the raw struct data).
+ *  // For fetching out of a msg (s points to the raw msg data).
  *  int32_t *upb_msg_get_int32_ptr(void *s, struct upb_msg_field *f);
  *  int32_t upb_msg_get_int32(void *s, struct upb_msg_field *f);
  *  void upb_msg_set_int32(void *s, struct upb_msg_field *f, int32_t val);
@@ -232,9 +229,11 @@ UPB_DEFINE_ALL_ACCESSORS(uint64_t, uint64, INLINE)
 UPB_DEFINE_ALL_ACCESSORS(bool,     bool,   INLINE)
 UPB_DEFINE_ALL_ACCESSORS(struct upb_string*, bytes, INLINE)
 UPB_DEFINE_ALL_ACCESSORS(struct upb_string*, string, INLINE)
-UPB_DEFINE_ALL_ACCESSORS(void*, substruct, INLINE)
+UPB_DEFINE_ALL_ACCESSORS(void*,    submsg, INLINE)
 UPB_DEFINE_ACCESSORS(struct upb_array*, array, INLINE)
 
+/* "Set" flag reading and writing.  *******************************************/
+
 INLINE size_t upb_isset_offset(uint32_t field_index) {
   return field_index / 8;
 }
@@ -243,10 +242,9 @@ INLINE uint8_t upb_isset_mask(uint32_t field_index) {
   return 1 << (field_index % 8);
 }
 
-/* Functions for reading and writing the "set" flags in the pbstruct.  Note
- * that these do not perform any memory management associated with any dynamic
- * memory these fields may be referencing; that is the client's responsibility.
- * These *only* set and test the flags. */
+/* Functions for reading and writing the "set" flags in the msg.  Note that
+ * these do not perform memory management associated with any dynamic memory
+ * these fields may be referencing. These *only* set and test the flags. */
 INLINE void upb_msg_set(void *s, struct upb_msg_field *f)
 {
   ((char*)s)[upb_isset_offset(f->field_index)] |= upb_isset_mask(f->field_index);
diff --git a/upb_parse.c b/upb_parse.c
index 2b9c875..57cca2b 100644
--- a/upb_parse.c
+++ b/upb_parse.c
@@ -8,6 +8,7 @@
 
 #include <assert.h>
 #include <stddef.h>
+#include <stdlib.h>
 #include <string.h>
 #include "descriptor.h"
 
@@ -266,6 +267,22 @@ upb_status_t upb_parse_value(void **b, upb_field_type_t ft,
 #undef CASE
 }
 
+void upb_parse_state_init(struct upb_parse_state *state, size_t udata_size)
+{
+  state->offset = 0;
+  size_t stack_bytes = (sizeof(*state->stack) + udata_size) * UPB_MAX_NESTING;
+  state->stack = state->top = malloc(stack_bytes);
+  state->limit = (struct upb_parse_stack_frame*)((char*)state->stack + stack_bytes);
+  state->udata_size = udata_size;
+  state->done = false;
+  state->packed_end_offset = 0;
+}
+
+void upb_parse_state_free(struct upb_parse_state *state)
+{
+  free(state->stack);
+}
+
 static void pop_stack_frame(struct upb_parse_state *s)
 {
   s->submsg_end_cb(s);
author	Joshua Haberman <joshua@reverberate.org>	2009-06-28 15:41:53 -0700
committer	Joshua Haberman <joshua@reverberate.org>	2009-06-28 15:41:53 -0700
commit	5e2691460e9fb2ec9b77c1f9d133ae6b667afc3a (patch)
tree	f64dbba7d073ca6ee8b4c9e0c8ba77fa51a3b72d
parent	03616c86ea3ba2d8da1e5e6b342d717165b71655 (diff)