From 559e23c796f973a65d05c76e211835b126ee8ac8 Mon Sep 17 00:00:00 2001
From: Joshua Haberman <jhaberman@gmail.com>
Date: Fri, 17 Jun 2011 10:34:29 -0700
Subject: Major refactoring: abandon upb_msg, add upb_accessors.

Next on the chopping block is upb_string.
---
 src/upb_msg.h | 429 ++++++++++++++++++++++++++--------------------------------
 1 file changed, 189 insertions(+), 240 deletions(-)

(limited to 'src/upb_msg.h')

diff --git a/src/upb_msg.h b/src/upb_msg.h
index 4e1b4d5..b93037b 100644
--- a/src/upb_msg.h
+++ b/src/upb_msg.h
@@ -4,285 +4,122 @@
  * Copyright (c) 2010-2011 Google Inc.  See LICENSE for details.
  * Author: Josh Haberman <jhaberman@gmail.com>
  *
- * Data structure for storing a message of protobuf data.  Unlike Google's
- * protobuf, upb_msg and upb_array are reference counted instead of having
- * exclusive ownership of their fields.  This is a better match for dynamic
- * languages where statements like a.b = other_b are normal.
+ * Routines for reading and writing message data to an in-memory structure,
+ * similar to a C struct.
  *
- * upb's parsers and serializers could also be used to populate and serialize
- * other kinds of message objects (even one generated by Google's protobuf).
+ * upb does not define one single message object that everyone must use.
+ * Rather it defines an abstract interface for reading and writing members
+ * of a message object, and all of the parsers and serializers use this
+ * abstract interface.  This allows upb's parsers and serializers to be used
+ * regardless of what memory management scheme or synchronization model the
+ * application is using.
  *
- * TODO: consider properly supporting const instances.
+ * A standard set of accessors is provided for doing simple reads and writes at
+ * a known offset into the message.  These accessors should be used when
+ * possible, because they are specially optimized -- for example, the JIT can
+ * recognize them and emit specialized code instead of having to call the
+ * function at all.  The application can substitute its own accessors when the
+ * standard accessors are not suitable.
  */
 
 #ifndef UPB_MSG_H
 #define UPB_MSG_H
 
 #include <stdlib.h>
+#include "upb_def.h"
 #include "upb_handlers.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-// A pointer to a .proto value.  The owner must have an out-of-band way of
-// knowing the type, so it knows which union member to use.
-typedef union {
-  double *_double;
-  float *_float;
-  int32_t *int32;
-  int64_t *int64;
-  uint8_t *uint8;
-  uint32_t *uint32;
-  uint64_t *uint64;
-  bool *_bool;
-  upb_string **str;
-  upb_msg **msg;
-  upb_array **arr;
-  void *_void;
-} upb_valueptr;
-
-INLINE upb_valueptr upb_value_addrof(upb_value *val) {
-  upb_valueptr ptr = {&val->val._double};
-  return ptr;
-}
 
-// Reads or writes a upb_value from an address represented by a upb_value_ptr.
-// We need to know the value type to perform this operation, because we need to
-// know how much memory to copy (and for big-endian machines, we need to know
-// where in the upb_value the data goes).
-//
-// For little endian-machines where we didn't mind overreading, we could make
-// upb_value_read simply use memcpy().
-INLINE upb_value upb_value_read(upb_valueptr ptr, upb_fieldtype_t ft) {
-  upb_value val;
-
-#ifdef NDEBUG
-#define CASE(t, member_name) \
-  case UPB_TYPE(t): val.val.member_name = *ptr.member_name; break;
-#else
-#define CASE(t, member_name) \
-  case UPB_TYPE(t): val.val.member_name = *ptr.member_name; val.type = upb_types[ft].inmemory_type; break;
-#endif
+/* upb_accessor ***************************************************************/
 
-  switch(ft) {
-    CASE(DOUBLE,   _double)
-    CASE(FLOAT,    _float)
-    CASE(INT32,    int32)
-    CASE(INT64,    int64)
-    CASE(UINT32,   uint32)
-    CASE(UINT64,   uint64)
-    CASE(SINT32,   int32)
-    CASE(SINT64,   int64)
-    CASE(FIXED32,  uint32)
-    CASE(FIXED64,  uint64)
-    CASE(SFIXED32, int32)
-    CASE(SFIXED64, int64)
-    CASE(BOOL,     _bool)
-    CASE(ENUM,     int32)
-    CASE(STRING,   str)
-    CASE(BYTES,    str)
-    CASE(MESSAGE,  msg)
-    CASE(GROUP,    msg)
-    case UPB_VALUETYPE_ARRAY:
-      val.val.arr = *ptr.arr;
-#ifndef NDEBUG
-      val.type = UPB_VALUETYPE_ARRAY;
-#endif
-      break;
-    default: assert(false);
-  }
-  return val;
+// A upb_accessor is a table of function pointers for doing reads and writes
+// for one specific upb_fielddef.  Each field has a separate accessor, which
+// lives in the fielddef.
 
-#undef CASE
-}
+typedef bool upb_has_reader(void *m, upb_value fval);
+typedef upb_value upb_value_reader(void *m, upb_value fval);
 
-INLINE void upb_value_write(upb_valueptr ptr, upb_value val,
-                            upb_fieldtype_t ft) {
-#ifndef NDEBUG
-  if (ft == UPB_VALUETYPE_ARRAY) {
-    assert(val.type == UPB_VALUETYPE_ARRAY);
-  } else if (val.type != UPB_VALUETYPE_RAW) {
-    assert(val.type == upb_types[ft].inmemory_type);
-  }
-#endif
-#define CASE(t, member_name) \
-  case UPB_TYPE(t): *ptr.member_name = val.val.member_name; break;
-
-  switch(ft) {
-    CASE(DOUBLE,   _double)
-    CASE(FLOAT,    _float)
-    CASE(INT32,    int32)
-    CASE(INT64,    int64)
-    CASE(UINT32,   uint32)
-    CASE(UINT64,   uint64)
-    CASE(SINT32,   int32)
-    CASE(SINT64,   int64)
-    CASE(FIXED32,  uint32)
-    CASE(FIXED64,  uint64)
-    CASE(SFIXED32, int32)
-    CASE(SFIXED64, int64)
-    CASE(BOOL,     _bool)
-    CASE(ENUM,     int32)
-    CASE(STRING,   str)
-    CASE(BYTES,    str)
-    CASE(MESSAGE,  msg)
-    CASE(GROUP,    msg)
-    case UPB_VALUETYPE_ARRAY:
-      *ptr.arr = val.val.arr;
-      break;
-    default: assert(false);
-  }
-
-#undef CASE
-}
+typedef void *upb_seqbegin_handler(void *s);
+typedef void *upb_seqnext_handler(void *s, void *iter);
+typedef upb_value upb_seqget_handler(void *iter);
+INLINE bool upb_seq_done(void *iter) { return iter == NULL; }
 
+typedef struct _upb_accessor_vtbl {
+  // Writers.  These take an fval as a parameter because the callbacks are used
+  // as upb_handlers, but the fval is always the fielddef for that field.
+  upb_startfield_handler *appendseq;     // Repeated fields only.
+  upb_startfield_handler *appendsubmsg;  // Submsg fields (repeated or no).
+  upb_value_handler      *set;           // Scalar fields (repeated or no).
 
-/* upb_array ******************************************************************/
+  // Readers.
+  upb_has_reader         *has;
+  upb_value_reader       *get;
+  upb_seqbegin_handler   *seqbegin;
+  upb_seqnext_handler    *seqnext;
+  upb_seqget_handler     *seqget;
+} upb_accessor_vtbl;
 
-typedef uint32_t upb_arraylen_t;
-struct _upb_array {
-  upb_atomic_t refcount;
-  // "len" and "size" are measured in elements, not bytes.
-  int32_t len;
-  int32_t size;
-  char *ptr;
-};
-
-void _upb_array_free(upb_array *a, upb_fielddef *f);
-INLINE upb_valueptr _upb_array_getptrforsize(upb_array *a, size_t type_size,
-                                             int32_t elem) {
-  assert(elem >= 0);
-  upb_valueptr p;
-  p._void = &a->ptr[elem * type_size];
-  return p;
-}
+// Registers handlers for writing into a message of the given type.
+upb_mhandlers *upb_accessors_reghandlers(upb_handlers *h, upb_msgdef *m);
 
-INLINE upb_valueptr _upb_array_getptr(upb_array *a, upb_fielddef *f,
-                                      uint32_t elem) {
-  return _upb_array_getptrforsize(a, upb_types[f->type].size, elem);
-}
+// Returns an stdmsg accessor for the given fielddef.
+upb_accessor_vtbl *upb_stdmsg_accessor(upb_fielddef *f);
 
-upb_array *upb_array_new(void);
 
-INLINE void upb_array_unref(upb_array *a, upb_fielddef *f) {
-  if (a && upb_atomic_unref(&a->refcount)) _upb_array_free(a, f);
-}
+/* upb_msg/upb_seq ************************************************************/
 
-void upb_array_recycle(upb_array **arr);
-INLINE uint32_t upb_array_len(upb_array *a) {
-  return a->len;
-}
+// upb_msg and upb_seq allow for generic access to a message through its
+// accessor vtable.  Note that these do *not* allow you to create, destroy, or
+// take references on the objects -- these operations are specifically outside
+// the scope of what the accessors define.
 
-INLINE upb_value upb_array_get(upb_array *arr, upb_fielddef *f,
-                               upb_arraylen_t i) {
-  assert(i < upb_array_len(arr));
-  return upb_value_read(_upb_array_getptr(arr, f, i), f->type);
-}
+// Clears all hasbits.
+// TODO: Add a separate function for setting primitive values back to their
+// defaults (but not strings, submessages, or arrays).
+void upb_msg_clear(void *msg, upb_msgdef *md);
 
+// Could add a method that recursively clears submessages, strings, and
+// arrays if desired.  This could be a win if you wanted to merge without
+// needing hasbits, because during parsing you would never clear submessages
+// or arrays.  Also this could be desired to provide proto2 operations on
+// generated messages.
 
-/* upb_msg ********************************************************************/
-
-// upb_msg is not self-describing; the upb_msg does not contain a pointer to the
-// upb_msgdef.  While this makes the API a bit more cumbersome to use, this
-// choice was made for a few important reasons:
-//
-// 1. it would make every message 8 bytes larger on 64-bit platforms.  This is
-//    a high overhead for small messages.
-// 2. you would want the msg to own a ref on its msgdef, but this would require
-//    an atomic operation for every message create or destroy!
-struct _upb_msg {
-  upb_atomic_t refcount;
-  uint8_t data[4];  // We allocate the appropriate amount per message.
-};
-
-void _upb_msg_free(upb_msg *msg, upb_msgdef *md);
-
-INLINE upb_valueptr _upb_msg_getptr(upb_msg *msg, upb_fielddef *f) {
-  upb_valueptr p;
-  p._void = &msg->data[f->byte_offset];
-  return p;
+INLINE bool upb_msg_has(void *m, upb_fielddef *f) {
+  return f->accessor && f->accessor->has(m, f->fval);
 }
 
-// Creates a new msg of the given type.
-upb_msg *upb_msg_new(upb_msgdef *md);
-
-// Unrefs the given message.
-INLINE void upb_msg_unref(upb_msg *msg, upb_msgdef *md) {
-  if (msg && upb_atomic_unref(&msg->refcount)) _upb_msg_free(msg, md);
-}
-
-INLINE upb_msg *upb_msg_getref(upb_msg *msg) {
-  assert(msg);
-  upb_atomic_ref(&msg->refcount);
-  return msg;
+// May only be called for fields that are known to be set.
+INLINE upb_value upb_msg_get(void *m, upb_fielddef *f) {
+  assert(upb_msg_has(m, f));
+  return f->accessor->get(m, f->fval);
 }
 
-// Modifies *msg to point to a newly initialized msg instance.  If the msg had
-// no other referents, reuses the same msg, otherwise allocates a new one.
-// The caller *must* own a ref on the msg prior to calling this method!
-void upb_msg_recycle(upb_msg **msg, upb_msgdef *msgdef);
-
-// Tests whether the given field is explicitly set, or whether it will return a
-// default.
-INLINE bool upb_msg_has(upb_msg *msg, upb_fielddef *f) {
-  return (msg->data[f->set_bit_offset] & f->set_bit_mask) != 0;
+INLINE void *upb_seq_begin(void *s, upb_fielddef *f) {
+  assert(f->accessor);
+  return f->accessor->seqbegin(s);
 }
-
-// We have several options for handling default values:
-// 1. inside upb_msg_clear(), overwrite all values to be their defaults,
-//    overwriting submessage pointers to point to the default instance again.
-// 2. inside upb_msg_get(), test upb_msg_has() and return md->default_value
-//    if it is not set.  upb_msg_clear() only clears the set bits.
-//    We lazily clear objects if/when we reuse them.
-// 3. inside upb_msg_clear(), overwrite all values to be their default,
-//    and recurse into submessages to set all their values to defaults also.
-// 4. as a hybrid of (1) and (3), clear all set bits in upb_msg_clear()
-//    but also overwrite all primitive values to be their defaults.  Only
-//    accessors for non-primitive values (submessage, strings, and arrays)
-//    need to check the has-bits in their accessors -- primitive values can
-//    always be returned straight from the msg.
-//
-// (1) is undesirable, because it prevents us from caching sub-objects.
-// (2) makes clear() cheaper, but makes get() branchier.
-// (3) makes get() less branchy, but makes clear() traverse the message graph.
-// (4) is probably the best bang for the buck.
-//
-// For the moment upb does (2), but we should implement (4).  Google's protobuf
-// does (3), which is likely part of the reason that even our table-based
-// decoder beats it in some benchmarks.
-
-// For submessages and strings, the returned value is not owned.
-upb_value upb_msg_get(upb_msg *msg, upb_fielddef *f);
-
-// A specialized version of the previous that is cheaper because it doesn't
-// support submessages or arrays.
-INLINE upb_value upb_msg_getscalar(upb_msg *msg, upb_fielddef *f) {
-  if (upb_msg_has(msg, f)) {
-    return upb_value_read(_upb_msg_getptr(msg, f), upb_field_valuetype(f));
-  } else {
-    return f->default_value;
-  }
+INLINE void *upb_seq_next(void *s, void *iter, upb_fielddef *f) {
+  assert(f->accessor);
+  assert(!upb_seq_done(iter));
+  return f->accessor->seqnext(s, iter);
 }
-
-// Sets the given field to the given value.  If the field is a string, array,
-// or submessage, releases the ref on any object we may have been referencing
-// and takes a ref on the new object (if any).
-void upb_msg_set(upb_msg *msg, upb_fielddef *f, upb_value val);
-
-// Unsets all field values back to their defaults.
-INLINE void upb_msg_clear(upb_msg *msg, upb_msgdef *md) {
-  memset(msg->data, 0, md->set_flags_bytes);
+INLINE upb_value upb_seq_get(void *iter, upb_fielddef *f) {
+  assert(f->accessor);
+  assert(!upb_seq_done(iter));
+  return f->accessor->seqget(iter);
 }
 
-// Registers handlers for populating a msg for the given upb_msgdef.
-// The upb_msg itself must be passed as the param to the src.
-upb_mhandlers *upb_msg_reghandlers(upb_handlers *h, upb_msgdef *md);
-
 
 /* upb_msgvisitor *************************************************************/
 
-// Calls a set of upb_handlers with the contents of a upb_msg.
+// A upb_msgvisitor reads data from an in-memory structure using its accessors,
+// pushing the results to a given set of upb_handlers.
+// TODO: not yet implemented.
+
 typedef struct {
   upb_fhandlers *fh;
   upb_fielddef *f;
@@ -314,6 +151,118 @@ void upb_msgvisitor_uninit(upb_msgvisitor *v);
 void upb_msgvisitor_reset(upb_msgvisitor *v, upb_msg *m);
 void upb_msgvisitor_visit(upb_msgvisitor *v, upb_status *status);
 
+
+/* Standard writers. **********************************************************/
+
+// Allocates a new stdmsg.
+void *upb_stdmsg_new(upb_msgdef *md);
+
+// Recursively frees any strings or submessages that the message refers to.
+void upb_stdmsg_free(void *m, upb_msgdef *md);
+
+// "hasbit" must be <= UPB_MAX_FIELDS.  If it is <0, this field has no hasbit.
+upb_value upb_stdmsg_packfval(int16_t hasbit, uint16_t value_offset);
+upb_value upb_stdmsg_packfval_subm(int16_t hasbit, uint16_t value_offset,
+                                   uint16_t subm_size, uint8_t subm_setbytes);
+
+// Value writers for every in-memory type: write the data to a known offset
+// from the closure "c" and set the hasbit (if any).
+// TODO: can we get away with having only one for int64, uint64, double, etc?
+// The main thing in the way atm is that the upb_value is strongly typed.
+// in debug mode.
+upb_flow_t upb_stdmsg_setint64(void *c, upb_value fval, upb_value val);
+upb_flow_t upb_stdmsg_setint32(void *c, upb_value fval, upb_value val);
+upb_flow_t upb_stdmsg_setuint64(void *c, upb_value fval, upb_value val);
+upb_flow_t upb_stdmsg_setuint32(void *c, upb_value fval, upb_value val);
+upb_flow_t upb_stdmsg_setdouble(void *c, upb_value fval, upb_value val);
+upb_flow_t upb_stdmsg_setfloat(void *c, upb_value fval, upb_value val);
+upb_flow_t upb_stdmsg_setbool(void *c, upb_value fval, upb_value val);
+
+// Value writers for repeated fields: the closure points to a standard array
+// struct, appends the value to the end of the array, resizing with realloc()
+// if necessary.
+typedef struct {
+  char *ptr;
+  int32_t len;   // Number of elements present.
+  int32_t size;  // Number of elements allocated.
+} upb_stdarray;
+
+upb_flow_t upb_stdmsg_setint64_r(void *c, upb_value fval, upb_value val);
+upb_flow_t upb_stdmsg_setint32_r(void *c, upb_value fval, upb_value val);
+upb_flow_t upb_stdmsg_setuint64_r(void *c, upb_value fval, upb_value val);
+upb_flow_t upb_stdmsg_setuint32_r(void *c, upb_value fval, upb_value val);
+upb_flow_t upb_stdmsg_setdouble_r(void *c, upb_value fval, upb_value val);
+upb_flow_t upb_stdmsg_setfloat_r(void *c, upb_value fval, upb_value val);
+upb_flow_t upb_stdmsg_setbool_r(void *c, upb_value fval, upb_value val);
+
+// Writers for C strings (NULL-terminated): we can find a char* at a known
+// offset from the closure "c".  Calls realloc() on the pointer to allocate
+// the memory (TODO: investigate whether checking malloc_usable_size() would
+// be cheaper than realloc()).  Also sets the hasbit, if any.
+//
+// Since the string is NULL terminated and does not store an explicit length,
+// these are not suitable for binary data that can contain NULLs.
+upb_flow_t upb_stdmsg_setcstr(void *c, upb_value fval, upb_value val);
+upb_flow_t upb_stdmsg_setcstr_r(void *c, upb_value fval, upb_value val);
+
+// Writers for length-delimited strings: we explicitly store the length, so
+// the data can contain NULLs.  Stores the data using upb_stdarray
+// which is located at a known offset from the closure "c" (note that it
+// is included inline rather than pointed to).  Also sets the hasbit, if any.
+upb_flow_t upb_stdmsg_setstr(void *c, upb_value fval, upb_value val);
+upb_flow_t upb_stdmsg_setstr_r(void *c, upb_value fval, upb_value val);
+
+// Writers for startseq and startmsg which allocate (or reuse, if possible)
+// a sub data structure (upb_stdarray or a submessage, respectively),
+// setting the hasbit.  If the hasbit is already set, the existing data
+// structure is used verbatim.  If the hasbit is not already set, the pointer
+// is checked for NULL.  If it is NULL, a new substructure is allocated,
+// cleared, and used.  If it is not NULL, the existing substructure is
+// cleared and reused.
+//
+// If there is no hasbit, we always behave as if the hasbit was not set,
+// so any existing data for this array or submessage is cleared.  In most
+// cases this will be fine since each array or non-repeated submessage should
+// occur at most once in the stream.  But if the client is using "concatenation
+// as merging", it will want to make sure hasbits are allocated so merges can
+// happen appropriately.
+//
+// If there was a demand for the behavior that absence of a hasbit acts as if
+// the bit was always set, we could provide that also.  But Clear() would need
+// to act recursively, which is less efficient since it requires an extra pass
+// over the tree.
+upb_sflow_t upb_stdmsg_startseq(void *c, upb_value fval);
+upb_sflow_t upb_stdmsg_startsubmsg(void *c, upb_value fval);
+upb_sflow_t upb_stdmsg_startsubmsg_r(void *c, upb_value fval);
+
+
+/* Standard readers. **********************************************************/
+
+bool upb_stdmsg_has(void *c, upb_value fval);
+void *upb_stdmsg_seqbegin(void *c);
+
+upb_value upb_stdmsg_getint64(void *c, upb_value fval);
+upb_value upb_stdmsg_getint32(void *c, upb_value fval);
+upb_value upb_stdmsg_getuint64(void *c, upb_value fval);
+upb_value upb_stdmsg_getuint32(void *c, upb_value fval);
+upb_value upb_stdmsg_getdouble(void *c, upb_value fval);
+upb_value upb_stdmsg_getfloat(void *c, upb_value fval);
+upb_value upb_stdmsg_getbool(void *c, upb_value fval);
+upb_value upb_stdmsg_getptr(void *c, upb_value fval);
+
+void *upb_stdmsg_8byte_seqnext(void *c, void *iter);
+void *upb_stdmsg_4byte_seqnext(void *c, void *iter);
+void *upb_stdmsg_1byte_seqnext(void *c, void *iter);
+
+upb_value upb_stdmsg_seqgetint64(void *c);
+upb_value upb_stdmsg_seqgetint32(void *c);
+upb_value upb_stdmsg_seqgetuint64(void *c);
+upb_value upb_stdmsg_seqgetuint32(void *c);
+upb_value upb_stdmsg_seqgetdouble(void *c);
+upb_value upb_stdmsg_seqgetfloat(void *c);
+upb_value upb_stdmsg_seqgetbool(void *c);
+upb_value upb_stdmsg_seqgetptr(void *c);
+
 #ifdef __cplusplus
 }  /* extern "C" */
 #endif
-- 
cgit v1.2.3