summaryrefslogtreecommitdiff
path: root/src/upb_msg.h
diff options
context:
space:
mode:
Diffstat (limited to 'src/upb_msg.h')
-rw-r--r--src/upb_msg.h429
1 files changed, 189 insertions, 240 deletions
diff --git a/src/upb_msg.h b/src/upb_msg.h
index 4e1b4d5..b93037b 100644
--- a/src/upb_msg.h
+++ b/src/upb_msg.h
@@ -4,285 +4,122 @@
* Copyright (c) 2010-2011 Google Inc. See LICENSE for details.
* Author: Josh Haberman <jhaberman@gmail.com>
*
- * Data structure for storing a message of protobuf data. Unlike Google's
- * protobuf, upb_msg and upb_array are reference counted instead of having
- * exclusive ownership of their fields. This is a better match for dynamic
- * languages where statements like a.b = other_b are normal.
+ * Routines for reading and writing message data to an in-memory structure,
+ * similar to a C struct.
*
- * upb's parsers and serializers could also be used to populate and serialize
- * other kinds of message objects (even one generated by Google's protobuf).
+ * upb does not define one single message object that everyone must use.
+ * Rather it defines an abstract interface for reading and writing members
+ * of a message object, and all of the parsers and serializers use this
+ * abstract interface. This allows upb's parsers and serializers to be used
+ * regardless of what memory management scheme or synchronization model the
+ * application is using.
*
- * TODO: consider properly supporting const instances.
+ * A standard set of accessors is provided for doing simple reads and writes at
+ * a known offset into the message. These accessors should be used when
+ * possible, because they are specially optimized -- for example, the JIT can
+ * recognize them and emit specialized code instead of having to call the
+ * function at all. The application can substitute its own accessors when the
+ * standard accessors are not suitable.
*/
#ifndef UPB_MSG_H
#define UPB_MSG_H
#include <stdlib.h>
+#include "upb_def.h"
#include "upb_handlers.h"
#ifdef __cplusplus
extern "C" {
#endif
-// A pointer to a .proto value. The owner must have an out-of-band way of
-// knowing the type, so it knows which union member to use.
-typedef union {
- double *_double;
- float *_float;
- int32_t *int32;
- int64_t *int64;
- uint8_t *uint8;
- uint32_t *uint32;
- uint64_t *uint64;
- bool *_bool;
- upb_string **str;
- upb_msg **msg;
- upb_array **arr;
- void *_void;
-} upb_valueptr;
-
-INLINE upb_valueptr upb_value_addrof(upb_value *val) {
- upb_valueptr ptr = {&val->val._double};
- return ptr;
-}
-// Reads or writes a upb_value from an address represented by a upb_value_ptr.
-// We need to know the value type to perform this operation, because we need to
-// know how much memory to copy (and for big-endian machines, we need to know
-// where in the upb_value the data goes).
-//
-// For little endian-machines where we didn't mind overreading, we could make
-// upb_value_read simply use memcpy().
-INLINE upb_value upb_value_read(upb_valueptr ptr, upb_fieldtype_t ft) {
- upb_value val;
-
-#ifdef NDEBUG
-#define CASE(t, member_name) \
- case UPB_TYPE(t): val.val.member_name = *ptr.member_name; break;
-#else
-#define CASE(t, member_name) \
- case UPB_TYPE(t): val.val.member_name = *ptr.member_name; val.type = upb_types[ft].inmemory_type; break;
-#endif
+/* upb_accessor ***************************************************************/
- switch(ft) {
- CASE(DOUBLE, _double)
- CASE(FLOAT, _float)
- CASE(INT32, int32)
- CASE(INT64, int64)
- CASE(UINT32, uint32)
- CASE(UINT64, uint64)
- CASE(SINT32, int32)
- CASE(SINT64, int64)
- CASE(FIXED32, uint32)
- CASE(FIXED64, uint64)
- CASE(SFIXED32, int32)
- CASE(SFIXED64, int64)
- CASE(BOOL, _bool)
- CASE(ENUM, int32)
- CASE(STRING, str)
- CASE(BYTES, str)
- CASE(MESSAGE, msg)
- CASE(GROUP, msg)
- case UPB_VALUETYPE_ARRAY:
- val.val.arr = *ptr.arr;
-#ifndef NDEBUG
- val.type = UPB_VALUETYPE_ARRAY;
-#endif
- break;
- default: assert(false);
- }
- return val;
+// A upb_accessor is a table of function pointers for doing reads and writes
+// for one specific upb_fielddef. Each field has a separate accessor, which
+// lives in the fielddef.
-#undef CASE
-}
+typedef bool upb_has_reader(void *m, upb_value fval);
+typedef upb_value upb_value_reader(void *m, upb_value fval);
-INLINE void upb_value_write(upb_valueptr ptr, upb_value val,
- upb_fieldtype_t ft) {
-#ifndef NDEBUG
- if (ft == UPB_VALUETYPE_ARRAY) {
- assert(val.type == UPB_VALUETYPE_ARRAY);
- } else if (val.type != UPB_VALUETYPE_RAW) {
- assert(val.type == upb_types[ft].inmemory_type);
- }
-#endif
-#define CASE(t, member_name) \
- case UPB_TYPE(t): *ptr.member_name = val.val.member_name; break;
-
- switch(ft) {
- CASE(DOUBLE, _double)
- CASE(FLOAT, _float)
- CASE(INT32, int32)
- CASE(INT64, int64)
- CASE(UINT32, uint32)
- CASE(UINT64, uint64)
- CASE(SINT32, int32)
- CASE(SINT64, int64)
- CASE(FIXED32, uint32)
- CASE(FIXED64, uint64)
- CASE(SFIXED32, int32)
- CASE(SFIXED64, int64)
- CASE(BOOL, _bool)
- CASE(ENUM, int32)
- CASE(STRING, str)
- CASE(BYTES, str)
- CASE(MESSAGE, msg)
- CASE(GROUP, msg)
- case UPB_VALUETYPE_ARRAY:
- *ptr.arr = val.val.arr;
- break;
- default: assert(false);
- }
-
-#undef CASE
-}
+typedef void *upb_seqbegin_handler(void *s);
+typedef void *upb_seqnext_handler(void *s, void *iter);
+typedef upb_value upb_seqget_handler(void *iter);
+INLINE bool upb_seq_done(void *iter) { return iter == NULL; }
+typedef struct _upb_accessor_vtbl {
+ // Writers. These take an fval as a parameter because the callbacks are used
+ // as upb_handlers, but the fval is always the fielddef for that field.
+ upb_startfield_handler *appendseq; // Repeated fields only.
+ upb_startfield_handler *appendsubmsg; // Submsg fields (repeated or no).
+ upb_value_handler *set; // Scalar fields (repeated or no).
-/* upb_array ******************************************************************/
+ // Readers.
+ upb_has_reader *has;
+ upb_value_reader *get;
+ upb_seqbegin_handler *seqbegin;
+ upb_seqnext_handler *seqnext;
+ upb_seqget_handler *seqget;
+} upb_accessor_vtbl;
-typedef uint32_t upb_arraylen_t;
-struct _upb_array {
- upb_atomic_t refcount;
- // "len" and "size" are measured in elements, not bytes.
- int32_t len;
- int32_t size;
- char *ptr;
-};
-
-void _upb_array_free(upb_array *a, upb_fielddef *f);
-INLINE upb_valueptr _upb_array_getptrforsize(upb_array *a, size_t type_size,
- int32_t elem) {
- assert(elem >= 0);
- upb_valueptr p;
- p._void = &a->ptr[elem * type_size];
- return p;
-}
+// Registers handlers for writing into a message of the given type.
+upb_mhandlers *upb_accessors_reghandlers(upb_handlers *h, upb_msgdef *m);
-INLINE upb_valueptr _upb_array_getptr(upb_array *a, upb_fielddef *f,
- uint32_t elem) {
- return _upb_array_getptrforsize(a, upb_types[f->type].size, elem);
-}
+// Returns an stdmsg accessor for the given fielddef.
+upb_accessor_vtbl *upb_stdmsg_accessor(upb_fielddef *f);
-upb_array *upb_array_new(void);
-INLINE void upb_array_unref(upb_array *a, upb_fielddef *f) {
- if (a && upb_atomic_unref(&a->refcount)) _upb_array_free(a, f);
-}
+/* upb_msg/upb_seq ************************************************************/
-void upb_array_recycle(upb_array **arr);
-INLINE uint32_t upb_array_len(upb_array *a) {
- return a->len;
-}
+// upb_msg and upb_seq allow for generic access to a message through its
+// accessor vtable. Note that these do *not* allow you to create, destroy, or
+// take references on the objects -- these operations are specifically outside
+// the scope of what the accessors define.
-INLINE upb_value upb_array_get(upb_array *arr, upb_fielddef *f,
- upb_arraylen_t i) {
- assert(i < upb_array_len(arr));
- return upb_value_read(_upb_array_getptr(arr, f, i), f->type);
-}
+// Clears all hasbits.
+// TODO: Add a separate function for setting primitive values back to their
+// defaults (but not strings, submessages, or arrays).
+void upb_msg_clear(void *msg, upb_msgdef *md);
+// Could add a method that recursively clears submessages, strings, and
+// arrays if desired. This could be a win if you wanted to merge without
+// needing hasbits, because during parsing you would never clear submessages
+// or arrays. Also this could be desired to provide proto2 operations on
+// generated messages.
-/* upb_msg ********************************************************************/
-
-// upb_msg is not self-describing; the upb_msg does not contain a pointer to the
-// upb_msgdef. While this makes the API a bit more cumbersome to use, this
-// choice was made for a few important reasons:
-//
-// 1. it would make every message 8 bytes larger on 64-bit platforms. This is
-// a high overhead for small messages.
-// 2. you would want the msg to own a ref on its msgdef, but this would require
-// an atomic operation for every message create or destroy!
-struct _upb_msg {
- upb_atomic_t refcount;
- uint8_t data[4]; // We allocate the appropriate amount per message.
-};
-
-void _upb_msg_free(upb_msg *msg, upb_msgdef *md);
-
-INLINE upb_valueptr _upb_msg_getptr(upb_msg *msg, upb_fielddef *f) {
- upb_valueptr p;
- p._void = &msg->data[f->byte_offset];
- return p;
+INLINE bool upb_msg_has(void *m, upb_fielddef *f) {
+ return f->accessor && f->accessor->has(m, f->fval);
}
-// Creates a new msg of the given type.
-upb_msg *upb_msg_new(upb_msgdef *md);
-
-// Unrefs the given message.
-INLINE void upb_msg_unref(upb_msg *msg, upb_msgdef *md) {
- if (msg && upb_atomic_unref(&msg->refcount)) _upb_msg_free(msg, md);
-}
-
-INLINE upb_msg *upb_msg_getref(upb_msg *msg) {
- assert(msg);
- upb_atomic_ref(&msg->refcount);
- return msg;
+// May only be called for fields that are known to be set.
+INLINE upb_value upb_msg_get(void *m, upb_fielddef *f) {
+ assert(upb_msg_has(m, f));
+ return f->accessor->get(m, f->fval);
}
-// Modifies *msg to point to a newly initialized msg instance. If the msg had
-// no other referents, reuses the same msg, otherwise allocates a new one.
-// The caller *must* own a ref on the msg prior to calling this method!
-void upb_msg_recycle(upb_msg **msg, upb_msgdef *msgdef);
-
-// Tests whether the given field is explicitly set, or whether it will return a
-// default.
-INLINE bool upb_msg_has(upb_msg *msg, upb_fielddef *f) {
- return (msg->data[f->set_bit_offset] & f->set_bit_mask) != 0;
+INLINE void *upb_seq_begin(void *s, upb_fielddef *f) {
+ assert(f->accessor);
+ return f->accessor->seqbegin(s);
}
-
-// We have several options for handling default values:
-// 1. inside upb_msg_clear(), overwrite all values to be their defaults,
-// overwriting submessage pointers to point to the default instance again.
-// 2. inside upb_msg_get(), test upb_msg_has() and return md->default_value
-// if it is not set. upb_msg_clear() only clears the set bits.
-// We lazily clear objects if/when we reuse them.
-// 3. inside upb_msg_clear(), overwrite all values to be their default,
-// and recurse into submessages to set all their values to defaults also.
-// 4. as a hybrid of (1) and (3), clear all set bits in upb_msg_clear()
-// but also overwrite all primitive values to be their defaults. Only
-// accessors for non-primitive values (submessage, strings, and arrays)
-// need to check the has-bits in their accessors -- primitive values can
-// always be returned straight from the msg.
-//
-// (1) is undesirable, because it prevents us from caching sub-objects.
-// (2) makes clear() cheaper, but makes get() branchier.
-// (3) makes get() less branchy, but makes clear() traverse the message graph.
-// (4) is probably the best bang for the buck.
-//
-// For the moment upb does (2), but we should implement (4). Google's protobuf
-// does (3), which is likely part of the reason that even our table-based
-// decoder beats it in some benchmarks.
-
-// For submessages and strings, the returned value is not owned.
-upb_value upb_msg_get(upb_msg *msg, upb_fielddef *f);
-
-// A specialized version of the previous that is cheaper because it doesn't
-// support submessages or arrays.
-INLINE upb_value upb_msg_getscalar(upb_msg *msg, upb_fielddef *f) {
- if (upb_msg_has(msg, f)) {
- return upb_value_read(_upb_msg_getptr(msg, f), upb_field_valuetype(f));
- } else {
- return f->default_value;
- }
+INLINE void *upb_seq_next(void *s, void *iter, upb_fielddef *f) {
+ assert(f->accessor);
+ assert(!upb_seq_done(iter));
+ return f->accessor->seqnext(s, iter);
}
-
-// Sets the given field to the given value. If the field is a string, array,
-// or submessage, releases the ref on any object we may have been referencing
-// and takes a ref on the new object (if any).
-void upb_msg_set(upb_msg *msg, upb_fielddef *f, upb_value val);
-
-// Unsets all field values back to their defaults.
-INLINE void upb_msg_clear(upb_msg *msg, upb_msgdef *md) {
- memset(msg->data, 0, md->set_flags_bytes);
+INLINE upb_value upb_seq_get(void *iter, upb_fielddef *f) {
+ assert(f->accessor);
+ assert(!upb_seq_done(iter));
+ return f->accessor->seqget(iter);
}
-// Registers handlers for populating a msg for the given upb_msgdef.
-// The upb_msg itself must be passed as the param to the src.
-upb_mhandlers *upb_msg_reghandlers(upb_handlers *h, upb_msgdef *md);
-
/* upb_msgvisitor *************************************************************/
-// Calls a set of upb_handlers with the contents of a upb_msg.
+// A upb_msgvisitor reads data from an in-memory structure using its accessors,
+// pushing the results to a given set of upb_handlers.
+// TODO: not yet implemented.
+
typedef struct {
upb_fhandlers *fh;
upb_fielddef *f;
@@ -314,6 +151,118 @@ void upb_msgvisitor_uninit(upb_msgvisitor *v);
void upb_msgvisitor_reset(upb_msgvisitor *v, upb_msg *m);
void upb_msgvisitor_visit(upb_msgvisitor *v, upb_status *status);
+
+/* Standard writers. **********************************************************/
+
+// Allocates a new stdmsg.
+void *upb_stdmsg_new(upb_msgdef *md);
+
+// Recursively frees any strings or submessages that the message refers to.
+void upb_stdmsg_free(void *m, upb_msgdef *md);
+
+// "hasbit" must be <= UPB_MAX_FIELDS. If it is <0, this field has no hasbit.
+upb_value upb_stdmsg_packfval(int16_t hasbit, uint16_t value_offset);
+upb_value upb_stdmsg_packfval_subm(int16_t hasbit, uint16_t value_offset,
+ uint16_t subm_size, uint8_t subm_setbytes);
+
+// Value writers for every in-memory type: write the data to a known offset
+// from the closure "c" and set the hasbit (if any).
+// TODO: can we get away with having only one for int64, uint64, double, etc?
+// The main thing in the way atm is that the upb_value is strongly typed.
+// in debug mode.
+upb_flow_t upb_stdmsg_setint64(void *c, upb_value fval, upb_value val);
+upb_flow_t upb_stdmsg_setint32(void *c, upb_value fval, upb_value val);
+upb_flow_t upb_stdmsg_setuint64(void *c, upb_value fval, upb_value val);
+upb_flow_t upb_stdmsg_setuint32(void *c, upb_value fval, upb_value val);
+upb_flow_t upb_stdmsg_setdouble(void *c, upb_value fval, upb_value val);
+upb_flow_t upb_stdmsg_setfloat(void *c, upb_value fval, upb_value val);
+upb_flow_t upb_stdmsg_setbool(void *c, upb_value fval, upb_value val);
+
+// Value writers for repeated fields: the closure points to a standard array
+// struct, appends the value to the end of the array, resizing with realloc()
+// if necessary.
+typedef struct {
+ char *ptr;
+ int32_t len; // Number of elements present.
+ int32_t size; // Number of elements allocated.
+} upb_stdarray;
+
+upb_flow_t upb_stdmsg_setint64_r(void *c, upb_value fval, upb_value val);
+upb_flow_t upb_stdmsg_setint32_r(void *c, upb_value fval, upb_value val);
+upb_flow_t upb_stdmsg_setuint64_r(void *c, upb_value fval, upb_value val);
+upb_flow_t upb_stdmsg_setuint32_r(void *c, upb_value fval, upb_value val);
+upb_flow_t upb_stdmsg_setdouble_r(void *c, upb_value fval, upb_value val);
+upb_flow_t upb_stdmsg_setfloat_r(void *c, upb_value fval, upb_value val);
+upb_flow_t upb_stdmsg_setbool_r(void *c, upb_value fval, upb_value val);
+
+// Writers for C strings (NULL-terminated): we can find a char* at a known
+// offset from the closure "c". Calls realloc() on the pointer to allocate
+// the memory (TODO: investigate whether checking malloc_usable_size() would
+// be cheaper than realloc()). Also sets the hasbit, if any.
+//
+// Since the string is NULL terminated and does not store an explicit length,
+// these are not suitable for binary data that can contain NULLs.
+upb_flow_t upb_stdmsg_setcstr(void *c, upb_value fval, upb_value val);
+upb_flow_t upb_stdmsg_setcstr_r(void *c, upb_value fval, upb_value val);
+
+// Writers for length-delimited strings: we explicitly store the length, so
+// the data can contain NULLs. Stores the data using upb_stdarray
+// which is located at a known offset from the closure "c" (note that it
+// is included inline rather than pointed to). Also sets the hasbit, if any.
+upb_flow_t upb_stdmsg_setstr(void *c, upb_value fval, upb_value val);
+upb_flow_t upb_stdmsg_setstr_r(void *c, upb_value fval, upb_value val);
+
+// Writers for startseq and startmsg which allocate (or reuse, if possible)
+// a sub data structure (upb_stdarray or a submessage, respectively),
+// setting the hasbit. If the hasbit is already set, the existing data
+// structure is used verbatim. If the hasbit is not already set, the pointer
+// is checked for NULL. If it is NULL, a new substructure is allocated,
+// cleared, and used. If it is not NULL, the existing substructure is
+// cleared and reused.
+//
+// If there is no hasbit, we always behave as if the hasbit was not set,
+// so any existing data for this array or submessage is cleared. In most
+// cases this will be fine since each array or non-repeated submessage should
+// occur at most once in the stream. But if the client is using "concatenation
+// as merging", it will want to make sure hasbits are allocated so merges can
+// happen appropriately.
+//
+// If there was a demand for the behavior that absence of a hasbit acts as if
+// the bit was always set, we could provide that also. But Clear() would need
+// to act recursively, which is less efficient since it requires an extra pass
+// over the tree.
+upb_sflow_t upb_stdmsg_startseq(void *c, upb_value fval);
+upb_sflow_t upb_stdmsg_startsubmsg(void *c, upb_value fval);
+upb_sflow_t upb_stdmsg_startsubmsg_r(void *c, upb_value fval);
+
+
+/* Standard readers. **********************************************************/
+
+bool upb_stdmsg_has(void *c, upb_value fval);
+void *upb_stdmsg_seqbegin(void *c);
+
+upb_value upb_stdmsg_getint64(void *c, upb_value fval);
+upb_value upb_stdmsg_getint32(void *c, upb_value fval);
+upb_value upb_stdmsg_getuint64(void *c, upb_value fval);
+upb_value upb_stdmsg_getuint32(void *c, upb_value fval);
+upb_value upb_stdmsg_getdouble(void *c, upb_value fval);
+upb_value upb_stdmsg_getfloat(void *c, upb_value fval);
+upb_value upb_stdmsg_getbool(void *c, upb_value fval);
+upb_value upb_stdmsg_getptr(void *c, upb_value fval);
+
+void *upb_stdmsg_8byte_seqnext(void *c, void *iter);
+void *upb_stdmsg_4byte_seqnext(void *c, void *iter);
+void *upb_stdmsg_1byte_seqnext(void *c, void *iter);
+
+upb_value upb_stdmsg_seqgetint64(void *c);
+upb_value upb_stdmsg_seqgetint32(void *c);
+upb_value upb_stdmsg_seqgetuint64(void *c);
+upb_value upb_stdmsg_seqgetuint32(void *c);
+upb_value upb_stdmsg_seqgetdouble(void *c);
+upb_value upb_stdmsg_seqgetfloat(void *c);
+upb_value upb_stdmsg_seqgetbool(void *c);
+upb_value upb_stdmsg_seqgetptr(void *c);
+
#ifdef __cplusplus
} /* extern "C" */
#endif
generated by cgit on debian on lair
contact matthew@masot.net with questions or feedback