From 559e23c796f973a65d05c76e211835b126ee8ac8 Mon Sep 17 00:00:00 2001 From: Joshua Haberman Date: Fri, 17 Jun 2011 10:34:29 -0700 Subject: Major refactoring: abandon upb_msg, add upb_accessors. Next on the chopping block is upb_string. --- src/upb_msg.h | 429 ++++++++++++++++++++++++++-------------------------------- 1 file changed, 189 insertions(+), 240 deletions(-) (limited to 'src/upb_msg.h') diff --git a/src/upb_msg.h b/src/upb_msg.h index 4e1b4d5..b93037b 100644 --- a/src/upb_msg.h +++ b/src/upb_msg.h @@ -4,285 +4,122 @@ * Copyright (c) 2010-2011 Google Inc. See LICENSE for details. * Author: Josh Haberman * - * Data structure for storing a message of protobuf data. Unlike Google's - * protobuf, upb_msg and upb_array are reference counted instead of having - * exclusive ownership of their fields. This is a better match for dynamic - * languages where statements like a.b = other_b are normal. + * Routines for reading and writing message data to an in-memory structure, + * similar to a C struct. * - * upb's parsers and serializers could also be used to populate and serialize - * other kinds of message objects (even one generated by Google's protobuf). + * upb does not define one single message object that everyone must use. + * Rather it defines an abstract interface for reading and writing members + * of a message object, and all of the parsers and serializers use this + * abstract interface. This allows upb's parsers and serializers to be used + * regardless of what memory management scheme or synchronization model the + * application is using. * - * TODO: consider properly supporting const instances. + * A standard set of accessors is provided for doing simple reads and writes at + * a known offset into the message. These accessors should be used when + * possible, because they are specially optimized -- for example, the JIT can + * recognize them and emit specialized code instead of having to call the + * function at all. The application can substitute its own accessors when the + * standard accessors are not suitable. */ #ifndef UPB_MSG_H #define UPB_MSG_H #include +#include "upb_def.h" #include "upb_handlers.h" #ifdef __cplusplus extern "C" { #endif -// A pointer to a .proto value. The owner must have an out-of-band way of -// knowing the type, so it knows which union member to use. -typedef union { - double *_double; - float *_float; - int32_t *int32; - int64_t *int64; - uint8_t *uint8; - uint32_t *uint32; - uint64_t *uint64; - bool *_bool; - upb_string **str; - upb_msg **msg; - upb_array **arr; - void *_void; -} upb_valueptr; - -INLINE upb_valueptr upb_value_addrof(upb_value *val) { - upb_valueptr ptr = {&val->val._double}; - return ptr; -} -// Reads or writes a upb_value from an address represented by a upb_value_ptr. -// We need to know the value type to perform this operation, because we need to -// know how much memory to copy (and for big-endian machines, we need to know -// where in the upb_value the data goes). -// -// For little endian-machines where we didn't mind overreading, we could make -// upb_value_read simply use memcpy(). -INLINE upb_value upb_value_read(upb_valueptr ptr, upb_fieldtype_t ft) { - upb_value val; - -#ifdef NDEBUG -#define CASE(t, member_name) \ - case UPB_TYPE(t): val.val.member_name = *ptr.member_name; break; -#else -#define CASE(t, member_name) \ - case UPB_TYPE(t): val.val.member_name = *ptr.member_name; val.type = upb_types[ft].inmemory_type; break; -#endif +/* upb_accessor ***************************************************************/ - switch(ft) { - CASE(DOUBLE, _double) - CASE(FLOAT, _float) - CASE(INT32, int32) - CASE(INT64, int64) - CASE(UINT32, uint32) - CASE(UINT64, uint64) - CASE(SINT32, int32) - CASE(SINT64, int64) - CASE(FIXED32, uint32) - CASE(FIXED64, uint64) - CASE(SFIXED32, int32) - CASE(SFIXED64, int64) - CASE(BOOL, _bool) - CASE(ENUM, int32) - CASE(STRING, str) - CASE(BYTES, str) - CASE(MESSAGE, msg) - CASE(GROUP, msg) - case UPB_VALUETYPE_ARRAY: - val.val.arr = *ptr.arr; -#ifndef NDEBUG - val.type = UPB_VALUETYPE_ARRAY; -#endif - break; - default: assert(false); - } - return val; +// A upb_accessor is a table of function pointers for doing reads and writes +// for one specific upb_fielddef. Each field has a separate accessor, which +// lives in the fielddef. -#undef CASE -} +typedef bool upb_has_reader(void *m, upb_value fval); +typedef upb_value upb_value_reader(void *m, upb_value fval); -INLINE void upb_value_write(upb_valueptr ptr, upb_value val, - upb_fieldtype_t ft) { -#ifndef NDEBUG - if (ft == UPB_VALUETYPE_ARRAY) { - assert(val.type == UPB_VALUETYPE_ARRAY); - } else if (val.type != UPB_VALUETYPE_RAW) { - assert(val.type == upb_types[ft].inmemory_type); - } -#endif -#define CASE(t, member_name) \ - case UPB_TYPE(t): *ptr.member_name = val.val.member_name; break; - - switch(ft) { - CASE(DOUBLE, _double) - CASE(FLOAT, _float) - CASE(INT32, int32) - CASE(INT64, int64) - CASE(UINT32, uint32) - CASE(UINT64, uint64) - CASE(SINT32, int32) - CASE(SINT64, int64) - CASE(FIXED32, uint32) - CASE(FIXED64, uint64) - CASE(SFIXED32, int32) - CASE(SFIXED64, int64) - CASE(BOOL, _bool) - CASE(ENUM, int32) - CASE(STRING, str) - CASE(BYTES, str) - CASE(MESSAGE, msg) - CASE(GROUP, msg) - case UPB_VALUETYPE_ARRAY: - *ptr.arr = val.val.arr; - break; - default: assert(false); - } - -#undef CASE -} +typedef void *upb_seqbegin_handler(void *s); +typedef void *upb_seqnext_handler(void *s, void *iter); +typedef upb_value upb_seqget_handler(void *iter); +INLINE bool upb_seq_done(void *iter) { return iter == NULL; } +typedef struct _upb_accessor_vtbl { + // Writers. These take an fval as a parameter because the callbacks are used + // as upb_handlers, but the fval is always the fielddef for that field. + upb_startfield_handler *appendseq; // Repeated fields only. + upb_startfield_handler *appendsubmsg; // Submsg fields (repeated or no). + upb_value_handler *set; // Scalar fields (repeated or no). -/* upb_array ******************************************************************/ + // Readers. + upb_has_reader *has; + upb_value_reader *get; + upb_seqbegin_handler *seqbegin; + upb_seqnext_handler *seqnext; + upb_seqget_handler *seqget; +} upb_accessor_vtbl; -typedef uint32_t upb_arraylen_t; -struct _upb_array { - upb_atomic_t refcount; - // "len" and "size" are measured in elements, not bytes. - int32_t len; - int32_t size; - char *ptr; -}; - -void _upb_array_free(upb_array *a, upb_fielddef *f); -INLINE upb_valueptr _upb_array_getptrforsize(upb_array *a, size_t type_size, - int32_t elem) { - assert(elem >= 0); - upb_valueptr p; - p._void = &a->ptr[elem * type_size]; - return p; -} +// Registers handlers for writing into a message of the given type. +upb_mhandlers *upb_accessors_reghandlers(upb_handlers *h, upb_msgdef *m); -INLINE upb_valueptr _upb_array_getptr(upb_array *a, upb_fielddef *f, - uint32_t elem) { - return _upb_array_getptrforsize(a, upb_types[f->type].size, elem); -} +// Returns an stdmsg accessor for the given fielddef. +upb_accessor_vtbl *upb_stdmsg_accessor(upb_fielddef *f); -upb_array *upb_array_new(void); -INLINE void upb_array_unref(upb_array *a, upb_fielddef *f) { - if (a && upb_atomic_unref(&a->refcount)) _upb_array_free(a, f); -} +/* upb_msg/upb_seq ************************************************************/ -void upb_array_recycle(upb_array **arr); -INLINE uint32_t upb_array_len(upb_array *a) { - return a->len; -} +// upb_msg and upb_seq allow for generic access to a message through its +// accessor vtable. Note that these do *not* allow you to create, destroy, or +// take references on the objects -- these operations are specifically outside +// the scope of what the accessors define. -INLINE upb_value upb_array_get(upb_array *arr, upb_fielddef *f, - upb_arraylen_t i) { - assert(i < upb_array_len(arr)); - return upb_value_read(_upb_array_getptr(arr, f, i), f->type); -} +// Clears all hasbits. +// TODO: Add a separate function for setting primitive values back to their +// defaults (but not strings, submessages, or arrays). +void upb_msg_clear(void *msg, upb_msgdef *md); +// Could add a method that recursively clears submessages, strings, and +// arrays if desired. This could be a win if you wanted to merge without +// needing hasbits, because during parsing you would never clear submessages +// or arrays. Also this could be desired to provide proto2 operations on +// generated messages. -/* upb_msg ********************************************************************/ - -// upb_msg is not self-describing; the upb_msg does not contain a pointer to the -// upb_msgdef. While this makes the API a bit more cumbersome to use, this -// choice was made for a few important reasons: -// -// 1. it would make every message 8 bytes larger on 64-bit platforms. This is -// a high overhead for small messages. -// 2. you would want the msg to own a ref on its msgdef, but this would require -// an atomic operation for every message create or destroy! -struct _upb_msg { - upb_atomic_t refcount; - uint8_t data[4]; // We allocate the appropriate amount per message. -}; - -void _upb_msg_free(upb_msg *msg, upb_msgdef *md); - -INLINE upb_valueptr _upb_msg_getptr(upb_msg *msg, upb_fielddef *f) { - upb_valueptr p; - p._void = &msg->data[f->byte_offset]; - return p; +INLINE bool upb_msg_has(void *m, upb_fielddef *f) { + return f->accessor && f->accessor->has(m, f->fval); } -// Creates a new msg of the given type. -upb_msg *upb_msg_new(upb_msgdef *md); - -// Unrefs the given message. -INLINE void upb_msg_unref(upb_msg *msg, upb_msgdef *md) { - if (msg && upb_atomic_unref(&msg->refcount)) _upb_msg_free(msg, md); -} - -INLINE upb_msg *upb_msg_getref(upb_msg *msg) { - assert(msg); - upb_atomic_ref(&msg->refcount); - return msg; +// May only be called for fields that are known to be set. +INLINE upb_value upb_msg_get(void *m, upb_fielddef *f) { + assert(upb_msg_has(m, f)); + return f->accessor->get(m, f->fval); } -// Modifies *msg to point to a newly initialized msg instance. If the msg had -// no other referents, reuses the same msg, otherwise allocates a new one. -// The caller *must* own a ref on the msg prior to calling this method! -void upb_msg_recycle(upb_msg **msg, upb_msgdef *msgdef); - -// Tests whether the given field is explicitly set, or whether it will return a -// default. -INLINE bool upb_msg_has(upb_msg *msg, upb_fielddef *f) { - return (msg->data[f->set_bit_offset] & f->set_bit_mask) != 0; +INLINE void *upb_seq_begin(void *s, upb_fielddef *f) { + assert(f->accessor); + return f->accessor->seqbegin(s); } - -// We have several options for handling default values: -// 1. inside upb_msg_clear(), overwrite all values to be their defaults, -// overwriting submessage pointers to point to the default instance again. -// 2. inside upb_msg_get(), test upb_msg_has() and return md->default_value -// if it is not set. upb_msg_clear() only clears the set bits. -// We lazily clear objects if/when we reuse them. -// 3. inside upb_msg_clear(), overwrite all values to be their default, -// and recurse into submessages to set all their values to defaults also. -// 4. as a hybrid of (1) and (3), clear all set bits in upb_msg_clear() -// but also overwrite all primitive values to be their defaults. Only -// accessors for non-primitive values (submessage, strings, and arrays) -// need to check the has-bits in their accessors -- primitive values can -// always be returned straight from the msg. -// -// (1) is undesirable, because it prevents us from caching sub-objects. -// (2) makes clear() cheaper, but makes get() branchier. -// (3) makes get() less branchy, but makes clear() traverse the message graph. -// (4) is probably the best bang for the buck. -// -// For the moment upb does (2), but we should implement (4). Google's protobuf -// does (3), which is likely part of the reason that even our table-based -// decoder beats it in some benchmarks. - -// For submessages and strings, the returned value is not owned. -upb_value upb_msg_get(upb_msg *msg, upb_fielddef *f); - -// A specialized version of the previous that is cheaper because it doesn't -// support submessages or arrays. -INLINE upb_value upb_msg_getscalar(upb_msg *msg, upb_fielddef *f) { - if (upb_msg_has(msg, f)) { - return upb_value_read(_upb_msg_getptr(msg, f), upb_field_valuetype(f)); - } else { - return f->default_value; - } +INLINE void *upb_seq_next(void *s, void *iter, upb_fielddef *f) { + assert(f->accessor); + assert(!upb_seq_done(iter)); + return f->accessor->seqnext(s, iter); } - -// Sets the given field to the given value. If the field is a string, array, -// or submessage, releases the ref on any object we may have been referencing -// and takes a ref on the new object (if any). -void upb_msg_set(upb_msg *msg, upb_fielddef *f, upb_value val); - -// Unsets all field values back to their defaults. -INLINE void upb_msg_clear(upb_msg *msg, upb_msgdef *md) { - memset(msg->data, 0, md->set_flags_bytes); +INLINE upb_value upb_seq_get(void *iter, upb_fielddef *f) { + assert(f->accessor); + assert(!upb_seq_done(iter)); + return f->accessor->seqget(iter); } -// Registers handlers for populating a msg for the given upb_msgdef. -// The upb_msg itself must be passed as the param to the src. -upb_mhandlers *upb_msg_reghandlers(upb_handlers *h, upb_msgdef *md); - /* upb_msgvisitor *************************************************************/ -// Calls a set of upb_handlers with the contents of a upb_msg. +// A upb_msgvisitor reads data from an in-memory structure using its accessors, +// pushing the results to a given set of upb_handlers. +// TODO: not yet implemented. + typedef struct { upb_fhandlers *fh; upb_fielddef *f; @@ -314,6 +151,118 @@ void upb_msgvisitor_uninit(upb_msgvisitor *v); void upb_msgvisitor_reset(upb_msgvisitor *v, upb_msg *m); void upb_msgvisitor_visit(upb_msgvisitor *v, upb_status *status); + +/* Standard writers. **********************************************************/ + +// Allocates a new stdmsg. +void *upb_stdmsg_new(upb_msgdef *md); + +// Recursively frees any strings or submessages that the message refers to. +void upb_stdmsg_free(void *m, upb_msgdef *md); + +// "hasbit" must be <= UPB_MAX_FIELDS. If it is <0, this field has no hasbit. +upb_value upb_stdmsg_packfval(int16_t hasbit, uint16_t value_offset); +upb_value upb_stdmsg_packfval_subm(int16_t hasbit, uint16_t value_offset, + uint16_t subm_size, uint8_t subm_setbytes); + +// Value writers for every in-memory type: write the data to a known offset +// from the closure "c" and set the hasbit (if any). +// TODO: can we get away with having only one for int64, uint64, double, etc? +// The main thing in the way atm is that the upb_value is strongly typed. +// in debug mode. +upb_flow_t upb_stdmsg_setint64(void *c, upb_value fval, upb_value val); +upb_flow_t upb_stdmsg_setint32(void *c, upb_value fval, upb_value val); +upb_flow_t upb_stdmsg_setuint64(void *c, upb_value fval, upb_value val); +upb_flow_t upb_stdmsg_setuint32(void *c, upb_value fval, upb_value val); +upb_flow_t upb_stdmsg_setdouble(void *c, upb_value fval, upb_value val); +upb_flow_t upb_stdmsg_setfloat(void *c, upb_value fval, upb_value val); +upb_flow_t upb_stdmsg_setbool(void *c, upb_value fval, upb_value val); + +// Value writers for repeated fields: the closure points to a standard array +// struct, appends the value to the end of the array, resizing with realloc() +// if necessary. +typedef struct { + char *ptr; + int32_t len; // Number of elements present. + int32_t size; // Number of elements allocated. +} upb_stdarray; + +upb_flow_t upb_stdmsg_setint64_r(void *c, upb_value fval, upb_value val); +upb_flow_t upb_stdmsg_setint32_r(void *c, upb_value fval, upb_value val); +upb_flow_t upb_stdmsg_setuint64_r(void *c, upb_value fval, upb_value val); +upb_flow_t upb_stdmsg_setuint32_r(void *c, upb_value fval, upb_value val); +upb_flow_t upb_stdmsg_setdouble_r(void *c, upb_value fval, upb_value val); +upb_flow_t upb_stdmsg_setfloat_r(void *c, upb_value fval, upb_value val); +upb_flow_t upb_stdmsg_setbool_r(void *c, upb_value fval, upb_value val); + +// Writers for C strings (NULL-terminated): we can find a char* at a known +// offset from the closure "c". Calls realloc() on the pointer to allocate +// the memory (TODO: investigate whether checking malloc_usable_size() would +// be cheaper than realloc()). Also sets the hasbit, if any. +// +// Since the string is NULL terminated and does not store an explicit length, +// these are not suitable for binary data that can contain NULLs. +upb_flow_t upb_stdmsg_setcstr(void *c, upb_value fval, upb_value val); +upb_flow_t upb_stdmsg_setcstr_r(void *c, upb_value fval, upb_value val); + +// Writers for length-delimited strings: we explicitly store the length, so +// the data can contain NULLs. Stores the data using upb_stdarray +// which is located at a known offset from the closure "c" (note that it +// is included inline rather than pointed to). Also sets the hasbit, if any. +upb_flow_t upb_stdmsg_setstr(void *c, upb_value fval, upb_value val); +upb_flow_t upb_stdmsg_setstr_r(void *c, upb_value fval, upb_value val); + +// Writers for startseq and startmsg which allocate (or reuse, if possible) +// a sub data structure (upb_stdarray or a submessage, respectively), +// setting the hasbit. If the hasbit is already set, the existing data +// structure is used verbatim. If the hasbit is not already set, the pointer +// is checked for NULL. If it is NULL, a new substructure is allocated, +// cleared, and used. If it is not NULL, the existing substructure is +// cleared and reused. +// +// If there is no hasbit, we always behave as if the hasbit was not set, +// so any existing data for this array or submessage is cleared. In most +// cases this will be fine since each array or non-repeated submessage should +// occur at most once in the stream. But if the client is using "concatenation +// as merging", it will want to make sure hasbits are allocated so merges can +// happen appropriately. +// +// If there was a demand for the behavior that absence of a hasbit acts as if +// the bit was always set, we could provide that also. But Clear() would need +// to act recursively, which is less efficient since it requires an extra pass +// over the tree. +upb_sflow_t upb_stdmsg_startseq(void *c, upb_value fval); +upb_sflow_t upb_stdmsg_startsubmsg(void *c, upb_value fval); +upb_sflow_t upb_stdmsg_startsubmsg_r(void *c, upb_value fval); + + +/* Standard readers. **********************************************************/ + +bool upb_stdmsg_has(void *c, upb_value fval); +void *upb_stdmsg_seqbegin(void *c); + +upb_value upb_stdmsg_getint64(void *c, upb_value fval); +upb_value upb_stdmsg_getint32(void *c, upb_value fval); +upb_value upb_stdmsg_getuint64(void *c, upb_value fval); +upb_value upb_stdmsg_getuint32(void *c, upb_value fval); +upb_value upb_stdmsg_getdouble(void *c, upb_value fval); +upb_value upb_stdmsg_getfloat(void *c, upb_value fval); +upb_value upb_stdmsg_getbool(void *c, upb_value fval); +upb_value upb_stdmsg_getptr(void *c, upb_value fval); + +void *upb_stdmsg_8byte_seqnext(void *c, void *iter); +void *upb_stdmsg_4byte_seqnext(void *c, void *iter); +void *upb_stdmsg_1byte_seqnext(void *c, void *iter); + +upb_value upb_stdmsg_seqgetint64(void *c); +upb_value upb_stdmsg_seqgetint32(void *c); +upb_value upb_stdmsg_seqgetuint64(void *c); +upb_value upb_stdmsg_seqgetuint32(void *c); +upb_value upb_stdmsg_seqgetdouble(void *c); +upb_value upb_stdmsg_seqgetfloat(void *c); +upb_value upb_stdmsg_seqgetbool(void *c); +upb_value upb_stdmsg_seqgetptr(void *c); + #ifdef __cplusplus } /* extern "C" */ #endif -- cgit v1.2.3