summaryrefslogtreecommitdiff
path: root/upb_msg.h
diff options
context:
space:
mode:
Diffstat (limited to 'upb_msg.h')
-rw-r--r--upb_msg.h339
1 files changed, 196 insertions, 143 deletions
diff --git a/upb_msg.h b/upb_msg.h
index ce32783..a3f8c92 100644
--- a/upb_msg.h
+++ b/upb_msg.h
@@ -3,46 +3,100 @@
*
* Copyright (c) 2009 Joshua Haberman. See LICENSE for details.
*
- * upb_msg contains a full description of a message as defined in a .proto file.
- * It supports many features and operations for dealing with proto messages:
+ * A upb_msg provides a full description of a message as defined in a .proto
+ * file. It supports many features and operations for dealing with proto
+ * messages:
* - reflection over .proto types at runtime (list fields, get names, etc).
* - an in-memory byte-level format for efficiently storing and accessing msgs.
* - serializing and deserializing from the in-memory format to a protobuf.
* - optional memory management for handling strings, arrays, and submessages.
*
+ * Throughout this file, the following convention is used:
+ * - "struct upb_msg *m" describes a message type (name, list of fields, etc).
+ * - "void *data" is an actual message stored using the in-memory format.
+ *
* The in-memory format is very much like a C struct that you can define at
* run-time, but also supports reflection. Like C structs it supports
* offset-based access, as opposed to the much slower name-based lookup. The
- * format represents both the values themselves and bits describing whether each
- * field is set or not.
+ * format stores both the values themselves and bits describing whether each
+ * field is set or not. For example:
+ *
+ * parsed message Foo {
+ * optional bool a = 1;
+ * repeated uint32 b = 2;
+ * optional Bar c = 3;
+ * }
+ *
+ * The in-memory layout for this message on a 32-bit machine will be something
+ * like:
+ *
+ * Foo
+ * +------------------------+
+ * | set_flags a:1, b:1, c:1|
+ * +------------------------+
+ * | bool a (1 byte) |
+ * +------------------------+
+ * | padding (3 bytes) |
+ * +------------------------+ upb_array
+ * | upb_array* b (4 bytes) | ----> +----------------------------+
+ * +------------------------+ | uint32* elements (4 bytes) | ---+
+ * | Bar* c (4 bytes) | +----------------------------+ |
+ * +------------------------+ | uint32 size (4 bytes) | |
+ * +----------------------------+ |
+ * |
+ * -----------------------------------------------------------------+
+ * |
+ * V
+ * uint32 array
+ * +----+----+----+----+----+----+
+ * | e1 | e2 | e3 | e4 | e5 | e6 |
+ * +----+----+----+----+----+----+
+ *
+ * And the corresponding C structure (as emitted by the proto compiler) would be:
*
- * The upb compiler emits C structs that mimic this definition exactly, so that
- * you can access the same hunk of memory using either this run-time
- * reflection-supporting interface or a C struct that was generated by the upb
- * compiler.
+ * struct Foo {
+ * union {
+ * uint8_t bytes[1];
+ * struct {
+ * bool a:1;
+ * bool b:1;
+ * bool c:1;
+ * } has;
+ * } set_flags;
+ * bool a;
+ * upb_uint32_array *b;
+ * Bar *c;
+ * }
*
- * Like C structs the format depends on the endianness of the host machine, so
- * it is not suitable for exchanging across machines of differing endianness.
- * But there is no reason to do that -- the protobuf serialization format is
- * designed already for serialization/deserialization, and is more compact than
- * this format. This format is designed to allow the fastest possible random
- * access of individual fields.
+ * Because the C struct emitted by the upb compiler uses exactly the same
+ * byte-level format as the reflection interface, you can access the same hunk
+ * of memory either way. The C struct provides maximum performance and static
+ * type safety; upb_msg provides flexibility.
*
- * Note that clients need not use the memory management facilities defined here.
- * They are for convenience only -- clients wishing to do their own memory
- * management may do so (allowing clients to perform advanced techniques like
- * reference-counting, garbage collection, and string references). Different
+ * The in-memory format has no interoperability guarantees whatsoever, except
+ * that a single version of upb will interoperate with itself. Don't even
+ * think about persisting the in-memory format or sending it anywhere. That's
+ * what serialized protobufs are for! The in-memory format is just that -- an
+ * in-memory representation that allows for fast access.
+ *
+ * The in-memory format is carefully designed to *not* mandate any particular
+ * memory management scheme. This should make it easier to integrate with
+ * existing memory management schemes, or to perform advanced techniques like
+ * reference counting, garbage collection, and string references. Different
* clients can read each others messages regardless of what memory management
* scheme each is using.
+ *
+ * A memory management scheme is provided for convenience, and it is used by
+ * default by the stock message parser. Clients can substitute their own
+ * memory management scheme into this parser without any loss of generality
+ * or performance.
*/
#ifndef UPB_MSG_H_
#define UPB_MSG_H_
#include <stdbool.h>
-#include <stddef.h>
#include <stdint.h>
-#include <string.h>
#include "upb.h"
#include "upb_table.h"
@@ -59,7 +113,10 @@ struct google_protobuf_FieldDescriptorProto;
/* Message definition. ********************************************************/
/* Structure that describes a single field in a message. This structure is very
- * consciously designed to fit into 12/16 bytes (32/64 bit, respectively). */
+ * consciously designed to fit into 12/16 bytes (32/64 bit, respectively),
+ * because copies of this struct are in the hash table that is read in the
+ * critical path of parsing. Minimizing the size of this struct increases
+ * cache-friendliness. */
struct upb_msg_field {
union upb_symbol_ref ref;
uint32_t byte_offset; /* Where to find the data. */
@@ -102,7 +159,7 @@ INLINE struct google_protobuf_FieldDescriptorProto *upb_msg_field_descriptor(
return m->field_descriptors[f->field_index];
}
-/* Initialize and free a upb_msg. Caller retains ownership of d, but the msg
+/* Initializes/frees a upb_msg. Caller retains ownership of d, but the msg
* will contain references to it, so it must outlive the msg. Note that init
* does not resolve upb_msg_field.ref -- the caller should do that
* post-initialization by calling upb_msg_ref() below. */
@@ -114,9 +171,9 @@ void upb_msg_free(struct upb_msg *m);
* mutually-recursive ways, this step must be separated from initialization. */
void upb_msg_ref(struct upb_msg *m, struct upb_msg_field *f, union upb_symbol_ref ref);
-/* While these are written to be as fast as possible, it will still be faster
- * to cache the results of this lookup if possible. These return NULL if no
- * such field is found. */
+/* Looks up a field by name or number. While these are written to be as fast
+ * as possible, it will still be faster to cache the results of this lookup if
+ * possible. These return NULL if no such field is found. */
INLINE struct upb_msg_field *upb_msg_fieldbynum(struct upb_msg *m,
uint32_t number) {
struct upb_fieldsbynum_entry *e = upb_inttable_lookup(
@@ -130,33 +187,69 @@ INLINE struct upb_msg_field *upb_msg_fieldbyname(struct upb_msg *m,
return e ? &e->f : NULL;
}
+/* "Set" flag reading and writing. *******************************************/
+
+INLINE size_t upb_isset_offset(uint32_t field_index) {
+ return field_index / 8;
+}
+
+INLINE uint8_t upb_isset_mask(uint32_t field_index) {
+ return 1 << (field_index % 8);
+}
+
+/* Functions for reading and writing the "set" flags in the msg. Note that
+ * these do not perform memory management associated with any dynamic memory
+ * these fields may be referencing. These *only* set and test the flags. */
+INLINE void upb_msg_set(void *s, struct upb_msg_field *f)
+{
+ ((char*)s)[upb_isset_offset(f->field_index)] |= upb_isset_mask(f->field_index);
+}
+
+INLINE void upb_msg_unset(void *s, struct upb_msg_field *f)
+{
+ ((char*)s)[upb_isset_offset(f->field_index)] &= ~upb_isset_mask(f->field_index);
+}
+
+INLINE bool upb_msg_is_set(void *s, struct upb_msg_field *f)
+{
+ return ((char*)s)[upb_isset_offset(f->field_index)] & upb_isset_mask(f->field_index);
+}
+
+INLINE bool upb_msg_all_required_fields_set(void *s, struct upb_msg *m)
+{
+ int num_fields = m->num_required_fields;
+ int i = 0;
+ while(num_fields > 8) {
+ if(((uint8_t*)s)[i++] != 0xFF) return false;
+ num_fields -= 8;
+ }
+ if(((uint8_t*)s)[i] != (1 << num_fields) - 1) return false;
+ return true;
+}
+
+INLINE void upb_msg_clear(void *s, struct upb_msg *m)
+{
+ memset(s, 0, m->set_flags_bytes);
+}
+
+/* Scalar (non-array) data access. ********************************************/
+
+/* Returns a pointer to a specific field in a message. */
+INLINE union upb_value_ptr upb_msg_getptr(void *data, struct upb_msg_field *f) {
+ union upb_value_ptr p = {._void = ((char*)data + f->byte_offset)};
+ return p;
+}
+
/* Arrays. ********************************************************************/
/* Represents an array (a repeated field) of any type. The interpretation of
* the data in the array depends on the type. */
struct upb_array {
- union {
- double *_double;
- float *_float;
- int32_t *int32;
- int64_t *int64;
- uint32_t *uint32;
- uint64_t *uint64;
- bool *_bool;
- struct upb_string **string;
- void **submsg;
- void *_void;
- } elements;
+ union upb_value_ptr elements;
uint32_t len; /* Measured in elements. */
};
-/* These are all overlays on upb_array, pointers between them can be cast. */
-#define UPB_DEFINE_ARRAY_TYPE(name, type) \
- struct name ## _array { \
- type *elements; \
- uint32_t len; \
- };
-
+/* Returns a pointer to an array element. */
INLINE union upb_value_ptr upb_array_getelementptr(
struct upb_array *arr, uint32_t n, upb_field_type_t type)
{
@@ -166,6 +259,13 @@ INLINE union upb_value_ptr upb_array_getelementptr(
return ptr;
}
+/* These are all overlays on upb_array, pointers between them can be cast. */
+#define UPB_DEFINE_ARRAY_TYPE(name, type) \
+ struct name ## _array { \
+ type *elements; \
+ uint32_t len; \
+ };
+
UPB_DEFINE_ARRAY_TYPE(upb_double, double)
UPB_DEFINE_ARRAY_TYPE(upb_float, float)
UPB_DEFINE_ARRAY_TYPE(upb_int32, int32_t)
@@ -175,6 +275,7 @@ UPB_DEFINE_ARRAY_TYPE(upb_uint64, uint64_t)
UPB_DEFINE_ARRAY_TYPE(upb_bool, bool)
UPB_DEFINE_ARRAY_TYPE(upb_string, struct upb_string*)
+/* Defines an array of a specific message type. */
#define UPB_MSG_ARRAY(msg_type) struct msg_type ## _array
#define UPB_DEFINE_MSG_ARRAY(msg_type) \
UPB_MSG_ARRAY(msg_type) { \
@@ -182,52 +283,42 @@ UPB_DEFINE_ARRAY_TYPE(upb_string, struct upb_string*)
uint32_t len; \
};
-/* Accessors for primitive types. ********************************************/
+/* Memory management *********************************************************/
-/* For each primitive type we define a set of three functions:
- *
- * // For fetching out of a msg (s points to the raw msg data).
- * int32_t *upb_msg_get_int32_ptr(void *s, struct upb_msg_field *f);
- * int32_t upb_msg_get_int32(void *s, struct upb_msg_field *f);
- * void upb_msg_set_int32(void *s, struct upb_msg_field *f, int32_t val);
- *
- * These do no existence checks, bounds checks, or type checks. */
-
-#define UPB_DEFINE_ACCESSORS(INLINE, name, ctype) \
- INLINE ctype *upb_msg_get_ ## name ## _ptr( \
- void *s, struct upb_msg_field *f) { \
- return (ctype*)((char*)s + f->byte_offset); \
- } \
- INLINE ctype upb_msg_get_ ## name( \
- void *s, struct upb_msg_field *f) { \
- return *upb_msg_get_ ## name ## _ptr(s, f); \
- } \
- INLINE void upb_msg_set_ ## name( \
- void *s, struct upb_msg_field *f, ctype val) { \
- *upb_msg_get_ ## name ## _ptr(s, f) = val; \
- }
+/* One important note about these memory management routines: they must be used
+ * completely or not at all (for each message). In other words, you can't
+ * allocate your own message and then free it with upb_msgdata_free. As
+ * another example, you can't point a field to your own string and then call
+ * upb_msg_reuse_str. */
+
+/* Allocates and frees message data, respectively. Newly allocated data is
+ * initialized to empty. Freeing a message always frees string data, but
+ * the client can decide whether or not submessages should be deleted. */
+void *upb_msgdata_new(struct upb_msg *m);
+void upb_msgdata_free(void *data, struct upb_msg *m, bool free_submsgs);
+
+/* Given a pointer to the appropriate field of the message or array, these
+ * functions will lazily allocate memory for a string, array, or submessage.
+ * If the previously allocated memory is big enough, it will reuse it without
+ * re-allocating. See upb_msg.c for example usage. */
+
+/* Reuse a string of at least the given size. */
+void upb_msg_reuse_str(struct upb_string **str, uint32_t size);
+/* Like the previous, but assumes that the string will be by reference, so
+ * doesn't allocate memory for the string itself. */
+void upb_msg_reuse_strref(struct upb_string **str);
-UPB_DEFINE_ACCESSORS(INLINE, double, double)
-UPB_DEFINE_ACCESSORS(INLINE, float, float)
-UPB_DEFINE_ACCESSORS(INLINE, int32, int32_t)
-UPB_DEFINE_ACCESSORS(INLINE, int64, int64_t)
-UPB_DEFINE_ACCESSORS(INLINE, uint32, uint32_t)
-UPB_DEFINE_ACCESSORS(INLINE, uint64, uint64_t)
-UPB_DEFINE_ACCESSORS(INLINE, bool, bool)
-UPB_DEFINE_ACCESSORS(INLINE, bytes, struct upb_string*)
-UPB_DEFINE_ACCESSORS(INLINE, string, struct upb_string*)
-UPB_DEFINE_ACCESSORS(INLINE, submsg, void*)
-UPB_DEFINE_ACCESSORS(INLINE, array, struct upb_array*)
-
-INLINE union upb_value_ptr upb_msg_get_ptr(
- void *data, struct upb_msg_field *f) {
- union upb_value_ptr p = {._void = ((char*)data + f->byte_offset)};
- return p;
-}
+/* Reuse an array of at least the given size, with the given type. */
+void upb_msg_reuse_array(struct upb_array **arr, uint32_t size,
+ upb_field_type_t t);
-/* Memory management *********************************************************/
+/* Reuse a submessage of the given type. */
+void upb_msg_reuse_submsg(void **msg, struct upb_msg *m);
-void *upb_msg_new(struct upb_msg *m);
+/* Serialization/Deserialization. ********************************************/
+
+/* This is all just a layer on top of the stream-oriented facility in
+ * upb_parse.h. */
struct upb_msg_parse_state {
struct upb_parse_state s;
@@ -236,70 +327,32 @@ struct upb_msg_parse_state {
struct upb_msg *m;
};
-void upb_msg_parse_init(struct upb_msg_parse_state *s, void *msg,
+/* Initializes/frees a message parser. The parser will write the data to the
+ * message data "data", which the caller must have previously allocated (the
+ * parser will allocate submsgs, strings, and arrays as needed, however).
+ *
+ * "Merge" controls whether the parser will append to data instead of
+ * overwriting. Merging concatenates arrays and merges submessages instead
+ * of clearing both.
+ *
+ * "Byref" controls whether the new message data copies or references strings
+ * it encounters. If byref == true, then all strings supplied to upb_msg_parse
+ * must remain unchanged and must outlive data. */
+void upb_msg_parse_init(struct upb_msg_parse_state *s, void *data,
struct upb_msg *m, bool merge, bool byref);
void upb_msg_parse_free(struct upb_msg_parse_state *s);
+
+/* Parses a protobuf fragment, writing the data to the message that was passed
+ * to upb_msg_parse_init. This function can be called multiple times as more
+ * data becomes available. */
upb_status_t upb_msg_parse(struct upb_msg_parse_state *s,
void *data, size_t len, size_t *read);
+/* Parses the protobuf in s (which is expected to be complete) and allocates
+ * new message data to hold it. This is an alternative to the streaming API
+ * above. "byref" works as in upb_msg_parse_init(). */
void *upb_alloc_and_parse(struct upb_msg *m, struct upb_string *s, bool byref);
-/* Note! These two may not be use on a upb_string* that was initialized by
- * means other than these functions. */
-void upb_msg_reuse_str(struct upb_string **str, uint32_t len);
-void upb_msg_reuse_array(struct upb_array **arr, uint32_t n, upb_field_type_t t);
-void upb_msg_reuse_strref(struct upb_string **str);
-void upb_msg_reuse_submsg(void **msg, struct upb_msg *m);
-
-/* "Set" flag reading and writing. *******************************************/
-
-INLINE size_t upb_isset_offset(uint32_t field_index) {
- return field_index / 8;
-}
-
-INLINE uint8_t upb_isset_mask(uint32_t field_index) {
- return 1 << (field_index % 8);
-}
-
-/* Functions for reading and writing the "set" flags in the msg. Note that
- * these do not perform memory management associated with any dynamic memory
- * these fields may be referencing. These *only* set and test the flags. */
-INLINE void upb_msg_set(void *s, struct upb_msg_field *f)
-{
- ((char*)s)[upb_isset_offset(f->field_index)] |= upb_isset_mask(f->field_index);
-}
-
-INLINE void upb_msg_unset(void *s, struct upb_msg_field *f)
-{
- ((char*)s)[upb_isset_offset(f->field_index)] &= ~upb_isset_mask(f->field_index);
-}
-
-INLINE bool upb_msg_is_set(void *s, struct upb_msg_field *f)
-{
- return ((char*)s)[upb_isset_offset(f->field_index)] & upb_isset_mask(f->field_index);
-}
-
-INLINE bool upb_msg_all_required_fields_set(void *s, struct upb_msg *m)
-{
- int num_fields = m->num_required_fields;
- int i = 0;
- while(num_fields > 8) {
- if(((uint8_t*)s)[i++] != 0xFF) return false;
- num_fields -= 8;
- }
- if(((uint8_t*)s)[i] != (1 << num_fields) - 1) return false;
- return true;
-}
-
-INLINE void upb_msg_clear(void *s, struct upb_msg *m)
-{
- memset(s, 0, m->set_flags_bytes);
-}
-
-/* Serialization/Deserialization. ********************************************/
-
-/* Parses the string data in s according to the message description in m. */
-upb_status_t upb_msg_merge(void *data, struct upb_msg *m, struct upb_string *s);
/* Text dump *****************************************************************/
generated by cgit on debian on lair
contact matthew@masot.net with questions or feedback