diff options
Diffstat (limited to 'upb_msg.h')
-rw-r--r-- | upb_msg.h | 339 |
1 files changed, 196 insertions, 143 deletions
@@ -3,46 +3,100 @@ * * Copyright (c) 2009 Joshua Haberman. See LICENSE for details. * - * upb_msg contains a full description of a message as defined in a .proto file. - * It supports many features and operations for dealing with proto messages: + * A upb_msg provides a full description of a message as defined in a .proto + * file. It supports many features and operations for dealing with proto + * messages: * - reflection over .proto types at runtime (list fields, get names, etc). * - an in-memory byte-level format for efficiently storing and accessing msgs. * - serializing and deserializing from the in-memory format to a protobuf. * - optional memory management for handling strings, arrays, and submessages. * + * Throughout this file, the following convention is used: + * - "struct upb_msg *m" describes a message type (name, list of fields, etc). + * - "void *data" is an actual message stored using the in-memory format. + * * The in-memory format is very much like a C struct that you can define at * run-time, but also supports reflection. Like C structs it supports * offset-based access, as opposed to the much slower name-based lookup. The - * format represents both the values themselves and bits describing whether each - * field is set or not. + * format stores both the values themselves and bits describing whether each + * field is set or not. For example: + * + * parsed message Foo { + * optional bool a = 1; + * repeated uint32 b = 2; + * optional Bar c = 3; + * } + * + * The in-memory layout for this message on a 32-bit machine will be something + * like: + * + * Foo + * +------------------------+ + * | set_flags a:1, b:1, c:1| + * +------------------------+ + * | bool a (1 byte) | + * +------------------------+ + * | padding (3 bytes) | + * +------------------------+ upb_array + * | upb_array* b (4 bytes) | ----> +----------------------------+ + * +------------------------+ | uint32* elements (4 bytes) | ---+ + * | Bar* c (4 bytes) | +----------------------------+ | + * +------------------------+ | uint32 size (4 bytes) | | + * +----------------------------+ | + * | + * -----------------------------------------------------------------+ + * | + * V + * uint32 array + * +----+----+----+----+----+----+ + * | e1 | e2 | e3 | e4 | e5 | e6 | + * +----+----+----+----+----+----+ + * + * And the corresponding C structure (as emitted by the proto compiler) would be: * - * The upb compiler emits C structs that mimic this definition exactly, so that - * you can access the same hunk of memory using either this run-time - * reflection-supporting interface or a C struct that was generated by the upb - * compiler. + * struct Foo { + * union { + * uint8_t bytes[1]; + * struct { + * bool a:1; + * bool b:1; + * bool c:1; + * } has; + * } set_flags; + * bool a; + * upb_uint32_array *b; + * Bar *c; + * } * - * Like C structs the format depends on the endianness of the host machine, so - * it is not suitable for exchanging across machines of differing endianness. - * But there is no reason to do that -- the protobuf serialization format is - * designed already for serialization/deserialization, and is more compact than - * this format. This format is designed to allow the fastest possible random - * access of individual fields. + * Because the C struct emitted by the upb compiler uses exactly the same + * byte-level format as the reflection interface, you can access the same hunk + * of memory either way. The C struct provides maximum performance and static + * type safety; upb_msg provides flexibility. * - * Note that clients need not use the memory management facilities defined here. - * They are for convenience only -- clients wishing to do their own memory - * management may do so (allowing clients to perform advanced techniques like - * reference-counting, garbage collection, and string references). Different + * The in-memory format has no interoperability guarantees whatsoever, except + * that a single version of upb will interoperate with itself. Don't even + * think about persisting the in-memory format or sending it anywhere. That's + * what serialized protobufs are for! The in-memory format is just that -- an + * in-memory representation that allows for fast access. + * + * The in-memory format is carefully designed to *not* mandate any particular + * memory management scheme. This should make it easier to integrate with + * existing memory management schemes, or to perform advanced techniques like + * reference counting, garbage collection, and string references. Different * clients can read each others messages regardless of what memory management * scheme each is using. + * + * A memory management scheme is provided for convenience, and it is used by + * default by the stock message parser. Clients can substitute their own + * memory management scheme into this parser without any loss of generality + * or performance. */ #ifndef UPB_MSG_H_ #define UPB_MSG_H_ #include <stdbool.h> -#include <stddef.h> #include <stdint.h> -#include <string.h> #include "upb.h" #include "upb_table.h" @@ -59,7 +113,10 @@ struct google_protobuf_FieldDescriptorProto; /* Message definition. ********************************************************/ /* Structure that describes a single field in a message. This structure is very - * consciously designed to fit into 12/16 bytes (32/64 bit, respectively). */ + * consciously designed to fit into 12/16 bytes (32/64 bit, respectively), + * because copies of this struct are in the hash table that is read in the + * critical path of parsing. Minimizing the size of this struct increases + * cache-friendliness. */ struct upb_msg_field { union upb_symbol_ref ref; uint32_t byte_offset; /* Where to find the data. */ @@ -102,7 +159,7 @@ INLINE struct google_protobuf_FieldDescriptorProto *upb_msg_field_descriptor( return m->field_descriptors[f->field_index]; } -/* Initialize and free a upb_msg. Caller retains ownership of d, but the msg +/* Initializes/frees a upb_msg. Caller retains ownership of d, but the msg * will contain references to it, so it must outlive the msg. Note that init * does not resolve upb_msg_field.ref -- the caller should do that * post-initialization by calling upb_msg_ref() below. */ @@ -114,9 +171,9 @@ void upb_msg_free(struct upb_msg *m); * mutually-recursive ways, this step must be separated from initialization. */ void upb_msg_ref(struct upb_msg *m, struct upb_msg_field *f, union upb_symbol_ref ref); -/* While these are written to be as fast as possible, it will still be faster - * to cache the results of this lookup if possible. These return NULL if no - * such field is found. */ +/* Looks up a field by name or number. While these are written to be as fast + * as possible, it will still be faster to cache the results of this lookup if + * possible. These return NULL if no such field is found. */ INLINE struct upb_msg_field *upb_msg_fieldbynum(struct upb_msg *m, uint32_t number) { struct upb_fieldsbynum_entry *e = upb_inttable_lookup( @@ -130,33 +187,69 @@ INLINE struct upb_msg_field *upb_msg_fieldbyname(struct upb_msg *m, return e ? &e->f : NULL; } +/* "Set" flag reading and writing. *******************************************/ + +INLINE size_t upb_isset_offset(uint32_t field_index) { + return field_index / 8; +} + +INLINE uint8_t upb_isset_mask(uint32_t field_index) { + return 1 << (field_index % 8); +} + +/* Functions for reading and writing the "set" flags in the msg. Note that + * these do not perform memory management associated with any dynamic memory + * these fields may be referencing. These *only* set and test the flags. */ +INLINE void upb_msg_set(void *s, struct upb_msg_field *f) +{ + ((char*)s)[upb_isset_offset(f->field_index)] |= upb_isset_mask(f->field_index); +} + +INLINE void upb_msg_unset(void *s, struct upb_msg_field *f) +{ + ((char*)s)[upb_isset_offset(f->field_index)] &= ~upb_isset_mask(f->field_index); +} + +INLINE bool upb_msg_is_set(void *s, struct upb_msg_field *f) +{ + return ((char*)s)[upb_isset_offset(f->field_index)] & upb_isset_mask(f->field_index); +} + +INLINE bool upb_msg_all_required_fields_set(void *s, struct upb_msg *m) +{ + int num_fields = m->num_required_fields; + int i = 0; + while(num_fields > 8) { + if(((uint8_t*)s)[i++] != 0xFF) return false; + num_fields -= 8; + } + if(((uint8_t*)s)[i] != (1 << num_fields) - 1) return false; + return true; +} + +INLINE void upb_msg_clear(void *s, struct upb_msg *m) +{ + memset(s, 0, m->set_flags_bytes); +} + +/* Scalar (non-array) data access. ********************************************/ + +/* Returns a pointer to a specific field in a message. */ +INLINE union upb_value_ptr upb_msg_getptr(void *data, struct upb_msg_field *f) { + union upb_value_ptr p = {._void = ((char*)data + f->byte_offset)}; + return p; +} + /* Arrays. ********************************************************************/ /* Represents an array (a repeated field) of any type. The interpretation of * the data in the array depends on the type. */ struct upb_array { - union { - double *_double; - float *_float; - int32_t *int32; - int64_t *int64; - uint32_t *uint32; - uint64_t *uint64; - bool *_bool; - struct upb_string **string; - void **submsg; - void *_void; - } elements; + union upb_value_ptr elements; uint32_t len; /* Measured in elements. */ }; -/* These are all overlays on upb_array, pointers between them can be cast. */ -#define UPB_DEFINE_ARRAY_TYPE(name, type) \ - struct name ## _array { \ - type *elements; \ - uint32_t len; \ - }; - +/* Returns a pointer to an array element. */ INLINE union upb_value_ptr upb_array_getelementptr( struct upb_array *arr, uint32_t n, upb_field_type_t type) { @@ -166,6 +259,13 @@ INLINE union upb_value_ptr upb_array_getelementptr( return ptr; } +/* These are all overlays on upb_array, pointers between them can be cast. */ +#define UPB_DEFINE_ARRAY_TYPE(name, type) \ + struct name ## _array { \ + type *elements; \ + uint32_t len; \ + }; + UPB_DEFINE_ARRAY_TYPE(upb_double, double) UPB_DEFINE_ARRAY_TYPE(upb_float, float) UPB_DEFINE_ARRAY_TYPE(upb_int32, int32_t) @@ -175,6 +275,7 @@ UPB_DEFINE_ARRAY_TYPE(upb_uint64, uint64_t) UPB_DEFINE_ARRAY_TYPE(upb_bool, bool) UPB_DEFINE_ARRAY_TYPE(upb_string, struct upb_string*) +/* Defines an array of a specific message type. */ #define UPB_MSG_ARRAY(msg_type) struct msg_type ## _array #define UPB_DEFINE_MSG_ARRAY(msg_type) \ UPB_MSG_ARRAY(msg_type) { \ @@ -182,52 +283,42 @@ UPB_DEFINE_ARRAY_TYPE(upb_string, struct upb_string*) uint32_t len; \ }; -/* Accessors for primitive types. ********************************************/ +/* Memory management *********************************************************/ -/* For each primitive type we define a set of three functions: - * - * // For fetching out of a msg (s points to the raw msg data). - * int32_t *upb_msg_get_int32_ptr(void *s, struct upb_msg_field *f); - * int32_t upb_msg_get_int32(void *s, struct upb_msg_field *f); - * void upb_msg_set_int32(void *s, struct upb_msg_field *f, int32_t val); - * - * These do no existence checks, bounds checks, or type checks. */ - -#define UPB_DEFINE_ACCESSORS(INLINE, name, ctype) \ - INLINE ctype *upb_msg_get_ ## name ## _ptr( \ - void *s, struct upb_msg_field *f) { \ - return (ctype*)((char*)s + f->byte_offset); \ - } \ - INLINE ctype upb_msg_get_ ## name( \ - void *s, struct upb_msg_field *f) { \ - return *upb_msg_get_ ## name ## _ptr(s, f); \ - } \ - INLINE void upb_msg_set_ ## name( \ - void *s, struct upb_msg_field *f, ctype val) { \ - *upb_msg_get_ ## name ## _ptr(s, f) = val; \ - } +/* One important note about these memory management routines: they must be used + * completely or not at all (for each message). In other words, you can't + * allocate your own message and then free it with upb_msgdata_free. As + * another example, you can't point a field to your own string and then call + * upb_msg_reuse_str. */ + +/* Allocates and frees message data, respectively. Newly allocated data is + * initialized to empty. Freeing a message always frees string data, but + * the client can decide whether or not submessages should be deleted. */ +void *upb_msgdata_new(struct upb_msg *m); +void upb_msgdata_free(void *data, struct upb_msg *m, bool free_submsgs); + +/* Given a pointer to the appropriate field of the message or array, these + * functions will lazily allocate memory for a string, array, or submessage. + * If the previously allocated memory is big enough, it will reuse it without + * re-allocating. See upb_msg.c for example usage. */ + +/* Reuse a string of at least the given size. */ +void upb_msg_reuse_str(struct upb_string **str, uint32_t size); +/* Like the previous, but assumes that the string will be by reference, so + * doesn't allocate memory for the string itself. */ +void upb_msg_reuse_strref(struct upb_string **str); -UPB_DEFINE_ACCESSORS(INLINE, double, double) -UPB_DEFINE_ACCESSORS(INLINE, float, float) -UPB_DEFINE_ACCESSORS(INLINE, int32, int32_t) -UPB_DEFINE_ACCESSORS(INLINE, int64, int64_t) -UPB_DEFINE_ACCESSORS(INLINE, uint32, uint32_t) -UPB_DEFINE_ACCESSORS(INLINE, uint64, uint64_t) -UPB_DEFINE_ACCESSORS(INLINE, bool, bool) -UPB_DEFINE_ACCESSORS(INLINE, bytes, struct upb_string*) -UPB_DEFINE_ACCESSORS(INLINE, string, struct upb_string*) -UPB_DEFINE_ACCESSORS(INLINE, submsg, void*) -UPB_DEFINE_ACCESSORS(INLINE, array, struct upb_array*) - -INLINE union upb_value_ptr upb_msg_get_ptr( - void *data, struct upb_msg_field *f) { - union upb_value_ptr p = {._void = ((char*)data + f->byte_offset)}; - return p; -} +/* Reuse an array of at least the given size, with the given type. */ +void upb_msg_reuse_array(struct upb_array **arr, uint32_t size, + upb_field_type_t t); -/* Memory management *********************************************************/ +/* Reuse a submessage of the given type. */ +void upb_msg_reuse_submsg(void **msg, struct upb_msg *m); -void *upb_msg_new(struct upb_msg *m); +/* Serialization/Deserialization. ********************************************/ + +/* This is all just a layer on top of the stream-oriented facility in + * upb_parse.h. */ struct upb_msg_parse_state { struct upb_parse_state s; @@ -236,70 +327,32 @@ struct upb_msg_parse_state { struct upb_msg *m; }; -void upb_msg_parse_init(struct upb_msg_parse_state *s, void *msg, +/* Initializes/frees a message parser. The parser will write the data to the + * message data "data", which the caller must have previously allocated (the + * parser will allocate submsgs, strings, and arrays as needed, however). + * + * "Merge" controls whether the parser will append to data instead of + * overwriting. Merging concatenates arrays and merges submessages instead + * of clearing both. + * + * "Byref" controls whether the new message data copies or references strings + * it encounters. If byref == true, then all strings supplied to upb_msg_parse + * must remain unchanged and must outlive data. */ +void upb_msg_parse_init(struct upb_msg_parse_state *s, void *data, struct upb_msg *m, bool merge, bool byref); void upb_msg_parse_free(struct upb_msg_parse_state *s); + +/* Parses a protobuf fragment, writing the data to the message that was passed + * to upb_msg_parse_init. This function can be called multiple times as more + * data becomes available. */ upb_status_t upb_msg_parse(struct upb_msg_parse_state *s, void *data, size_t len, size_t *read); +/* Parses the protobuf in s (which is expected to be complete) and allocates + * new message data to hold it. This is an alternative to the streaming API + * above. "byref" works as in upb_msg_parse_init(). */ void *upb_alloc_and_parse(struct upb_msg *m, struct upb_string *s, bool byref); -/* Note! These two may not be use on a upb_string* that was initialized by - * means other than these functions. */ -void upb_msg_reuse_str(struct upb_string **str, uint32_t len); -void upb_msg_reuse_array(struct upb_array **arr, uint32_t n, upb_field_type_t t); -void upb_msg_reuse_strref(struct upb_string **str); -void upb_msg_reuse_submsg(void **msg, struct upb_msg *m); - -/* "Set" flag reading and writing. *******************************************/ - -INLINE size_t upb_isset_offset(uint32_t field_index) { - return field_index / 8; -} - -INLINE uint8_t upb_isset_mask(uint32_t field_index) { - return 1 << (field_index % 8); -} - -/* Functions for reading and writing the "set" flags in the msg. Note that - * these do not perform memory management associated with any dynamic memory - * these fields may be referencing. These *only* set and test the flags. */ -INLINE void upb_msg_set(void *s, struct upb_msg_field *f) -{ - ((char*)s)[upb_isset_offset(f->field_index)] |= upb_isset_mask(f->field_index); -} - -INLINE void upb_msg_unset(void *s, struct upb_msg_field *f) -{ - ((char*)s)[upb_isset_offset(f->field_index)] &= ~upb_isset_mask(f->field_index); -} - -INLINE bool upb_msg_is_set(void *s, struct upb_msg_field *f) -{ - return ((char*)s)[upb_isset_offset(f->field_index)] & upb_isset_mask(f->field_index); -} - -INLINE bool upb_msg_all_required_fields_set(void *s, struct upb_msg *m) -{ - int num_fields = m->num_required_fields; - int i = 0; - while(num_fields > 8) { - if(((uint8_t*)s)[i++] != 0xFF) return false; - num_fields -= 8; - } - if(((uint8_t*)s)[i] != (1 << num_fields) - 1) return false; - return true; -} - -INLINE void upb_msg_clear(void *s, struct upb_msg *m) -{ - memset(s, 0, m->set_flags_bytes); -} - -/* Serialization/Deserialization. ********************************************/ - -/* Parses the string data in s according to the message description in m. */ -upb_status_t upb_msg_merge(void *data, struct upb_msg *m, struct upb_string *s); /* Text dump *****************************************************************/ |