/* * upb - a minimalist implementation of protocol buffers. * * Copyright (c) 2009 Joshua Haberman. See LICENSE for details. * * A upb_msg provides a full description of a message as defined in a .proto * file. It supports many features and operations for dealing with proto * messages: * - reflection over .proto types at runtime (list fields, get names, etc). * - an in-memory byte-level format for efficiently storing and accessing msgs. * - serializing and deserializing from the in-memory format to a protobuf. * - optional memory management for handling strings, arrays, and submessages. * * Throughout this file, the following convention is used: * - "struct upb_msg *m" describes a message type (name, list of fields, etc). * - "void *data" is an actual message stored using the in-memory format. * * The in-memory format is very much like a C struct that you can define at * run-time, but also supports reflection. Like C structs it supports * offset-based access, as opposed to the much slower name-based lookup. The * format stores both the values themselves and bits describing whether each * field is set or not. For example: * * parsed message Foo { * optional bool a = 1; * repeated uint32 b = 2; * optional Bar c = 3; * } * * The in-memory layout for this message on a 32-bit machine will be something * like: * * Foo * +------------------------+ * | set_flags a:1, b:1, c:1| * +------------------------+ * | bool a (1 byte) | * +------------------------+ * | padding (3 bytes) | * +------------------------+ upb_array * | upb_array* b (4 bytes) | ----> +----------------------------+ * +------------------------+ | uint32* elements (4 bytes) | ---+ * | Bar* c (4 bytes) | +----------------------------+ | * +------------------------+ | uint32 size (4 bytes) | | * +----------------------------+ | * | * -----------------------------------------------------------------+ * | * V * uint32 array * +----+----+----+----+----+----+ * | e1 | e2 | e3 | e4 | e5 | e6 | * +----+----+----+----+----+----+ * * And the corresponding C structure (as emitted by the proto compiler) would be: * * struct Foo { * union { * uint8_t bytes[1]; * struct { * bool a:1; * bool b:1; * bool c:1; * } has; * } set_flags; * bool a; * upb_uint32_array *b; * Bar *c; * } * * Because the C struct emitted by the upb compiler uses exactly the same * byte-level format as the reflection interface, you can access the same hunk * of memory either way. The C struct provides maximum performance and static * type safety; upb_msg provides flexibility. * * The in-memory format has no interoperability guarantees whatsoever, except * that a single version of upb will interoperate with itself. Don't even * think about persisting the in-memory format or sending it anywhere. That's * what serialized protobufs are for! The in-memory format is just that -- an * in-memory representation that allows for fast access. * * The in-memory format is carefully designed to *not* mandate any particular * memory management scheme. This should make it easier to integrate with * existing memory management schemes, or to perform advanced techniques like * reference counting, garbage collection, and string references. Different * clients can read each others messages regardless of what memory management * scheme each is using. * * A memory management scheme is provided for convenience, and it is used by * default by the stock message parser. Clients can substitute their own * memory management scheme into this parser without any loss of generality * or performance. */ #ifndef UPB_MSG_H_ #define UPB_MSG_H_ #include #include #include "upb.h" #include "upb_table.h" #include "upb_parse.h" #ifdef __cplusplus extern "C" { #endif /* Message definition. ********************************************************/ /* Structure that describes a single field in a message. This structure is very * consciously designed to fit into 12/16 bytes (32/64 bit, respectively), * because copies of this struct are in the hash table that is read in the * critical path of parsing. Minimizing the size of this struct increases * cache-friendliness. */ struct upb_msg_field { union upb_symbol_ref ref; uint32_t byte_offset; /* Where to find the data. */ uint16_t field_index; /* Indexes upb_msg.fields. Also indicates set bit */ upb_field_type_t type; /* Copied from descriptor for cache-friendliness. */ upb_label_t label; }; /* Structure that describes a single .proto message type. */ struct upb_msg { struct google_protobuf_DescriptorProto *descriptor; struct upb_string fqname; /* Fully qualified. */ size_t size; uint32_t num_fields; uint32_t set_flags_bytes; uint32_t num_required_fields; /* Required fields have the lowest set bytemasks. */ struct upb_inttable fields_by_num; struct upb_strtable fields_by_name; struct upb_msg_field *fields; struct google_protobuf_FieldDescriptorProto **field_descriptors; }; /* The num->field and name->field maps in upb_msg allow fast lookup of fields * by number or name. These lookups are in the critical path of parsing and * field lookup, so they must be as fast as possible. To make these more * cache-friendly, we put the data in the table by value. */ struct upb_fieldsbynum_entry { struct upb_inttable_entry e; struct upb_msg_field f; }; struct upb_fieldsbyname_entry { struct upb_strtable_entry e; struct upb_msg_field f; }; /* Can be used to retrieve a field descriptor given the upb_msg_field ref. */ INLINE struct google_protobuf_FieldDescriptorProto *upb_msg_field_descriptor( struct upb_msg_field *f, struct upb_msg *m) { return m->field_descriptors[f->field_index]; } /* Initializes/frees a upb_msg. Usually this will be called by upb_context, and * clients will not have to construct one directly. * * Caller retains ownership of d, but the msg will contain references to it, so * it must outlive the msg. Note that init does not resolve upb_msg_field.ref * the caller should do that post-initialization by calling upb_msg_ref() * below. * * fqname indicates the fully-qualified name of this message. Ownership of * fqname passes to the msg, but the msg will contain references to it, so it * must outlive the msg. * * sort indicates whether or not it is safe to reorder the fields from the order * they appear in d. This should be false if code has been compiled against a * header for this type that expects the given order. */ bool upb_msg_init(struct upb_msg *m, struct google_protobuf_DescriptorProto *d, struct upb_string fqname, bool sort); void upb_msg_free(struct upb_msg *m); /* Clients use this function on a previously initialized upb_msg to resolve the * "ref" field in the upb_msg_field. Since messages can refer to each other in * mutually-recursive ways, this step must be separated from initialization. */ void upb_msg_ref(struct upb_msg *m, struct upb_msg_field *f, union upb_symbol_ref ref); /* Looks up a field by name or number. While these are written to be as fast * as possible, it will still be faster to cache the results of this lookup if * possible. These return NULL if no such field is found. */ INLINE struct upb_msg_field *upb_msg_fieldbynum(struct upb_msg *m, uint32_t number) { struct upb_fieldsbynum_entry *e = (struct upb_fieldsbynum_entry*)upb_inttable_fast_lookup( &m->fields_by_num, number, sizeof(struct upb_fieldsbynum_entry)); return e ? &e->f : NULL; } INLINE struct upb_msg_field *upb_msg_fieldbyname(struct upb_msg *m, struct upb_string *name) { struct upb_fieldsbyname_entry *e = (struct upb_fieldsbyname_entry*)upb_strtable_lookup( &m->fields_by_name, name); return e ? &e->f : NULL; } INLINE bool upb_issubmsg(struct upb_msg_field *f) { return upb_issubmsgtype(f->type); } INLINE bool upb_isstring(struct upb_msg_field *f) { return upb_isstringtype(f->type); } INLINE bool upb_isarray(struct upb_msg_field *f) { return f->label == GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_LABEL_REPEATED; } /* "Set" flag reading and writing. *******************************************/ INLINE size_t upb_isset_offset(uint32_t field_index) { return field_index / 8; } INLINE uint8_t upb_isset_mask(uint32_t field_index) { return 1 << (field_index % 8); } /* Functions for reading and writing the "set" flags in the msg. Note that * these do not perform memory management associated with any dynamic memory * these fields may be referencing. These *only* set and test the flags. */ INLINE void upb_msg_set(void *s, struct upb_msg_field *f) { ((char*)s)[upb_isset_offset(f->field_index)] |= upb_isset_mask(f->field_index); } INLINE void upb_msg_unset(void *s, struct upb_msg_field *f) { ((char*)s)[upb_isset_offset(f->field_index)] &= ~upb_isset_mask(f->field_index); } INLINE bool upb_msg_is_set(void *s, struct upb_msg_field *f) { return ((char*)s)[upb_isset_offset(f->field_index)] & upb_isset_mask(f->field_index); } INLINE bool upb_msg_all_required_fields_set(void *s, struct upb_msg *m) { int num_fields = m->num_required_fields; int i = 0; while(num_fields > 8) { if(((uint8_t*)s)[i++] != 0xFF) return false; num_fields -= 8; } if(((uint8_t*)s)[i] != (1 << num_fields) - 1) return false; return true; } INLINE void upb_msg_clear(void *s, struct upb_msg *m) { memset(s, 0, m->set_flags_bytes); } /* Scalar (non-array) data access. ********************************************/ /* Returns a pointer to a specific field in a message. */ INLINE union upb_value_ptr upb_msg_getptr(void *data, struct upb_msg_field *f) { union upb_value_ptr p; p._void = ((char*)data + f->byte_offset); return p; } /* Memory management *********************************************************/ /* One important note about these memory management routines: they must be used * completely or not at all (for each message). In other words, you can't * allocate your own message and then free it with upb_msgdata_free. As * another example, you can't point a field to your own string and then call * upb_msg_reuse_str. */ /* Allocates and frees message data, respectively. Newly allocated data is * initialized to empty. Freeing a message always frees string data, but * the client can decide whether or not submessages should be deleted. */ void *upb_msgdata_new(struct upb_msg *m); void upb_msgdata_free(void *data, struct upb_msg *m, bool free_submsgs); /* Given a pointer to the appropriate field of the message or array, these * functions will lazily allocate memory for a string, array, or submessage. * If the previously allocated memory is big enough, it will reuse it without * re-allocating. See upb_msg.c for example usage. */ /* Reuse a string of at least the given size. */ void upb_msg_reuse_str(struct upb_string **str, uint32_t size); /* Like the previous, but assumes that the string will be by reference, so * doesn't allocate memory for the string itself. */ void upb_msg_reuse_strref(struct upb_string **str); /* Reuse an array of at least the given size, with the given type. */ void upb_msg_reuse_array(struct upb_array **arr, uint32_t size, upb_field_type_t t); /* Reuse a submessage of the given type. */ void upb_msg_reuse_submsg(void **msg, struct upb_msg *m); /* Serialization/Deserialization. ********************************************/ /* This is all just a layer on top of the stream-oriented facility in * upb_parse.h. */ struct upb_msg_parse_frame { struct upb_msg *m; void *data; }; //#include "upb_text.h" struct upb_msg_parse_state { struct upb_parse_state s; bool merge; bool byref; struct upb_msg *m; struct upb_msg_parse_frame stack[UPB_MAX_NESTING], *top; //struct upb_text_printer p; }; /* Initializes/frees a message parser. The parser will write the data to the * message data "data", which the caller must have previously allocated (the * parser will allocate submsgs, strings, and arrays as needed, however). * * "Merge" controls whether the parser will append to data instead of * overwriting. Merging concatenates arrays and merges submessages instead * of clearing both. * * "Byref" controls whether the new message data copies or references strings * it encounters. If byref == true, then all strings supplied to upb_msg_parse * must remain unchanged and must outlive data. */ void upb_msg_parse_init(struct upb_msg_parse_state *s, void *data, struct upb_msg *m, bool merge, bool byref); void upb_msg_parse_reset(struct upb_msg_parse_state *s, void *data, struct upb_msg *m, bool merge, bool byref); void upb_msg_parse_free(struct upb_msg_parse_state *s); /* Parses a protobuf fragment, writing the data to the message that was passed * to upb_msg_parse_init. This function can be called multiple times as more * data becomes available. */ upb_status_t upb_msg_parse(struct upb_msg_parse_state *s, void *data, size_t len, size_t *read); /* Parses the protobuf in s (which is expected to be complete) and allocates * new message data to hold it. This is an alternative to the streaming API * above. "byref" works as in upb_msg_parse_init(). */ void *upb_alloc_and_parse(struct upb_msg *m, struct upb_string *s, bool byref); /* Text dump *****************************************************************/ bool upb_msg_eql(void *data1, void *data2, struct upb_msg *m, bool recursive); void upb_msg_print(void *data, struct upb_msg *m, FILE *stream); #ifdef __cplusplus } /* extern "C" */ #endif #endif /* UPB_MSG_H_ */