/* * upb - a minimalist implementation of protocol buffers. * * Copyright (c) 2009 Joshua Haberman. See LICENSE for details. * * A upb_msgdef provides a full description of a message type as defined in a * .proto file. Using a upb_msgdef, it is possible to treat an arbitrary hunk * of memory (a void*) as a protobuf of the given type. We will call this * void* a upb_msg in the context of this interface. * * Clients generally do not construct or destruct upb_msgdef objects directly. * They are managed by upb_contexts, and clients can obtain upb_msgdef pointers * directly from a upb_context. * * A upb_msg is READ-ONLY, and the upb_msgdef functions in this file provide * read-only access. For a mutable message, or for a message that you can take * a reference to to prevents its destruction, see upb_mm_msg.h, which is a * layer on top of upb_msg that adds memory management semantics. * * upb_msgdef supports many features and operations for dealing with proto * messages: * - reflection over .proto types at runtime (list fields, get names, etc). * - an in-memory byte-level format for efficiently storing and accessing msgs. * - serializing from the in-memory format to a protobuf. * - parsing from a protobuf to an in-memory data structure (you either * supply callbacks for allocating/repurposing memory or use a simplified * version that parses into newly-allocated memory). * * The in-memory format is very much like a C struct that you can define at * run-time, but also supports reflection. Like C structs it supports * offset-based access, as opposed to the much slower name-based lookup. The * format stores both the values themselves and bits describing whether each * field is set or not. * * For a more in-depth description of the in-memory format, see: * http://wiki.github.com/haberman/upb/inmemoryformat * * Because the C struct emitted by the upb compiler uses exactly the same * byte-level format as the reflection interface, you can access the same hunk * of memory either way. The C struct provides maximum performance and static * type safety; upb_msg_def provides flexibility. * * The in-memory format has no interoperability guarantees whatsoever, except * that a single version of upb will interoperate with itself. Don't even * think about persisting the in-memory format or sending it anywhere. That's * what serialized protobufs are for! The in-memory format is just that -- an * in-memory representation that allows for fast access. */ #ifndef UPB_MSG_H_ #define UPB_MSG_H_ #include #include #include "upb.h" #include "upb_atomic.h" #include "upb_context.h" #include "upb_parse.h" #include "upb_table.h" #ifdef __cplusplus extern "C" { #endif /* Message definition. ********************************************************/ struct upb_msg_fielddef; /* Structure that describes a single .proto message type. */ struct upb_msgdef { upb_atomic_refcount_t refcount; struct upb_context *context; struct google_protobuf_DescriptorProto *descriptor; struct upb_string fqname; /* Fully qualified. */ size_t size; uint32_t num_fields; uint32_t set_flags_bytes; uint32_t num_required_fields; /* Required fields have the lowest set bytemasks. */ struct upb_inttable fields_by_num; struct upb_strtable fields_by_name; struct upb_msg_fielddef *fields; struct google_protobuf_FieldDescriptorProto **field_descriptors; }; /* Structure that describes a single field in a message. This structure is very * consciously designed to fit into 12/16 bytes (32/64 bit, respectively), * because copies of this struct are in the hash table that is read in the * critical path of parsing. Minimizing the size of this struct increases * cache-friendliness. */ struct upb_msg_fielddef { union upb_symbol_ref ref; uint32_t byte_offset; /* Where to find the data. */ uint16_t field_index; /* Indexes upb_msgdef.fields and indicates set bit */ upb_field_type_t type; /* Copied from descriptor for cache-friendliness. */ upb_label_t label; }; INLINE void upb_msgdef_ref(struct upb_msgdef *m) { if(upb_atomic_ref(&m->refcount)) upb_context_ref(m->context); } INLINE void upb_msgdef_unref(struct upb_msgdef *m) { if(upb_atomic_unref(&m->refcount)) upb_context_unref(m->context); } INLINE bool upb_issubmsg(struct upb_msg_fielddef *f) { return upb_issubmsgtype(f->type); } INLINE bool upb_isstring(struct upb_msg_fielddef *f) { return upb_isstringtype(f->type); } INLINE bool upb_isarray(struct upb_msg_fielddef *f) { return f->label == GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_LABEL_REPEATED; } /* Can be used to retrieve a field descriptor given the upb_msg_fielddef. */ INLINE struct google_protobuf_FieldDescriptorProto *upb_msg_field_descriptor( struct upb_msg_fielddef *f, struct upb_msgdef *m) { return m->field_descriptors[f->field_index]; } /* Message structure. *********************************************************/ struct upb_msg { struct upb_msgdef *def; void *gptr; /* Generic pointer for use by subclasses. */ uint8_t data[1]; }; INLINE void *upb_msg_gptr(struct upb_msg *msg) { return msg->gptr; } /* Field access. **************************************************************/ /* Note that these only provide access to fields that are directly in the msg * itself. For dynamic fields (strings, arrays, and submessages) it will be * necessary to dereference the returned values. */ /* Returns a pointer to a specific field in a message. */ INLINE union upb_value_ptr upb_msg_getptr(struct upb_msg *msg, struct upb_msg_fielddef *f) { union upb_value_ptr p; p._void = &msg->data[f->byte_offset]; return p; } /* Returns a a specific field in a message. */ INLINE union upb_value upb_msg_get(struct upb_msg *msg, struct upb_msg_fielddef *f) { return upb_deref(upb_msg_getptr(msg, f), f->type); } /* "Set" flag reading and writing. *******************************************/ /* All upb code and code using upb should guarantee that the set flags are * always valid. It should always be the case that if a flag's field is set * for a dynamic field that the pointer is valid. * * Clients should never set fields on a plain upb_msg, only on a upb_mm_msg. */ /* Returns the byte offset where we store whether this field is set. */ INLINE size_t upb_isset_offset(uint32_t field_index) { return field_index / 8; } /* Returns the mask within the appropriate byte that selects the set bit. */ INLINE uint8_t upb_isset_mask(uint32_t field_index) { return 1 << (field_index % 8); } /* Returns true if the given field is set, false otherwise. */ INLINE void upb_msg_set(struct upb_msg *msg, struct upb_msg_fielddef *f) { msg->data[upb_isset_offset(f->field_index)] |= upb_isset_mask(f->field_index); } /* Clears the set bit for this field in the given message. */ INLINE void upb_msg_unset(struct upb_msg *msg, struct upb_msg_fielddef *f) { msg->data[upb_isset_offset(f->field_index)] &= ~upb_isset_mask(f->field_index); } /* Tests whether the given field is set. */ INLINE bool upb_msg_isset(struct upb_msg *msg, struct upb_msg_fielddef *f) { return msg->data[upb_isset_offset(f->field_index)] & upb_isset_mask(f->field_index); } /* Returns true if *all* required fields are set, false otherwise. */ INLINE bool upb_msg_all_required_fields_set(struct upb_msg *msg, struct upb_msgdef *m) { int num_fields = m->num_required_fields; int i = 0; while(num_fields > 8) { if(msg->data[i++] != 0xFF) return false; num_fields -= 8; } if(msg->data[i] != (1 << num_fields) - 1) return false; return true; } /* Clears the set bit for all fields. */ INLINE void upb_msg_clear(struct upb_msg *msg) { memset(msg->data, 0, msg->def->set_flags_bytes); } /* Number->field and name->field lookup. *************************************/ /* The num->field and name->field maps in upb_msgdef allow fast lookup of fields * by number or name. These lookups are in the critical path of parsing and * field lookup, so they must be as fast as possible. To make these more * cache-friendly, we put the data in the table by value. */ struct upb_fieldsbynum_entry { struct upb_inttable_entry e; struct upb_msg_fielddef f; }; struct upb_fieldsbyname_entry { struct upb_strtable_entry e; struct upb_msg_fielddef f; }; /* Looks up a field by name or number. While these are written to be as fast * as possible, it will still be faster to cache the results of this lookup if * possible. These return NULL if no such field is found. */ INLINE struct upb_msg_fielddef *upb_msg_fieldbynum(struct upb_msgdef *m, uint32_t number) { struct upb_fieldsbynum_entry *e = (struct upb_fieldsbynum_entry*)upb_inttable_fast_lookup( &m->fields_by_num, number, sizeof(struct upb_fieldsbynum_entry)); return e ? &e->f : NULL; } INLINE struct upb_msg_fielddef *upb_msg_fieldbyname(struct upb_msgdef *m, struct upb_string *name) { struct upb_fieldsbyname_entry *e = (struct upb_fieldsbyname_entry*)upb_strtable_lookup( &m->fields_by_name, name); return e ? &e->f : NULL; } /* Simple, one-shot parsing ***************************************************/ /* A simple interface for parsing into a newly-allocated message. This * interface should only be used when the message will be read-only with * respect to memory management (eg. won't add or remove internal references to * dynamic memory). For more flexible (but also more complicated) interfaces, * see below and in upb_mm_msg.h. */ /* Parses the protobuf in s (which is expected to be complete) and allocates * new message data to hold it. If byref is set, strings in the returned * upb_msg will reference s instead of copying from it, but this requires that * s will live for as long as the returned message does. */ struct upb_msg *upb_msg_parsenew(struct upb_msgdef *m, struct upb_string *s); /* This function should be used to free messages that were parsed with * upb_msg_parsenew. It will free the message appropriately (including all * submessages). */ void upb_msg_free(struct upb_msg *msg); /* Parsing with (re)allocation callbacks. *************************************/ /* This interface parses protocol buffers into upb_msgs, but allows the client * to supply allocation callbacks whenever the parser needs to obtain a string, * array, or submsg (a "dynamic field"). If the parser sees that a dynamic * field is already present (its "set bit" is set) it will use that, resizing * it if necessary in the case of an array. Otherwise it will call the * allocation callback to obtain one. * * This may seem trivial (since nearly all clients will use malloc and free for * memory management), but the allocation callback can be used for more than * just allocation. If we are parsing data into an existing upb_msg, the * allocation callback can examine any existing memory that is allocated for * the dynamic field and determine whether it can reuse it. It can also * perform memory management like refing the new field. * * This parser is layered on top of the event-based parser in upb_parse.h. The * parser is upb_mm_msg.h is layered on top of this parser. * * This parser is fully streaming-capable. */ /* Should return an initialized array. */ typedef struct upb_array *(*upb_msg_getandref_array_cb_t)( void *from_gptr, struct upb_array *existingval, struct upb_msg_fielddef *f); /* Callback to allocate a string. If byref is true, the client should assume * that the string will be referencing the input data. */ typedef struct upb_string *(*upb_msg_getandref_string_cb_t)( void *from_gptr, struct upb_string *existingval, struct upb_msg_fielddef *f, bool byref); /* Should return a cleared message. */ typedef struct upb_msg *(*upb_msg_getandref_msg_cb_t)( void *from_gptr, struct upb_msg *existingval, struct upb_msg_fielddef *f); struct upb_msg_parser_frame { struct upb_msg *msg; }; struct upb_msg_parser { struct upb_stream_parser s; bool merge; bool byref; struct upb_msg_parser_frame stack[UPB_MAX_NESTING], *top; upb_msg_getandref_array_cb_t getarray_cb; upb_msg_getandref_string_cb_t getstring_cb; upb_msg_getandref_msg_cb_t getmsg_cb; }; void upb_msg_parser_reset(struct upb_msg_parser *p, struct upb_msg *msg, bool byref); /* Parses protocol buffer data out of data which has length of len. The data * need not be a complete protocol buffer. The number of bytes parsed is * returned in *read, and the next call to upb_msg_parse must supply data that * is *read bytes past data in the logical stream. */ upb_status_t upb_msg_parser_parse(struct upb_msg_parser *p, void *data, size_t len, size_t *read); /* Serialization *************************************************************/ /* For messages that contain any submessages, we must do a pre-pass on the * message tree to discover the size of all submessages. This is necessary * because when serializing, the message length has to precede the message data * itself. * * We can calculate these sizes once and reuse them as long as the message is * known not to have changed. */ struct upb_msgsizes; /* Initialize/free a upb_msgsizes for the given message. */ void upb_msgsizes_init(struct upb_msgsizes *sizes); void upb_msgsizes_free(struct upb_msgsizes *sizes); /* Given a previously initialized sizes, recurse over the message and store its * sizes in 'sizes'. */ void upb_msgsizes_read(struct upb_msgsizes *sizes, void *msg, struct upb_msgdef *m); /* Returns the total size of the serialized message given in sizes. Must be * preceeded by a call to upb_msgsizes_read. */ size_t upb_msgsizes_totalsize(struct upb_msgsizes *sizes); struct upb_msg_serialize_state; /* Initializes the state of serialization. The provided message must not * change between the upb_msgsizes_read() call that was used to construct * "sizes" and the parse being fully completed. */ void upb_msg_serialize_alloc(struct upb_msg_serialize_state *s); void upb_msg_serialize_free(struct upb_msg_serialize_state *s); void upb_msg_serialize_init(struct upb_msg_serialize_state *s, void *msg, struct upb_msgdef *m, struct upb_msgsizes *sizes); /* Serializes the next set of bytes into buf (which has size len). Returns * UPB_STATUS_OK if serialization is complete, or UPB_STATUS_NEED_MORE_DATA * if there is more data from the message left to be serialized. * * The number of bytes written to buf is returned in *written. This will be * equal to len unless we finished serializing. */ upb_status_t upb_msg_serialize(struct upb_msg_serialize_state *s, void *buf, size_t len, size_t *written); /* Text dump *****************************************************************/ bool upb_msg_eql(struct upb_msg *msg1, struct upb_msg *msg2, bool recursive); void upb_msg_print(struct upb_msg *data, bool single_line, FILE *stream); /* Internal functions. ********************************************************/ /* Initializes/frees a upb_msgdef. Usually this will be called by upb_context, * and clients will not have to construct one directly. * * Caller retains ownership of d, but the msg will contain references to it, so * it must outlive the msg. Note that init does not resolve * upb_msg_fielddef.ref the caller should do that post-initialization by * calling upb_msg_ref() below. * * fqname indicates the fully-qualified name of this message. Ownership of * fqname passes to the msg, but the msg will contain references to it, so it * must outlive the msg. * * sort indicates whether or not it is safe to reorder the fields from the order * they appear in d. This should be false if code has been compiled against a * header for this type that expects the given order. */ bool upb_msgdef_init(struct upb_msgdef *m, struct google_protobuf_DescriptorProto *d, struct upb_string fqname, bool sort, struct upb_context *c); void upb_msgdef_free(struct upb_msgdef *m); /* Sort the given field descriptors in-place, according to what we think is an * optimal ordering of fields. This can change from upb release to upb * release. */ void upb_msgdef_sortfds(google_protobuf_FieldDescriptorProto **fds, size_t num); /* Clients use this function on a previously initialized upb_msgdef to resolve * the "ref" field in the upb_msg_fielddef. Since messages can refer to each * other in mutually-recursive ways, this step must be separated from * initialization. */ void upb_msgdef_setref(struct upb_msgdef *m, struct upb_msg_fielddef *f, union upb_symbol_ref ref); #ifdef __cplusplus } /* extern "C" */ #endif #endif /* UPB_MSG_H_ */