From 8fa6a92f534cb01b6f5b4f48e3982f686d6c3123 Mon Sep 17 00:00:00 2001 From: Joshua Haberman Date: Fri, 7 Aug 2009 20:47:26 -0700 Subject: Major refactoring of upb_msg. Temporary functionality regression. There is significant refactoring here, as well as some more trivial name changes. upb_msg has become upb_msgdef, to reflect the fact that a upb_msg is not *itself* a message, it describes a message. There are other renamings, such as upb_parse_state -> upb_stream_parser. More significantly, the upb_msg class and parser have been refactored to reflect my recent realization about how memory management should work. upb_msg now has no memory management, and a memory mangement scheme (that works beautifully with multiple language runtimes) will be layered on top of it. This iteration has the new, read-only upb_msg. upb_mm_msg (a memory-managed message class) will come in the next change. --- src/upb.h | 2 +- src/upb_array.h | 92 +++++++++-- src/upb_context.c | 14 +- src/upb_context.h | 2 +- src/upb_msg.c | 422 ++++++++++++++++++++++++------------------------- src/upb_msg.h | 457 ++++++++++++++++++++++++++---------------------------- src/upb_parse.c | 21 +-- src/upb_parse.h | 17 +- src/upb_table.c | 4 +- 9 files changed, 525 insertions(+), 506 deletions(-) (limited to 'src') diff --git a/src/upb.h b/src/upb.h index 0db5369..83917af 100644 --- a/src/upb.h +++ b/src/upb.h @@ -133,7 +133,7 @@ INLINE union upb_value upb_deref(union upb_value_ptr ptr, upb_field_type_t t) { } union upb_symbol_ref { - struct upb_msg *msg; + struct upb_msgdef *msg; struct upb_enum *_enum; struct upb_svc *svc; }; diff --git a/src/upb_array.h b/src/upb_array.h index 0e5178f..d48aa17 100644 --- a/src/upb_array.h +++ b/src/upb_array.h @@ -2,32 +2,63 @@ * upb - a minimalist implementation of protocol buffers. * * Copyright (c) 2009 Joshua Haberman. See LICENSE for details. - - * Defines an in-memory array type. TODO: more documentation. + * + * Defines an in-memory, polymorphic array type. The array does not know its + * own type -- its owner must know that information out-of-band. + * + * upb_arrays are memory-managed in the sense that they contain a pointer + * ("mem") to memory that is "owned" by the array (which may be NULL if the + * array owns no memory). There is a separate pointer ("elements") that points + * to the the array's currently "effective" memory, which is either equal to + * mem (if the array's current value is memory we own) or not (if the array is + * referencing other memory). + * + * If the array is referencing other memory, it is up to the array's owner to + * ensure that the other memory remains valid for as long as the array is + * referencing it. * */ #ifndef UPB_ARRAY_H_ #define UPB_ARRAY_H_ +#include +#include "upb.h" + #ifdef __cplusplus extern "C" { #endif struct upb_string; -#include "upb.h" +/* upb_arrays can be at most 2**32 elements long. */ +typedef uint32_t upb_arraylen_t; /* Represents an array (a repeated field) of any type. The interpretation of * the data in the array depends on the type. */ struct upb_array { union upb_value_ptr elements; - uint32_t len; /* Measured in elements. */ + void *mem; + upb_arraylen_t len; /* Number of elements in "elements". */ + upb_arraylen_t size; /* Memory allocated in "mem" (measured in elements) */ }; -/* Returns a pointer to an array element. */ +INLINE void upb_array_init(struct upb_array *arr) +{ + arr->elements._void = NULL; + arr->mem = NULL; + arr->len = 0; + arr->size = 0; +} + +INLINE void upb_array_free(struct upb_array *arr) +{ + free(arr->mem); +} + +/* Returns a pointer to an array element. Does not perform a bounds check! */ INLINE union upb_value_ptr upb_array_getelementptr( - struct upb_array *arr, uint32_t n, upb_field_type_t type) + struct upb_array *arr, upb_arraylen_t n, upb_field_type_t type) { union upb_value_ptr ptr; ptr._void = (void*)((char*)arr->elements._void + n*upb_type_info[type].size); @@ -35,16 +66,55 @@ INLINE union upb_value_ptr upb_array_getelementptr( } INLINE union upb_value upb_array_getelement( - struct upb_array *arr, uint32_t n, upb_field_type_t type) + struct upb_array *arr, upb_arraylen_t n, upb_field_type_t type) { return upb_deref(upb_array_getelementptr(arr, n, type), type); } +INLINE uint32_t upb_round_up_to_pow2(uint32_t v) +{ + /* cf. http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2 */ + v--; + v |= v >> 1; + v |= v >> 2; + v |= v >> 4; + v |= v >> 8; + v |= v >> 16; + v++; + return v; +} + +/* Resizes array to be "len" elements long and ensures we have write access + * to the array (reallocating if necessary). Returns true iff we were + * referencing memory for the array and dropped the reference. */ +INLINE bool upb_array_resize(struct upb_array *arr, upb_arraylen_t newlen, + upb_field_type_t type) +{ + size_t type_size = upb_type_info[type].size; + bool dropped = false; + bool ref = arr->elements._void != arr->mem; /* Ref'ing external memory. */ + if(arr->size < newlen) { + /* Need to resize. */ + arr->size = max(4, upb_round_up_to_pow2(newlen)); + arr->mem = realloc(arr->mem, arr->size * type_size); + } + if(ref) { + /* Need to take referenced data and copy it to memory we own. */ + memcpy(arr->mem, arr->elements._void, UPB_MIN(arr->len, newlen) * type_size); + dropped = true; + } + arr->elements._void = arr->mem; + arr->len = newlen; + return dropped; +} + /* These are all overlays on upb_array, pointers between them can be cast. */ #define UPB_DEFINE_ARRAY_TYPE(name, type) \ struct name ## _array { \ type *elements; \ - uint32_t len; \ + type *mem; \ + upb_arraylen_t len; \ + upb_arraylen_t size; \ }; UPB_DEFINE_ARRAY_TYPE(upb_double, double) @@ -57,12 +127,14 @@ UPB_DEFINE_ARRAY_TYPE(upb_bool, bool) UPB_DEFINE_ARRAY_TYPE(upb_string, struct upb_string*) UPB_DEFINE_ARRAY_TYPE(upb_msg, void*) -/* Defines an array of a specific message type. */ +/* Defines an array of a specific message type (an overlay of upb_array). */ #define UPB_MSG_ARRAY(msg_type) struct msg_type ## _array #define UPB_DEFINE_MSG_ARRAY(msg_type) \ UPB_MSG_ARRAY(msg_type) { \ msg_type **elements; \ - uint32_t len; \ + msg_type **mem; \ + upb_arraylen_t len; \ + upb_arraylen_t size; \ }; #ifdef __cplusplus diff --git a/src/upb_context.c b/src/upb_context.c index 46d8f05..5e1833e 100644 --- a/src/upb_context.c +++ b/src/upb_context.c @@ -46,7 +46,7 @@ static void free_symtab(struct upb_strtable *t) struct upb_symtab_entry *e = upb_strtable_begin(t); for(; e; e = upb_strtable_next(t, &e->e)) { switch(e->type) { - case UPB_SYM_MESSAGE: upb_msg_free(e->ref.msg); break; + case UPB_SYM_MESSAGE: upb_msgdef_free(e->ref.msg); break; case UPB_SYM_ENUM: upb_enum_free(e->ref._enum); break; default: break; /* TODO */ } @@ -60,7 +60,7 @@ void upb_context_free(struct upb_context *c) { free_symtab(&c->symtab); for(size_t i = 0; i < c->fds_len; i++) - upb_msgdata_free(c->fds[i], c->fds_msg, true); + upb_msg_free(c->fds[i], c->fds_msg); free_symtab(&c->psymtab); free(c->fds); } @@ -188,7 +188,7 @@ static bool insert_message(struct upb_strtable *t, e.e.key = fqname; e.type = UPB_SYM_MESSAGE; e.ref.msg = malloc(sizeof(*e.ref.msg)); - if(!upb_msg_init(e.ref.msg, d, fqname, sort)) { + if(!upb_msgdef_init(e.ref.msg, d, fqname, sort)) { free(fqname.ptr); return false; } @@ -232,9 +232,9 @@ bool addfd(struct upb_strtable *addto, struct upb_strtable *existingdefs, if(upb_strtable_lookup(existingdefs, &e->e.key)) return false; /* Redefinition prohibited. */ if(e->type == UPB_SYM_MESSAGE) { - struct upb_msg *m = e->ref.msg; + struct upb_msgdef *m = e->ref.msg; for(unsigned int i = 0; i < m->num_fields; i++) { - struct upb_msg_field *f = &m->fields[i]; + struct upb_msg_fielddef *f = &m->fields[i]; google_protobuf_FieldDescriptorProto *fd = m->field_descriptors[i]; union upb_symbol_ref ref; if(fd->type == GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_MESSAGE || @@ -247,7 +247,7 @@ bool addfd(struct upb_strtable *addto, struct upb_strtable *existingdefs, else continue; /* No resolving necessary. */ if(!ref.msg) return false; /* Ref. to undefined symbol. */ - upb_msg_ref(m, f, ref); + upb_msgdef_ref(m, f, ref); } } } @@ -280,7 +280,7 @@ bool upb_context_addfds(struct upb_context *c, bool upb_context_parsefds(struct upb_context *c, struct upb_string *fds_str) { google_protobuf_FileDescriptorSet *fds = - upb_alloc_and_parse(c->fds_msg, fds_str, false); + upb_msg_parsenew(c->fds_msg, fds_str); if(!fds) return false; if(!upb_context_addfds(c, fds)) return false; diff --git a/src/upb_context.h b/src/upb_context.h index 3031124..e802454 100644 --- a/src/upb_context.h +++ b/src/upb_context.h @@ -40,7 +40,7 @@ struct upb_symtab_entry { struct upb_context { struct upb_strtable symtab; /* The context's symbol table. */ struct upb_strtable psymtab; /* Private symbols, for internal use. */ - struct upb_msg *fds_msg; /* This is in psymtab, ptr here for convenience. */ + struct upb_msgdef *fds_msg; /* In psymtab, ptr here for convenience. */ /* A list of the FileDescriptorProtos we own (from having parsed them * ourselves) and must free on destruction. */ diff --git a/src/upb_msg.c b/src/upb_msg.c index 2e5fdd5..c58b470 100644 --- a/src/upb_msg.c +++ b/src/upb_msg.c @@ -10,6 +10,7 @@ #include "upb_msg.h" #include "upb_parse.h" #include "upb_serialize.h" +#include "upb_text.h" /* Rounds p up to the next multiple of t. */ #define ALIGN_UP(p, t) ((p) % (t) == 0 ? (p) : (p) + ((t) - ((p) % (t)))) @@ -35,13 +36,13 @@ static int compare_fields(const void *e1, const void *e2) { } } -void upb_msg_sortfds(google_protobuf_FieldDescriptorProto **fds, size_t num) +void upb_msgdef_sortfds(google_protobuf_FieldDescriptorProto **fds, size_t num) { qsort(fds, num, sizeof(void*), compare_fields); } -bool upb_msg_init(struct upb_msg *m, google_protobuf_DescriptorProto *d, - struct upb_string fqname, bool sort) +bool upb_msgdef_init(struct upb_msgdef *m, google_protobuf_DescriptorProto *d, + struct upb_string fqname, bool sort) { /* TODO: more complete validation. */ if(!d->set_flags.has.field) return false; @@ -65,11 +66,11 @@ bool upb_msg_init(struct upb_msg *m, google_protobuf_DescriptorProto *d, /* We count on the caller to keep this pointer alive. */ m->field_descriptors[i] = d->field->elements[i]; } - if(sort) upb_msg_sortfds(m->field_descriptors, m->num_fields); + if(sort) upb_msgdef_sortfds(m->field_descriptors, m->num_fields); size_t max_align = 0; for(unsigned int i = 0; i < m->num_fields; i++) { - struct upb_msg_field *f = &m->fields[i]; + struct upb_msg_fielddef *f = &m->fields[i]; google_protobuf_FieldDescriptorProto *fd = m->field_descriptors[i]; struct upb_type_info *type_info = &upb_type_info[fd->type]; @@ -98,7 +99,7 @@ bool upb_msg_init(struct upb_msg *m, google_protobuf_DescriptorProto *d, return true; } -void upb_msg_free(struct upb_msg *m) +void upb_msgdef_free(struct upb_msgdef *m) { upb_inttable_free(&m->fields_by_num); upb_strtable_free(&m->fields_by_name); @@ -106,8 +107,8 @@ void upb_msg_free(struct upb_msg *m) free(m->field_descriptors); } -void upb_msg_ref(struct upb_msg *m, struct upb_msg_field *f, - union upb_symbol_ref ref) { +void upb_msgdef_ref(struct upb_msgdef *m, struct upb_msg_fielddef *f, + union upb_symbol_ref ref) { struct google_protobuf_FieldDescriptorProto *d = upb_msg_field_descriptor(f, m); struct upb_fieldsbynum_entry *int_e = upb_inttable_fast_lookup( @@ -120,175 +121,146 @@ void upb_msg_ref(struct upb_msg *m, struct upb_msg_field *f, str_e->f.ref = ref; } -/* Memory management *********************************************************/ +/* Simple, one-shot parsing ***************************************************/ -/* Our memory management scheme is as follows: - * - * All pointers to dynamic memory (strings, arrays, and submessages) are - * expected to be good pointers if they are non-zero, *regardless* of whether - * that field's bit is set! That way we can reuse the memory even if the field - * is unset and then set later. */ - -/* For our memory-managed strings and arrays we store extra information - * (compared to a plain upb_string or upb_array). But the data starts with - * a upb_string and upb_array, so we can overlay onto the regular types. */ -struct mm_upb_string { - struct upb_string s; - /* Track the allocated size, so we know when we need to reallocate. */ - uint32_t size; - /* Our allocated data. Stored separately so that clients can point s.ptr to - * a referenced string, but we can reuse this data later. */ - char *data; -}; +void *upb_msg_new(struct upb_msgdef *md) +{ + void *msg = malloc(md->size); + memset(msg, 0, md->size); + return msg; +} -struct mm_upb_array { - struct upb_array a; - /* Track the allocated size, so we know when we need to reallocate. */ - uint32_t size; -}; +/* Allocation callbacks. */ +static struct upb_array *getarray_cb(void *msg, struct upb_msgdef *md, + struct upb_array *existingval, + struct upb_msg_fielddef *f, + upb_arraylen_t len) +{ + (void)msg; + (void)md; + (void)existingval; /* Don't care -- always zero. */ + (void)len; + struct upb_array *arr = existingval; + if(!arr) { + arr = malloc(sizeof(*arr)); + upb_array_init(arr); + } + upb_array_resize(arr, len, f->type); + return arr; +} -static uint32_t round_up_to_pow2(uint32_t v) +static struct upb_string *getstring_cb(void *msg, struct upb_msgdef *md, + struct upb_string *existingval, + struct upb_msg_fielddef *f, size_t len) { - /* cf. http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2 */ - v--; - v |= v >> 1; - v |= v >> 2; - v |= v >> 4; - v |= v >> 8; - v |= v >> 16; - v++; - return v; + (void)msg; + (void)md; + (void)existingval; /* Don't care -- always zero. */ + (void)f; + struct upb_string *str = malloc(sizeof(*str)); + str->ptr = malloc(len); + return str; } -void *upb_msgdata_new(struct upb_msg *m) +static void *getmsg_cb(void *msg, struct upb_msgdef *md, + void *existingval, struct upb_msg_fielddef *f) { - void *msg = malloc(m->size); - memset(msg, 0, m->size); /* Clear all pointers, values, and set bits. */ - return msg; + (void)msg; + (void)md; + (void)existingval; /* Don't care -- always zero. */ + return upb_msg_new(f->ref.msg); } -static void free_value(union upb_value_ptr p, struct upb_msg_field *f, - bool free_submsgs) +void *upb_msg_parsenew(struct upb_msgdef *md, struct upb_string *s) { - switch(f->type) { - case GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_STRING: - case GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_BYTES: { - struct mm_upb_string *mm_str = (void*)*p.str; - if(mm_str) { - free(mm_str->data); - free(mm_str); - } - break; - } - case GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_MESSAGE: - case GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_GROUP: - if(free_submsgs) upb_msgdata_free(*p.msg, f->ref.msg, free_submsgs); - break; - default: break; /* For non-dynamic types, do nothing. */ + struct upb_msg_parser mp; + void *msg = upb_msg_new(md); + upb_msg_parser_reset(&mp, msg, md, false); + mp.getarray_cb = getarray_cb; + mp.getstring_cb = getstring_cb; + mp.getmsg_cb = getmsg_cb; + size_t read; + upb_status_t status = upb_msg_parser_parse(&mp, s->ptr, s->byte_len, &read); + if(status == UPB_STATUS_OK && read == s->byte_len) { + return msg; + } else { + upb_msg_free(msg, md); + return NULL; } } -void upb_msgdata_free(void *data, struct upb_msg *m, bool free_submsgs) +/* For simple, one-shot parsing we assume that a dynamic field exists (and + * needs to be freed) iff its set bit is set. */ +static void free_value(union upb_value_ptr p, struct upb_msg_fielddef *f) +{ + if(upb_isarray(f)) { + free((*p.str)->ptr); + free(*p.str); + } else if(upb_issubmsg(f)) { + upb_msg_free(*p.msg, f->ref.msg); + } +} + +void upb_msg_free(void *data, struct upb_msgdef *m) { if(!data) return; /* A very free-like thing to do. */ for(unsigned int i = 0; i < m->num_fields; i++) { - struct upb_msg_field *f = &m->fields[i]; + struct upb_msg_fielddef *f = &m->fields[i]; + if(!upb_msg_isset(data, f)) continue; union upb_value_ptr p = upb_msg_getptr(data, f); - if(f->label == GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_LABEL_REPEATED) { - if(*p.arr) { - for(uint32_t j = 0; j < (*p.arr)->len; j++) - free_value(upb_array_getelementptr(*p.arr, j, f->type), - f, free_submsgs); - free((*p.arr)->elements._void); - free(*p.arr); - } + if(upb_isarray(f)) { + assert(*p.arr); + for(upb_arraylen_t j = 0; j < (*p.arr)->len; j++) + free_value(upb_array_getelementptr(*p.arr, j, f->type), f); + free((*p.arr)->elements._void); + free(*p.arr); } else { - free_value(p, f, free_submsgs); + free_value(p, f); } } free(data); } -void upb_msg_reuse_str(struct upb_string **str, uint32_t size) -{ - if(!*str) { - *str = malloc(sizeof(struct mm_upb_string)); - memset(*str, 0, sizeof(struct mm_upb_string)); - } - struct mm_upb_string *s = (void*)*str; - if(s->size < size) { - size = max(16, round_up_to_pow2(size)); - s->data = realloc(s->data, size); - s->size = size; - } - s->s.ptr = s->data; -} +/* Parsing. ******************************************************************/ -void upb_msg_reuse_array(struct upb_array **arr, uint32_t size, upb_field_type_t t) +/* Helper function that returns a pointer to where the next value for field "f" + * should be stored, taking into account whether f is an array that may need to + * be allocated or resized. */ +static union upb_value_ptr get_value_ptr(void *data, struct upb_msgdef *m, + struct upb_msg_fielddef *f, + upb_msg_getarray_cb_t getarray_cb) { - if(!*arr) { - *arr = malloc(sizeof(struct mm_upb_array)); - memset(*arr, 0, sizeof(struct mm_upb_array)); - } - struct mm_upb_array *a = (void*)*arr; - if(a->size < size) { - size = max(4, round_up_to_pow2(size)); - size_t type_size = upb_type_info[t].size; - a->a.elements._void = realloc(a->a.elements._void, size * type_size); - /* Zero any newly initialized memory. */ - memset(UPB_INDEX(a->a.elements._void, a->size, type_size), 0, - (size - a->size) * type_size); - a->size = size; + union upb_value_ptr p = upb_msg_getptr(data, f); + if(upb_isarray(f)) { + size_t len = upb_msg_isset(data, f) ? (*p.arr)->len : 0; + *p.arr = getarray_cb(data, m, *p.arr, f, len + 1); + p = upb_array_getelementptr(*p.arr, len, f->type); } + return p; } -void upb_msg_reuse_strref(struct upb_string **str) { upb_msg_reuse_str(str, 0); } - -void upb_msg_reuse_submsg(void **msg, struct upb_msg *m) -{ - if(!*msg) *msg = upb_msgdata_new(m); -} - -/* Parsing. ******************************************************************/ +/* Callbacks for the stream parser. */ static upb_field_type_t tag_cb(void *udata, struct upb_tag *tag, void **user_field_desc) { - struct upb_msg_parse_state *s = udata; - struct upb_msg_field *f = upb_msg_fieldbynum(s->top->m, tag->field_number); + struct upb_msg_parser *mp = udata; + struct upb_msg_fielddef *f = upb_msg_fieldbynum(mp->top->m, tag->field_number); if(!f || !upb_check_type(tag->wire_type, f->type)) return 0; /* Skip unknown or fields of the wrong type. */ *user_field_desc = f; return f->type; } -/* Returns a pointer to where the next value for field "f" should be stored, - * taking into account whether f is an array that may need to be reallocatd. */ -static union upb_value_ptr get_value_ptr(void *data, struct upb_msg_field *f) -{ - union upb_value_ptr p = upb_msg_getptr(data, f); - if(upb_isarray(f)) { - size_t len = upb_msg_isset(data, f) ? (*p.arr)->len : 0; - upb_msg_reuse_array(p.arr, len+1, f->type); - (*p.arr)->len = len + 1; - assert(p._void); - p = upb_array_getelementptr(*p.arr, len, f->type); - assert(p._void); - } - assert(p._void); - return p; -} - static upb_status_t value_cb(void *udata, uint8_t *buf, uint8_t *end, void *user_field_desc, uint8_t **outbuf) { - struct upb_msg_parse_state *s = udata; - struct upb_msg_field *f = user_field_desc; - union upb_value_ptr p = get_value_ptr(s->top->data, f); - upb_msg_set(s->top->data, f); + struct upb_msg_parser *mp = udata; + struct upb_msg_fielddef *f = user_field_desc; + void *msg = mp->top->msg; + union upb_value_ptr p = get_value_ptr(msg, mp->top->m, f, mp->getarray_cb); + upb_msg_set(msg, f); UPB_CHECK(upb_parse_value(buf, end, f->type, p, outbuf)); - //google_protobuf_FieldDescriptorProto *fd = upb_msg_field_descriptor(f, s->top->m); - //upb_text_printfield(&s->p, *fd->name, f->type, upb_deref(p, f->type), stdout); return UPB_STATUS_OK; } @@ -296,60 +268,53 @@ static void str_cb(void *udata, uint8_t *str, size_t avail_len, size_t total_len, void *udesc) { - struct upb_msg_parse_state *s = udata; - struct upb_msg_field *f = udesc; - union upb_value_ptr p = get_value_ptr(s->top->data, f); - upb_msg_set(s->top->data, f); + struct upb_msg_parser *mp = udata; + struct upb_msg_fielddef *f = udesc; + void *msg = mp->top->msg; + union upb_value_ptr p = get_value_ptr(msg, mp->top->m, f, mp->getarray_cb); + upb_msg_set(msg, f); if(avail_len != total_len) abort(); /* TODO: support streaming. */ - if(s->byref) { - upb_msg_reuse_strref(p.str); + if(avail_len == total_len && mp->byref) { + *p.str = mp->getstring_cb(msg, mp->top->m, *p.str, f, 0); (*p.str)->ptr = (char*)str; (*p.str)->byte_len = avail_len; } else { - upb_msg_reuse_str(p.str, avail_len); + *p.str = mp->getstring_cb(msg, mp->top->m, *p.str, f, total_len); memcpy((*p.str)->ptr, str, avail_len); (*p.str)->byte_len = avail_len; } - //google_protobuf_FieldDescriptorProto *fd = upb_msg_field_descriptor(f, s->top->m); - //upb_text_printfield(&s->p, *fd->name, f->type, upb_deref(p, fd->type), stdout); } static void submsg_start_cb(void *udata, void *user_field_desc) { - struct upb_msg_parse_state *s = udata; - struct upb_msg_field *f = user_field_desc; - struct upb_msg *m = f->ref.msg; - void *data = s->top->data; /* The message from the existing frame. */ - union upb_value_ptr p = get_value_ptr(data, f); - upb_msg_reuse_submsg(p.msg, m); - if(!upb_msg_isset(data, f) || !s->merge) - upb_msg_clear(*p.msg, m); - upb_msg_set(data, f); - s->top++; - s->top->m = m; - s->top->data = *p.msg; - //upb_text_push(&s->p, *s->top->m->descriptor->name, stdout); + struct upb_msg_parser *mp = udata; + struct upb_msg_fielddef *f = user_field_desc; + struct upb_msgdef *oldmsgdef = mp->top->m; + void *oldmsg = mp->top->msg; + union upb_value_ptr p = get_value_ptr(oldmsg, oldmsgdef, f, mp->getarray_cb); + upb_msg_set(oldmsg, f); + *p.msg = mp->getmsg_cb(oldmsg, oldmsgdef, *p.msg, f); + mp->top++; + mp->top->m = f->ref.msg; + mp->top->msg = *p.msg; } static void submsg_end_cb(void *udata) { - struct upb_msg_parse_state *s = udata; - s->top--; - //upb_text_pop(&s->p, stdout); + struct upb_msg_parser *mp = udata; + mp->top--; } -void upb_msg_parse_reset(struct upb_msg_parse_state *s, void *msg, - struct upb_msg *m, bool merge, bool byref) +/* Externally-visible functions for the msg parser. */ + +void upb_msg_parser_reset(struct upb_msg_parser *s, void *msg, + struct upb_msgdef *m, bool byref) { - upb_parse_reset(&s->s, s); - upb_text_printer_init(&s->p, false); - s->merge = merge; + upb_stream_parser_reset(&s->s, s); s->byref = byref; - if(!merge && msg == NULL) msg = upb_msgdata_new(m); - upb_msg_clear(msg, m); s->top = s->stack; s->top->m = m; - s->top->data = msg; + s->top->msg = msg; s->s.tag_cb = tag_cb; s->s.value_cb = value_cb; s->s.str_cb = str_cb; @@ -357,38 +322,10 @@ void upb_msg_parse_reset(struct upb_msg_parse_state *s, void *msg, s->s.submsg_end_cb = submsg_end_cb; } -void upb_msg_parse_init(struct upb_msg_parse_state *s, void *msg, - struct upb_msg *m, bool merge, bool byref) -{ - upb_parse_init(&s->s, s); - upb_msg_parse_reset(s, msg, m, merge, byref); -} - -void upb_msg_parse_free(struct upb_msg_parse_state *s) -{ - upb_parse_free(&s->s); -} - -upb_status_t upb_msg_parse(struct upb_msg_parse_state *s, - void *data, size_t len, size_t *read) +upb_status_t upb_msg_parser_parse(struct upb_msg_parser *s, + void *data, size_t len, size_t *read) { - return upb_parse(&s->s, data, len, read); -} - -void *upb_alloc_and_parse(struct upb_msg *m, struct upb_string *str, bool byref) -{ - struct upb_msg_parse_state s; - void *msg = upb_msgdata_new(m); - upb_msg_parse_init(&s, msg, m, false, byref); - size_t read; - upb_status_t status = upb_msg_parse(&s, str->ptr, str->byte_len, &read); - upb_msg_parse_free(&s); - if(status == UPB_STATUS_OK && read == str->byte_len) { - return msg; - } else { - upb_msg_free(msg); - return NULL; - } + return upb_stream_parser_parse(&s->s, data, len, read); } /* Serialization. ************************************************************/ @@ -405,12 +342,12 @@ struct upb_msgsizes { /* Declared below -- this and get_valuesize are mutually recursive. */ static size_t get_msgsize(struct upb_msgsizes *sizes, void *data, - struct upb_msg *m); + struct upb_msgdef *m); /* Returns a size of a value as it will be serialized. Does *not* include * the size of the tag -- that is already accounted for. */ static size_t get_valuesize(struct upb_msgsizes *sizes, union upb_value_ptr p, - struct upb_msg_field *f, + struct upb_msg_fielddef *f, google_protobuf_FieldDescriptorProto *fd) { switch(f->type) { @@ -448,12 +385,12 @@ static size_t get_valuesize(struct upb_msgsizes *sizes, union upb_value_ptr p, * message. However it also stores the results of each level of the recursion * in sizes, because we need all of this intermediate information later. */ static size_t get_msgsize(struct upb_msgsizes *sizes, void *data, - struct upb_msg *m) + struct upb_msgdef *m) { size_t size = 0; /* We iterate over fields and arrays in reverse order. */ for(int32_t i = m->num_fields - 1; i >= 0; i--) { - struct upb_msg_field *f = &m->fields[i]; + struct upb_msg_fielddef *f = &m->fields[i]; google_protobuf_FieldDescriptorProto *fd = upb_msg_field_descriptor(f, m); if(!upb_msg_isset(data, f)) continue; union upb_value_ptr p = upb_msg_getptr(data, f); @@ -480,7 +417,7 @@ static size_t get_msgsize(struct upb_msgsizes *sizes, void *data, return size; } -void upb_msgsizes_read(struct upb_msgsizes *sizes, void *data, struct upb_msg *m) +void upb_msgsizes_read(struct upb_msgsizes *sizes, void *data, struct upb_msgdef *m) { get_msgsize(sizes, data, m); } @@ -507,7 +444,7 @@ struct upb_msg_serialize_state { struct { int field_iter; int elem_iter; - struct upb_msg *m; + struct upb_msgdef *m; void *msg; } stack[UPB_MAX_NESTING], *top, *limit; }; @@ -523,7 +460,7 @@ void upb_msg_serialize_free(struct upb_msg_serialize_state *s) } void upb_msg_serialize_init(struct upb_msg_serialize_state *s, void *data, - struct upb_msg *m, struct upb_msgsizes *sizes) + struct upb_msgdef *m, struct upb_msgsizes *sizes) { (void)s; (void)data; @@ -532,7 +469,7 @@ void upb_msg_serialize_init(struct upb_msg_serialize_state *s, void *data, } static upb_status_t serialize_tag(uint8_t *buf, uint8_t *end, - struct upb_msg_field *f, uint8_t **outptr) + struct upb_msg_fielddef *f, uint8_t **outptr) { /* TODO: need to have the field number also. */ UPB_CHECK(upb_put_UINT32(buf, end, f->type, outptr)); @@ -554,10 +491,10 @@ upb_status_t upb_msg_serialize(struct upb_msg_serialize_state *s, int i = s->top->field_iter; //int j = s->top->elem_iter; void *msg = s->top->msg; - struct upb_msg *m = s->top->m; + struct upb_msgdef *m = s->top->m; while(buf < end) { - struct upb_msg_field *f = &m->fields[i]; + struct upb_msg_fielddef *f = &m->fields[i]; union upb_value_ptr p = upb_msg_getptr(msg, f); serialize_tag(buf, end, f, &buf); if(f->type == GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_MESSAGE) { @@ -571,6 +508,7 @@ upb_status_t upb_msg_serialize(struct upb_msg_serialize_state *s, return UPB_STATUS_OK; } + /* Comparison. ***************************************************************/ bool upb_value_eql(union upb_value_ptr p1, union upb_value_ptr p2, @@ -607,7 +545,7 @@ bool upb_value_eql(union upb_value_ptr p1, union upb_value_ptr p2, } bool upb_array_eql(struct upb_array *arr1, struct upb_array *arr2, - struct upb_msg_field *f, bool recursive) + struct upb_msg_fielddef *f, bool recursive) { if(arr1->len != arr2->len) return false; if(upb_issubmsg(f)) { @@ -628,7 +566,7 @@ bool upb_array_eql(struct upb_array *arr1, struct upb_array *arr2, return true; } -bool upb_msg_eql(void *data1, void *data2, struct upb_msg *m, bool recursive) +bool upb_msg_eql(void *data1, void *data2, struct upb_msgdef *m, bool recursive) { /* Must have the same fields set. TODO: is this wrong? Should we also * consider absent defaults equal to explicitly set defaults? */ @@ -640,20 +578,66 @@ bool upb_msg_eql(void *data1, void *data2, struct upb_msg *m, bool recursive) * padding) and memcmp the masked messages. */ for(uint32_t i = 0; i < m->num_fields; i++) { - struct upb_msg_field *f = &m->fields[i]; + struct upb_msg_fielddef *f = &m->fields[i]; if(!upb_msg_isset(data1, f)) continue; union upb_value_ptr p1 = upb_msg_getptr(data1, f); union upb_value_ptr p2 = upb_msg_getptr(data2, f); - if(f->label == GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_LABEL_REPEATED) { + if(upb_isarray(f)) { if(!upb_array_eql(*p1.arr, *p2.arr, f, recursive)) return false; - } else { - if(upb_issubmsg(f)) { - if(recursive && !upb_msg_eql(p1.msg, p2.msg, f->ref.msg, recursive)) - return false; - } else if(!upb_value_eql(p1, p2, f->type)) { + } else if(upb_issubmsg(f)) { + if(recursive && !upb_msg_eql(p1.msg, p2.msg, f->ref.msg, recursive)) return false; - } + } else if(!upb_value_eql(p1, p2, f->type)) { + return false; } } return true; } + + +static void printval(struct upb_text_printer *printer, union upb_value_ptr p, + struct upb_msg_fielddef *f, + google_protobuf_FieldDescriptorProto *fd, + FILE *stream); + +static void printmsg(struct upb_text_printer *printer, void *msg, + struct upb_msgdef *m, FILE *stream) +{ + for(uint32_t i = 0; i < m->num_fields; i++) { + struct upb_msg_fielddef *f = &m->fields[i]; + google_protobuf_FieldDescriptorProto *fd = upb_msg_field_descriptor(f, m); + if(!upb_msg_isset(msg, f)) continue; + union upb_value_ptr p = upb_msg_getptr(msg, f); + if(upb_isarray(f)) { + struct upb_array *arr = *p.arr; + for(uint32_t j = 0; j < arr->len; j++) { + union upb_value_ptr elem_p = upb_array_getelementptr(arr, j, f->type); + printval(printer, elem_p, f, fd, stream); + } + } else { + printval(printer, p, f, fd, stream); + } + } +} + +static void printval(struct upb_text_printer *printer, union upb_value_ptr p, + struct upb_msg_fielddef *f, + google_protobuf_FieldDescriptorProto *fd, + FILE *stream) +{ + if(upb_issubmsg(f)) { + upb_text_push(printer, *fd->name, stream); + printmsg(printer, *p.msg, f->ref.msg, stream); + upb_text_pop(printer, stream); + } else { + upb_text_printfield(printer, *fd->name, f->type, upb_deref(p, f->type), stream); + } +} + +void upb_msg_print(void *data, struct upb_msgdef *m, bool single_line, + FILE *stream) +{ + struct upb_text_printer printer; + upb_text_printer_init(&printer, single_line); + printmsg(&printer, data, m, stream); +} diff --git a/src/upb_msg.h b/src/upb_msg.h index 043af23..6a0568a 100644 --- a/src/upb_msg.h +++ b/src/upb_msg.h @@ -3,93 +3,48 @@ * * Copyright (c) 2009 Joshua Haberman. See LICENSE for details. * - * A upb_msg provides a full description of a message as defined in a .proto - * file. It supports many features and operations for dealing with proto + * A upb_msgdef provides a full description of a message type as defined in a + * .proto file. Using a upb_msgdef, it is possible to treat an arbitrary hunk + * of memory (a void*) as a protobuf of the given type. We will call this + * void* a upb_msg in the context of this interface. + * + * Clients generally do not construct or destruct upb_msgdef objects directly. + * They are managed by upb_contexts, and clients can obtain upb_msgdef pointers + * directly from a upb_context. + * + * A upb_msg is READ-ONLY, and the upb_msgdef functions in this file provide + * read-only access. For a mutable message, or for a message that you can take + * a reference to to prevents its destruction, see upb_mm_msg.h, which is a + * layer on top of upb_msg that adds memory management semantics. + * + * upb_msgdef supports many features and operations for dealing with proto * messages: * - reflection over .proto types at runtime (list fields, get names, etc). * - an in-memory byte-level format for efficiently storing and accessing msgs. - * - serializing and deserializing from the in-memory format to a protobuf. - * - optional memory management for handling strings, arrays, and submessages. - * - * Throughout this file, the following convention is used: - * - "struct upb_msg *m" describes a message type (name, list of fields, etc). - * - "void *data" is an actual message stored using the in-memory format. + * - serializing from the in-memory format to a protobuf. + * - parsing from a protobuf to an in-memory data structure (you either + * supply callbacks for allocating/repurposing memory or use a simplified + * version that parses into newly-allocated memory). * * The in-memory format is very much like a C struct that you can define at * run-time, but also supports reflection. Like C structs it supports * offset-based access, as opposed to the much slower name-based lookup. The * format stores both the values themselves and bits describing whether each - * field is set or not. For example: - * - * parsed message Foo { - * optional bool a = 1; - * repeated uint32 b = 2; - * optional Bar c = 3; - * } + * field is set or not. * - * The in-memory layout for this message on a 32-bit machine will be something - * like: - * - * Foo - * +------------------------+ - * | set_flags a:1, b:1, c:1| - * +------------------------+ - * | bool a (1 byte) | - * +------------------------+ - * | padding (3 bytes) | - * +------------------------+ upb_array - * | upb_array* b (4 bytes) | ----> +----------------------------+ - * +------------------------+ | uint32* elements (4 bytes) | ---+ - * | Bar* c (4 bytes) | +----------------------------+ | - * +------------------------+ | uint32 size (4 bytes) | | - * +----------------------------+ | - * | - * -----------------------------------------------------------------+ - * | - * V - * uint32 array - * +----+----+----+----+----+----+ - * | e1 | e2 | e3 | e4 | e5 | e6 | - * +----+----+----+----+----+----+ - * - * And the corresponding C structure (as emitted by the proto compiler) would be: - * - * struct Foo { - * union { - * uint8_t bytes[1]; - * struct { - * bool a:1; - * bool b:1; - * bool c:1; - * } has; - * } set_flags; - * bool a; - * upb_uint32_array *b; - * Bar *c; - * } + * For a more in-depth description of the in-memory format, see: + * http://wiki.github.com/haberman/upb/inmemoryformat * * Because the C struct emitted by the upb compiler uses exactly the same * byte-level format as the reflection interface, you can access the same hunk * of memory either way. The C struct provides maximum performance and static - * type safety; upb_msg provides flexibility. + * type safety; upb_msg_def provides flexibility. * * The in-memory format has no interoperability guarantees whatsoever, except * that a single version of upb will interoperate with itself. Don't even * think about persisting the in-memory format or sending it anywhere. That's * what serialized protobufs are for! The in-memory format is just that -- an * in-memory representation that allows for fast access. - * - * The in-memory format is carefully designed to *not* mandate any particular - * memory management scheme. This should make it easier to integrate with - * existing memory management schemes, or to perform advanced techniques like - * reference counting, garbage collection, and string references. Different - * clients can read each others messages regardless of what memory management - * scheme each is using. - * - * A memory management scheme is provided for convenience, and it is used by - * default by the stock message parser. Clients can substitute their own - * memory management scheme into this parser without any loss of generality - * or performance. */ #ifndef UPB_MSG_H_ @@ -108,21 +63,9 @@ extern "C" { /* Message definition. ********************************************************/ -/* Structure that describes a single field in a message. This structure is very - * consciously designed to fit into 12/16 bytes (32/64 bit, respectively), - * because copies of this struct are in the hash table that is read in the - * critical path of parsing. Minimizing the size of this struct increases - * cache-friendliness. */ -struct upb_msg_field { - union upb_symbol_ref ref; - uint32_t byte_offset; /* Where to find the data. */ - uint16_t field_index; /* Indexes upb_msg.fields. Also indicates set bit */ - upb_field_type_t type; /* Copied from descriptor for cache-friendliness. */ - upb_label_t label; -}; - +struct upb_msg_fielddef; /* Structure that describes a single .proto message type. */ -struct upb_msg { +struct upb_msgdef { struct google_protobuf_DescriptorProto *descriptor; struct upb_string fqname; /* Fully qualified. */ size_t size; @@ -131,93 +74,65 @@ struct upb_msg { uint32_t num_required_fields; /* Required fields have the lowest set bytemasks. */ struct upb_inttable fields_by_num; struct upb_strtable fields_by_name; - struct upb_msg_field *fields; + struct upb_msg_fielddef *fields; struct google_protobuf_FieldDescriptorProto **field_descriptors; }; -/* The num->field and name->field maps in upb_msg allow fast lookup of fields - * by number or name. These lookups are in the critical path of parsing and - * field lookup, so they must be as fast as possible. To make these more - * cache-friendly, we put the data in the table by value. */ -struct upb_fieldsbynum_entry { - struct upb_inttable_entry e; - struct upb_msg_field f; +/* Structure that describes a single field in a message. This structure is very + * consciously designed to fit into 12/16 bytes (32/64 bit, respectively), + * because copies of this struct are in the hash table that is read in the + * critical path of parsing. Minimizing the size of this struct increases + * cache-friendliness. */ +struct upb_msg_fielddef { + union upb_symbol_ref ref; + uint32_t byte_offset; /* Where to find the data. */ + uint16_t field_index; /* Indexes upb_msgdef.fields and indicates set bit */ + upb_field_type_t type; /* Copied from descriptor for cache-friendliness. */ + upb_label_t label; }; -struct upb_fieldsbyname_entry { - struct upb_strtable_entry e; - struct upb_msg_field f; -}; +INLINE bool upb_issubmsg(struct upb_msg_fielddef *f) { + return upb_issubmsgtype(f->type); +} +INLINE bool upb_isstring(struct upb_msg_fielddef *f) { + return upb_isstringtype(f->type); +} +INLINE bool upb_isarray(struct upb_msg_fielddef *f) { + return f->label == GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_LABEL_REPEATED; +} -/* Can be used to retrieve a field descriptor given the upb_msg_field ref. */ +/* Can be used to retrieve a field descriptor given the upb_msg_fielddef. */ INLINE struct google_protobuf_FieldDescriptorProto *upb_msg_field_descriptor( - struct upb_msg_field *f, struct upb_msg *m) { + struct upb_msg_fielddef *f, struct upb_msgdef *m) { return m->field_descriptors[f->field_index]; } -/* Initializes/frees a upb_msg. Usually this will be called by upb_context, and - * clients will not have to construct one directly. - * - * Caller retains ownership of d, but the msg will contain references to it, so - * it must outlive the msg. Note that init does not resolve upb_msg_field.ref - * the caller should do that post-initialization by calling upb_msg_ref() - * below. - * - * fqname indicates the fully-qualified name of this message. Ownership of - * fqname passes to the msg, but the msg will contain references to it, so it - * must outlive the msg. - * - * sort indicates whether or not it is safe to reorder the fields from the order - * they appear in d. This should be false if code has been compiled against a - * header for this type that expects the given order. */ -bool upb_msg_init(struct upb_msg *m, struct google_protobuf_DescriptorProto *d, - struct upb_string fqname, bool sort); -void upb_msg_free(struct upb_msg *m); - -/* Sort the given field descriptors in-place, according to what we think is an - * optimal ordering of fields. This can change from upb release to upb release. - * This is meant for internal use. */ -void upb_msg_sortfds(google_protobuf_FieldDescriptorProto **fds, size_t num); +/* Field access. **************************************************************/ -/* Clients use this function on a previously initialized upb_msg to resolve the - * "ref" field in the upb_msg_field. Since messages can refer to each other in - * mutually-recursive ways, this step must be separated from initialization. */ -void upb_msg_ref(struct upb_msg *m, struct upb_msg_field *f, union upb_symbol_ref ref); +/* Note that these only provide access to fields that are directly in the msg + * itself. For dynamic fields (strings, arrays, and submessages) it will be + * necessary to dereference the returned values. */ -/* Looks up a field by name or number. While these are written to be as fast - * as possible, it will still be faster to cache the results of this lookup if - * possible. These return NULL if no such field is found. */ -INLINE struct upb_msg_field *upb_msg_fieldbynum(struct upb_msg *m, - uint32_t number) { - struct upb_fieldsbynum_entry *e = - (struct upb_fieldsbynum_entry*)upb_inttable_fast_lookup( - &m->fields_by_num, number, sizeof(struct upb_fieldsbynum_entry)); - return e ? &e->f : NULL; -} -INLINE struct upb_msg_field *upb_msg_fieldbyname(struct upb_msg *m, - struct upb_string *name) { - struct upb_fieldsbyname_entry *e = - (struct upb_fieldsbyname_entry*)upb_strtable_lookup( - &m->fields_by_name, name); - return e ? &e->f : NULL; +/* Returns a pointer to a specific field in a message. */ +INLINE union upb_value_ptr upb_msg_getptr(void *msg, struct upb_msg_fielddef *f) { + union upb_value_ptr p; + p._void = ((char*)msg + f->byte_offset); + return p; } -INLINE bool upb_issubmsg(struct upb_msg_field *f) { - return upb_issubmsgtype(f->type); -} -INLINE bool upb_isstring(struct upb_msg_field *f) { - return upb_isstringtype(f->type); -} -INLINE bool upb_isarray(struct upb_msg_field *f) { - return f->label == GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_LABEL_REPEATED; +/* Returns a a specific field in a message. */ +INLINE union upb_value upb_msg_get(void *msg, struct upb_msg_fielddef *f) { + return upb_deref(upb_msg_getptr(msg, f), f->type); } /* "Set" flag reading and writing. *******************************************/ -/* Please note that these functions do not perform any memory management or in - * any way ensure that the fields are valid. They *only* test/set/clear a bit - * that indicates whether the field is set or not. */ +/* All upb code and code using upb should guarantee that the set flags are + * always valid. It should always be the case that if a flag's field is set + * for a dynamic field that the pointer is valid. + * + * Clients should never set fields on a plain upb_msg, only on a upb_mm_msg. */ /* Returns the byte offset where we store whether this field is set. */ INLINE size_t upb_isset_offset(uint32_t field_index) { @@ -230,135 +145,162 @@ INLINE uint8_t upb_isset_mask(uint32_t field_index) { } /* Returns true if the given field is set, false otherwise. */ -INLINE void upb_msg_set(void *s, struct upb_msg_field *f) +INLINE void upb_msg_set(void *msg, struct upb_msg_fielddef *f) { - ((char*)s)[upb_isset_offset(f->field_index)] |= upb_isset_mask(f->field_index); + ((char*)msg)[upb_isset_offset(f->field_index)] |= upb_isset_mask(f->field_index); } /* Clears the set bit for this field in the given message. */ -INLINE void upb_msg_unset(void *s, struct upb_msg_field *f) +INLINE void upb_msg_unset(void *msg, struct upb_msg_fielddef *f) { - ((char*)s)[upb_isset_offset(f->field_index)] &= ~upb_isset_mask(f->field_index); + ((char*)msg)[upb_isset_offset(f->field_index)] &= ~upb_isset_mask(f->field_index); } /* Tests whether the given field is set. */ -INLINE bool upb_msg_isset(void *s, struct upb_msg_field *f) +INLINE bool upb_msg_isset(void *msg, struct upb_msg_fielddef *f) { - return ((char*)s)[upb_isset_offset(f->field_index)] & upb_isset_mask(f->field_index); + return ((char*)msg)[upb_isset_offset(f->field_index)] & upb_isset_mask(f->field_index); } /* Returns true if *all* required fields are set, false otherwise. */ -INLINE bool upb_msg_all_required_fields_set(void *s, struct upb_msg *m) +INLINE bool upb_msg_all_required_fields_set(void *msg, struct upb_msgdef *m) { int num_fields = m->num_required_fields; int i = 0; while(num_fields > 8) { - if(((uint8_t*)s)[i++] != 0xFF) return false; + if(((uint8_t*)msg)[i++] != 0xFF) return false; num_fields -= 8; } - if(((uint8_t*)s)[i] != (1 << num_fields) - 1) return false; + if(((uint8_t*)msg)[i] != (1 << num_fields) - 1) return false; return true; } /* Clears the set bit for all fields. */ -INLINE void upb_msg_clear(void *s, struct upb_msg *m) +INLINE void upb_msg_clear(void *msg, struct upb_msgdef *m) { - memset(s, 0, m->set_flags_bytes); + memset(msg, 0, m->set_flags_bytes); } -/* Scalar (non-array) data access. ********************************************/ +/* Number->field and name->field lookup. *************************************/ -/* Returns a pointer to a specific field in a message. */ -INLINE union upb_value_ptr upb_msg_getptr(void *data, struct upb_msg_field *f) { - union upb_value_ptr p; - p._void = ((char*)data + f->byte_offset); - return p; -} +/* The num->field and name->field maps in upb_msgdef allow fast lookup of fields + * by number or name. These lookups are in the critical path of parsing and + * field lookup, so they must be as fast as possible. To make these more + * cache-friendly, we put the data in the table by value. */ -/* Returns a a specific field in a message. */ -INLINE union upb_value upb_msg_get(void *data, struct upb_msg_field *f) { - return upb_deref(upb_msg_getptr(data, f), f->type); -} +struct upb_fieldsbynum_entry { + struct upb_inttable_entry e; + struct upb_msg_fielddef f; +}; -/* Memory management *********************************************************/ +struct upb_fieldsbyname_entry { + struct upb_strtable_entry e; + struct upb_msg_fielddef f; +}; -/* One important note about these memory management routines: they must be used - * completely or not at all (for each message). In other words, you can't - * allocate your own message and then free it with upb_msgdata_free. As - * another example, you can't point a field to your own string and then call - * upb_msg_reuse_str. */ +/* Looks up a field by name or number. While these are written to be as fast + * as possible, it will still be faster to cache the results of this lookup if + * possible. These return NULL if no such field is found. */ +INLINE struct upb_msg_fielddef *upb_msg_fieldbynum(struct upb_msgdef *m, + uint32_t number) { + struct upb_fieldsbynum_entry *e = + (struct upb_fieldsbynum_entry*)upb_inttable_fast_lookup( + &m->fields_by_num, number, sizeof(struct upb_fieldsbynum_entry)); + return e ? &e->f : NULL; +} + +INLINE struct upb_msg_fielddef *upb_msg_fieldbyname(struct upb_msgdef *m, + struct upb_string *name) { + struct upb_fieldsbyname_entry *e = + (struct upb_fieldsbyname_entry*)upb_strtable_lookup( + &m->fields_by_name, name); + return e ? &e->f : NULL; +} -/* Allocates and frees message data, respectively. Newly allocated data is - * initialized to empty. Freeing a message always frees string data, but - * the client can decide whether or not submessages should be deleted. */ -void *upb_msgdata_new(struct upb_msg *m); -void upb_msgdata_free(void *data, struct upb_msg *m, bool free_submsgs); -/* Given a pointer to the appropriate field of the message or array, these - * functions will lazily allocate memory for a string, array, or submessage. - * If the previously allocated memory is big enough, it will reuse it without - * re-allocating. See upb_msg.c for example usage. */ +/* Simple, one-shot parsing ***************************************************/ -/* Reuse a string of at least the given size. */ -void upb_msg_reuse_str(struct upb_string **str, uint32_t size); -/* Like the previous, but assumes that the string will be by reference, so - * doesn't allocate memory for the string itself. */ -void upb_msg_reuse_strref(struct upb_string **str); +/* A simple interface for parsing into a newly-allocated message. This + * interface should only be used when the message will be read-only with + * respect to memory management (eg. won't add or remove internal references to + * dynamic memory). For more flexible (but also more complicated) interfaces, + * see below and in upb_mm_msg.h. */ -/* Reuse an array of at least the given size, with the given type. */ -void upb_msg_reuse_array(struct upb_array **arr, uint32_t size, - upb_field_type_t t); +/* Parses the protobuf in s (which is expected to be complete) and allocates + * new message data to hold it. If byref is set, strings in the returned + * upb_msg will reference s instead of copying from it, but this requires that + * s will live for as long as the returned message does. */ +void *upb_msg_parsenew(struct upb_msgdef *m, struct upb_string *s); -/* Reuse a submessage of the given type. */ -void upb_msg_reuse_submsg(void **msg, struct upb_msg *m); +/* This function should be used to free messages that were parsed with + * upb_msg_parsenew. It will free the message appropriately (including all + * submessages). */ +void upb_msg_free(void *msg, struct upb_msgdef *m); -/* Parsing. ******************************************************************/ -/* This is all just a layer on top of the stream-oriented facility in - * upb_parse.h. */ +/* Parsing with (re)allocation callbacks. *************************************/ -struct upb_msg_parse_frame { - struct upb_msg *m; - void *data; +/* This interface parses protocol buffers into upb_msgs, but allows the client + * to supply allocation callbacks whenever the parser needs to obtain a string, + * array, or submsg (a "dynamic field"). If the parser sees that a dynamic + * field is already present (its "set bit" is set) it will use that, otherwise + * it will call the allocation callback to obtain one. + * + * This may seem trivial (since nearly all clients will use malloc and free for + * memory management), but the allocation callback can be used for more than + * just allocation. If we are parsing data into an existing upb_msg, the + * allocation callback can examine any existing memory that is allocated for + * the dynamic field and determine whether it can reuse it. It can also + * perform memory management like unrefing the existing field or refing the new. + * + * This parser is layered on top of the event-based parser in upb_parse.h. The + * parser is upb_mm_msg.h is layered on top of this parser. + * + * This parser is fully streaming-capable. */ + +typedef struct upb_array *(*upb_msg_getarray_cb_t)( + void *msg, struct upb_msgdef *m, + struct upb_array *existingval, struct upb_msg_fielddef *f, + upb_arraylen_t size); + +/* Callback to allocate a string of size >=len. If len==0 then the client can + * assume that the parser intends to reference the memory instead of copying + * it. */ +typedef struct upb_string *(*upb_msg_getstring_cb_t)( + void *msg, struct upb_msgdef *m, + struct upb_string *existingval, struct upb_msg_fielddef *f, size_t len); + +typedef void *(*upb_msg_getmsg_cb_t)( + void *msg, struct upb_msgdef *m, + void *existingval, struct upb_msg_fielddef *f); + +struct upb_msg_parser_frame { + struct upb_msgdef *m; + void *msg; }; -#include "upb_text.h" -struct upb_msg_parse_state { - struct upb_parse_state s; +struct upb_msg_parser { + struct upb_stream_parser s; bool merge; bool byref; struct upb_msg *m; - struct upb_msg_parse_frame stack[UPB_MAX_NESTING], *top; - struct upb_text_printer p; + struct upb_msg_parser_frame stack[UPB_MAX_NESTING], *top; + upb_msg_getarray_cb_t getarray_cb; + upb_msg_getstring_cb_t getstring_cb; + upb_msg_getmsg_cb_t getmsg_cb; }; -/* Initializes/frees a message parser. The parser will write the data to the - * message data "data", which the caller must have previously allocated (the - * parser will allocate submsgs, strings, and arrays as needed, however). - * - * "Merge" controls whether the parser will append to data instead of - * overwriting. Merging concatenates arrays and merges submessages instead - * of clearing both. - * - * "Byref" controls whether the new message data copies or references strings - * it encounters. If byref == true, then all strings supplied to upb_msg_parse - * must remain unchanged and must outlive data. */ -void upb_msg_parse_init(struct upb_msg_parse_state *s, void *data, - struct upb_msg *m, bool merge, bool byref); -void upb_msg_parse_reset(struct upb_msg_parse_state *s, void *data, - struct upb_msg *m, bool merge, bool byref); -void upb_msg_parse_free(struct upb_msg_parse_state *s); - -/* Parses a protobuf fragment, writing the data to the message that was passed - * to upb_msg_parse_init. This function can be called multiple times as more - * data becomes available. */ -upb_status_t upb_msg_parse(struct upb_msg_parse_state *s, - void *data, size_t len, size_t *read); +void upb_msg_parser_reset(struct upb_msg_parser *p, + void *msg, struct upb_msgdef *m, + bool byref); + +/* Parses protocol buffer data out of data which has length of len. The data + * need not be a complete protocol buffer. The number of bytes parsed is + * returned in *read, and the next call to upb_msg_parse must supply data that + * is *read bytes past data in the logical stream. */ +upb_status_t upb_msg_parser_parse(struct upb_msg_parser *p, + void *data, size_t len, size_t *read); -/* Parses the protobuf in s (which is expected to be complete) and allocates - * new message data to hold it. This is an alternative to the streaming API - * above. "byref" works as in upb_msg_parse_init(). */ -void *upb_alloc_and_parse(struct upb_msg *m, struct upb_string *s, bool byref); /* Serialization *************************************************************/ @@ -377,8 +319,8 @@ void upb_msgsizes_free(struct upb_msgsizes *sizes); /* Given a previously initialized sizes, recurse over the message and store its * sizes in 'sizes'. */ -void upb_msgsizes_read(struct upb_msgsizes *sizes, void *data, - struct upb_msg *m); +void upb_msgsizes_read(struct upb_msgsizes *sizes, void *msg, + struct upb_msgdef *m); /* Returns the total size of the serialized message given in sizes. Must be * preceeded by a call to upb_msgsizes_read. */ @@ -391,8 +333,8 @@ struct upb_msg_serialize_state; * "sizes" and the parse being fully completed. */ void upb_msg_serialize_alloc(struct upb_msg_serialize_state *s); void upb_msg_serialize_free(struct upb_msg_serialize_state *s); -void upb_msg_serialize_init(struct upb_msg_serialize_state *s, void *data, - struct upb_msg *m, struct upb_msgsizes *sizes); +void upb_msg_serialize_init(struct upb_msg_serialize_state *s, void *msg, + struct upb_msgdef *m, struct upb_msgsizes *sizes); /* Serializes the next set of bytes into buf (which has size len). Returns * UPB_STATUS_OK if serialization is complete, or UPB_STATUS_NEED_MORE_DATA @@ -405,8 +347,43 @@ upb_status_t upb_msg_serialize(struct upb_msg_serialize_state *s, /* Text dump *****************************************************************/ -bool upb_msg_eql(void *data1, void *data2, struct upb_msg *m, bool recursive); -void upb_msg_print(void *data, struct upb_msg *m, FILE *stream); +bool upb_msg_eql(void *data1, void *data2, struct upb_msgdef *m, bool recursive); +void upb_msg_print(void *data, struct upb_msgdef *m, bool single_line, + FILE *stream); + +/* Internal functions. ********************************************************/ + +/* Initializes/frees a upb_msgdef. Usually this will be called by upb_context, + * and clients will not have to construct one directly. + * + * Caller retains ownership of d, but the msg will contain references to it, so + * it must outlive the msg. Note that init does not resolve + * upb_msg_fielddef.ref the caller should do that post-initialization by + * calling upb_msg_ref() below. + * + * fqname indicates the fully-qualified name of this message. Ownership of + * fqname passes to the msg, but the msg will contain references to it, so it + * must outlive the msg. + * + * sort indicates whether or not it is safe to reorder the fields from the order + * they appear in d. This should be false if code has been compiled against a + * header for this type that expects the given order. */ +bool upb_msgdef_init(struct upb_msgdef *m, + struct google_protobuf_DescriptorProto *d, + struct upb_string fqname, bool sort); +void upb_msgdef_free(struct upb_msgdef *m); + +/* Sort the given field descriptors in-place, according to what we think is an + * optimal ordering of fields. This can change from upb release to upb + * release. */ +void upb_msgdef_sortfds(google_protobuf_FieldDescriptorProto **fds, size_t num); + +/* Clients use this function on a previously initialized upb_msgdef to resolve + * the "ref" field in the upb_msg_fielddef. Since messages can refer to each + * other in mutually-recursive ways, this step must be separated from + * initialization. */ +void upb_msgdef_ref(struct upb_msgdef *m, struct upb_msg_fielddef *f, + union upb_symbol_ref ref); #ifdef __cplusplus } /* extern "C" */ diff --git a/src/upb_parse.c b/src/upb_parse.c index fd76051..b7f3832 100644 --- a/src/upb_parse.c +++ b/src/upb_parse.c @@ -100,7 +100,7 @@ upb_status_t upb_parse_value(uint8_t *buf, uint8_t *end, upb_field_type_t ft, #undef CASE } -void upb_parse_reset(struct upb_parse_state *state, void *udata) +void upb_stream_parser_reset(struct upb_stream_parser *state, void *udata) { state->top = state->stack; state->limit = &state->stack[UPB_MAX_NESTING]; @@ -111,18 +111,7 @@ void upb_parse_reset(struct upb_parse_state *state, void *udata) state->udata = udata; } -void upb_parse_init(struct upb_parse_state *state, void *udata) -{ - memset(state, 0, sizeof(struct upb_parse_state)); /* Clear all callbacks. */ - upb_parse_reset(state, udata); -} - -void upb_parse_free(struct upb_parse_state *state) -{ - (void)state; -} - -static void *pop_stack_frame(struct upb_parse_state *s, uint8_t *buf) +static void *pop_stack_frame(struct upb_stream_parser *s, uint8_t *buf) { if(s->submsg_end_cb) s->submsg_end_cb(s->udata); s->top--; @@ -130,7 +119,7 @@ static void *pop_stack_frame(struct upb_parse_state *s, uint8_t *buf) } /* Returns the next end offset. */ -static upb_status_t push_stack_frame(struct upb_parse_state *s, +static upb_status_t push_stack_frame(struct upb_stream_parser *s, uint8_t *buf, uint32_t len, void *user_field_desc, uint8_t **submsg_end) { @@ -142,8 +131,8 @@ static upb_status_t push_stack_frame(struct upb_parse_state *s, return UPB_STATUS_OK; } -upb_status_t upb_parse(struct upb_parse_state *s, void *_buf, size_t len, - size_t *read) +upb_status_t upb_stream_parser_parse(struct upb_stream_parser *s, + void *_buf, size_t len, size_t *read) { uint8_t *buf = _buf; uint8_t *completed = buf; diff --git a/src/upb_parse.h b/src/upb_parse.h index f0ec5e2..de4cb2c 100644 --- a/src/upb_parse.h +++ b/src/upb_parse.h @@ -42,14 +42,11 @@ INLINE bool upb_isstringtype(upb_field_type_t type) { * as data becomes available. The parser is fully streaming-capable, so the * data need not all be available at the same time. */ -struct upb_parse_state; +struct upb_stream_parser; -/* Initialize and free (respectively) the given parse state, which must have - * been previously allocated. udata_size specifies how much space will be - * available at parse_stack_frame.user_data in each frame for user data. */ -void upb_parse_init(struct upb_parse_state *state, void *udata); -void upb_parse_reset(struct upb_parse_state *state, void *udata); -void upb_parse_free(struct upb_parse_state *state); +/* Resets the internal state of an already-allocated parser. udata will be + * passed to callbacks as appropriate. */ +void upb_stream_parser_reset(struct upb_stream_parser *p, void *udata); /* The callback that is called immediately after a tag has been parsed. The * client should determine whether it wants to parse or skip the corresponding @@ -86,7 +83,7 @@ typedef void (*upb_submsg_start_cb)(void *udata, void *user_field_desc); typedef void (*upb_submsg_end_cb)(void *udata); -struct upb_parse_state { +struct upb_stream_parser { /* For delimited submsgs, counts from the submsg len down to zero. * For group submsgs, counts from zero down to the negative len. */ uint32_t stack[UPB_MAX_NESTING], *top, *limit; @@ -115,8 +112,8 @@ struct upb_parse_state { * * TODO: see if we can provide the following guarantee efficiently: * *read will always be >= len. */ -upb_status_t upb_parse(struct upb_parse_state *s, void *buf, size_t len, - size_t *read); +upb_status_t upb_stream_parser_parse(struct upb_stream_parser *p, + void *buf, size_t len, size_t *read); extern upb_wire_type_t upb_expected_wire_types[]; /* Returns true if wt is the correct on-the-wire type for ft. */ diff --git a/src/upb_table.c b/src/upb_table.c index 036d175..314594a 100644 --- a/src/upb_table.c +++ b/src/upb_table.c @@ -13,7 +13,7 @@ static const upb_inttable_key_t EMPTYENT = 0; static const double MAX_LOAD = 0.85; -static uint32_t MurmurHash2(const void *key, size_t len, uint32_t seed); +uint32_t MurmurHash2(const void *key, size_t len, uint32_t seed); /* We use 1-based indexes into the table so that 0 can be "NULL". */ static struct upb_inttable_entry *intent(struct upb_inttable *t, int32_t i) { @@ -238,7 +238,7 @@ void *upb_strtable_next(struct upb_strtable *t, struct upb_strtable_entry *cur) // 1. It will not work incrementally. // 2. It will not produce the same results on little-endian and big-endian // machines. -static uint32_t MurmurHash2(const void *key, size_t len, uint32_t seed) +uint32_t MurmurHash2(const void *key, size_t len, uint32_t seed) { // 'm' and 'r' are mixing constants generated offline. // They're not really 'magic', they just happen to work well. -- cgit v1.2.3