From 5235966ed5f369969c6ba0a558453ff22097a722 Mon Sep 17 00:00:00 2001 From: Joshua Haberman Date: Mon, 6 Jul 2009 13:34:40 -0700 Subject: Lots of documentation, cleanup, and fixed memory leaks. --- Makefile | 2 +- upb.h | 81 ++++++-------- upb_context.c | 17 +-- upb_context.h | 20 ++-- upb_msg.c | 102 +++++++++++++----- upb_msg.h | 339 +++++++++++++++++++++++++++++++++------------------------- upb_parse.h | 27 +++++ upbc.c | 5 +- 8 files changed, 352 insertions(+), 241 deletions(-) diff --git a/Makefile b/Makefile index 3e60c72..a06ff0c 100644 --- a/Makefile +++ b/Makefile @@ -3,7 +3,7 @@ CC=gcc CXX=g++ CFLAGS=-std=c99 -CPPFLAGS=-O0 -Wall -Wextra -pedantic -g -DUPB_UNALIGNED_READS_OK -fomit-frame-pointer +CPPFLAGS=-Wall -Wextra -pedantic -g -DUPB_UNALIGNED_READS_OK -fomit-frame-pointer OBJ=upb_parse.o upb_table.o upb_msg.o upb_enum.o upb_context.o descriptor.o all: $(OBJ) test_table tests upbc clean: diff --git a/upb.h b/upb.h index b7f1f16..eecdba0 100644 --- a/upb.h +++ b/upb.h @@ -1,7 +1,9 @@ /* * upb - a minimalist implementation of protocol buffers. - * + * Copyright (c) 2009 Joshua Haberman. See LICENSE for details. + * + * This file contains shared definitions that are widely used across upb. */ #ifndef UPB_H_ @@ -36,29 +38,14 @@ extern "C" { /* The maximum number of fields that any one .proto type can have. */ #define UPB_MAX_FIELDS (1<<16) +/* Nested type names are separated by periods. */ +#define UPB_SYMBOL_SEPARATOR '.' +#define UPB_SYMBOL_MAX_LENGTH 256 + #define UPB_INDEX(base, i, m) (void*)((char*)(base) + ((i)*(m))) INLINE uint32_t max(uint32_t a, uint32_t b) { return a > b ? a : b; } -/* A list of types as they are encoded on-the-wire. */ -enum upb_wire_type { - UPB_WIRE_TYPE_VARINT = 0, - UPB_WIRE_TYPE_64BIT = 1, - UPB_WIRE_TYPE_DELIMITED = 2, - UPB_WIRE_TYPE_START_GROUP = 3, - UPB_WIRE_TYPE_END_GROUP = 4, - UPB_WIRE_TYPE_32BIT = 5 -}; -typedef uint8_t upb_wire_type_t; - -/* A value as it is encoded on-the-wire, except delimited, which is handled - * separately. */ -union upb_wire_value { - uint64_t varint; - uint64_t _64bit; - uint32_t _32bit; -}; - /* Value type as defined in a .proto file. The values of this are defined by * google_protobuf_FieldDescriptorProto_Type (from descriptor.proto). * Note that descriptor.proto reserves "0" for errors, and we use it to @@ -76,48 +63,36 @@ struct upb_type_info { uint8_t expected_wire_type; }; -/* This array is indexed by upb_field_type_t. */ +/* Contains information for all .proto types. Indexed by upb_field_type_t. */ extern struct upb_type_info upb_type_info[]; -/* A scalar value as described in a .proto file */ +/* A pointer to a .proto value. The owner must have an out-of-band way of + * knowing the type, so it knows which union member to use. */ union upb_value { - double _double; - float _float; - int32_t int32; - int64_t int64; + double _double; + float _float; + int32_t int32; + int64_t int64; uint32_t uint32; uint64_t uint64; - bool _bool; + bool _bool; + struct upb_string **string; + struct upb_array **array; + void *message; }; union upb_value_ptr { - double *_double; - float *_float; - int32_t *int32; - int64_t *int64; + double *_double; + float *_float; + int32_t *int32; + int64_t *int64; uint32_t *uint32; uint64_t *uint64; - bool *_bool; + bool *_bool; struct upb_string **string; struct upb_array **array; - void **message; - void *_void; -}; - -/* The number of a field, eg. "optional string foo = 3". */ -typedef int32_t upb_field_number_t; - -/* A tag occurs before each value on-the-wire. */ -struct upb_tag { - upb_field_number_t field_number; - upb_wire_type_t wire_type; -}; - -enum upb_symbol_type { - UPB_SYM_MESSAGE, - UPB_SYM_ENUM, - UPB_SYM_SERVICE, - UPB_SYM_EXTENSION + void **message; + void *_void; }; union upb_symbol_ref { @@ -126,7 +101,11 @@ union upb_symbol_ref { struct upb_svc *svc; }; -/* Status codes used as a return value. */ +/* The number of a field, eg. "optional string foo = 3". */ +typedef int32_t upb_field_number_t; + +/* Status codes used as a return value. Codes >0 are not fatal and can be + * resumed. */ typedef enum upb_status { UPB_STATUS_OK = 0, diff --git a/upb_context.c b/upb_context.c index fc558be..e92f9e3 100644 --- a/upb_context.c +++ b/upb_context.c @@ -58,8 +58,9 @@ static void free_symtab(struct upb_strtable *t) void upb_context_free(struct upb_context *c) { free_symtab(&c->symtab); + for(size_t i = 0; i < c->fds_len; i++) + upb_msgdata_free(c->fds[i], c->fds_msg, true); free_symtab(&c->psymtab); - for(size_t i = 0; i < c->fds_len; i++) free(c->fds[i]); free(c->fds); } @@ -73,23 +74,23 @@ static struct upb_symtab_entry *resolve(struct upb_strtable *t, struct upb_string *base, struct upb_string *symbol) { - if(base->byte_len + symbol->byte_len + 1 >= UPB_SYM_MAX_LENGTH || + if(base->byte_len + symbol->byte_len + 1 >= UPB_SYMBOL_MAX_LENGTH || symbol->byte_len == 0) return NULL; - if(symbol->ptr[0] == UPB_CONTEXT_SEPARATOR) { + if(symbol->ptr[0] == UPB_SYMBOL_SEPARATOR) { /* Symbols starting with '.' are absolute, so we do a single lookup. */ struct upb_string sym_str = {.ptr = symbol->ptr+1, .byte_len = symbol->byte_len-1}; return upb_strtable_lookup(t, &sym_str); } else { /* Remove components from base until we find an entry or run out. */ - char sym[UPB_SYM_MAX_LENGTH+1]; + char sym[UPB_SYMBOL_MAX_LENGTH+1]; struct upb_string sym_str = {.ptr = sym}; int baselen = base->byte_len; while(1) { - /* sym_str = base[0...base_len] + UPB_CONTEXT_SEPARATOR + symbol */ + /* sym_str = base[0...base_len] + UPB_SYMBOL_SEPARATOR + symbol */ memcpy(sym, base->ptr, baselen); - sym[baselen] = UPB_CONTEXT_SEPARATOR; + sym[baselen] = UPB_SYMBOL_SEPARATOR; memcpy(sym + baselen + 1, symbol->ptr, symbol->byte_len); sym_str.byte_len = baselen + symbol->byte_len + 1; @@ -97,7 +98,7 @@ static struct upb_symtab_entry *resolve(struct upb_strtable *t, if (e) return e; else if(baselen == 0) return NULL; /* No more scopes to try. */ - baselen = memrchr(base->ptr, UPB_CONTEXT_SEPARATOR, baselen); + baselen = memrchr(base->ptr, UPB_SYMBOL_SEPARATOR, baselen); } } } @@ -130,7 +131,7 @@ static struct upb_string join(struct upb_string *base, struct upb_string *name) if(base->byte_len > 0) { /* nested_base = base + '.' + d->name */ memcpy(joined.ptr, base->ptr, base->byte_len); - joined.ptr[base->byte_len] = UPB_CONTEXT_SEPARATOR; + joined.ptr[base->byte_len] = UPB_SYMBOL_SEPARATOR; memcpy(&joined.ptr[base->byte_len+1], name->ptr, name->byte_len); } else { memcpy(joined.ptr, name->ptr, name->byte_len); diff --git a/upb_context.h b/upb_context.h index 51afed1..ffff214 100644 --- a/upb_context.h +++ b/upb_context.h @@ -2,8 +2,7 @@ * upb - a minimalist implementation of protocol buffers. * * A context represents a namespace of proto definitions, sort of like an - * interpreter's symbol table. It is empty when first constructed, with the - * exception of built-in types (those defined in descriptor.proto). Clients + * interpreter's symbol table. It is empty when first constructed. Clients * add definitions to the context by supplying unserialized or serialized * descriptors (as defined in descriptor.proto). * @@ -22,6 +21,16 @@ struct google_protobuf_FileDescriptorProto; extern "C" { #endif +/* Definitions. ***************************************************************/ + +/* The symbol table maps names to various kinds of symbols. */ +enum upb_symbol_type { + UPB_SYM_MESSAGE, + UPB_SYM_ENUM, + UPB_SYM_SERVICE, + UPB_SYM_EXTENSION +}; + struct upb_symtab_entry { struct upb_strtable_entry e; enum upb_symbol_type type; @@ -39,17 +48,12 @@ struct upb_context { struct google_protobuf_FileDescriptorSet **fds; }; -/* Initializes and frees a upb_context, respectively. Newly initialized - * contexts will always have the types in descriptor.proto defined. */ +/* Initializes and frees a upb_context, respectively. */ bool upb_context_init(struct upb_context *c); void upb_context_free(struct upb_context *c); /* Looking up symbols. ********************************************************/ -/* Nested type names are separated by periods. */ -#define UPB_CONTEXT_SEPARATOR '.' -#define UPB_SYM_MAX_LENGTH 256 - /* Resolves the given symbol using the rules described in descriptor.proto, * namely: * diff --git a/upb_msg.c b/upb_msg.c index 65df800..48804e7 100644 --- a/upb_msg.c +++ b/upb_msg.c @@ -9,6 +9,7 @@ #include "upb_msg.h" #include "upb_parse.h" +/* Rounds p up to the next multiple of t. */ #define ALIGN_UP(p, t) (p % t == 0 ? p : p + (t - (p % t))) static int div_round_up(int numerator, int denominator) { @@ -55,6 +56,7 @@ bool upb_msg_init(struct upb_msg *m, struct google_protobuf_DescriptorProto *d) /* We count on the caller to keep this pointer alive. */ m->field_descriptors[i] = d->field->elements[i]; } + /* TODO: re-enable proper sorting once the compiler is sorted out. */ //qsort(m->field_descriptors, m->num_fields, sizeof(void*), compare_fields); size_t max_align = 0; @@ -77,7 +79,7 @@ bool upb_msg_init(struct upb_msg *m, struct google_protobuf_DescriptorProto *d) /* Insert into the tables. Note that f->ref will be uninitialized, even in * the tables' copies of *f, which is why we must update them separately - * when the references are resolved. */ + * in upb_msg_ref() below. */ struct upb_fieldsbynum_entry nument = {.e = {.key = fd->number}, .f = *f}; struct upb_fieldsbyname_entry strent = {.e = {.key = *fd->name}, .f = *f}; upb_inttable_insert(&m->fields_by_num, &nument.e); @@ -96,15 +98,6 @@ void upb_msg_free(struct upb_msg *m) free(m->field_descriptors); } -void *upb_msg_new(struct upb_msg *m) -{ - void *msg = malloc(m->size); - memset(msg, 0, m->size); /* Clear all pointers, values, and set bits. */ - return msg; -} - -//void upb_msg_free(void *msg, struct upb_msg *m, bool free_submsgs); - void upb_msg_ref(struct upb_msg *m, struct upb_msg_field *f, union upb_symbol_ref ref) { struct google_protobuf_FieldDescriptorProto *d = @@ -119,23 +112,35 @@ void upb_msg_ref(struct upb_msg *m, struct upb_msg_field *f, str_e->f.ref = ref; } +/* Memory management *********************************************************/ + +/* Our memory management scheme is as follows: + * + * All pointers to dynamic memory (strings, arrays, and submessages) are + * expected to be good pointers if they are non-zero, *regardless* of whether + * that field's bit is set! That way we can reuse the memory even if the field + * is unset and then set later. */ + +/* For our memory-managed strings and arrays we store extra information + * (compared to a plain upb_string or upb_array). But the data starts with + * a upb_string and upb_array, so we can overlay onto the regular types. */ struct mm_upb_string { struct upb_string s; + /* Track the allocated size, so we know when we need to reallocate. */ uint32_t size; + /* Our allocated data. Stored separately so that clients can point s.ptr to + * a referenced string, but we can reuse this data later. */ char *data; }; struct mm_upb_array { struct upb_array a; + /* Track the allocated size, so we know when we need to reallocate. */ uint32_t size; - char *data; }; static uint32_t round_up_to_pow2(uint32_t v) { -#if 0 // __GNUC__ - return (1U<<31) >> (__builtin_clz(v-1)+1); -#else /* cf. http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2 */ v--; v |= v >> 1; @@ -145,7 +150,54 @@ static uint32_t round_up_to_pow2(uint32_t v) v |= v >> 16; v++; return v; -#endif +} + +void *upb_msgdata_new(struct upb_msg *m) +{ + void *msg = malloc(m->size); + memset(msg, 0, m->size); /* Clear all pointers, values, and set bits. */ + return msg; +} + +static void free_value(union upb_value_ptr p, struct upb_msg_field *f, + bool free_submsgs) +{ + switch(f->type) { + case GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_STRING: + case GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_BYTES: { + struct mm_upb_string *mm_str = (void*)*p.string; + if(mm_str) { + free(mm_str->data); + free(mm_str); + } + break; + } + case GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_MESSAGE: + if(free_submsgs) upb_msgdata_free(*p.message, f->ref.msg, free_submsgs); + break; + default: break; /* For non-dynamic types, do nothing. */ + } +} + +void upb_msgdata_free(void *data, struct upb_msg *m, bool free_submsgs) +{ + if(!data) return; /* A very free-like thing to do. */ + for(unsigned int i = 0; i < m->num_fields; i++) { + struct upb_msg_field *f = &m->fields[i]; + union upb_value_ptr p = upb_msg_getptr(data, f); + if(f->label == GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_LABEL_REPEATED) { + if(*p.array) { + for(uint32_t j = 0; j < (*p.array)->len; j++) + free_value(upb_array_getelementptr(*p.array, j, f->type), + f, free_submsgs); + free((*p.array)->elements._void); + free(*p.array); + } + } else { + free_value(p, f, free_submsgs); + } + } + free(data); } void upb_msg_reuse_str(struct upb_string **str, uint32_t size) @@ -185,11 +237,11 @@ void upb_msg_reuse_strref(struct upb_string **str) { upb_msg_reuse_str(str, 0); void upb_msg_reuse_submsg(void **msg, struct upb_msg *m) { - if(!*msg) *msg = upb_msg_new(m); + if(!*msg) *msg = upb_msgdata_new(m); else upb_msg_clear(*msg, m); /* Clears set bits, leaves pointers. */ } -/* Parser. */ +/* Serialization/Deserialization. ********************************************/ struct parse_frame_data { struct upb_msg *m; @@ -217,7 +269,7 @@ static upb_field_type_t tag_cb(struct upb_parse_state *s, struct upb_tag *tag, static union upb_value_ptr get_value_ptr(void *data, struct upb_msg_field *f) { - union upb_value_ptr p = upb_msg_get_ptr(data, f); + union upb_value_ptr p = upb_msg_getptr(data, f); if(f->label == GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_LABEL_REPEATED) { size_t len = upb_msg_is_set(data, f) ? (*p.array)->len : 0; upb_msg_reuse_array(p.array, len+1, f->type); @@ -270,25 +322,18 @@ static void submsg_start_cb(struct upb_parse_state *_s, void *user_field_desc) if(!s->merge) upb_msg_clear(frame->data, f->ref.msg); } -static void submsg_end_cb(struct upb_parse_state *s) -{ - struct parse_frame_data *frame = (void*)&s->top->user_data; -} - - void upb_msg_parse_init(struct upb_msg_parse_state *s, void *msg, struct upb_msg *m, bool merge, bool byref) { upb_parse_init(&s->s, sizeof(struct parse_frame_data)); s->merge = merge; s->byref = byref; - if(!merge && msg == NULL) msg = upb_msg_new(m); + if(!merge && msg == NULL) msg = upb_msgdata_new(m); set_frame_data(&s->s, m, msg); s->s.tag_cb = tag_cb; s->s.value_cb = value_cb; s->s.str_cb = str_cb; s->s.submsg_start_cb = submsg_start_cb; - s->s.submsg_end_cb = submsg_end_cb; } void upb_msg_parse_free(struct upb_msg_parse_state *s) @@ -305,10 +350,11 @@ upb_status_t upb_msg_parse(struct upb_msg_parse_state *s, void *upb_alloc_and_parse(struct upb_msg *m, struct upb_string *str, bool byref) { struct upb_msg_parse_state s; - void *msg = upb_msg_new(m); + void *msg = upb_msgdata_new(m); upb_msg_parse_init(&s, msg, m, false, byref); size_t read; upb_status_t status = upb_msg_parse(&s, str->ptr, str->byte_len, &read); + upb_msg_parse_free(&s); if(status == UPB_STATUS_OK && read == str->byte_len) { return msg; } else { @@ -370,7 +416,7 @@ void upb_msg_print(void *data, struct upb_msg *m, FILE *stream) if(upb_msg_is_set(data, f)) fputs(" (set): ", stream); else fputs(" (NOT set): ", stream); - union upb_value_ptr p = upb_msg_get_ptr(data, f); + union upb_value_ptr p = upb_msg_getptr(data, f); if(f->label == GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_LABEL_REPEATED) { if(*p.array) { fputc('[', stream); diff --git a/upb_msg.h b/upb_msg.h index ce32783..a3f8c92 100644 --- a/upb_msg.h +++ b/upb_msg.h @@ -3,46 +3,100 @@ * * Copyright (c) 2009 Joshua Haberman. See LICENSE for details. * - * upb_msg contains a full description of a message as defined in a .proto file. - * It supports many features and operations for dealing with proto messages: + * A upb_msg provides a full description of a message as defined in a .proto + * file. It supports many features and operations for dealing with proto + * messages: * - reflection over .proto types at runtime (list fields, get names, etc). * - an in-memory byte-level format for efficiently storing and accessing msgs. * - serializing and deserializing from the in-memory format to a protobuf. * - optional memory management for handling strings, arrays, and submessages. * + * Throughout this file, the following convention is used: + * - "struct upb_msg *m" describes a message type (name, list of fields, etc). + * - "void *data" is an actual message stored using the in-memory format. + * * The in-memory format is very much like a C struct that you can define at * run-time, but also supports reflection. Like C structs it supports * offset-based access, as opposed to the much slower name-based lookup. The - * format represents both the values themselves and bits describing whether each - * field is set or not. + * format stores both the values themselves and bits describing whether each + * field is set or not. For example: + * + * parsed message Foo { + * optional bool a = 1; + * repeated uint32 b = 2; + * optional Bar c = 3; + * } + * + * The in-memory layout for this message on a 32-bit machine will be something + * like: + * + * Foo + * +------------------------+ + * | set_flags a:1, b:1, c:1| + * +------------------------+ + * | bool a (1 byte) | + * +------------------------+ + * | padding (3 bytes) | + * +------------------------+ upb_array + * | upb_array* b (4 bytes) | ----> +----------------------------+ + * +------------------------+ | uint32* elements (4 bytes) | ---+ + * | Bar* c (4 bytes) | +----------------------------+ | + * +------------------------+ | uint32 size (4 bytes) | | + * +----------------------------+ | + * | + * -----------------------------------------------------------------+ + * | + * V + * uint32 array + * +----+----+----+----+----+----+ + * | e1 | e2 | e3 | e4 | e5 | e6 | + * +----+----+----+----+----+----+ + * + * And the corresponding C structure (as emitted by the proto compiler) would be: * - * The upb compiler emits C structs that mimic this definition exactly, so that - * you can access the same hunk of memory using either this run-time - * reflection-supporting interface or a C struct that was generated by the upb - * compiler. + * struct Foo { + * union { + * uint8_t bytes[1]; + * struct { + * bool a:1; + * bool b:1; + * bool c:1; + * } has; + * } set_flags; + * bool a; + * upb_uint32_array *b; + * Bar *c; + * } * - * Like C structs the format depends on the endianness of the host machine, so - * it is not suitable for exchanging across machines of differing endianness. - * But there is no reason to do that -- the protobuf serialization format is - * designed already for serialization/deserialization, and is more compact than - * this format. This format is designed to allow the fastest possible random - * access of individual fields. + * Because the C struct emitted by the upb compiler uses exactly the same + * byte-level format as the reflection interface, you can access the same hunk + * of memory either way. The C struct provides maximum performance and static + * type safety; upb_msg provides flexibility. * - * Note that clients need not use the memory management facilities defined here. - * They are for convenience only -- clients wishing to do their own memory - * management may do so (allowing clients to perform advanced techniques like - * reference-counting, garbage collection, and string references). Different + * The in-memory format has no interoperability guarantees whatsoever, except + * that a single version of upb will interoperate with itself. Don't even + * think about persisting the in-memory format or sending it anywhere. That's + * what serialized protobufs are for! The in-memory format is just that -- an + * in-memory representation that allows for fast access. + * + * The in-memory format is carefully designed to *not* mandate any particular + * memory management scheme. This should make it easier to integrate with + * existing memory management schemes, or to perform advanced techniques like + * reference counting, garbage collection, and string references. Different * clients can read each others messages regardless of what memory management * scheme each is using. + * + * A memory management scheme is provided for convenience, and it is used by + * default by the stock message parser. Clients can substitute their own + * memory management scheme into this parser without any loss of generality + * or performance. */ #ifndef UPB_MSG_H_ #define UPB_MSG_H_ #include -#include #include -#include #include "upb.h" #include "upb_table.h" @@ -59,7 +113,10 @@ struct google_protobuf_FieldDescriptorProto; /* Message definition. ********************************************************/ /* Structure that describes a single field in a message. This structure is very - * consciously designed to fit into 12/16 bytes (32/64 bit, respectively). */ + * consciously designed to fit into 12/16 bytes (32/64 bit, respectively), + * because copies of this struct are in the hash table that is read in the + * critical path of parsing. Minimizing the size of this struct increases + * cache-friendliness. */ struct upb_msg_field { union upb_symbol_ref ref; uint32_t byte_offset; /* Where to find the data. */ @@ -102,7 +159,7 @@ INLINE struct google_protobuf_FieldDescriptorProto *upb_msg_field_descriptor( return m->field_descriptors[f->field_index]; } -/* Initialize and free a upb_msg. Caller retains ownership of d, but the msg +/* Initializes/frees a upb_msg. Caller retains ownership of d, but the msg * will contain references to it, so it must outlive the msg. Note that init * does not resolve upb_msg_field.ref -- the caller should do that * post-initialization by calling upb_msg_ref() below. */ @@ -114,9 +171,9 @@ void upb_msg_free(struct upb_msg *m); * mutually-recursive ways, this step must be separated from initialization. */ void upb_msg_ref(struct upb_msg *m, struct upb_msg_field *f, union upb_symbol_ref ref); -/* While these are written to be as fast as possible, it will still be faster - * to cache the results of this lookup if possible. These return NULL if no - * such field is found. */ +/* Looks up a field by name or number. While these are written to be as fast + * as possible, it will still be faster to cache the results of this lookup if + * possible. These return NULL if no such field is found. */ INLINE struct upb_msg_field *upb_msg_fieldbynum(struct upb_msg *m, uint32_t number) { struct upb_fieldsbynum_entry *e = upb_inttable_lookup( @@ -130,33 +187,69 @@ INLINE struct upb_msg_field *upb_msg_fieldbyname(struct upb_msg *m, return e ? &e->f : NULL; } +/* "Set" flag reading and writing. *******************************************/ + +INLINE size_t upb_isset_offset(uint32_t field_index) { + return field_index / 8; +} + +INLINE uint8_t upb_isset_mask(uint32_t field_index) { + return 1 << (field_index % 8); +} + +/* Functions for reading and writing the "set" flags in the msg. Note that + * these do not perform memory management associated with any dynamic memory + * these fields may be referencing. These *only* set and test the flags. */ +INLINE void upb_msg_set(void *s, struct upb_msg_field *f) +{ + ((char*)s)[upb_isset_offset(f->field_index)] |= upb_isset_mask(f->field_index); +} + +INLINE void upb_msg_unset(void *s, struct upb_msg_field *f) +{ + ((char*)s)[upb_isset_offset(f->field_index)] &= ~upb_isset_mask(f->field_index); +} + +INLINE bool upb_msg_is_set(void *s, struct upb_msg_field *f) +{ + return ((char*)s)[upb_isset_offset(f->field_index)] & upb_isset_mask(f->field_index); +} + +INLINE bool upb_msg_all_required_fields_set(void *s, struct upb_msg *m) +{ + int num_fields = m->num_required_fields; + int i = 0; + while(num_fields > 8) { + if(((uint8_t*)s)[i++] != 0xFF) return false; + num_fields -= 8; + } + if(((uint8_t*)s)[i] != (1 << num_fields) - 1) return false; + return true; +} + +INLINE void upb_msg_clear(void *s, struct upb_msg *m) +{ + memset(s, 0, m->set_flags_bytes); +} + +/* Scalar (non-array) data access. ********************************************/ + +/* Returns a pointer to a specific field in a message. */ +INLINE union upb_value_ptr upb_msg_getptr(void *data, struct upb_msg_field *f) { + union upb_value_ptr p = {._void = ((char*)data + f->byte_offset)}; + return p; +} + /* Arrays. ********************************************************************/ /* Represents an array (a repeated field) of any type. The interpretation of * the data in the array depends on the type. */ struct upb_array { - union { - double *_double; - float *_float; - int32_t *int32; - int64_t *int64; - uint32_t *uint32; - uint64_t *uint64; - bool *_bool; - struct upb_string **string; - void **submsg; - void *_void; - } elements; + union upb_value_ptr elements; uint32_t len; /* Measured in elements. */ }; -/* These are all overlays on upb_array, pointers between them can be cast. */ -#define UPB_DEFINE_ARRAY_TYPE(name, type) \ - struct name ## _array { \ - type *elements; \ - uint32_t len; \ - }; - +/* Returns a pointer to an array element. */ INLINE union upb_value_ptr upb_array_getelementptr( struct upb_array *arr, uint32_t n, upb_field_type_t type) { @@ -166,6 +259,13 @@ INLINE union upb_value_ptr upb_array_getelementptr( return ptr; } +/* These are all overlays on upb_array, pointers between them can be cast. */ +#define UPB_DEFINE_ARRAY_TYPE(name, type) \ + struct name ## _array { \ + type *elements; \ + uint32_t len; \ + }; + UPB_DEFINE_ARRAY_TYPE(upb_double, double) UPB_DEFINE_ARRAY_TYPE(upb_float, float) UPB_DEFINE_ARRAY_TYPE(upb_int32, int32_t) @@ -175,6 +275,7 @@ UPB_DEFINE_ARRAY_TYPE(upb_uint64, uint64_t) UPB_DEFINE_ARRAY_TYPE(upb_bool, bool) UPB_DEFINE_ARRAY_TYPE(upb_string, struct upb_string*) +/* Defines an array of a specific message type. */ #define UPB_MSG_ARRAY(msg_type) struct msg_type ## _array #define UPB_DEFINE_MSG_ARRAY(msg_type) \ UPB_MSG_ARRAY(msg_type) { \ @@ -182,52 +283,42 @@ UPB_DEFINE_ARRAY_TYPE(upb_string, struct upb_string*) uint32_t len; \ }; -/* Accessors for primitive types. ********************************************/ +/* Memory management *********************************************************/ -/* For each primitive type we define a set of three functions: - * - * // For fetching out of a msg (s points to the raw msg data). - * int32_t *upb_msg_get_int32_ptr(void *s, struct upb_msg_field *f); - * int32_t upb_msg_get_int32(void *s, struct upb_msg_field *f); - * void upb_msg_set_int32(void *s, struct upb_msg_field *f, int32_t val); - * - * These do no existence checks, bounds checks, or type checks. */ - -#define UPB_DEFINE_ACCESSORS(INLINE, name, ctype) \ - INLINE ctype *upb_msg_get_ ## name ## _ptr( \ - void *s, struct upb_msg_field *f) { \ - return (ctype*)((char*)s + f->byte_offset); \ - } \ - INLINE ctype upb_msg_get_ ## name( \ - void *s, struct upb_msg_field *f) { \ - return *upb_msg_get_ ## name ## _ptr(s, f); \ - } \ - INLINE void upb_msg_set_ ## name( \ - void *s, struct upb_msg_field *f, ctype val) { \ - *upb_msg_get_ ## name ## _ptr(s, f) = val; \ - } +/* One important note about these memory management routines: they must be used + * completely or not at all (for each message). In other words, you can't + * allocate your own message and then free it with upb_msgdata_free. As + * another example, you can't point a field to your own string and then call + * upb_msg_reuse_str. */ + +/* Allocates and frees message data, respectively. Newly allocated data is + * initialized to empty. Freeing a message always frees string data, but + * the client can decide whether or not submessages should be deleted. */ +void *upb_msgdata_new(struct upb_msg *m); +void upb_msgdata_free(void *data, struct upb_msg *m, bool free_submsgs); + +/* Given a pointer to the appropriate field of the message or array, these + * functions will lazily allocate memory for a string, array, or submessage. + * If the previously allocated memory is big enough, it will reuse it without + * re-allocating. See upb_msg.c for example usage. */ + +/* Reuse a string of at least the given size. */ +void upb_msg_reuse_str(struct upb_string **str, uint32_t size); +/* Like the previous, but assumes that the string will be by reference, so + * doesn't allocate memory for the string itself. */ +void upb_msg_reuse_strref(struct upb_string **str); -UPB_DEFINE_ACCESSORS(INLINE, double, double) -UPB_DEFINE_ACCESSORS(INLINE, float, float) -UPB_DEFINE_ACCESSORS(INLINE, int32, int32_t) -UPB_DEFINE_ACCESSORS(INLINE, int64, int64_t) -UPB_DEFINE_ACCESSORS(INLINE, uint32, uint32_t) -UPB_DEFINE_ACCESSORS(INLINE, uint64, uint64_t) -UPB_DEFINE_ACCESSORS(INLINE, bool, bool) -UPB_DEFINE_ACCESSORS(INLINE, bytes, struct upb_string*) -UPB_DEFINE_ACCESSORS(INLINE, string, struct upb_string*) -UPB_DEFINE_ACCESSORS(INLINE, submsg, void*) -UPB_DEFINE_ACCESSORS(INLINE, array, struct upb_array*) - -INLINE union upb_value_ptr upb_msg_get_ptr( - void *data, struct upb_msg_field *f) { - union upb_value_ptr p = {._void = ((char*)data + f->byte_offset)}; - return p; -} +/* Reuse an array of at least the given size, with the given type. */ +void upb_msg_reuse_array(struct upb_array **arr, uint32_t size, + upb_field_type_t t); -/* Memory management *********************************************************/ +/* Reuse a submessage of the given type. */ +void upb_msg_reuse_submsg(void **msg, struct upb_msg *m); -void *upb_msg_new(struct upb_msg *m); +/* Serialization/Deserialization. ********************************************/ + +/* This is all just a layer on top of the stream-oriented facility in + * upb_parse.h. */ struct upb_msg_parse_state { struct upb_parse_state s; @@ -236,70 +327,32 @@ struct upb_msg_parse_state { struct upb_msg *m; }; -void upb_msg_parse_init(struct upb_msg_parse_state *s, void *msg, +/* Initializes/frees a message parser. The parser will write the data to the + * message data "data", which the caller must have previously allocated (the + * parser will allocate submsgs, strings, and arrays as needed, however). + * + * "Merge" controls whether the parser will append to data instead of + * overwriting. Merging concatenates arrays and merges submessages instead + * of clearing both. + * + * "Byref" controls whether the new message data copies or references strings + * it encounters. If byref == true, then all strings supplied to upb_msg_parse + * must remain unchanged and must outlive data. */ +void upb_msg_parse_init(struct upb_msg_parse_state *s, void *data, struct upb_msg *m, bool merge, bool byref); void upb_msg_parse_free(struct upb_msg_parse_state *s); + +/* Parses a protobuf fragment, writing the data to the message that was passed + * to upb_msg_parse_init. This function can be called multiple times as more + * data becomes available. */ upb_status_t upb_msg_parse(struct upb_msg_parse_state *s, void *data, size_t len, size_t *read); +/* Parses the protobuf in s (which is expected to be complete) and allocates + * new message data to hold it. This is an alternative to the streaming API + * above. "byref" works as in upb_msg_parse_init(). */ void *upb_alloc_and_parse(struct upb_msg *m, struct upb_string *s, bool byref); -/* Note! These two may not be use on a upb_string* that was initialized by - * means other than these functions. */ -void upb_msg_reuse_str(struct upb_string **str, uint32_t len); -void upb_msg_reuse_array(struct upb_array **arr, uint32_t n, upb_field_type_t t); -void upb_msg_reuse_strref(struct upb_string **str); -void upb_msg_reuse_submsg(void **msg, struct upb_msg *m); - -/* "Set" flag reading and writing. *******************************************/ - -INLINE size_t upb_isset_offset(uint32_t field_index) { - return field_index / 8; -} - -INLINE uint8_t upb_isset_mask(uint32_t field_index) { - return 1 << (field_index % 8); -} - -/* Functions for reading and writing the "set" flags in the msg. Note that - * these do not perform memory management associated with any dynamic memory - * these fields may be referencing. These *only* set and test the flags. */ -INLINE void upb_msg_set(void *s, struct upb_msg_field *f) -{ - ((char*)s)[upb_isset_offset(f->field_index)] |= upb_isset_mask(f->field_index); -} - -INLINE void upb_msg_unset(void *s, struct upb_msg_field *f) -{ - ((char*)s)[upb_isset_offset(f->field_index)] &= ~upb_isset_mask(f->field_index); -} - -INLINE bool upb_msg_is_set(void *s, struct upb_msg_field *f) -{ - return ((char*)s)[upb_isset_offset(f->field_index)] & upb_isset_mask(f->field_index); -} - -INLINE bool upb_msg_all_required_fields_set(void *s, struct upb_msg *m) -{ - int num_fields = m->num_required_fields; - int i = 0; - while(num_fields > 8) { - if(((uint8_t*)s)[i++] != 0xFF) return false; - num_fields -= 8; - } - if(((uint8_t*)s)[i] != (1 << num_fields) - 1) return false; - return true; -} - -INLINE void upb_msg_clear(void *s, struct upb_msg *m) -{ - memset(s, 0, m->set_flags_bytes); -} - -/* Serialization/Deserialization. ********************************************/ - -/* Parses the string data in s according to the message description in m. */ -upb_status_t upb_msg_merge(void *data, struct upb_msg *m, struct upb_string *s); /* Text dump *****************************************************************/ diff --git a/upb_parse.h b/upb_parse.h index 182cb9e..c5640f1 100644 --- a/upb_parse.h +++ b/upb_parse.h @@ -18,6 +18,33 @@ extern "C" { #endif +/* Definitions. ***************************************************************/ + +/* A list of types as they are encoded on-the-wire. */ +enum upb_wire_type { + UPB_WIRE_TYPE_VARINT = 0, + UPB_WIRE_TYPE_64BIT = 1, + UPB_WIRE_TYPE_DELIMITED = 2, + UPB_WIRE_TYPE_START_GROUP = 3, + UPB_WIRE_TYPE_END_GROUP = 4, + UPB_WIRE_TYPE_32BIT = 5 +}; +typedef uint8_t upb_wire_type_t; + +/* A value as it is encoded on-the-wire, except delimited, which is handled + * separately. */ +union upb_wire_value { + uint64_t varint; + uint64_t _64bit; + uint32_t _32bit; +}; + +/* A tag occurs before each value on-the-wire. */ +struct upb_tag { + upb_field_number_t field_number; + upb_wire_type_t wire_type; +}; + /* High-level parsing interface. **********************************************/ struct upb_parse_state; diff --git a/upbc.c b/upbc.c index 7b649af..4177576 100644 --- a/upbc.c +++ b/upbc.c @@ -65,7 +65,7 @@ static void write_header(struct upb_symtab_entry entries[], int num_entries, struct upb_string enum_val_prefix = upb_strdup(entry->e.key); enum_val_prefix.byte_len = memrchr(enum_val_prefix.ptr, - UPB_CONTEXT_SEPARATOR, + UPB_SYMBOL_SEPARATOR, enum_val_prefix.byte_len); enum_val_prefix.byte_len++; to_preproc(enum_val_prefix); @@ -135,7 +135,7 @@ static void write_header(struct upb_symtab_entry entries[], int num_entries, /* Submessages get special treatment, since we have to use the message * name directly. */ struct upb_string type_name_ref = *fd->type_name; - if(type_name_ref.ptr[0] == UPB_CONTEXT_SEPARATOR) { + if(type_name_ref.ptr[0] == UPB_SYMBOL_SEPARATOR) { /* Omit leading '.'. */ type_name_ref.ptr++; type_name_ref.byte_len--; @@ -207,5 +207,6 @@ int main() struct upb_string name = UPB_STRLIT("descriptor.proto"); write_header(entries, symcount, name, stdout); upb_context_free(&c); + upb_strfree(fds); } -- cgit v1.2.3