From 4a8b9be46c9485a35383b52d400ba086d3f40ace Mon Sep 17 00:00:00 2001 From: Joshua Haberman Date: Wed, 31 Aug 2011 20:03:59 -0700 Subject: Header cleanup, clarify/correct comments for interfaces. --- upb/atomic.h | 2 +- upb/bytestream.h | 68 +++++++++++++++++++++++++++++++++++++++++++------------- upb/def.c | 9 ++++++++ upb/def.h | 63 ++++++++++++++++++++++++++------------------------- upb/descriptor.c | 6 ----- upb/descriptor.h | 8 ++----- upb/pb/decoder.c | 9 +------- upb/pb/decoder.h | 28 ++++++++--------------- upb/pb/glue.c | 7 ++++-- upb/table.h | 8 +++---- 10 files changed, 114 insertions(+), 94 deletions(-) (limited to 'upb') diff --git a/upb/atomic.h b/upb/atomic.h index 87a0141..e82ad8f 100644 --- a/upb/atomic.h +++ b/upb/atomic.h @@ -7,7 +7,7 @@ * Only a very small part of upb is thread-safe. Notably, individual * messages, arrays, and strings are *not* thread safe for mutating. * However, we do make message *metadata* such as upb_msgdef and - * upb_context thread-safe, and their ownership is tracked via atomic + * upb_symtab thread-safe, and their ownership is tracked via atomic * refcounting. This header implements the small number of atomic * primitives required to support this. The primitives we implement * are: diff --git a/upb/bytestream.h b/upb/bytestream.h index 83976a3..0a744f6 100644 --- a/upb/bytestream.h +++ b/upb/bytestream.h @@ -10,6 +10,7 @@ * can get the data from a fd, a string, a cord, etc. * * Byte streams are NOT thread-safe! (Like f{read,write}_unlocked()) + * This may change (in particular, bytesrc objects may be better thread-safe). */ #ifndef UPB_BYTESTREAM_H @@ -28,7 +29,12 @@ extern "C" { /* upb_bytesrc ****************************************************************/ // A upb_bytesrc allows the consumer of a stream of bytes to obtain buffers as -// they become available, and to preserve some trailing amount of data. +// they become available, and to preserve some trailing amount of data, which +// is useful for lazy parsing (among other things). If there is a submessage +// that we want to parse later we can take a reference on that region of the +// input buffer. This will guarantee that the bytesrc keeps the submessage +// data around for later use, without requiring a copy out of the input +// buffers. typedef size_t upb_bytesrc_fetch_func(void*, uint64_t, upb_status*); typedef void upb_bytesrc_read_func(void*, uint64_t, size_t, char*); typedef const char *upb_bytesrc_getptr_func(void*, uint64_t, size_t*); @@ -53,10 +59,15 @@ INLINE void upb_bytesrc_init(upb_bytesrc *src, upb_bytesrc_vtbl *vtbl) { } // Fetches at least one byte starting at ofs, returning the actual number of -// bytes fetched (or 0 on error: see "s" for details). Gives caller a ref on -// the fetched region. It is safe to re-fetch existing regions but only if -// they are ref'd. "ofs" may not greater than the end of the region that was -// previously fetched. +// bytes fetched (or 0 on error: see "s" for details). A successful return +// gives caller a ref on the fetched region. +// +// If "ofs" may be greater or equal than the end of the already-fetched region. +// It may also be less than the end of the already-fetch region *if* either of +// the following is true: +// +// * the region is ref'd (this implies that the data is still in-memory) +// * the bytesrc is seekable (this implies that the data can be fetched again). INLINE size_t upb_bytesrc_fetch(upb_bytesrc *src, uint64_t ofs, upb_status *s) { return src->vtbl->fetch(src, ofs, s); } @@ -68,20 +79,21 @@ INLINE void upb_bytesrc_read(upb_bytesrc *src, uint64_t src_ofs, size_t len, src->vtbl->read(src, src_ofs, len, dst); } -// Returns a pointer to the bytesrc's internal buffer, returning how much data -// was actually returned (which may be less than "len" if the given region is -// not contiguous). The caller must own refs on the entire region from [ofs, -// ofs+len]. The returned buffer is valid for as long as the region remains -// ref'd. +// Returns a pointer to the bytesrc's internal buffer, storing in *len how much +// data is available. The caller must own refs on the given region. The +// returned buffer is valid for as long as the region remains ref'd. // -// TODO: is "len" really required here? +// TODO: if more data is available than the caller has ref'd is it ok for the +// caller to read *len bytes? INLINE const char *upb_bytesrc_getptr(upb_bytesrc *src, uint64_t ofs, size_t *len) { return src->vtbl->getptr(src, ofs, len); } // Gives the caller a ref on the given region. The caller must know that the -// given region is already ref'd. +// given region is already ref'd (for example, inside a upb_handlers callback +// that receives a upb_strref, the region is guaranteed to be ref'd -- this +// function allows that handler to take its own ref). INLINE void upb_bytesrc_refregion(upb_bytesrc *src, uint64_t ofs, size_t len) { src->vtbl->refregion(src, ofs, len); } @@ -110,16 +122,17 @@ INLINE void upb_bytesrc_unref(upb_bytesrc *src) { src->vtbl->unref(src); } + /* upb_strref *****************************************************************/ -// The structure we pass for a string. +// The structure we pass to upb_handlers for a string value. typedef struct _upb_strref { // Pointer to the string data. NULL if the string spans multiple input // buffers (in which case upb_bytesrc_getptr() must be called to obtain // the actual pointers). const char *ptr; - // Length of the string. + // Total length of the string. uint32_t len; // Offset in the bytesrc that represents the beginning of this string. @@ -139,12 +152,21 @@ typedef struct _upb_strref { char *upb_strref_dup(struct _upb_strref *r); INLINE void upb_strref_read(struct _upb_strref *r, char *buf) { - upb_bytesrc_read(r->bytesrc, r->stream_offset, r->len, buf); + if (r->ptr) { + memcpy(buf, r->ptr, r->len); + } else { + assert(r->bytesrc); + upb_bytesrc_read(r->bytesrc, r->stream_offset, r->len, buf); + } } /* upb_bytesink ***************************************************************/ +// A bytesink is an interface that allows the caller to push byte-wise data. +// It is very simple -- the only special capability is the ability to "rewind" +// the stream, which is really only a mechanism of having the bytesink ignore +// some subsequent calls. typedef int upb_bytesink_write_func(void*, const void*, int); typedef int upb_bytesink_vprintf_func(void*, const char *fmt, va_list args); @@ -194,6 +216,22 @@ INLINE uint64_t upb_bytesink_getoffset(upb_bytesink *sink) { return sink->offset; } +// Rewinds the stream to the given offset. This cannot actually "unput" any +// data, it is for situations like: +// +// // If false is returned (because of error), call again later to resume. +// bool write_some_data(upb_bytesink *sink, int indent) { +// uint64_t start_offset = upb_bytesink_getoffset(sink); +// if (upb_bytesink_writestr(sink, "Some data") < 0) goto err; +// if (upb_bytesink_putrepeated(sink, ' ', indent) < 0) goto err; +// return true; +// err: +// upb_bytesink_rewind(sink, start_offset); +// return false; +// } +// +// The subsequent bytesink writes *must* be identical to the writes that were +// rewinded past. INLINE void upb_bytesink_rewind(upb_bytesink *sink, uint64_t offset) { // TODO (void)sink; diff --git a/upb/def.c b/upb/def.c index f83f5eb..ee793a9 100644 --- a/upb/def.c +++ b/upb/def.c @@ -523,6 +523,13 @@ upb_msg_iter upb_msg_next(upb_msgdef *m, upb_msg_iter iter) { /* upb_symtab *****************************************************************/ +struct _upb_symtab { + upb_atomic_t refcount; + upb_rwlock_t lock; // Protects all members except the refcount. + upb_strtable symtab; // The symbol table. + upb_deflist olddefs; +}; + typedef struct { upb_def *def; } upb_symtab_ent; @@ -568,6 +575,8 @@ static void upb_symtab_free(upb_symtab *s) { free(s); } +void upb_symtab_ref(upb_symtab *s) { upb_atomic_ref(&s->refcount); } + void upb_symtab_unref(upb_symtab *s) { if(s && upb_atomic_unref(&s->refcount)) { upb_symtab_free(s); diff --git a/upb/def.h b/upb/def.h index 86f84ce..4dc9c16 100644 --- a/upb/def.h +++ b/upb/def.h @@ -123,12 +123,12 @@ INLINE const char *upb_fielddef_typename(upb_fielddef *f) { return f->def ? f->def->fqname : NULL; } -// Only meaningful once the def is in a symtab (returns NULL otherwise, or for -// a fielddef where !upb_hassubdef(f)). +// The enum or submessage def for this field, if any. Only meaningful for +// submessage, group, and enum fields (ie. when upb_hassubdef(f) is true). +// Since defs are not linked together until they are in a symtab, this +// will return NULL until the msgdef is in a symtab. upb_def *upb_fielddef_subdef(upb_fielddef *f); -// NULL until the fielddef has been added to a msgdef. - // Write accessors. "Number" and "name" must be set before the fielddef is // added to a msgdef. For the moment we do not allow these to be set once // the fielddef is added to a msgdef -- this could be relaxed in the future. @@ -214,11 +214,12 @@ void upb_msgdef_setsize(upb_msgdef *m, uint16_t size); void upb_msgdef_sethasbit_bytes(upb_msgdef *m, uint16_t bytes); bool upb_msgdef_setextrange(upb_msgdef *m, uint32_t start, uint32_t end); -// Adds a fielddef to a msgdef. Caller retains its ref on the fielddef. -// May only be done before the msgdef is in a symtab. The fielddef's name and -// number must be set, and the message may not already contain any field with -// this name or number, and this fielddef may not be part of another message, -// otherwise false is returned and no action is performed. +// Adds a fielddef to a msgdef. Caller retains its ref on the fielddef. May +// only be done before the msgdef is in a symtab (requires upb_def_ismutable(m) +// for the msgdef). The fielddef's name and number must be set, and the +// message may not already contain any field with this name or number, and this +// fielddef may not be part of another message, otherwise false is returned and +// no action is performed. bool upb_msgdef_addfields(upb_msgdef *m, upb_fielddef **f, int n); INLINE bool upb_msgdef_addfield(upb_msgdef *m, upb_fielddef *f) { return upb_msgdef_addfields(m, &f, 1); @@ -252,6 +253,7 @@ INLINE int upb_msgdef_numfields(upb_msgdef *m) { } // Iteration over fields. The order is undefined. +// TODO: the iteration should be in field order. // Iterators are invalidated when a field is added or removed. // upb_msg_iter i; // for(i = upb_msg_begin(m); !upb_msg_done(i); i = upb_msg_next(m, i)) { @@ -296,7 +298,7 @@ upb_enumdef *upb_enumdef_dup(upb_enumdef *e); INLINE int32_t upb_enumdef_default(upb_enumdef *e) { return e->defaultval; } -// May only be set before the enumdef is in a symtab. +// May only be set if upb_def_ismutable(e). void upb_enumdef_setdefault(upb_enumdef *e, int32_t val); // Adds a value to the enumdef. Requires that no existing val has this @@ -334,30 +336,12 @@ INLINE int32_t upb_enum_iter_number(upb_enum_iter iter) { /* upb_symtab *****************************************************************/ -// A SymbolTable is where upb_defs live. It is empty when first constructed. -// Clients add definitions to the symtab (or replace existing definitions) by -// using a upb_symtab_commit() or calling upb_symtab_add(). - -// upb_deflist: A little dynamic array for storing a growing list of upb_defs. -typedef struct { - upb_def **defs; - uint32_t len; - uint32_t size; -} upb_deflist; - -void upb_deflist_init(upb_deflist *l); -void upb_deflist_uninit(upb_deflist *l); -void upb_deflist_push(upb_deflist *l, upb_def *d); - -struct _upb_symtab { - upb_atomic_t refcount; - upb_rwlock_t lock; // Protects all members except the refcount. - upb_strtable symtab; // The symbol table. - upb_deflist olddefs; -}; +// A symtab (symbol table) is where upb_defs live. It is empty when first +// constructed. Clients add definitions to the symtab (or replace existing +// definitions) by calling upb_symtab_add(). upb_symtab *upb_symtab_new(void); -INLINE void upb_symtab_ref(upb_symtab *s) { upb_atomic_ref(&s->refcount); } +void upb_symtab_ref(upb_symtab *s); void upb_symtab_unref(upb_symtab *s); // Resolves the given symbol using the rules described in descriptor.proto, @@ -427,6 +411,21 @@ UPB_DOWNCAST_DEF(svcdef, SERVICE); UPB_DOWNCAST_DEF(unresolveddef, UNRESOLVED); #undef UPB_DOWNCAST_DEF + +/* upb_deflist ****************************************************************/ + +// upb_deflist is an internal-only dynamic array for storing a growing list of +// upb_defs. +typedef struct { + upb_def **defs; + uint32_t len; + uint32_t size; +} upb_deflist; + +void upb_deflist_init(upb_deflist *l); +void upb_deflist_uninit(upb_deflist *l); +void upb_deflist_push(upb_deflist *l, upb_def *d); + #ifdef __cplusplus } /* extern "C" */ #endif diff --git a/upb/descriptor.c b/upb/descriptor.c index 90ba4f7..26cfef5 100644 --- a/upb/descriptor.c +++ b/upb/descriptor.c @@ -29,10 +29,6 @@ static char *upb_join(char *base, char *name) { /* upb_descreader ************************************************************/ -// A upb_descreader builds a list of defs by handling a parse of a protobuf in -// the format defined in descriptor.proto. The output of a upb_descreader is -// a upb_symtabtxn. - static upb_def *upb_deflist_last(upb_deflist *l) { return l->defs[l->len-1]; } @@ -480,8 +476,6 @@ static void upb_msgdef_endmsg(void *_r, upb_status *status) { upb_status_seterrliteral(status, "Encountered message with no name."); return; } - - upb_msgdef_layout(m); upb_descreader_endcontainer(r); } diff --git a/upb/descriptor.h b/upb/descriptor.h index 2c279a5..21099b3 100644 --- a/upb/descriptor.h +++ b/upb/descriptor.h @@ -21,8 +21,6 @@ extern "C" { /* upb_descreader ************************************************************/ -// upb_descreader reads a descriptor and puts defs in a upb_symtabtxn. - // We keep a stack of all the messages scopes we are currently in, as well as // the top-level file scope. This is necessary to correctly qualify the // definitions that are contained inside. "name" tracks the name of the @@ -50,13 +48,11 @@ typedef struct { upb_fielddef *f; } upb_descreader; -// Creates a new descriptor builder that will add defs to the given txn. void upb_descreader_init(upb_descreader *r); void upb_descreader_uninit(upb_descreader *r); -// Registers handlers that will load descriptor data into a symtabtxn. -// Pass the descreader as the closure. The messages will have -// upb_msgdef_layout() called on them before adding to the txn. +// Registers handlers that will build the defs. Pass the descreader as the +// closure. upb_mhandlers *upb_descreader_reghandlers(upb_handlers *h); // Gets the array of defs that have been parsed and removes them from the diff --git a/upb/pb/decoder.c b/upb/pb/decoder.c index fd04efc..ff39685 100644 --- a/upb/pb/decoder.c +++ b/upb/pb/decoder.c @@ -400,7 +400,7 @@ static void upb_decoder_skip(void *_d, upb_dispatcher_frame *top, #endif } -void upb_decoder_initforhandlers(upb_decoder *d, upb_handlers *handlers) { +void upb_decoder_init(upb_decoder *d, upb_handlers *handlers) { upb_dispatcher_init( &d->dispatcher, handlers, upb_decoder_skip, upb_decoder_exit2, d); #ifdef UPB_USE_JIT_X64 @@ -423,13 +423,6 @@ void upb_decoder_initforhandlers(upb_decoder *d, upb_handlers *handlers) { } } -void upb_decoder_initformsgdef(upb_decoder *d, upb_msgdef *m) { - upb_handlers *h = upb_handlers_new(); - upb_accessors_reghandlers(h, m); - upb_decoder_initforhandlers(d, h); - upb_handlers_unref(h); -} - void upb_decoder_reset(upb_decoder *d, upb_bytesrc *bytesrc, uint64_t start_ofs, uint64_t end_ofs, void *closure) { upb_dispatcher_frame *f = upb_dispatcher_reset(&d->dispatcher, closure); diff --git a/upb/pb/decoder.h b/upb/pb/decoder.h index 921697a..2232c52 100644 --- a/upb/pb/decoder.h +++ b/upb/pb/decoder.h @@ -5,13 +5,8 @@ * Author: Josh Haberman * * upb_decoder implements a high performance, streaming decoder for protobuf - * data that works by implementing upb_src and getting its data from a - * upb_bytesrc. - * - * The decoder does not currently support non-blocking I/O, in the sense that - * if the bytesrc returns UPB_STATUS_TRYAGAIN it is not possible to resume the - * decoder when data becomes available again. Support for this could be added, - * but it would add complexity and perhaps cost efficiency also. + * data that works by getting its input data from a upb_bytesrc and calling + * into a upb_handlers. */ #ifndef UPB_DECODER_H_ @@ -36,18 +31,18 @@ typedef struct _upb_decoder { upb_status *status; // Where we will store any errors that occur. upb_strref strref; // For passing string data to callbacks. - // Offsets for the region we currently have ref'd. + // Offsets for the bytesrc region we currently have ref'd. uint64_t refstart_ofs, refend_ofs; - // Current buffer and its stream offset. + // Current input buffer and its stream offset. const char *buf, *ptr, *end; uint64_t bufstart_ofs, bufend_ofs; // Stream offset for the end of the top-level message, if any. uint64_t end_ofs; - // Buf offset as of which we've delivered calbacks; needed for rollback on - // UPB_TRYAGAIN (or in the future, UPB_SUSPEND). + // Buf offset as of which we've delivered calbacks; needed for rollback if + // a callback returns UPB_BREAK. const char *completed_ptr; // End of the delimited region, relative to ptr, or NULL if not in this buf. @@ -66,6 +61,7 @@ typedef struct _upb_decoder { struct dasm_State *dynasm; #endif + // For exiting the decoder on error. sigjmp_buf exitjmp; } upb_decoder; @@ -75,14 +71,8 @@ typedef struct _upb_decoder { // Initializes/uninitializes a decoder for calling into the given handlers // or to write into the given msgdef, given its accessors). Takes a ref -// on the handlers or msgdef. -void upb_decoder_initforhandlers(upb_decoder *d, upb_handlers *h); - -// Equivalent to: -// upb_accessors_reghandlers(m, h); -// upb_decoder_initforhandlers(d, h); -// except possibly more efficient, by using cached state in the msgdef. -void upb_decoder_initformsgdef(upb_decoder *d, upb_msgdef *m); +// on the handlers. +void upb_decoder_init(upb_decoder *d, upb_handlers *h); void upb_decoder_uninit(upb_decoder *d); // Resets the internal state of an already-allocated decoder. This puts it in a diff --git a/upb/pb/glue.c b/upb/pb/glue.c index 8034c54..6981aa2 100644 --- a/upb/pb/glue.c +++ b/upb/pb/glue.c @@ -19,7 +19,10 @@ void upb_strtomsg(const char *str, size_t len, void *msg, upb_msgdef *md, upb_stringsrc_reset(&strsrc, str, len); upb_decoder d; - upb_decoder_initformsgdef(&d, md); + upb_handlers *h = upb_handlers_new(); + upb_accessors_reghandlers(h, md); + upb_decoder_init(&d, h); + upb_handlers_unref(h); upb_decoder_reset(&d, upb_stringsrc_bytesrc(&strsrc), 0, UINT64_MAX, msg); upb_decoder_decode(&d, status); @@ -63,7 +66,7 @@ upb_def **upb_load_descriptor(const char *str, size_t len, int *n, upb_descreader_reghandlers(h); upb_decoder d; - upb_decoder_initforhandlers(&d, h); + upb_decoder_init(&d, h); upb_handlers_unref(h); upb_descreader r; upb_descreader_init(&r); diff --git a/upb/table.h b/upb/table.h index bd9a5d4..4a44a6f 100644 --- a/upb/table.h +++ b/upb/table.h @@ -11,6 +11,9 @@ * The table uses internal chaining with Brent's variation (inspired by the * Lua implementation of hash tables). The hash function for strings is * Austin Appleby's "MurmurHash." + * + * This header is internal to upb; its interface should not be considered + * public or stable. */ #ifndef UPB_TABLE_H_ @@ -128,21 +131,16 @@ INLINE void *_upb_inttable_fastlookup(upb_inttable *t, uint32_t key, upb_inttable_value *arrval = (upb_inttable_value*)UPB_INDEX(t->array, key, value_size); if (_upb_inttable_isarrkey(t, key)) { - //DEBUGPRINTF("array lookup for key %d, &val=%p, has_entry=%d\n", key, val, val->has_entry); return (arrval->has_entry) ? arrval : NULL; } uint32_t bucket = _upb_inttable_bucket(t, key); upb_inttable_entry *e = (upb_inttable_entry*)UPB_INDEX(t->t.entries, bucket, entry_size); - //DEBUGPRINTF("looking in first bucket %d, entry size=%zd, addr=%p\n", bucket, entry_size, e); while (1) { - //DEBUGPRINTF("%d, %d, %d\n", e->val.has_entry, e->hdr.key, key); if (e->hdr.key == key) { - //DEBUGPRINTF("returning val from hash part\n"); return &e->val; } if ((bucket = e->hdr.next) == UPB_END_OF_CHAIN) return NULL; - //DEBUGPRINTF("looking in bucket %d\n", bucket); e = (upb_inttable_entry*)UPB_INDEX(t->t.entries, bucket, entry_size); } } -- cgit v1.2.3