From 868f118797969cd0178d38207330e410267e6c46 Mon Sep 17 00:00:00 2001 From: Joshua Haberman Date: Sat, 14 Nov 2009 21:59:31 -0800 Subject: Changed parse API to know about msgdefs. This should make it both easier to use and easier to optimize, in exchange for a small amount of generality. In practice, any remotely normal case is still very natural. --- src/upb.h | 11 ++-- src/upb_msg.c | 38 ++++-------- src/upb_parse.c | 189 ++++++++++++++++++++++++++++++++++---------------------- src/upb_parse.h | 107 ++++++++++---------------------- 4 files changed, 165 insertions(+), 180 deletions(-) (limited to 'src') diff --git a/src/upb.h b/src/upb.h index e8ec001..cc09ab1 100644 --- a/src/upb.h +++ b/src/upb.h @@ -140,13 +140,10 @@ union upb_value_ptr { void *_void; }; -// Unfortunately there is no way to define this so that it can be used as a -// generic expression, a la: -// foo(UPB_VALUE_ADDROF(bar)); -// ...you have to use it as the initializer of a upb_value_ptr: -// union upb_value_ptr p = UPB_VALUE_ADDROF(bar); -// foo(p); -#define UPB_VALUE_ADDROF(val) {(void*)&val._double} +INLINE union upb_value_ptr upb_value_addrof(union upb_value *val) { + union upb_value_ptr ptr = {&val->_double}; + return ptr; +} /** * Converts upb_value_ptr -> upb_value by reading from the pointer. We need to diff --git a/src/upb_msg.c b/src/upb_msg.c index 926eda0..3786a63 100644 --- a/src/upb_msg.c +++ b/src/upb_msg.c @@ -50,35 +50,24 @@ static union upb_value_ptr get_value_ptr(struct upb_msg *msg, /* Callbacks for the stream parser. */ -static upb_field_type_t tag_cb(void *udata, struct upb_tag *tag, - void **user_field_desc) +static bool value_cb(void *udata, struct upb_msgdef *msgdef, + struct upb_fielddef *f, union upb_value val) { + (void)msgdef; struct upb_msgparser *mp = udata; - struct upb_fielddef *f = - upb_msg_fieldbynum(mp->top->msg->def, tag->field_number); - if(!f || !upb_check_type(tag->wire_type, f->type)) - return 0; /* Skip unknown or fields of the wrong type. */ - *user_field_desc = f; - return f->type; -} - -static void *value_cb(void *udata, uint8_t *buf, uint8_t *end, - void *user_field_desc, struct upb_status *status) -{ - struct upb_msgparser *mp = udata; - struct upb_fielddef *f = user_field_desc; struct upb_msg *msg = mp->top->msg; union upb_value_ptr p = get_value_ptr(msg, f); upb_msg_set(msg, f); - return upb_parse_value(buf, end, f->type, p, status); + upb_value_write(p, val, f->type); + return true; } -static void str_cb(void *udata, uint8_t *str, - size_t avail_len, size_t total_len, - void *udesc) +static bool str_cb(void *udata, struct upb_msgdef *msgdef, + struct upb_fielddef *f, uint8_t *str, size_t avail_len, + size_t total_len) { + (void)msgdef; struct upb_msgparser *mp = udata; - struct upb_fielddef *f = udesc; struct upb_msg *msg = mp->top->msg; union upb_value_ptr p = get_value_ptr(msg, f); upb_msg_set(msg, f); @@ -98,12 +87,12 @@ static void str_cb(void *udata, uint8_t *str, memcpy((*p.str)->ptr, str, avail_len); (*p.str)->byte_len = avail_len; //} + return true; } -static void start_cb(void *udata, void *user_field_desc) +static void start_cb(void *udata, struct upb_fielddef *f) { struct upb_msgparser *mp = udata; - struct upb_fielddef *f = user_field_desc; struct upb_msg *oldmsg = mp->top->msg; union upb_value_ptr p = get_value_ptr(oldmsg, f); @@ -131,15 +120,14 @@ static void end_cb(void *udata) struct upb_msgparser *upb_msgparser_new(struct upb_msgdef *def) { - (void)def; // Not used atm. struct upb_msgparser *mp = malloc(sizeof(struct upb_msgparser)); - mp->s = upb_cbparser_new(); + mp->s = upb_cbparser_new(def, value_cb, str_cb, start_cb, end_cb); return mp; } void upb_msgparser_reset(struct upb_msgparser *s, struct upb_msg *msg, bool byref) { - upb_cbparser_reset(s->s, s, tag_cb, value_cb, str_cb, start_cb, end_cb); + upb_cbparser_reset(s->s, s); s->byref = byref; s->top = s->stack; s->top->msg = msg; diff --git a/src/upb_parse.c b/src/upb_parse.c index d1d535a..2e910f2 100644 --- a/src/upb_parse.c +++ b/src/upb_parse.c @@ -6,8 +6,10 @@ #include "upb_parse.h" +#include #include #include +#include "upb_def.h" /* Functions to read wire values. *********************************************/ @@ -297,21 +299,38 @@ uint8_t *upb_parse_value(uint8_t *buf, uint8_t *end, upb_field_type_t ft, #undef CASE } +struct upb_cbparser_frame { + struct upb_msgdef *msgdef; + size_t end_offset; // For groups, 0. +}; + struct upb_cbparser { - // Stack entries store the offset where the submsg ends (for groups, 0). - size_t stack[UPB_MAX_NESTING], *top, *limit; - size_t completed_offset; - void *udata; - upb_tag_cb tag_cb; + // Immutable state of the parser. + struct upb_msgdef *toplevel_msgdef; upb_value_cb value_cb; upb_str_cb str_cb; upb_start_cb start_cb; upb_end_cb end_cb; + + // State pertaining to a particular parse (resettable). + // Stack entries store the offset where the submsg ends (for groups, 0). + struct upb_cbparser_frame stack[UPB_MAX_NESTING], *top, *limit; + size_t completed_offset; + void *udata; }; -struct upb_cbparser *upb_cbparser_new(void) +struct upb_cbparser *upb_cbparser_new(struct upb_msgdef *msgdef, + upb_value_cb valuecb, upb_str_cb strcb, + upb_start_cb startcb, upb_end_cb endcb) { - return malloc(sizeof(struct upb_cbparser)); + struct upb_cbparser *p = malloc(sizeof(struct upb_cbparser)); + p->toplevel_msgdef = msgdef; + p->value_cb = valuecb; + p->str_cb = strcb; + p->start_cb = startcb; + p->end_cb = endcb; + p->limit = &p->stack[UPB_MAX_NESTING]; + return p; } void upb_cbparser_free(struct upb_cbparser *p) @@ -319,145 +338,165 @@ void upb_cbparser_free(struct upb_cbparser *p) free(p); } -void upb_cbparser_reset(struct upb_cbparser *p, void *udata, - upb_tag_cb tagcb, - upb_value_cb valuecb, - upb_str_cb strcb, - upb_start_cb startcb, - upb_end_cb endcb) +void upb_cbparser_reset(struct upb_cbparser *p, void *udata) { p->top = p->stack; - p->limit = &p->stack[UPB_MAX_NESTING]; p->completed_offset = 0; p->udata = udata; - p->tag_cb = tagcb; - p->value_cb = valuecb; - p->str_cb = strcb; - p->start_cb = startcb; - p->end_cb = endcb; - + p->top->msgdef = p->toplevel_msgdef; // The top-level message is not delimited (we can keep receiving data for it // indefinitely), so we treat it like a group. - *p->top = 0; + p->top->end_offset = 0; } +static void *get_msgend(struct upb_cbparser *p, uint8_t *start) +{ + if(p->top->end_offset > 0) + return start + (p->top->end_offset - p->completed_offset); + else + return (void*)UINTPTR_MAX; // group. +} + +static bool isgroup(void *submsg_end) +{ + return submsg_end == (void*)UINTPTR_MAX; +} + +extern upb_wire_type_t upb_expected_wire_types[]; +// Returns true if wt is the correct on-the-wire type for ft. +INLINE bool upb_check_type(upb_wire_type_t wt, upb_field_type_t ft) { + // This doesn't currently support packed arrays. + return upb_type_info[ft].expected_wire_type == wt; +} + + /** * Pushes a new stack frame for a submessage with the given len (which will * be zero if the submessage is a group). */ -static uint8_t *push(struct upb_cbparser *s, uint8_t *start, - uint32_t submsg_len, void *user_field_desc, +static uint8_t *push(struct upb_cbparser *p, uint8_t *start, + uint32_t submsg_len, struct upb_fielddef *f, struct upb_status *status) { - s->top++; - if(s->top >= s->limit) { + p->top++; + if(p->top >= p->limit) { upb_seterr(status, UPB_STATUS_ERROR, "Nesting exceeded maximum (%d levels)\n", UPB_MAX_NESTING); return NULL; } - *s->top = s->completed_offset + submsg_len; + struct upb_cbparser_frame *frame = p->top; + frame->end_offset = p->completed_offset + submsg_len; + frame->msgdef = f->ref.msg; - if(s->start_cb) - s->start_cb(s->udata, user_field_desc); - - if(*s->top > 0) - return start + (*s->top - s->completed_offset); - else - return (void*)UINTPTR_MAX; + if(p->start_cb) p->start_cb(p->udata, f); + return get_msgend(p, start); } /** * Pops a stack frame, returning a pointer for where the next submsg should * end (or a pointer that is out of range for a group). */ -static void *pop(struct upb_cbparser *s, uint8_t *start) +static void *pop(struct upb_cbparser *p, uint8_t *start) { - if(s->end_cb) - s->end_cb(s->udata); - - s->top--; - - if(*s->top > 0) - return (char*)start + (*s->top - s->completed_offset); - else - return (void*)UINTPTR_MAX; // group. + if(p->end_cb) p->end_cb(p->udata); + p->top--; + return get_msgend(p, start); } -size_t upb_cbparser_parse(struct upb_cbparser *s, void *_buf, size_t len, +size_t upb_cbparser_parse(struct upb_cbparser *p, void *_buf, size_t len, struct upb_status *status) { + // buf is our current offset, moves from start to end. uint8_t *buf = _buf; - uint8_t *completed = buf; - uint8_t *const start = buf; // ptr equivalent of s->completed_offset + uint8_t *const start = buf; // ptr equivalent of p->completed_offset uint8_t *end = buf + len; - uint8_t *submsg_end = *s->top > 0 ? buf + *s->top : (uint8_t*)UINTPTR_MAX; + + // When we have fully parsed a tag/value pair, we advance this. + uint8_t *completed = buf; + + uint8_t *submsg_end = get_msgend(p, start); + struct upb_msgdef *msgdef = p->top->msgdef; + bool keep_going = true; // Make local copies so optimizer knows they won't change. - upb_tag_cb tag_cb = s->tag_cb; - upb_str_cb str_cb = s->str_cb; - upb_value_cb value_cb = s->value_cb; - void *udata = s->udata; + upb_str_cb str_cb = p->str_cb; + upb_value_cb value_cb = p->value_cb; + void *udata = p->udata; + // We need to check the status of operations that can fail, but we do so as + // late as possible to avoid introducing branches that have to wait on + // (status->code) which must be loaded from memory. #define CHECK_STATUS() do { if(!upb_ok(status)) goto err; } while(0) - // Main loop: parse a tag, then handle the value. - while(buf < end) { + // Main loop: parse a tag, find the appropriate fielddef. + while(keep_going && buf < end) { struct upb_tag tag; buf = parse_tag(buf, end, &tag, status); if(tag.wire_type == UPB_WIRE_TYPE_END_GROUP) { CHECK_STATUS(); - submsg_end = pop(s, start); + if(!isgroup(submsg_end)) { + upb_seterr(status, UPB_STATUS_ERROR, "End group seen but current " + "message is not a group, byte offset: %zd", + p->completed_offset + (completed - start)); + goto err; + } + submsg_end = pop(p, start); + msgdef = p->top->msgdef; completed = buf; continue; } - void *udesc; - upb_field_type_t ft = tag_cb(udata, &tag, &udesc); + struct upb_fielddef *f = upb_msg_fieldbynum(msgdef, tag.field_number); if(tag.wire_type == UPB_WIRE_TYPE_DELIMITED) { int32_t delim_len; buf = upb_get_INT32(buf, end, &delim_len, status); CHECK_STATUS(); uint8_t *delim_end = buf + delim_len; - if(ft == UPB_TYPENUM(MESSAGE)) { - submsg_end = push(s, start, delim_end - start, udesc, status); + if(f && f->type == UPB_TYPENUM(MESSAGE)) { + submsg_end = push(p, start, delim_end - start, f, status); + msgdef = p->top->msgdef; } else { - if(upb_isstringtype(ft)) { + if(f && upb_isstringtype(f->type)) { size_t avail_len = UPB_MIN(delim_end, end) - buf; - str_cb(udata, buf, avail_len, delim_end - buf, udesc); + keep_going = + str_cb(udata, msgdef, f, buf, avail_len, delim_end - buf); } // else { TODO: packed arrays } + // If field was not found, it is skipped silently. buf = delim_end; // Could be >end. } } else { - // Scalar (non-delimited) value. - switch(ft) { - case 0: // Client elected to skip. - buf = skip_wire_value(buf, end, tag.wire_type, status); - break; - case UPB_TYPENUM(GROUP): - submsg_end = push(s, start, 0, udesc, status); - break; - default: - buf = value_cb(udata, buf, end, udesc, status); - break; + if(!f || !upb_check_type(tag.wire_type, f->type)) { + buf = skip_wire_value(buf, end, tag.wire_type, status); + } else if (f->type == UPB_TYPENUM(GROUP)) { + submsg_end = push(p, start, 0, f, status); + msgdef = p->top->msgdef; + } else { + union upb_value val; + buf = upb_parse_value(buf, end, f->type, upb_value_addrof(&val), + status); + keep_going = value_cb(udata, msgdef, f, val); } } CHECK_STATUS(); while(buf >= submsg_end) { if(buf > submsg_end) { - return UPB_STATUS_ERROR; // Bad submessage end. + upb_seterr(status, UPB_STATUS_ERROR, "Expected submsg end offset " + "did not lie on a tag/value boundary."); + goto err; } - submsg_end = pop(s, start); + submsg_end = pop(p, start); + msgdef = p->top->msgdef; } - // while(buf < s->packed_end) { TODO: packed arrays } + // while(buf < p->packed_end) { TODO: packed arrays } completed = buf; } size_t read; err: read = (char*)completed - (char*)start; - s->completed_offset += read; + p->completed_offset += read; return read; } diff --git a/src/upb_parse.h b/src/upb_parse.h index 9e64a5b..6c26b83 100644 --- a/src/upb_parse.h +++ b/src/upb_parse.h @@ -6,6 +6,9 @@ * into in-memory messages (a more DOM-like model), see the routines in * upb_msg.h, which are layered on top of this parser. * + * TODO: the parser currently does not support returning unknown values. This + * can easily be added when it is needed. + * * Copyright (c) 2009 Joshua Haberman. See LICENSE for details. */ @@ -23,105 +26,63 @@ extern "C" { /* Event Callbacks. ***********************************************************/ -// The tag callback is called immediately after a tag has been parsed. The -// client should determine whether it wants to parse or skip the corresponding -// value. If it wants to parse it, it must discover and return the correct -// .proto type (the tag only contains the wire type) and check that the wire -// type is appropriate for the .proto type. Returning a type for which -// upb_check_type(tag->wire_type, type) == false invokes undefined behavior. -// -// To skip the value (which means skipping all submessages, in the case of a -// submessage), the callback should return zero. -// -// The client can store a void* in *user_field_desc; this will be passed to -// the value callback or the string callback. -typedef upb_field_type_t (*upb_tag_cb)(void *udata, struct upb_tag *tag, - void **user_field_desc); - // The value callback is called when a regular value (ie. not a string or -// submessage) is encountered which the client has opted to parse (by not -// returning 0 from the tag_cb). The client must parse the value by calling -// upb_parse_value(), returning success or failure accordingly. +// submessage) is encountered which was defined in the upb_msgdef. The client +// returns true to continue the parse or false to halt it. // // Note that this callback can be called several times in a row for a single // call to tag_cb in the case of packed arrays. -typedef void *(*upb_value_cb)(void *udata, uint8_t *buf, uint8_t *end, - void *user_field_desc, struct upb_status *status); +typedef bool (*upb_value_cb)(void *udata, struct upb_msgdef *msgdef, + struct upb_fielddef *f, union upb_value val); -// The string callback is called when a string is parsed. avail_len is the -// number of bytes that are currently available at str. If the client is -// streaming and the current buffer ends in the middle of the string, this -// number could be less than total_len. -typedef void (*upb_str_cb)(void *udata, uint8_t *str, size_t avail_len, - size_t total_len, void *user_field_desc); +// The string callback is called when a string that was defined in the +// upb_msgdef is parsed. avail_len is the number of bytes that are currently +// available at str. If the client is streaming and the current buffer ends in +// the middle of the string, this number could be less than total_len. +typedef bool (*upb_str_cb)(void *udata, struct upb_msgdef *msgdef, + struct upb_fielddef *f, uint8_t *str, + size_t avail_len, size_t total_len); // The start and end callbacks are called when a submessage begins and ends, // respectively. -typedef void (*upb_start_cb)(void *udata, void *user_field_desc); +typedef void (*upb_start_cb)(void *udata, struct upb_fielddef *f); typedef void (*upb_end_cb)(void *udata); /* Callback parser interface. *************************************************/ -// Allocates and frees a upb_cbparser, respectively. -struct upb_cbparser *upb_cbparser_new(void); +// Allocates and frees a upb_cbparser, respectively. Callbacks may be NULL, +// in which case they will be skipped. +struct upb_cbparser *upb_cbparser_new(struct upb_msgdef *md, + upb_value_cb valuecb, upb_str_cb strcb, + upb_start_cb startcb, upb_end_cb endcb); void upb_cbparser_free(struct upb_cbparser *p); -// Resets the internal state of an already-allocated parser. Parsers must be -// reset before they can be used. A parser can be reset multiple times. udata -// will be passed as the first argument to callbacks. -// -// tagcb must be set, but all other callbacks can be NULL, in which case they -// will just be skipped. -void upb_cbparser_reset(struct upb_cbparser *p, void *udata, - upb_tag_cb tagcb, - upb_value_cb valuecb, - upb_str_cb strcb, - upb_start_cb startcb, - upb_end_cb endcb); - +// Resets the internal state of an already-allocated parser. This puts it in a +// state where it has not seen any data, and expects the next data to be from +// the beginning of a new protobuf. Parsers must be reset before they can be +// used. A parser can be reset multiple times. udata will be passed as the +// first argument to callbacks. +void upb_cbparser_reset(struct upb_cbparser *p, void *udata); // Parses up to len bytes of protobuf data out of buf, calling the appropriate // callbacks as values are parsed. // // The function returns a status indicating the success of the operation. Data -// is parsed until no more data can be read from buf, or the callback returns an -// error like UPB_STATUS_USER_CANCELLED, or an error occurs. +// is parsed until no more data can be read from buf, or a user callback +// returns false, or an error occurs. // -// *read is set to the number of bytes consumed. Note that this can be greater -// than len in the case that a string was recognized that spans beyond the end -// of the currently provided data. +// The function returns the number of bytes consumed. Note that this can be +// greater than len in the case that a string was recognized that spans beyond +// the end of the currently provided data. // -// The next call to upb_parse must be the first byte after buf + *read, even in -// the case that *read > len. +// The next call to upb_parse must be the first byte after buf + retval, even in +// the case that retval > len. // // TODO: see if we can provide the following guarantee efficiently: -// *read will always be >= len. */ +// retval will always be >= len. */ size_t upb_cbparser_parse(struct upb_cbparser *p, void *buf, size_t len, struct upb_status *status); -extern upb_wire_type_t upb_expected_wire_types[]; -// Returns true if wt is the correct on-the-wire type for ft. -INLINE bool upb_check_type(upb_wire_type_t wt, upb_field_type_t ft) { - // This doesn't currently support packed arrays. - return upb_type_info[ft].expected_wire_type == wt; -} - -/* Data-consuming functions (to be called from value cb). *********************/ - -// Parses and converts a value from the character data starting at buf (but not -// past end). Returns a pointer that is one past the data that was read. The -// caller must have previously checked that the wire type is appropriate for -// this field type. -uint8_t *upb_parse_value(uint8_t *buf, uint8_t *end, upb_field_type_t ft, - union upb_value_ptr v, struct upb_status *status); - -// Parses a wire value with the given type (which must have been obtained from -// a tag that was just parsed) and returns a pointer to one past the data that -// was read. -uint8_t *upb_parse_wire_value(uint8_t *buf, uint8_t *end, upb_wire_type_t wt, - union upb_wire_value *wv, - struct upb_status *status); - #ifdef __cplusplus } /* extern "C" */ #endif -- cgit v1.2.3