From 4240e0e5989fadd367c29651e0a0b01d499927b0 Mon Sep 17 00:00:00 2001 From: Joshua Haberman Date: Wed, 29 Jul 2009 20:37:32 -0700 Subject: Updated parser semantics to better support streaming. --- src/upb_msg.c | 15 ++++++--- src/upb_parse.c | 100 +++++++++++++++++++------------------------------------ src/upb_parse.h | 37 +++++++++++--------- src/upb_string.h | 6 ++-- 4 files changed, 68 insertions(+), 90 deletions(-) (limited to 'src') diff --git a/src/upb_msg.c b/src/upb_msg.c index 6d6d934..bdebe0d 100644 --- a/src/upb_msg.c +++ b/src/upb_msg.c @@ -291,18 +291,23 @@ static upb_status_t value_cb(void *udata, uint8_t *buf, uint8_t *end, return UPB_STATUS_OK; } -static void str_cb(void *udata, struct upb_string *str, void *user_field_desc) +static void str_cb(void *udata, uint8_t *str, + size_t avail_len, size_t total_len, + void *udesc) { struct upb_msg_parse_state *s = udata; - struct upb_msg_field *f = user_field_desc; + struct upb_msg_field *f = udesc; union upb_value_ptr p = get_value_ptr(s->top->data, f); upb_msg_set(s->top->data, f); + if(avail_len != total_len) abort(); /* TODO: support streaming. */ if(s->byref) { upb_msg_reuse_strref(p.str); - **p.str = *str; + (*p.str)->ptr = (char*)str; + (*p.str)->byte_len = avail_len; } else { - upb_msg_reuse_str(p.str, str->byte_len); - upb_strcpy(*p.str, str); + upb_msg_reuse_str(p.str, avail_len); + memcpy((*p.str)->ptr, str, avail_len); + (*p.str)->byte_len = avail_len; } //google_protobuf_FieldDescriptorProto *fd = upb_msg_field_descriptor(f, s->top->m); //upb_text_printfield(&s->p, *fd->name, f->type, upb_deref(p, fd->type), stdout); diff --git a/src/upb_parse.c b/src/upb_parse.c index 4e1f4a5..a8fa3a6 100644 --- a/src/upb_parse.c +++ b/src/upb_parse.c @@ -39,36 +39,12 @@ upb_status_t upb_get_v_uint64_t_full(uint8_t *restrict buf, uint8_t *end, uint64_t *restrict val, uint8_t **outbuf) { - if(buf + 10 <= end) { - /* >2-byte varint, fast path. */ - uint64_t cont = *(uint64_t*)(buf+2) | 0x7f7f7f7f7f7f7f7fULL; - int num_bytes = __builtin_ffsll(~cont) / 8; - uint32_t part0 = 0, part1 = 0, part2 = 0; - - switch(num_bytes) { - default: return UPB_ERROR_UNTERMINATED_VARINT; - case 8: part2 |= (buf[9] & 0x7F) << 7; - case 7: part2 |= (buf[8] & 0x7F); - case 6: part1 |= (buf[7] & 0x7F) << 21; - case 5: part1 |= (buf[6] & 0x7F) << 14; - case 4: part1 |= (buf[5] & 0x7F) << 7; - case 3: part1 |= (buf[4] & 0x7F); - case 2: part0 |= (buf[3] & 0x7F) << 21; - case 1: part0 |= (buf[2] & 0x7F) << 14; - part0 |= (buf[1] & 0x7F) << 7; - part0 |= (buf[0] & 0x7F); - } - *val = (uint64_t)part0 | ((uint64_t)part1 << 28) | ((uint64_t)part2 << 56); - *outbuf = buf + num_bytes + 2; - } else { - /* >2-byte varint, slow path. */ - uint8_t last = 0x80; - *val = 0; - for(int bitpos = 0; buf < (uint8_t*)end && (last & 0x80); buf++, bitpos += 7) - *val |= ((uint64_t)((last = *buf) & 0x7F)) << bitpos; - if(last & 0x80) return UPB_STATUS_NEED_MORE_DATA; - *outbuf = buf; - } + uint8_t last = 0x80; + *val = 0; + for(int bitpos = 0; buf < (uint8_t*)end && (last & 0x80); buf++, bitpos += 7) + *val |= ((uint64_t)((last = *buf) & 0x7F)) << bitpos; + if(last & 0x80) return UPB_STATUS_NEED_MORE_DATA; + *outbuf = buf; return UPB_STATUS_OK; } @@ -201,15 +177,17 @@ upb_status_t upb_parse(struct upb_parse_state *s, void *_buf, size_t len, uint8_t *buf = _buf; uint8_t *completed = buf; uint8_t *const start = buf; - + uint8_t *end = buf + len; + uint8_t *submsg_end = buf + (*s->top > 0 ? *s->top : 0); upb_status_t status = UPB_STATUS_OK; + + /* Make local copies so optimizer knows they won't change. */ upb_tag_cb tag_cb = s->tag_cb; upb_str_cb str_cb = s->str_cb; upb_value_cb value_cb = s->value_cb; void *udata = s->udata; - uint8_t *end = buf + len; - uint8_t *submsg_end = buf + (*s->top > 0 ? *s->top : 0); + /* Main loop: parse a tag, then handle the value. */ while(buf < end) { struct upb_tag tag; UPB_CHECK(parse_tag(buf, end, &tag, &buf)); @@ -218,53 +196,43 @@ upb_status_t upb_parse(struct upb_parse_state *s, void *_buf, size_t len, completed = buf; continue; } - /* Don't handle START_GROUP here, so client can skip group via tag_cb. */ - void *user_field_desc; - upb_field_type_t ft = tag_cb(udata, &tag, &user_field_desc); + void *udesc; + upb_field_type_t ft = tag_cb(udata, &tag, &udesc); if(tag.wire_type == UPB_WIRE_TYPE_DELIMITED) { int32_t delim_len; UPB_CHECK(upb_get_INT32(buf, end, &delim_len, &buf)); uint8_t *delim_end = buf + delim_len; - - if(delim_end > end) { /* String ends beyond the data we have. */ - if(ft == GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_MESSAGE) { - /* Streaming the body of a message is ok. */ - } else { - /* String, bytes, and packed arrays must have all data present. */ - status = UPB_STATUS_NEED_MORE_DATA; - goto done; - } - } - if(ft == GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_MESSAGE) { - UPB_CHECK(push_stack_frame(s, start, delim_end - start, user_field_desc, &submsg_end)); - } else { /* Delimited data for which we require (and have) all data. */ - if(ft == 0) { - /* Do nothing -- client has elected to skip. */ - } else if(upb_isstringtype(ft)) { - struct upb_string str = {.ptr = (char*)buf, .byte_len = delim_len}; - str_cb(udata, &str, user_field_desc); - } else { /* Packed Array. */ - while(buf < delim_end) - UPB_CHECK(value_cb(udata, buf, end, user_field_desc, &buf)); - } - buf = delim_end; + UPB_CHECK(push_stack_frame( + s, start, delim_end - start, udesc, &submsg_end)); + } else { + if(upb_isstringtype(ft)) + str_cb(udata, buf, UPB_MIN(delim_end, end) - buf, delim_end - buf, udesc); + else + ;/* Set a marker for packed arrays. */ + buf = delim_end; /* Note that this could be greater than end. */ } } else { /* Scalar (non-delimited) value. */ - if(ft == 0) /* Client elected to skip. */ - UPB_CHECK(skip_wire_value(buf, end, tag.wire_type, &buf)); - else if(ft == GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_GROUP) - UPB_CHECK(push_stack_frame(s, start, 0, user_field_desc, &submsg_end)); - else - UPB_CHECK(value_cb(udata, buf, end, user_field_desc, &buf)); + switch(ft) { + case 0: /* Client elected to skip. */ + UPB_CHECK(skip_wire_value(buf, end, tag.wire_type, &buf)); + break; + case GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_GROUP: + UPB_CHECK(push_stack_frame(s, start, 0, udesc, &submsg_end)); + break; + default: + UPB_CHECK(value_cb(udata, buf, end, udesc, &buf)); + break; + } } while(buf == submsg_end) submsg_end = pop_stack_frame(s, start); + //while(buf < s->packed_end) /* packed arrays. */ + // UPB_CHECK(value_cb(udata, buf, end, udesc, &buf)); completed = buf; } -done: *read = (char*)completed - (char*)start; s->completed_offset += *read; return status; diff --git a/src/upb_parse.h b/src/upb_parse.h index 09ac4f7..ca18937 100644 --- a/src/upb_parse.h +++ b/src/upb_parse.h @@ -72,8 +72,12 @@ typedef upb_field_type_t (*upb_tag_cb)(void *udata, typedef upb_status_t (*upb_value_cb)(void *udata, uint8_t *buf, uint8_t *end, void *user_field_desc, uint8_t **outbuf); -/* The callback that is called when a string is parsed. */ -typedef void (*upb_str_cb)(void *udata, struct upb_string *str, +/* The callback that is called when a string is parsed. Note that the data + * for the string might not all be available -- we could be streaming, and + * the current buffer might end right in the middle of the string. So we + * pass both the available length and the total length. */ +typedef void (*upb_str_cb)(void *udata, uint8_t *str, + size_t avail_len, size_t total_len, void *user_field_desc); /* Callbacks that are called when a submessage begins and ends, respectively. @@ -96,9 +100,16 @@ struct upb_parse_state { }; /* Parses up to len bytes of protobuf data out of buf, calling cb as needed. - * The function returns how many bytes were consumed from buf. Data is parsed - * until no more data can be read from buf, or the callback sets *done=true, - * or an error occured. Sets *read to the number of bytes consumed. */ + * The function returns a status indicating the success of the operation. Data + * is parsed until no more data can be read from buf, or the callback returns an + * error like UPB_STATUS_USER_CANCELLED, or an error occurs. + * + * *read is set to the number of bytes consumed. Note that this can be greater + * than len in the case that a string was recognized that spans beyond the end + * of the currently provided data. + * + * The next call to upb_parse must be the first byte after buf + *read, even in + * the case that *read > len. */ upb_status_t upb_parse(struct upb_parse_state *s, void *buf, size_t len, size_t *read); @@ -136,18 +147,12 @@ upb_status_t upb_get_v_uint64_t_full(uint8_t *buf, uint8_t *end, uint64_t *val, INLINE upb_status_t upb_get_v_uint64_t(uint8_t *buf, uint8_t *end, uint64_t *val, uint8_t **outbuf) { - /* We inline these two common cases (short varints), if that fails we - * dispatch to the full (non-inlined) version. */ + /* We inline this common case (1-byte varints), if that fails we dispatch to + * the full (non-inlined) version. */ if((*buf & 0x80) == 0) { - /* Single-byte varint -- very common case. */ *val = *buf & 0x7f; *outbuf = buf + 1; return UPB_STATUS_OK; - } else if(buf <= end && (*(buf+1) & 0x80) == 0) { - /* Two-byte varint. */ - *val = (buf[0] & 0x7f) | ((buf[1] & 0x7f) << 7); - *outbuf = buf + 2; - return UPB_STATUS_OK; } else { return upb_get_v_uint64_t_full(buf, end, val, outbuf); } @@ -174,7 +179,7 @@ INLINE upb_status_t upb_get_f_uint32_t(uint8_t *buf, uint8_t *end, *val = *(uint32_t*)buf; #else #define SHL(val, bits) ((uint32_t)val << bits) - *val = SHL(b[0], 0) | SHL(b[1], 8) | SHL(b[2], 16) | SHL(b[3], 24); + *val = SHL(buf[0], 0) | SHL(buf[1], 8) | SHL(buf[2], 16) | SHL(buf[3], 24); #undef SHL #endif *outbuf = uint32_end; @@ -191,8 +196,8 @@ INLINE upb_status_t upb_get_f_uint64_t(uint8_t *buf, uint8_t *end, *val = *(uint64_t*)buf; #else #define SHL(val, bits) ((uint64_t)val << bits) - *val = SHL(b[0], 0) | SHL(b[1], 8) | SHL(b[2], 16) | SHL(b[3], 24) | - SHL(b[4], 32) | SHL(b[5], 40) | SHL(b[6], 48) | SHL(b[7], 56) | + *val = SHL(buf[0], 0) | SHL(buf[1], 8) | SHL(buf[2], 16) | SHL(buf[3], 24) | + SHL(buf[4], 32) | SHL(buf[5], 40) | SHL(buf[6], 48) | SHL(buf[7], 56); #undef SHL #endif *outbuf = uint64_end; diff --git a/src/upb_string.h b/src/upb_string.h index 8e56daa..528c8c8 100644 --- a/src/upb_string.h +++ b/src/upb_string.h @@ -37,6 +37,8 @@ extern "C" { #define INLINE static inline #endif +#define UPB_MIN(x, y) ((x) < (y) ? (x) : (y)) + struct upb_string { /* We expect the data to be 8-bit clean (uint8_t), but char* is such an * ingrained convention that we follow it. */ @@ -44,15 +46,13 @@ struct upb_string { uint32_t byte_len; }; -INLINE uint32_t min(uint32_t a, uint32_t b) { return a < b ? a : b; } - INLINE bool upb_streql(struct upb_string *s1, struct upb_string *s2) { return s1->byte_len == s2->byte_len && memcmp(s1->ptr, s2->ptr, s1->byte_len) == 0; } INLINE int upb_strcmp(struct upb_string s1, struct upb_string s2) { - size_t common_length = min(s1.byte_len, s2.byte_len); + size_t common_length = UPB_MIN(s1.byte_len, s2.byte_len); int common_diff = memcmp(s1.ptr, s2.ptr, common_length); if(common_diff == 0) return s1.byte_len - s2.byte_len; else return common_diff; -- cgit v1.2.3