diff options
Diffstat (limited to 'upb/pb')
-rw-r--r-- | upb/pb/decoder.c | 157 | ||||
-rw-r--r-- | upb/pb/decoder.h | 38 | ||||
-rw-r--r-- | upb/pb/decoder_x64.dasc | 47 | ||||
-rw-r--r-- | upb/pb/glue.c | 13 | ||||
-rw-r--r-- | upb/pb/textprinter.c | 13 | ||||
-rw-r--r-- | upb/pb/varint.h | 2 |
6 files changed, 128 insertions, 142 deletions
diff --git a/upb/pb/decoder.c b/upb/pb/decoder.c index 5844377..ae54e47 100644 --- a/upb/pb/decoder.c +++ b/upb/pb/decoder.c @@ -45,27 +45,29 @@ static void upb_decoder_abort(upb_decoder *d, const char *msg) { /* Buffering ******************************************************************/ -// We operate on one buffer at a time, which may be a subset of the bytesrc -// region we have ref'd. When data for the buffer is completely gone we pull -// the next one. When we've committed our progress we release our ref on any -// previous buffers' regions. - -static size_t upb_decoder_bufleft(upb_decoder *d) { return d->end - d->ptr; } -static void upb_decoder_advance(upb_decoder *d, size_t len) { - assert((size_t)(d->end - d->ptr) >= len); +// We operate on one buffer at a time, which may be a subset of the currently +// loaded byteregion data. When data for the buffer is completely gone we pull +// the next one. When we've committed our progress we discard any previous +// buffers' regions. + +static uint32_t upb_decoder_bufleft(upb_decoder *d) { return d->end - d->ptr; } +static void upb_decoder_advance(upb_decoder *d, uint32_t len) { + assert(upb_decoder_bufleft(d) >= len); d->ptr += len; } -size_t upb_decoder_offset(upb_decoder *d) { - size_t offset = d->bufstart_ofs; - if (d->ptr) offset += (d->ptr - d->buf); - return offset; +uint64_t upb_decoder_offset(upb_decoder *d) { + return d->bufstart_ofs + (d->ptr - d->buf); +} + +uint64_t upb_decoder_bufendofs(upb_decoder *d) { + return d->bufstart_ofs + (d->end - d->buf); } static void upb_decoder_setmsgend(upb_decoder *d) { upb_dispatcher_frame *f = d->dispatcher.top; - size_t delimlen = f->end_ofs - d->bufstart_ofs; - size_t buflen = d->end - d->buf; + uint32_t delimlen = f->end_ofs - d->bufstart_ofs; + uint32_t buflen = d->end - d->buf; d->delim_end = (f->end_ofs != UPB_NONDELIMITED && delimlen <= buflen) ? d->buf + delimlen : NULL; // NULL if not in this buf. d->top_is_packed = f->is_packed; @@ -73,24 +75,25 @@ static void upb_decoder_setmsgend(upb_decoder *d) { static bool upb_trypullbuf(upb_decoder *d) { assert(upb_decoder_bufleft(d) == 0); - if (d->bufend_ofs == d->refend_ofs) { - size_t read = upb_bytesrc_fetch(d->bytesrc, d->refend_ofs, d->status); - if (read <= 0) { - d->ptr = NULL; - d->end = NULL; - if (read == 0) return false; // EOF - upb_decoder_exit(d); // Non-EOF error. - } - d->refend_ofs += read; + d->bufstart_ofs = upb_decoder_offset(d); + d->buf = NULL; + d->ptr = NULL; + d->end = NULL; + if (upb_byteregion_available(d->input, upb_decoder_offset(d)) == 0 && + !upb_byteregion_fetch(d->input, d->status)) { + if (upb_eof(d->status)) return false; + upb_decoder_exit(d); // Non-EOF error. } - d->bufstart_ofs = d->bufend_ofs; - size_t len; - d->buf = upb_bytesrc_getptr(d->bytesrc, d->bufstart_ofs, &len); + uint32_t len; + d->buf = upb_byteregion_getptr(d->input, d->bufstart_ofs, &len); assert(len > 0); - d->bufend_ofs = d->bufstart_ofs + len; d->ptr = d->buf; d->end = d->buf + len; #ifdef UPB_USE_JIT_X64 + // If we start parsing a value, we can parse up to 20 bytes without + // having to bounds-check anything (2 10-byte varints). Since the + // JIT bounds-checks only *between* values (and for strings), the + // JIT bails if there are not 20 bytes available. d->jit_end = d->end - 20; #endif upb_decoder_setmsgend(d); @@ -101,16 +104,21 @@ static void upb_pullbuf(upb_decoder *d) { if (!upb_trypullbuf(d)) upb_decoder_abort(d, "Unexpected EOF"); } -void upb_decoder_commit(upb_decoder *d) { - d->completed_ptr = d->ptr; - if (d->refstart_ofs < d->bufstart_ofs) { - // Drop our ref on the previous buf's region. - upb_bytesrc_refregion(d->bytesrc, d->bufstart_ofs, d->refend_ofs); - upb_bytesrc_unrefregion(d->bytesrc, d->refstart_ofs, d->refend_ofs); - d->refstart_ofs = d->bufstart_ofs; +void upb_decoder_skipto(upb_decoder *d, uint64_t ofs) { + if (ofs < upb_decoder_bufendofs(d)) { + upb_decoder_advance(d, ofs - upb_decoder_offset(d)); + } else { + d->buf = NULL; + d->ptr = NULL; + d->end = NULL; + d->bufstart_ofs = ofs; } } +void upb_decoder_checkpoint(upb_decoder *d) { + upb_byteregion_discard(d->input, upb_decoder_offset(d)); +} + /* Decoding of wire types *****************************************************/ @@ -151,11 +159,12 @@ done: return ret; } +// Returns true on success or false if we've hit a valid EOF. FORCEINLINE bool upb_trydecode_varint32(upb_decoder *d, uint32_t *val) { - if (upb_decoder_bufleft(d) == 0 && upb_dispatcher_islegalend(&d->dispatcher)) { - // Check for our two successful end-of-message conditions - // (user-specified EOM and bytesrc EOF). - if (d->bufend_ofs == d->end_ofs || !upb_trypullbuf(d)) return false; + if (upb_decoder_bufleft(d) == 0 && + upb_dispatcher_islegalend(&d->dispatcher) && + !upb_trypullbuf(d)) { + return false; } *val = upb_decode_varint32(d); return true; @@ -212,26 +221,15 @@ FORCEINLINE uint64_t upb_decode_fixed64(upb_decoder *d) { return u64; // TODO: proper byte swapping } -INLINE upb_strref *upb_decode_string(upb_decoder *d) { +INLINE upb_byteregion *upb_decode_string(upb_decoder *d) { uint32_t strlen = upb_decode_varint32(d); - d->strref.stream_offset = upb_decoder_offset(d); - d->strref.len = strlen; - if (upb_decoder_bufleft(d) == 0) upb_pullbuf(d); - if (upb_decoder_bufleft(d) >= strlen) { - // Fast case. - d->strref.ptr = d->ptr; - upb_decoder_advance(d, strlen); - } else { - // Slow case. - while (1) { - size_t consume = UPB_MIN(upb_decoder_bufleft(d), strlen); - upb_decoder_advance(d, consume); - strlen -= consume; - if (strlen == 0) break; - upb_pullbuf(d); - } - } - return &d->strref; + uint64_t offset = upb_decoder_offset(d); + upb_byteregion_reset(&d->str_byteregion, d->input, offset, strlen); + // Could make it an option on the callback whether we fetchall() first or not. + upb_byteregion_fetchall(&d->str_byteregion, d->status); + if (!upb_ok(d->status)) upb_decoder_exit(d); + upb_decoder_skipto(d, offset + strlen); + return &d->str_byteregion; } INLINE void upb_push(upb_decoder *d, upb_fhandlers *f, uint64_t end) { @@ -272,7 +270,7 @@ T(DOUBLE, fixed64, double, upb_asdouble) T(FLOAT, fixed32, float, upb_asfloat) T(SINT32, varint, int32, upb_zzdec_32) T(SINT64, varint, int64, upb_zzdec_64) -T(STRING, string, strref, upb_strref*) +T(STRING, string, byteregion, upb_byteregion*) static void upb_decode_GROUP(upb_decoder *d, upb_fhandlers *f) { upb_push(d, f, UPB_NONDELIMITED); @@ -352,10 +350,10 @@ INLINE upb_fhandlers *upb_decode_tag(upb_decoder *d) { case UPB_WIRE_TYPE_DELIMITED: upb_decoder_advance(d, upb_decode_varint32(d)); break; default: - upb_decoder_abort(d, "Invavlid wire type"); + upb_decoder_abort(d, "Invalid wire type"); } // TODO: deliver to unknown field callback. - upb_decoder_commit(d); + upb_decoder_checkpoint(d); upb_decoder_checkdelim(d); } } @@ -380,24 +378,18 @@ void upb_decoder_decode(upb_decoder *d, upb_status *status) { return; } f->decode(d, f); - upb_decoder_commit(d); + upb_decoder_checkpoint(d); } } -static void upb_decoder_skip(void *_d, upb_dispatcher_frame *top, - upb_dispatcher_frame *bottom) { - (void)top; - (void)bottom; - (void)_d; -#if 0 +static void upb_decoder_skip(void *_d, upb_dispatcher_frame *f) { upb_decoder *d = _d; - // TODO - if (bottom->end_offset == UPB_NONDELIMITED) { - // TODO: support skipping groups. - abort(); + if (f->end_ofs != UPB_NONDELIMITED) { + upb_decoder_skipto(d, d->dispatcher.top->end_ofs); + } else { + // TODO: how to support skipping groups? Dispatcher could drop callbacks, + // or it could be special-cased inside the decoder. } - d->ptr = d->buf.ptr + bottom->end_offset; -#endif } void upb_decoder_init(upb_decoder *d, upb_handlers *handlers) { @@ -423,24 +415,19 @@ void upb_decoder_init(upb_decoder *d, upb_handlers *handlers) { } } -void upb_decoder_reset(upb_decoder *d, upb_bytesrc *bytesrc, uint64_t start_ofs, - uint64_t end_ofs, void *closure) { +void upb_decoder_reset(upb_decoder *d, upb_byteregion *input, void *closure) { upb_dispatcher_frame *f = upb_dispatcher_reset(&d->dispatcher, closure); - f->end_ofs = end_ofs; - d->end_ofs = end_ofs; - d->refstart_ofs = start_ofs; - d->refend_ofs = start_ofs; - d->bufstart_ofs = start_ofs; - d->bufend_ofs = start_ofs; - d->bytesrc = bytesrc; + f->end_ofs = UPB_NONDELIMITED; + d->input = input; + d->bufstart_ofs = upb_byteregion_startofs(input); d->buf = NULL; d->ptr = NULL; - d->end = NULL; // Force a buffer pull. + d->end = NULL; // Force a buffer pull. + d->delim_end = NULL; // But don't let end-of-message get triggered. + d->str_byteregion.bytesrc = input->bytesrc; #ifdef UPB_USE_JIT_X64 d->jit_end = NULL; #endif - d->delim_end = NULL; // But don't let end-of-message get triggered. - d->strref.bytesrc = bytesrc; } void upb_decoder_uninit(upb_decoder *d) { diff --git a/upb/pb/decoder.h b/upb/pb/decoder.h index 2232c52..c35bec4 100644 --- a/upb/pb/decoder.h +++ b/upb/pb/decoder.h @@ -5,7 +5,7 @@ * Author: Josh Haberman <jhaberman@gmail.com> * * upb_decoder implements a high performance, streaming decoder for protobuf - * data that works by getting its input data from a upb_bytesrc and calling + * data that works by getting its input data from a upb_byteregion and calling * into a upb_handlers. */ @@ -26,24 +26,14 @@ extern "C" { struct dasm_State; typedef struct _upb_decoder { - upb_bytesrc *bytesrc; // Source of our serialized data. - upb_dispatcher dispatcher; // Dispatcher to which we push parsed data. - upb_status *status; // Where we will store any errors that occur. - upb_strref strref; // For passing string data to callbacks. - - // Offsets for the bytesrc region we currently have ref'd. - uint64_t refstart_ofs, refend_ofs; + upb_byteregion *input; // Input data (serialized). + upb_dispatcher dispatcher; // Dispatcher to which we push parsed data. + upb_status *status; // Where we will store any errors that occur. + upb_byteregion str_byteregion; // For passing string data to callbacks. // Current input buffer and its stream offset. const char *buf, *ptr, *end; - uint64_t bufstart_ofs, bufend_ofs; - - // Stream offset for the end of the top-level message, if any. - uint64_t end_ofs; - - // Buf offset as of which we've delivered calbacks; needed for rollback if - // a callback returns UPB_BREAK. - const char *completed_ptr; + uint64_t bufstart_ofs; // End of the delimited region, relative to ptr, or NULL if not in this buf. const char *delim_end; @@ -65,10 +55,6 @@ typedef struct _upb_decoder { sigjmp_buf exitjmp; } upb_decoder; -// Used for frames that have no specific end offset: groups, repeated primitive -// fields inside groups, and the top-level message. -#define UPB_NONDELIMITED UINT64_MAX - // Initializes/uninitializes a decoder for calling into the given handlers // or to write into the given msgdef, given its accessors). Takes a ref // on the handlers. @@ -77,13 +63,13 @@ void upb_decoder_uninit(upb_decoder *d); // Resets the internal state of an already-allocated decoder. This puts it in a // state where it has not seen any data, and expects the next data to be from -// the beginning of a new protobuf. Parsers must be reset before they can be -// used. A decoder can be reset multiple times. -// -// Pass UINT64_MAX for end_ofs to indicate a non-delimited top-level message. -void upb_decoder_reset(upb_decoder *d, upb_bytesrc *src, uint64_t start_ofs, - uint64_t end_ofs, void *closure); +// the beginning of a new protobuf. Decoders must be reset before they can be +// used. A decoder can be reset multiple times. "input" must live until the +// decoder is reset again (or destroyed). +void upb_decoder_reset(upb_decoder *d, upb_byteregion *input, void *closure); +// Decodes serialized data (calling handlers as the data is parsed) until error +// or EOF (see *status for details). void upb_decoder_decode(upb_decoder *d, upb_status *status); #ifdef __cplusplus diff --git a/upb/pb/decoder_x64.dasc b/upb/pb/decoder_x64.dasc index 72c4aa1..75e5b6b 100644 --- a/upb/pb/decoder_x64.dasc +++ b/upb/pb/decoder_x64.dasc @@ -129,7 +129,7 @@ void upb_reg_jit_gdb(upb_decoder *d) { |.define PTR, rbx |.define CLOSURE, r12 |.type FRAME, upb_dispatcher_frame, r13 -|.type STRREF, upb_strref, r14 +|.type BYTEREGION,upb_byteregion, r14 |.type DECODER, upb_decoder, r15 |.type STDARRAY, upb_stdarray | @@ -365,23 +365,26 @@ static void upb_decoder_jit_decodefield(upb_decoder *d, upb_mhandlers *m, // robust checks. | mov ecx, dword [PTR + tag_size] | decode_loaded_varint tag_size + | mov rdi, DECODER->effective_end + | sub rdi, rax + | cmp ARG3_64, rdi // if (len > d->effective_end - str) + | ja ->exit_jit // Can't deliver, whole string not in buf. + + // Update PTR to point past end of string. | mov rdi, rax | add rdi, ARG3_64 - | mov STRREF->len, ARG3_32 - | mov STRREF->ptr, rax - | sub rax, DECODER->buf - | add eax, DECODER->bufstart_ofs // = d->ptr - d->buf + d->bufstart_ofs - | mov STRREF->stream_offset, eax - | mov ARG3_64, STRREF - | cmp rdi, DECODER->effective_end - | ja ->exit_jit // Can't deliver, whole string not in buf. | mov PTR, rdi - break; - case UPB_TYPE_ENDGROUP: // A pseudo-type. - | add PTR, tag_size - | jmp =>m->jit_endofmsg_pclabel - return; + // Populate BYTEREGION appropriately. + | sub rax, DECODER->buf + | add rax, DECODER->bufstart_ofs // = d->ptr - d->buf + d->bufstart_ofs + | mov BYTEREGION->start, rax + | mov BYTEREGION->discard, rax + | add rax, ARG3_64 + | mov BYTEREGION->end, rax + | mov BYTEREGION->fetch, rax // Fast path ensures whole string is loaded + | mov ARG3_64, BYTEREGION + break; // Will dispatch callbacks and call submessage in a second. case UPB_TYPE(MESSAGE): @@ -471,7 +474,6 @@ static void upb_decoder_jit_callcb(upb_decoder *d, upb_fhandlers *f) { | callp f->endsubmsg } | popframe upb_fhandlers_getmsg(f) - } else { | mov ARG1_64, CLOSURE // Test for callbacks we can specialize. @@ -522,8 +524,8 @@ static void upb_decoder_jit_callcb(upb_decoder *d, upb_fhandlers *f) { } // PTR should point to the beginning of the tag. -static void upb_decoder_jit_field(upb_decoder *d, uint32_t tag, uint32_t next_tag, - upb_mhandlers *m, +static void upb_decoder_jit_field(upb_decoder *d, uint32_t tag, + uint32_t next_tag, upb_mhandlers *m, upb_fhandlers *f, upb_fhandlers *next_f) { // PC-label for the dispatch table. // We check the wire type (which must be loaded in edx) because the @@ -546,7 +548,14 @@ static void upb_decoder_jit_field(upb_decoder *d, uint32_t tag, uint32_t next_ta |1: // Label for repeating this field. - upb_decoder_jit_decodefield(d, m, f->type, upb_value_size(tag)); + int tag_size = upb_value_size(tag); + if (f->type == UPB_TYPE_ENDGROUP) { + | add PTR, tag_size + | jmp =>m->jit_endofmsg_pclabel + return; + } + + upb_decoder_jit_decodefield(d, m, f->type, tag_size); upb_decoder_jit_callcb(d, f); // Epilogue: load next tag, check for repeated field. @@ -673,7 +682,7 @@ static void upb_decoder_jit(upb_decoder *d) { | sub rsp, 8 | mov DECODER, ARG1_64 | mov FRAME, DECODER:ARG1_64->dispatcher.top - | lea STRREF, DECODER:ARG1_64->strref + | lea BYTEREGION, DECODER:ARG1_64->str_byteregion | mov CLOSURE, FRAME->closure | mov PTR, DECODER->ptr diff --git a/upb/pb/glue.c b/upb/pb/glue.c index 37b86d9..3176355 100644 --- a/upb/pb/glue.c +++ b/upb/pb/glue.c @@ -23,7 +23,7 @@ void upb_strtomsg(const char *str, size_t len, void *msg, const upb_msgdef *md, upb_accessors_reghandlers(h, md); upb_decoder_init(&d, h); upb_handlers_unref(h); - upb_decoder_reset(&d, upb_stringsrc_bytesrc(&strsrc), 0, UINT64_MAX, msg); + upb_decoder_reset(&d, upb_stringsrc_allbytes(&strsrc), msg); upb_decoder_decode(&d, status); upb_stringsrc_uninit(&strsrc); @@ -84,16 +84,19 @@ upb_def **upb_load_defs_from_descriptor(const char *str, size_t len, int *n, upb_handlers_unref(h); upb_descreader r; upb_descreader_init(&r); - upb_decoder_reset(&d, upb_stringsrc_bytesrc(&strsrc), 0, UINT64_MAX, &r); + upb_decoder_reset(&d, upb_stringsrc_allbytes(&strsrc), &r); upb_decoder_decode(&d, status); + upb_stringsrc_uninit(&strsrc); + upb_decoder_uninit(&d); + if (!upb_ok(status)) { + upb_descreader_uninit(&r); + return NULL; + } upb_def **defs = upb_descreader_getdefs(&r, n); upb_def **defscopy = malloc(sizeof(upb_def*) * (*n)); memcpy(defscopy, defs, sizeof(upb_def*) * (*n)); - upb_descreader_uninit(&r); - upb_stringsrc_uninit(&strsrc); - upb_decoder_uninit(&d); // Set default accessors and layouts on all messages. for(int i = 0; i < *n; i++) { diff --git a/upb/pb/textprinter.c b/upb/pb/textprinter.c index 4056b8f..3f68f90 100644 --- a/upb/pb/textprinter.c +++ b/upb/pb/textprinter.c @@ -35,15 +35,16 @@ err: return -1; } -static int upb_textprinter_putescaped(upb_textprinter *p, const upb_strref *strref, +static int upb_textprinter_putescaped(upb_textprinter *p, + const upb_byteregion *bytes, bool preserve_utf8) { // Based on CEscapeInternal() from Google's protobuf release. // TODO; we could read directly from a bytesrc's buffer instead. - // TODO; we could write strrefs to the sink when possible. + // TODO; we could write byteregions to the sink when possible. char dstbuf[4096], *dst = dstbuf, *dstend = dstbuf + sizeof(dstbuf); - char *buf = malloc(strref->len), *src = buf; - char *end = src + strref->len; - upb_bytesrc_read(strref->bytesrc, strref->stream_offset, strref->len, buf); + char *buf = malloc(upb_byteregion_len(bytes)), *src = buf; + char *end = src + upb_byteregion_len(bytes); + upb_byteregion_copyall(bytes, buf); // I think hex is prettier and more useful, but proto2 uses octal; should // investigate whether it can parse hex also. @@ -142,7 +143,7 @@ static upb_flow_t upb_textprinter_putstr(void *_p, upb_value fval, uint64_t start_ofs = upb_bytesink_getoffset(p->sink); const upb_fielddef *f = upb_value_getfielddef(fval); CHECK(upb_bytesink_putc(p->sink, '"')); - CHECK(upb_textprinter_putescaped(p, upb_value_getstrref(val), + CHECK(upb_textprinter_putescaped(p, upb_value_getbyteregion(val), f->type == UPB_TYPE(STRING))); CHECK(upb_bytesink_putc(p->sink, '"')); return UPB_CONTINUE; diff --git a/upb/pb/varint.h b/upb/pb/varint.h index 1bbd193..19977e9 100644 --- a/upb/pb/varint.h +++ b/upb/pb/varint.h @@ -113,7 +113,7 @@ INLINE upb_decoderet upb_vdecode_max8_fast(upb_decoderet r) { /* Encoding *******************************************************************/ -INLINE size_t upb_value_size(uint64_t val) { +INLINE int upb_value_size(uint64_t val) { #ifdef __GNUC__ int high_bit = 63 - __builtin_clzll(val); // 0-based, undef if val == 0. #else |