diff options
Diffstat (limited to 'upb/pb')
-rw-r--r-- | upb/pb/compile_decoder.c | 9 | ||||
-rw-r--r-- | upb/pb/compile_decoder_x64.dasc | 3 | ||||
-rw-r--r-- | upb/pb/decoder.c | 146 | ||||
-rw-r--r-- | upb/pb/decoder.h | 22 | ||||
-rw-r--r-- | upb/pb/decoder.int.h | 8 | ||||
-rw-r--r-- | upb/pb/textprinter.c | 18 |
6 files changed, 142 insertions, 64 deletions
diff --git a/upb/pb/compile_decoder.c b/upb/pb/compile_decoder.c index 400d6fa..59bd03b 100644 --- a/upb/pb/compile_decoder.c +++ b/upb/pb/compile_decoder.c @@ -4,7 +4,14 @@ * Copyright (c) 2013 Google Inc. See LICENSE for details. * Author: Josh Haberman <jhaberman@gmail.com> * - * Code to compile a upb::MessageDef into bytecode for decoding that message. + * Code to compile a upb::Handlers into bytecode for decoding a protobuf + * according to that specific schema and destination handlers. + * + * Compiling to bytecode is always the first step. If we are using the + * interpreted decoder we leave it as bytecode and interpret that. If we are + * using a JIT decoder we use a code generator to turn the bytecode into native + * code, LLVM IR, etc. + * * Bytecode definition is in decoder.int.h. */ diff --git a/upb/pb/compile_decoder_x64.dasc b/upb/pb/compile_decoder_x64.dasc index 97fb5ce..9eec6a5 100644 --- a/upb/pb/compile_decoder_x64.dasc +++ b/upb/pb/compile_decoder_x64.dasc @@ -5,8 +5,7 @@ |// Author: Josh Haberman <jhaberman@gmail.com> |// |// JIT compiler for upb_pbdecoder on x86-64. Generates machine code from the -|// bytecode generated in compile_decoder.c, but unlike the interpreter we bind -|// to a specific set of handlers for greater efficiency. +|// bytecode generated in compile_decoder.c. | |.arch x64 |.actionlist upb_jit_actionlist diff --git a/upb/pb/decoder.c b/upb/pb/decoder.c index 9c54b8a..1098e63 100644 --- a/upb/pb/decoder.c +++ b/upb/pb/decoder.c @@ -3,6 +3,19 @@ * * Copyright (c) 2008-2013 Google Inc. See LICENSE for details. * Author: Josh Haberman <jhaberman@gmail.com> + * + * This file implements a VM for the interpreted (bytecode) decoder. + * + * Bytecode must previously have been generated using the bytecode compiler in + * compile_decoder.c. This decoder then walks through the bytecode op-by-op to + * parse the input. + * + * Decoding is fully resumable; we just keep a pointer to the current bytecode + * instruction and resume from there. A fair amount of the logic here is to + * handle the fact that values can span buffer seams and we have to be able to + * be capable of suspending/resuming from any byte in the stream. This + * sometimes requires keeping a few trailing bytes from the last buffer around + * in the "residual" buffer. */ #include <inttypes.h> @@ -55,7 +68,7 @@ static bool consumes_input(opcode op) { } } -static bool in_residual_buf(upb_pbdecoder *d, const char *p); +static bool in_residual_buf(const upb_pbdecoder *d, const char *p); // It's unfortunate that we have to micro-manage the compiler this way, // especially since this tuning is necessarily specific to one hardware @@ -83,18 +96,14 @@ void upb_pbdecoder_seterr(upb_pbdecoder *d, const char *msg) { // How many bytes can be safely read from d->ptr without reading past end-of-buf // or past the current delimited end. -static size_t curbufleft(upb_pbdecoder *d) { +static size_t curbufleft(const upb_pbdecoder *d) { assert(d->data_end >= d->ptr); return d->data_end - d->ptr; } -static const char *ptr(upb_pbdecoder *d) { - return d->ptr; -} - -// Overall offset of d->ptr. -uint64_t offset(upb_pbdecoder *d) { - return d->bufstart_ofs + (ptr(d) - d->buf); +// Overall stream offset of d->ptr. +uint64_t offset(const upb_pbdecoder *d) { + return d->bufstart_ofs + (d->ptr - d->buf); } // Advances d->ptr. @@ -107,12 +116,12 @@ static bool in_buf(const char *p, const char *buf, const char *end) { return p >= buf && p <= end; } -static bool in_residual_buf(upb_pbdecoder *d, const char *p) { +static bool in_residual_buf(const upb_pbdecoder *d, const char *p) { return in_buf(p, d->residual, d->residual_end); } -// Calculates the delim_end value, which represents a combination of the -// current buffer and the stack, so must be called whenever either is updated. +// Calculates the delim_end value, which is affected by both the current buffer +// and the parsing stack, so must be called whenever either is updated. static void set_delim_end(upb_pbdecoder *d) { size_t delim_ofs = d->top->end_ofs - d->bufstart_ofs; if (delim_ofs <= (d->end - d->buf)) { @@ -141,8 +150,8 @@ static void checkpoint(upb_pbdecoder *d) { // The assertion here is in the interests of efficiency, not correctness. // We are trying to ensure that we don't checkpoint() more often than // necessary. - assert(d->checkpoint != ptr(d)); - d->checkpoint = ptr(d); + assert(d->checkpoint != d->ptr); + d->checkpoint = d->ptr; } // Resumes the decoder from an initial state or from a previous suspend. @@ -154,14 +163,14 @@ int32_t upb_pbdecoder_resume(upb_pbdecoder *d, void *p, const char *buf, d->handle = handle; if (d->residual_end > d->residual) { // We have residual bytes from the last buffer. - assert(ptr(d) == d->residual); + assert(d->ptr == d->residual); } else { switchtobuf(d, buf, buf + size); } - d->checkpoint = ptr(d); + d->checkpoint = d->ptr; if (d->top->groupnum < 0) { CHECK_RETURN(upb_pbdecoder_skipunknown(d, -1, 0)); - d->checkpoint = ptr(d); + d->checkpoint = d->ptr; } return DECODE_OK; } @@ -198,7 +207,7 @@ static size_t suspend_save(upb_pbdecoder *d) { // Checkpoint was in residual buf; append user byte(s) to residual buf. assert((d->residual_end - d->residual) + d->size_param <= sizeof(d->residual)); - if (!in_residual_buf(d, ptr(d))) { + if (!in_residual_buf(d, d->ptr)) { d->bufstart_ofs -= (d->residual_end - d->residual); } memcpy(d->residual_end, d->buf_param, d->size_param); @@ -209,7 +218,7 @@ static size_t suspend_save(upb_pbdecoder *d) { d->ptr = d->checkpoint; size_t save = curbufleft(d); assert(save <= sizeof(d->residual)); - memcpy(d->residual, ptr(d), save); + memcpy(d->residual, d->ptr, save); d->residual_end = d->residual + save; d->bufstart_ofs = offset(d); } @@ -218,8 +227,11 @@ static size_t suspend_save(upb_pbdecoder *d) { return d->size_param; } +// Skips "bytes" bytes in the stream, which may be more than available. If we +// skip more bytes than are available, we return a long read count to the caller +// indicating how many bytes the caller should skip before passing a new buffer. static int32_t skip(upb_pbdecoder *d, size_t bytes) { - assert(!in_residual_buf(d, ptr(d)) || d->size_param == 0); + assert(!in_residual_buf(d, d->ptr) || d->size_param == 0); if (curbufleft(d) >= bytes) { // Skipped data is all in current buffer. advance(d, bytes); @@ -235,19 +247,24 @@ static int32_t skip(upb_pbdecoder *d, size_t bytes) { } } +// Copies the next "bytes" bytes into "buf" and advances the stream. +// Requires that this many bytes are available in the current buffer. FORCEINLINE void consumebytes(upb_pbdecoder *d, void *buf, size_t bytes) { assert(bytes <= curbufleft(d)); - memcpy(buf, ptr(d), bytes); + memcpy(buf, d->ptr, bytes); advance(d, bytes); } +// Slow path for getting the next "bytes" bytes, regardless of whether they are +// available in the current buffer or not. Returns a status code as described +// in decoder.int.h. static NOINLINE int32_t getbytes_slow(upb_pbdecoder *d, void *buf, size_t bytes) { const size_t avail = curbufleft(d); consumebytes(d, buf, avail); bytes -= avail; assert(bytes > 0); - if (in_residual_buf(d, ptr(d))) { + if (in_residual_buf(d, d->ptr)) { advancetobuf(d, d->buf_param, d->size_param); } if (curbufleft(d) >= bytes) { @@ -261,6 +278,8 @@ static NOINLINE int32_t getbytes_slow(upb_pbdecoder *d, void *buf, } } +// Gets the next "bytes" bytes, regardless of whether they are available in the +// current buffer or not. Returns a status code as described in decoder.int.h. FORCEINLINE int32_t getbytes(upb_pbdecoder *d, void *buf, size_t bytes) { if (curbufleft(d) >= bytes) { // Buffer has enough data to satisfy. @@ -274,8 +293,8 @@ FORCEINLINE int32_t getbytes(upb_pbdecoder *d, void *buf, size_t bytes) { static NOINLINE size_t peekbytes_slow(upb_pbdecoder *d, void *buf, size_t bytes) { size_t ret = curbufleft(d); - memcpy(buf, ptr(d), ret); - if (in_residual_buf(d, ptr(d))) { + memcpy(buf, d->ptr, ret); + if (in_residual_buf(d, d->ptr)) { size_t copy = UPB_MIN(bytes - ret, d->size_param); memcpy(buf + ret, d->buf_param, copy); ret += copy; @@ -285,7 +304,7 @@ static NOINLINE size_t peekbytes_slow(upb_pbdecoder *d, void *buf, FORCEINLINE size_t peekbytes(upb_pbdecoder *d, void *buf, size_t bytes) { if (curbufleft(d) >= bytes) { - memcpy(buf, ptr(d), bytes); + memcpy(buf, d->ptr, bytes); return bytes; } else { return peekbytes_slow(d, buf, bytes); @@ -295,6 +314,8 @@ FORCEINLINE size_t peekbytes(upb_pbdecoder *d, void *buf, size_t bytes) { /* Decoding of wire types *****************************************************/ +// Slow path for decoding a varint from the current buffer position. +// Returns a status code as described in decoder.int.h. NOINLINE int32_t upb_pbdecoder_decode_varint_slow(upb_pbdecoder *d, uint64_t *u64) { *u64 = 0; @@ -312,19 +333,21 @@ NOINLINE int32_t upb_pbdecoder_decode_varint_slow(upb_pbdecoder *d, return DECODE_OK; } +// Decodes a varint from the current buffer position. +// Returns a status code as described in decoder.int.h. FORCEINLINE int32_t decode_varint(upb_pbdecoder *d, uint64_t *u64) { - if (curbufleft(d) > 0 && !(*ptr(d) & 0x80)) { - *u64 = *ptr(d); + if (curbufleft(d) > 0 && !(*d->ptr & 0x80)) { + *u64 = *d->ptr; advance(d, 1); return DECODE_OK; } else if (curbufleft(d) >= 10) { // Fast case. - upb_decoderet r = upb_vdecode_fast(ptr(d)); + upb_decoderet r = upb_vdecode_fast(d->ptr); if (r.p == NULL) { seterr(d, kUnterminatedVarint); return upb_pbdecoder_suspend(d); } - advance(d, r.p - ptr(d)); + advance(d, r.p - d->ptr); *u64 = r.val; return DECODE_OK; } else { @@ -333,6 +356,8 @@ FORCEINLINE int32_t decode_varint(upb_pbdecoder *d, uint64_t *u64) { } } +// Decodes a 32-bit varint from the current buffer position. +// Returns a status code as described in decoder.int.h. FORCEINLINE int32_t decode_v32(upb_pbdecoder *d, uint32_t *u32) { uint64_t u64; int32_t ret = decode_varint(d, &u64); @@ -349,16 +374,22 @@ FORCEINLINE int32_t decode_v32(upb_pbdecoder *d, uint32_t *u32) { return DECODE_OK; } +// Decodes a fixed32 from the current buffer position. +// Returns a status code as described in decoder.int.h. // TODO: proper byte swapping for big-endian machines. FORCEINLINE int32_t decode_fixed32(upb_pbdecoder *d, uint32_t *u32) { return getbytes(d, u32, 4); } +// Decodes a fixed64 from the current buffer position. +// Returns a status code as described in decoder.int.h. // TODO: proper byte swapping for big-endian machines. FORCEINLINE int32_t decode_fixed64(upb_pbdecoder *d, uint64_t *u64) { return getbytes(d, u64, 8); } +// Non-static versions of the above functions. +// These are called by the JIT for fallback paths. int32_t upb_pbdecoder_decode_f32(upb_pbdecoder *d, uint32_t *u32) { return decode_fixed32(d, u32); } @@ -370,6 +401,7 @@ int32_t upb_pbdecoder_decode_f64(upb_pbdecoder *d, uint64_t *u64) { static double as_double(uint64_t n) { double d; memcpy(&d, &n, 8); return d; } static float as_float(uint32_t n) { float f; memcpy(&f, &n, 4); return f; } +// Pushes a frame onto the decoder stack. static bool push(upb_pbdecoder *d, uint64_t end) { upb_pbdecoder_frame *fr = d->top; @@ -400,6 +432,7 @@ static bool pushtagdelim(upb_pbdecoder *d, uint32_t arg) { return true; } +// Pops a frame from the decoder stack. static void pop(upb_pbdecoder *d) { d->top--; } NOINLINE int32_t upb_pbdecoder_checktag_slow(upb_pbdecoder *d, @@ -477,7 +510,7 @@ have_tag: return DECODE_OK; } - if (ptr(d) == d->delim_end) { + if (d->ptr == d->delim_end) { seterr(d, "Enclosing submessage ended in the middle of value or group"); // Unlike most errors we notice during parsing, right now we have consumed // all of the user's input. @@ -516,6 +549,12 @@ static void goto_endmsg(upb_pbdecoder *d) { d->pc = d->top->base + upb_value_getuint64(v); } +// Parses a tag and jumps to the corresponding bytecode instruction for this +// field. +// +// If the tag is unknown (or the wire type doesn't match), parses the field as +// unknown. If the tag is a valid ENDGROUP tag, jumps to the bytecode +// instruction for the end of message. static int32_t dispatch(upb_pbdecoder *d) { upb_inttable *dispatch = d->top->dispatch; @@ -564,6 +603,8 @@ upb_pbdecoder_frame *outer_frame(upb_pbdecoder *d) { /* The main decoding loop *****************************************************/ +// The main decoder VM function. Uses traditional bytecode dispatch loop with a +// switch() statement. size_t upb_pbdecoder_decode(void *closure, const void *hd, const char *buf, size_t size, const upb_bufhandle *handle) { upb_pbdecoder *d = closure; @@ -591,15 +632,15 @@ size_t upb_pbdecoder_decode(void *closure, const void *hd, const char *buf, opcode op = getop(instruction); uint32_t arg = instruction >> 8; int32_t longofs = arg; - assert(ptr(d) != d->residual_end); + assert(d->ptr != d->residual_end); #ifdef UPB_DUMP_BYTECODE fprintf(stderr, "s_ofs=%d buf_ofs=%d data_rem=%d buf_rem=%d delim_rem=%d " "%x %s (%d)\n", (int)offset(d), - (int)(ptr(d) - d->buf), - (int)(d->data_end - ptr(d)), - (int)(d->end - ptr(d)), - (int)((d->top->end_ofs - d->bufstart_ofs) - (ptr(d) - d->buf)), + (int)(d->ptr - d->buf), + (int)(d->data_end - d->ptr), + (int)(d->end - d->ptr), + (int)((d->top->end_ofs - d->bufstart_ofs) - (d->ptr - d->buf)), (int)(d->pc - 1 - group->bytecode), upb_pbdecoder_getopname(op), arg); @@ -657,25 +698,24 @@ size_t upb_pbdecoder_decode(void *closure, const void *hd, const char *buf, ) VMCASE(OP_STRING, uint32_t len = curbufleft(d); - size_t n = upb_sink_putstring(&d->top->sink, arg, ptr(d), len, handle); + size_t n = upb_sink_putstring(&d->top->sink, arg, d->ptr, len, handle); if (n > len) { if (n > d->top->end_ofs - offset(d)) { seterr(d, "Tried to skip past end of string."); return upb_pbdecoder_suspend(d); } else { - return skip(d, n); + int32_t ret = skip(d, n); + // This shouldn't return DECODE_OK, because n > len. + assert(ret >= 0); + return ret; } - } else if (n < len) { - advance(d, n); + } + advance(d, n); + if (n < len || d->delim_end == NULL) { + // We aren't finished with this string yet. + d->pc--; // Repeat OP_STRING. + if (n > 0) checkpoint(d); return upb_pbdecoder_suspend(d); - } else { - advance(d, n); - if (d->delim_end == NULL) { // String extends beyond this buf? - d->pc--; // Do OP_STRING again when we resume. - d->bufstart_ofs += size; - d->residual_end = d->residual; - return size; - } } ) VMCASE(OP_ENDSTR, @@ -703,8 +743,8 @@ size_t upb_pbdecoder_decode(void *closure, const void *hd, const char *buf, VMCASE(OP_CHECKDELIM, // We are guaranteed of this assert because we never allow ourselves to // consume bytes beyond data_end, which covers delim_end when non-NULL. - assert(!(d->delim_end && ptr(d) > d->delim_end)); - if (ptr(d) == d->delim_end) + assert(!(d->delim_end && d->ptr > d->delim_end)); + if (d->ptr == d->delim_end) d->pc += longofs; ) VMCASE(OP_CALL, @@ -721,7 +761,7 @@ size_t upb_pbdecoder_decode(void *closure, const void *hd, const char *buf, VMCASE(OP_TAG1, CHECK_SUSPEND(curbufleft(d) > 0); uint8_t expected = (arg >> 8) & 0xff; - if (*ptr(d) == expected) { + if (*d->ptr == expected) { advance(d, 1); } else { int8_t shortofs; @@ -740,7 +780,7 @@ size_t upb_pbdecoder_decode(void *closure, const void *hd, const char *buf, uint16_t expected = (arg >> 8) & 0xffff; if (curbufleft(d) >= 2) { uint16_t actual; - memcpy(&actual, ptr(d), 2); + memcpy(&actual, d->ptr, 2); if (expected == actual) { advance(d, 2); } else { @@ -856,6 +896,10 @@ void upb_pbdecoder_reset(upb_pbdecoder *d) { d->call_len = 1; } +uint64_t upb_pbdecoder_bytesparsed(const upb_pbdecoder *d) { + return offset(d); +} + // Not currently required, but to support outgrowing the static stack we need // this. void upb_pbdecoder_uninit(upb_pbdecoder *d) { diff --git a/upb/pb/decoder.h b/upb/pb/decoder.h index 586d2d5..0aa35ec 100644 --- a/upb/pb/decoder.h +++ b/upb/pb/decoder.h @@ -1,11 +1,18 @@ /* * upb - a minimalist implementation of protocol buffers. * - * Copyright (c) 2009-2013 Google Inc. See LICENSE for details. + * Copyright (c) 2009-2014 Google Inc. See LICENSE for details. * Author: Josh Haberman <jhaberman@gmail.com> * * upb::pb::Decoder implements a high performance, streaming, resumable decoder * for the binary protobuf format. + * + * This interface works the same regardless of what decoder backend is being + * used. A client of this class does not need to know whether decoding is using + * a JITted decoder (DynASM, LLVM, etc) or an interpreted decoder. By default, + * it will always use the fastest available decoder. However, you can call + * set_allow_jit(false) to disable any JIT decoder that might be available. + * This is primarily useful for testing purposes. */ #ifndef UPB_DECODER_H_ @@ -200,6 +207,15 @@ class upb::pb::Decoder { // Resets the state of the decoder. void Reset(); + // Returns number of bytes successfully parsed. + // + // This can be useful for determining the stream position where an error + // occurred. + // + // This value may not be up-to-date when called from inside a parsing + // callback. + uint64_t BytesParsed() const; + // Resets the output sink of the Decoder. // The given sink must match method()->dest_handlers(). // @@ -332,6 +348,7 @@ void upb_pbdecoder_reset(upb_pbdecoder *d); const upb_pbdecodermethod *upb_pbdecoder_method(const upb_pbdecoder *d); bool upb_pbdecoder_resetoutput(upb_pbdecoder *d, upb_sink *sink); upb_bytessink *upb_pbdecoder_input(upb_pbdecoder *d); +uint64_t upb_pbdecoder_bytesparsed(const upb_pbdecoder *d); void upb_pbdecodermethodopts_init(upb_pbdecodermethodopts *opts, const upb_handlers *h); @@ -400,6 +417,9 @@ inline const DecoderMethod* Decoder::method() const { inline void Decoder::Reset() { upb_pbdecoder_reset(this); } +inline uint64_t Decoder::BytesParsed() const { + return upb_pbdecoder_bytesparsed(this); +} inline bool Decoder::ResetOutput(Sink* sink) { return upb_pbdecoder_resetoutput(this, sink); } diff --git a/upb/pb/decoder.int.h b/upb/pb/decoder.int.h index 11aa133..d0f12cc 100644 --- a/upb/pb/decoder.int.h +++ b/upb/pb/decoder.int.h @@ -1,3 +1,11 @@ +/* + * upb - a minimalist implementation of protocol buffers. + * + * Copyright (c) 2009-2014 Google Inc. See LICENSE for details. + * Author: Josh Haberman <jhaberman@gmail.com> + * + * Internal-only definitions for the decoder. + */ #ifndef UPB_DECODER_INT_H_ #define UPB_DECODER_INT_H_ diff --git a/upb/pb/textprinter.c b/upb/pb/textprinter.c index 8a49c73..94f19e2 100644 --- a/upb/pb/textprinter.c +++ b/upb/pb/textprinter.c @@ -170,11 +170,10 @@ static bool putenum(void *closure, const void *handler_data, int32_t val) { putf(p, "%s: %s", upb_fielddef_name(f), label); endfield(p); } else { - CHECK(putint32(closure, handler_data, val)); + if (!putint32(closure, handler_data, val)) + return false; } return true; -err: - return false; } static void *startstr(void *closure, const void *handler_data, @@ -182,6 +181,7 @@ static void *startstr(void *closure, const void *handler_data, const upb_fielddef *f = handler_data; UPB_UNUSED(size_hint); upb_textprinter *p = closure; + indent(p); putf(p, "%s: \"", upb_fielddef_name(f)); return p; } @@ -244,16 +244,18 @@ void upb_textprinter_reset(upb_textprinter *p, bool single_line) { p->indent_depth_ = 0; } -static void onmreg(void *c, upb_handlers *h) { - (void)c; +static void onmreg(const void *c, upb_handlers *h) { + UPB_UNUSED(c); const upb_msgdef *m = upb_handlers_msgdef(h); + upb_handlers_setstartmsg(h, startmsg, NULL); upb_handlers_setendmsg(h, endmsg, NULL); + upb_msg_iter i; for(upb_msg_begin(&i, m); !upb_msg_done(&i); upb_msg_next(&i)) { upb_fielddef *f = upb_msg_iter_field(&i); upb_handlerattr attr = UPB_HANDLERATTR_INITIALIZER; - upb_handlerattr_sethandlerdata(&attr, f, NULL); + upb_handlerattr_sethandlerdata(&attr, f); switch (upb_fielddef_type(f)) { case UPB_TYPE_INT32: upb_handlers_setint32(h, f, putint32, &attr); @@ -287,9 +289,7 @@ static void onmreg(void *c, upb_handlers *h) { upb_fielddef_istagdelim(f) ? shortname(upb_msgdef_fullname(upb_fielddef_msgsubdef(f))) : upb_fielddef_name(f); - // TODO(haberman): add "setconsthandlerdata"? If we pass NULL for - // cleanup then we don't need a non-const pointer. - upb_handlerattr_sethandlerdata(&attr, (void*)name, NULL); + upb_handlerattr_sethandlerdata(&attr, name); upb_handlers_setstartsubmsg(h, f, startsubmsg, &attr); upb_handlers_setendsubmsg(h, f, endsubmsg, &attr); break; |