From 10265aa56b22ac4f04e7ba08330138e4507534e4 Mon Sep 17 00:00:00 2001 From: Joshua Haberman Date: Fri, 15 Jul 2011 12:05:43 -0700 Subject: Directory restructure. Includes are now via upb/foo.h. Files specific to the protobuf format are now in upb/pb (the core library is concerned with message definitions, handlers, and byte streams, but knows nothing about any particular serializationf format). --- upb/pb/decoder.c | 469 ++++++++++++++++++++++++++++++ upb/pb/decoder.h | 99 +++++++ upb/pb/decoder_x86.dasc | 694 ++++++++++++++++++++++++++++++++++++++++++++ upb/pb/encoder.c | 421 +++++++++++++++++++++++++++ upb/pb/encoder.h | 58 ++++ upb/pb/glue.c | 129 ++++++++ upb/pb/glue.h | 62 ++++ upb/pb/jit_debug_elf_file.s | 7 + upb/pb/textprinter.c | 199 +++++++++++++ upb/pb/textprinter.h | 31 ++ upb/pb/varint.c | 54 ++++ upb/pb/varint.h | 142 +++++++++ 12 files changed, 2365 insertions(+) create mode 100644 upb/pb/decoder.c create mode 100644 upb/pb/decoder.h create mode 100644 upb/pb/decoder_x86.dasc create mode 100644 upb/pb/encoder.c create mode 100644 upb/pb/encoder.h create mode 100644 upb/pb/glue.c create mode 100644 upb/pb/glue.h create mode 100644 upb/pb/jit_debug_elf_file.s create mode 100644 upb/pb/textprinter.c create mode 100644 upb/pb/textprinter.h create mode 100644 upb/pb/varint.c create mode 100644 upb/pb/varint.h (limited to 'upb/pb') diff --git a/upb/pb/decoder.c b/upb/pb/decoder.c new file mode 100644 index 0000000..218c780 --- /dev/null +++ b/upb/pb/decoder.c @@ -0,0 +1,469 @@ +/* + * upb - a minimalist implementation of protocol buffers. + * + * Copyright (c) 2008-2011 Google Inc. See LICENSE for details. + * Author: Josh Haberman + */ + +#include +#include +#include +#include "upb/bytestream.h" +#include "upb/msg.h" +#include "upb/pb/decoder.h" +#include "upb/pb/varint.h" + +// Used for frames that have no specific end offset: groups, repeated primitive +// fields inside groups, and the top-level message. +#define UPB_NONDELIMITED UINT32_MAX + +#ifdef UPB_USE_JIT_X64 +#define Dst_DECL upb_decoder *d +#define Dst_REF (d->dynasm) +#define Dst (d) +#include "dynasm/dasm_proto.h" +#include "upb/pb/decoder_x86.h" +#endif + +// It's unfortunate that we have to micro-manage the compiler this way, +// especially since this tuning is necessarily specific to one hardware +// configuration. But emperically on a Core i7, performance increases 30-50% +// with these annotations. Every instance where these appear, gcc 4.2.1 made +// the wrong decision and degraded performance in benchmarks. +#define FORCEINLINE static __attribute__((always_inline)) +#define NOINLINE static __attribute__((noinline)) + +static void upb_decoder_exit(upb_decoder *d) { siglongjmp(d->exitjmp, 1); } +static void upb_decoder_exit2(void *_d) { + upb_decoder *d = _d; + upb_decoder_exit(d); +} +static void upb_decoder_abort(upb_decoder *d, const char *msg) { + upb_status_setf(d->status, UPB_ERROR, msg); + upb_decoder_exit(d); +} + +/* Decoding/Buffering of wire types *******************************************/ + +static size_t upb_decoder_bufleft(upb_decoder *d) { return d->end - d->ptr; } +static void upb_decoder_advance(upb_decoder *d, size_t len) { + assert((size_t)(d->end - d->ptr) >= len); + d->ptr += len; +} + +size_t upb_decoder_offset(upb_decoder *d) { + size_t offset = d->bufstart_ofs; + if (d->ptr) offset += (d->ptr - d->buf); + return offset; +} + +static void upb_decoder_setmsgend(upb_decoder *d) { + upb_dispatcher_frame *f = d->dispatcher.top; + size_t delimlen = f->end_ofs - d->bufstart_ofs; + size_t buflen = d->end - d->buf; + if (f->end_ofs != UINT64_MAX && delimlen <= buflen) { + d->delim_end = (uintptr_t)(d->buf + delimlen); + } else { + // Buffers must not run up against the end of memory. + assert((uintptr_t)d->end < UINTPTR_MAX); + d->delim_end = UINTPTR_MAX; + } +} + +// Pulls the next buffer from the bytesrc. Should be called only when the +// current buffer is completely empty. +static bool upb_trypullbuf(upb_decoder *d) { + assert(upb_decoder_bufleft(d) == 0); + if (d->bufend_ofs == d->refend_ofs) { + d->refend_ofs += upb_bytesrc_fetch(d->bytesrc, d->refend_ofs, d->status); + if (!upb_ok(d->status)) { + d->ptr = NULL; + d->end = NULL; + if (upb_iseof(d->status)) return false; + upb_decoder_exit(d); + } + } + d->bufstart_ofs = d->bufend_ofs; + size_t len; + d->buf = upb_bytesrc_getptr(d->bytesrc, d->bufstart_ofs, &len); + assert(len > 0); + d->bufend_ofs = d->bufstart_ofs + len; + d->ptr = d->buf; + d->end = d->buf + len; +#ifdef UPB_USE_JIT_X64 + d->jit_end = d->end - 20; +#endif + upb_decoder_setmsgend(d); + return true; +} + +static void upb_pullbuf(upb_decoder *d) { + if (!upb_trypullbuf(d)) upb_decoder_abort(d, "Unexpected EOF"); +} + +void upb_decoder_commit(upb_decoder *d) { + d->completed_ptr = d->ptr; + if (d->refstart_ofs < d->bufstart_ofs) { + // Drop our ref on the previous buf's region. + upb_bytesrc_refregion(d->bytesrc, d->bufstart_ofs, d->refend_ofs); + upb_bytesrc_unrefregion(d->bytesrc, d->refstart_ofs, d->refend_ofs); + d->refstart_ofs = d->bufstart_ofs; + } +} + +NOINLINE uint64_t upb_decode_varint_slow(upb_decoder *d) { + uint8_t byte = 0x80; + uint64_t u64 = 0; + int bitpos; + const char *ptr = d->ptr; + for(bitpos = 0; bitpos < 70 && (byte & 0x80); bitpos += 7) { + if (upb_decoder_bufleft(d) == 0) { + upb_pullbuf(d); + ptr = d->ptr; + } + u64 |= ((uint64_t)(byte = *ptr++) & 0x7F) << bitpos; + } + if(bitpos == 70 && (byte & 0x80)) upb_decoder_abort(d, "Unterminated varint"); + return u64; +} + +// For tags and delimited lengths, which must be <=32bit and are usually small. +FORCEINLINE uint32_t upb_decode_varint32(upb_decoder *d) { + const char *p = d->ptr; + uint32_t ret; + uint64_t u64; + // Nearly all will be either 1 byte (1-16) or 2 bytes (17-2048). + if (upb_decoder_bufleft(d) < 2) goto slow; // unlikely. + ret = *p & 0x7f; + if ((*(p++) & 0x80) == 0) goto done; // predictable if fields are in order + ret |= (*p & 0x7f) << 7; + if ((*(p++) & 0x80) == 0) goto done; // likely +slow: + u64 = upb_decode_varint_slow(d); + if (u64 > 0xffffffff) upb_decoder_abort(d, "Unterminated 32-bit varint"); + ret = (uint32_t)u64; + p = d->ptr; // Turn the next line into a nop. +done: + upb_decoder_advance(d, p - d->ptr); + return ret; +} + +FORCEINLINE bool upb_trydecode_varint32(upb_decoder *d, uint32_t *val) { + if (upb_decoder_bufleft(d) == 0) { + // Check for our two normal end-of-message conditions. + if (d->bufend_ofs == d->end_ofs) return false; + if (!upb_trypullbuf(d)) return false; + } + *val = upb_decode_varint32(d); + return true; +} + +FORCEINLINE uint64_t upb_decode_varint(upb_decoder *d) { + if (upb_decoder_bufleft(d) >= 10) { + // Fast case. + upb_decoderet r = upb_vdecode_fast(d->ptr); + if (r.p == NULL) upb_decoder_abort(d, "Unterminated varint"); + upb_decoder_advance(d, r.p - d->ptr); + return r.val; + } else if (upb_decoder_bufleft(d) > 0) { + // Intermediate case -- worth it? + char tmpbuf[10]; + memset(tmpbuf, 0x80, 10); + memcpy(tmpbuf, d->ptr, upb_decoder_bufleft(d)); + upb_decoderet r = upb_vdecode_fast(tmpbuf); + if (r.p != NULL) { + upb_decoder_advance(d, r.p - tmpbuf); + return r.val; + } + } + // Slow case -- varint spans buffer seam. + return upb_decode_varint_slow(d); +} + +FORCEINLINE void upb_decode_fixed(upb_decoder *d, char *buf, size_t bytes) { + if (upb_decoder_bufleft(d) >= bytes) { + // Fast case. + memcpy(buf, d->ptr, bytes); + upb_decoder_advance(d, bytes); + } else { + // Slow case. + size_t read = 0; + while (read < bytes) { + size_t avail = upb_decoder_bufleft(d); + memcpy(buf + read, d->ptr, avail); + upb_decoder_advance(d, avail); + read += avail; + } + } +} + +FORCEINLINE uint32_t upb_decode_fixed32(upb_decoder *d) { + uint32_t u32; + upb_decode_fixed(d, (char*)&u32, sizeof(uint32_t)); + return u32; // TODO: proper byte swapping +} +FORCEINLINE uint64_t upb_decode_fixed64(upb_decoder *d) { + uint64_t u64; + upb_decode_fixed(d, (char*)&u64, sizeof(uint64_t)); + return u64; // TODO: proper byte swapping +} + +INLINE upb_strref *upb_decode_string(upb_decoder *d) { + uint32_t strlen = upb_decode_varint32(d); + d->strref.stream_offset = upb_decoder_offset(d); + d->strref.len = strlen; + if (upb_decoder_bufleft(d) == 0) upb_pullbuf(d); + if (upb_decoder_bufleft(d) >= strlen) { + // Fast case. + d->strref.ptr = d->ptr; + upb_decoder_advance(d, strlen); + } else { + // Slow case. + while (1) { + size_t consume = UPB_MIN(upb_decoder_bufleft(d), strlen); + upb_decoder_advance(d, consume); + strlen -= consume; + if (strlen == 0) break; + upb_pullbuf(d); + } + } + return &d->strref; +} + +INLINE void upb_push(upb_decoder *d, upb_fhandlers *f, uint32_t end) { + upb_dispatch_startsubmsg(&d->dispatcher, f)->end_ofs = end; + upb_decoder_setmsgend(d); +} + + +/* Decoding of .proto types ***************************************************/ + +// Technically, we are losing data if we see a 32-bit varint that is not +// properly sign-extended. We could detect this and error about the data loss, +// but proto2 does not do this, so we pass. + +#define T(type, wt, valtype, convfunc) \ + INLINE void upb_decode_ ## type(upb_decoder *d, upb_fhandlers *f) { \ + upb_value val; \ + upb_value_set ## valtype(&val, (convfunc)(upb_decode_ ## wt(d))); \ + upb_dispatch_value(&d->dispatcher, f, val); \ + } \ + +static double upb_asdouble(uint64_t n) { double d; memcpy(&d, &n, 8); return d; } +static float upb_asfloat(uint32_t n) { float f; memcpy(&f, &n, 4); return f; } +static int32_t upb_zzdec_32(uint32_t n) { return (n >> 1) ^ -(int32_t)(n & 1); } +static int64_t upb_zzdec_64(uint64_t n) { return (n >> 1) ^ -(int64_t)(n & 1); } + +T(INT32, varint, int32, int32_t) +T(INT64, varint, int64, int64_t) +T(UINT32, varint, uint32, uint32_t) +T(UINT64, varint, uint64, uint64_t) +T(FIXED32, fixed32, uint32, uint32_t) +T(FIXED64, fixed64, uint64, uint64_t) +T(SFIXED32, fixed32, int32, int32_t) +T(SFIXED64, fixed64, int64, int64_t) +T(BOOL, varint, bool, bool) +T(ENUM, varint, int32, int32_t) +T(DOUBLE, fixed64, double, upb_asdouble) +T(FLOAT, fixed32, float, upb_asfloat) +T(SINT32, varint, int32, upb_zzdec_32) +T(SINT64, varint, int64, upb_zzdec_64) +T(STRING, string, strref, upb_strref*) + +static void upb_decode_GROUP(upb_decoder *d, upb_fhandlers *f) { + upb_push(d, f, UPB_NONDELIMITED); +} +static void upb_endgroup(upb_decoder *d, upb_fhandlers *f) { + (void)f; + upb_dispatch_endsubmsg(&d->dispatcher); + upb_decoder_setmsgend(d); +} +static void upb_decode_MESSAGE(upb_decoder *d, upb_fhandlers *f) { + upb_push(d, f, upb_decode_varint32(d) + (d->ptr - d->buf)); +} + + +/* The main decoding loop *****************************************************/ + +static void upb_decoder_checkdelim(upb_decoder *d) { + while ((uintptr_t)d->ptr >= d->delim_end) { + if ((uintptr_t)d->ptr > d->delim_end) + upb_decoder_abort(d, "Bad submessage end"); + + if (d->dispatcher.top->is_sequence) { + upb_dispatch_endseq(&d->dispatcher); + } else { + upb_dispatch_endsubmsg(&d->dispatcher); + } + upb_decoder_setmsgend(d); + } +} + +static void upb_decoder_enterjit(upb_decoder *d) { + (void)d; +#ifdef UPB_USE_JIT_X64 + if (d->jit_code && d->dispatcher.top == d->dispatcher.stack && d->ptr < d->jit_end) { + // Decodes as many fields as possible, updating d->ptr appropriately, + // before falling through to the slow(er) path. + void (*upb_jit_decode)(upb_decoder *d) = (void*)d->jit_code; + upb_jit_decode(d); + } +#endif +} + +INLINE upb_fhandlers *upb_decode_tag(upb_decoder *d) { + while (1) { + uint32_t tag; + if (!upb_trydecode_varint32(d, &tag)) return NULL; + upb_fhandlers *f = upb_dispatcher_lookup(&d->dispatcher, tag); + + // There are no explicit "startseq" or "endseq" markers in protobuf + // streams, so we have to infer them by noticing when a repeated field + // starts or ends. + if (d->dispatcher.top->is_sequence && d->dispatcher.top->f != f) { + upb_dispatch_endseq(&d->dispatcher); + upb_decoder_setmsgend(d); + } + if (f && f->repeated && d->dispatcher.top->f != f) { + // TODO: support packed. + assert(upb_issubmsgtype(f->type) || upb_isstringtype(f->type) || + (tag & 0x7) != UPB_WIRE_TYPE_DELIMITED); + uint32_t end = d->dispatcher.top->end_ofs; + upb_dispatch_startseq(&d->dispatcher, f)->end_ofs = end; + upb_decoder_setmsgend(d); + } + if (f) return f; + + // Unknown field. + switch (tag & 0x7) { + case UPB_WIRE_TYPE_VARINT: upb_decode_varint(d); break; + case UPB_WIRE_TYPE_32BIT: upb_decoder_advance(d, 4); break; + case UPB_WIRE_TYPE_64BIT: upb_decoder_advance(d, 8); break; + case UPB_WIRE_TYPE_DELIMITED: + upb_decoder_advance(d, upb_decode_varint32(d)); break; + default: + upb_decoder_abort(d, "Invavlid wire type"); + } + // TODO: deliver to unknown field callback. + upb_decoder_commit(d); + upb_decoder_checkdelim(d); + } +} + +void upb_decoder_onexit(upb_decoder *d) { + if (d->dispatcher.top->is_sequence) upb_dispatch_endseq(&d->dispatcher); + if (d->status->code == UPB_EOF && upb_dispatcher_stackempty(&d->dispatcher)) { + // Normal end-of-file. + upb_status_clear(d->status); + upb_dispatch_endmsg(&d->dispatcher, d->status); + } else { + if (d->status->code == UPB_EOF) + upb_status_setf(d->status, UPB_ERROR, "Input ended mid-submessage."); + } +} + +void upb_decoder_decode(upb_decoder *d, upb_status *status) { + if (sigsetjmp(d->exitjmp, 0)) { + upb_decoder_onexit(d); + return; + } + d->status = status; + upb_dispatch_startmsg(&d->dispatcher); + while(1) { // Main loop: executed once per tag/field pair. + upb_decoder_checkdelim(d); + upb_decoder_enterjit(d); + // if (!d->dispatcher.top->is_packed) + upb_fhandlers *f = upb_decode_tag(d); + if (!f) upb_decoder_exit2(d); + f->decode(d, f); + upb_decoder_commit(d); + } +} + +static void upb_decoder_skip(void *_d, upb_dispatcher_frame *top, + upb_dispatcher_frame *bottom) { + (void)top; + (void)bottom; + (void)_d; +#if 0 + upb_decoder *d = _d; + // TODO + if (bottom->end_offset == UPB_NONDELIMITED) { + // TODO: support skipping groups. + abort(); + } + d->ptr = d->buf.ptr + bottom->end_offset; +#endif +} + +void upb_decoder_initforhandlers(upb_decoder *d, upb_handlers *handlers) { + upb_dispatcher_init( + &d->dispatcher, handlers, upb_decoder_skip, upb_decoder_exit2, d); +#ifdef UPB_USE_JIT_X64 + d->jit_code = NULL; + if (d->dispatcher.handlers->should_jit) upb_decoder_makejit(d); +#endif + // Set function pointers for each field's decode function. + for (int i = 0; i < handlers->msgs_len; i++) { + upb_mhandlers *m = handlers->msgs[i]; + for(upb_inttable_iter i = upb_inttable_begin(&m->fieldtab); !upb_inttable_done(i); + i = upb_inttable_next(&m->fieldtab, i)) { + upb_fhandlers *f = upb_inttable_iter_value(i); + switch (f->type) { + case UPB_TYPE(INT32): f->decode = &upb_decode_INT32; break; + case UPB_TYPE(INT64): f->decode = &upb_decode_INT64; break; + case UPB_TYPE(UINT32): f->decode = &upb_decode_UINT32; break; + case UPB_TYPE(UINT64): f->decode = &upb_decode_UINT64; break; + case UPB_TYPE(FIXED32): f->decode = &upb_decode_FIXED32; break; + case UPB_TYPE(FIXED64): f->decode = &upb_decode_FIXED64; break; + case UPB_TYPE(SFIXED32): f->decode = &upb_decode_SFIXED32; break; + case UPB_TYPE(SFIXED64): f->decode = &upb_decode_SFIXED64; break; + case UPB_TYPE(BOOL): f->decode = &upb_decode_BOOL; break; + case UPB_TYPE(ENUM): f->decode = &upb_decode_ENUM; break; + case UPB_TYPE(DOUBLE): f->decode = &upb_decode_DOUBLE; break; + case UPB_TYPE(FLOAT): f->decode = &upb_decode_FLOAT; break; + case UPB_TYPE(SINT32): f->decode = &upb_decode_SINT32; break; + case UPB_TYPE(SINT64): f->decode = &upb_decode_SINT64; break; + case UPB_TYPE(STRING): f->decode = &upb_decode_STRING; break; + case UPB_TYPE(BYTES): f->decode = &upb_decode_STRING; break; + case UPB_TYPE(GROUP): f->decode = &upb_decode_GROUP; break; + case UPB_TYPE(MESSAGE): f->decode = &upb_decode_MESSAGE; break; + case UPB_TYPE_ENDGROUP: f->decode = &upb_endgroup; break; + } + } + } +} + +void upb_decoder_initformsgdef(upb_decoder *d, upb_msgdef *m) { + upb_handlers *h = upb_handlers_new(); + upb_accessors_reghandlers(h, m); + upb_decoder_initforhandlers(d, h); + upb_handlers_unref(h); +} + +void upb_decoder_reset(upb_decoder *d, upb_bytesrc *bytesrc, uint64_t start_ofs, + uint64_t end_ofs, void *closure) { + upb_dispatcher_frame *f = upb_dispatcher_reset(&d->dispatcher, closure); + f->end_ofs = end_ofs; + d->end_ofs = end_ofs; + d->refstart_ofs = start_ofs; + d->refend_ofs = start_ofs; + d->bufstart_ofs = start_ofs; + d->bufend_ofs = start_ofs; + d->bytesrc = bytesrc; + d->buf = NULL; + d->ptr = NULL; + d->end = NULL; // Force a buffer pull. +#ifdef UPB_USE_JIT_X64 + d->jit_end = NULL; +#endif + d->delim_end = UINTPTR_MAX; // But don't let end-of-message get triggered. + d->strref.bytesrc = bytesrc; +} + +void upb_decoder_uninit(upb_decoder *d) { +#ifdef UPB_USE_JIT_X64 + if (d->dispatcher.handlers->should_jit) upb_decoder_freejit(d); +#endif + upb_dispatcher_uninit(&d->dispatcher); +} diff --git a/upb/pb/decoder.h b/upb/pb/decoder.h new file mode 100644 index 0000000..3981359 --- /dev/null +++ b/upb/pb/decoder.h @@ -0,0 +1,99 @@ +/* + * upb - a minimalist implementation of protocol buffers. + * + * Copyright (c) 2009-2010 Google Inc. See LICENSE for details. + * Author: Josh Haberman + * + * upb_decoder implements a high performance, streaming decoder for protobuf + * data that works by implementing upb_src and getting its data from a + * upb_bytesrc. + * + * The decoder does not currently support non-blocking I/O, in the sense that + * if the bytesrc returns UPB_STATUS_TRYAGAIN it is not possible to resume the + * decoder when data becomes available again. Support for this could be added, + * but it would add complexity and perhaps cost efficiency also. + */ + +#ifndef UPB_DECODER_H_ +#define UPB_DECODER_H_ + +#include +#include +#include +#include "upb/handlers.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/* upb_decoder *****************************************************************/ + +struct dasm_State; + +typedef struct _upb_decoder { + upb_bytesrc *bytesrc; // Source of our serialized data. + upb_dispatcher dispatcher; // Dispatcher to which we push parsed data. + upb_status *status; // Where we will store any errors that occur. + upb_strref strref; // For passing string data to callbacks. + + // Offsets for the region we currently have ref'd. + uint64_t refstart_ofs, refend_ofs; + + // Current buffer and its stream offset. + const char *buf, *ptr, *end; + uint64_t bufstart_ofs, bufend_ofs; + + // Stream offset for the end of the top-level message, if any. + uint64_t end_ofs; + + // Buf offset as of which we've delivered calbacks; needed for rollback on + // UPB_TRYAGAIN (or in the future, UPB_SUSPEND). + const char *completed_ptr; + + // End of the delimited region, relative to ptr, or UINTPTR_MAX if not in + // this buf. + uintptr_t delim_end; + +#ifdef UPB_USE_JIT_X64 + // For JIT, which doesn't do bounds checks in the middle of parsing a field. + const char *jit_end, *effective_end; // == MIN(jit_end, submsg_end) + + // JIT-generated machine code (else NULL). + char *jit_code; + size_t jit_size; + char *debug_info; + + struct dasm_State *dynasm; +#endif + + sigjmp_buf exitjmp; +} upb_decoder; + +// Initializes/uninitializes a decoder for calling into the given handlers +// or to write into the given msgdef, given its accessors). Takes a ref +// on the handlers or msgdef. +void upb_decoder_initforhandlers(upb_decoder *d, upb_handlers *h); + +// Equivalent to: +// upb_accessors_reghandlers(m, h); +// upb_decoder_initforhandlers(d, h); +// except possibly more efficient, by using cached state in the msgdef. +void upb_decoder_initformsgdef(upb_decoder *d, upb_msgdef *m); +void upb_decoder_uninit(upb_decoder *d); + +// Resets the internal state of an already-allocated decoder. This puts it in a +// state where it has not seen any data, and expects the next data to be from +// the beginning of a new protobuf. Parsers must be reset before they can be +// used. A decoder can be reset multiple times. +// +// Pass UINT64_MAX for end_ofs to indicate a non-delimited top-level message. +void upb_decoder_reset(upb_decoder *d, upb_bytesrc *src, uint64_t start_ofs, + uint64_t end_ofs, void *closure); + +void upb_decoder_decode(upb_decoder *d, upb_status *status); + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif /* UPB_DECODER_H_ */ diff --git a/upb/pb/decoder_x86.dasc b/upb/pb/decoder_x86.dasc new file mode 100644 index 0000000..19043c6 --- /dev/null +++ b/upb/pb/decoder_x86.dasc @@ -0,0 +1,694 @@ +|// +|// upb - a minimalist implementation of protocol buffers. +|// +|// Copyright (c) 2011 Google Inc. See LICENSE for details. +|// Author: Josh Haberman +|// +|// JIT compiler for upb_decoder on x86. Given a upb_handlers object, +|// generates code specialized to parsing the specific message and +|// calling specific handlers. + +#define UPB_NONE -1 +#define UPB_MULTIPLE -2 +#define UPB_TOPLEVEL_ONE -3 + +#include +#include "dynasm/dasm_proto.h" +#include "dynasm/dasm_x86.h" + +#ifndef MAP_ANONYMOUS +# define MAP_ANONYMOUS MAP_ANON +#endif + +// We map into the low 32 bits when we can, but if this is not available +// (like on OS X) we take what we can get. It's not required for correctness, +// it's just a performance thing that makes it more likely that our jumps +// can be rel32 (i.e. within 32-bits of our pc) instead of the longer +// sequence required for other jumps (see callp). +#ifndef MAP_32BIT +#define MAP_32BIT 0 +#endif + +// To debug JIT-ted code with GDB we need to tell GDB about the JIT-ted code +// at runtime. GDB 7.x+ has defined an interface for doing this, and these +// structure/function defintions are copied out of gdb/jit.h +// +// We need to give GDB an ELF file at runtime describing the symbols we have +// generated. To avoid implementing the ELF format, we generate an ELF file +// at compile-time and compile it in as a character string. We can replace +// a few key constants (address of JIT-ted function and its size) by looking +// for a few magic numbers and doing a dumb string replacement. + +#ifndef __APPLE__ +#include "upb/pb/jit_debug_elf_file.h" + +typedef enum +{ + GDB_JIT_NOACTION = 0, + GDB_JIT_REGISTER, + GDB_JIT_UNREGISTER +} jit_actions_t; + +typedef struct gdb_jit_entry { + struct gdb_jit_entry *next_entry; + struct gdb_jit_entry *prev_entry; + const char *symfile_addr; + uint64_t symfile_size; +} gdb_jit_entry; + +typedef struct { + uint32_t version; + uint32_t action_flag; + gdb_jit_entry *relevant_entry; + gdb_jit_entry *first_entry; +} gdb_jit_descriptor; + +gdb_jit_descriptor __jit_debug_descriptor = {1, GDB_JIT_NOACTION, NULL, NULL}; + +void __attribute__((noinline)) __jit_debug_register_code() { __asm__ __volatile__(""); } + +void upb_reg_jit_gdb(upb_decoder *d) { + // Create debug info. + size_t elf_len = upb_pb_jit_debug_elf_file_o_len; + d->debug_info = malloc(elf_len); + memcpy(d->debug_info, upb_pb_jit_debug_elf_file_o, elf_len); + uint64_t *p = (void*)d->debug_info; + for (; (void*)(p+1) <= (void*)d->debug_info + elf_len; ++p) { + if (*p == 0x12345678) { *p = (uintptr_t)d->jit_code; } + if (*p == 0x321) { *p = d->jit_size; } + } + + // Register the JIT-ted code with GDB. + gdb_jit_entry *e = malloc(sizeof(gdb_jit_entry)); + e->next_entry = __jit_debug_descriptor.first_entry; + e->prev_entry = NULL; + if (e->next_entry) e->next_entry->prev_entry = e; + e->symfile_addr = d->debug_info; + e->symfile_size = elf_len; + __jit_debug_descriptor.first_entry = e; + __jit_debug_descriptor.relevant_entry = e; + __jit_debug_descriptor.action_flag = GDB_JIT_REGISTER; + __jit_debug_register_code(); +} + +#else + +void upb_reg_jit_gdb(upb_decoder *d) { + (void)d; +} + +#endif + +|.arch x64 +|.actionlist upb_jit_actionlist +|.globals UPB_JIT_GLOBAL_ +|.globalnames upb_jit_globalnames +| +|// Calling conventions. +|.define ARG1_64, rdi +|.define ARG2_8, sil +|.define ARG2_32, esi +|.define ARG2_64, rsi +|.define ARG3_8, dl +|.define ARG3_32, edx +|.define ARG3_64, rdx +| +|// Register allocation / type map. +|// ALL of the code in this file uses these register allocations. +|// When we "call" within this file, we do not use regular calling +|// conventions, but of course when calling to user callbacks we must. +|.define PTR, rbx +|.define CLOSURE, r12 +|.type FRAME, upb_dispatcher_frame, r13 +|.type STRREF, upb_strref, r14 +|.type DECODER, upb_decoder, r15 +| +|.macro callp, addr +|| if ((uintptr_t)addr < 0xffffffff) { + | call &addr +|| } else { + | mov64 rax, (uintptr_t)addr + | call rax +|| } +|.endmacro +| +|// Checks PTR for end-of-buffer. +|.macro check_eob, m +| cmp PTR, DECODER->effective_end +|| if (m->is_group) { + | jae ->exit_jit +|| } else { + | jae =>m->jit_endofbuf_pclabel +|| } +|.endmacro +| +|// Decodes varint from [PTR + offset] -> ARG3. +|// Saves new pointer as rax. +|.macro decode_loaded_varint, offset +| // Check for <=2 bytes inline, otherwise jump to 2-10 byte decoder. +| lea rax, [PTR + offset + 1] +| mov ARG3_32, ecx +| and ARG3_32, 0x7f +| test cl, cl +| jns >9 +| lea rax, [PTR + offset + 2] +| movzx esi, ch +| and esi, 0x7f +| shl esi, 7 +| or ARG3_32, esi +| test cx, cx +| jns >9 +| mov ARG1_64, rax +| mov ARG2_32, ARG3_32 +| callp upb_vdecode_max8_fast +| test rax, rax +| jz ->exit_jit // >10-byte varint. +|9: +|.endmacro +| +|.macro decode_varint, offset +| mov ecx, dword [PTR + offset] +| decode_loaded_varint offset +| mov PTR, rax +|.endmacro +| +|// Decode the tag -> edx. +|// Could specialize this by avoiding the value masking: could just key the +|// table on the raw (length-masked) varint to save 3-4 cycles of latency. +|// Currently only support tables where all entries are in the array part. +|.macro dyndispatch, m +| decode_loaded_varint, 0 +| mov ecx, edx +| shr ecx, 3 +| and edx, 0x7 +| cmp ecx, m->max_field_number // Bounds-check the field. +| ja ->exit_jit // In the future; could be unknown label +|| if ((uintptr_t)m->tablearray < 0xffffffff) { +| mov rax, qword [rcx*8 + m->tablearray] // TODO: support hybrid array/hash tables. +|| } else { +| mov64 rax, (uintptr_t)m->tablearray +| mov rax, qword [rax + rcx*8] +|| } +| jmp rax // Dispatch: unpredictable jump. +|.endmacro +| +|// Push a stack frame (not the CPU stack, the upb_decoder stack). +|.macro pushframe, f, closure_, end_offset_, is_sequence_ +| lea rax, [FRAME + sizeof(upb_dispatcher_frame)] // rax for shorter addressing. +| cmp rax, qword DECODER->dispatcher.limit +| jae ->exit_jit // Frame stack overflow. +| mov qword FRAME:rax->f, f +| mov qword FRAME:rax->closure, closure_ +| mov dword FRAME:rax->end_ofs, end_offset_ +| mov byte FRAME:rax->is_sequence, is_sequence_ +| mov CLOSURE, rdx +| mov DECODER->dispatcher.top, rax +| mov FRAME, rax +|.endmacro +| +|.macro popframe +| sub FRAME, sizeof(upb_dispatcher_frame) +| mov DECODER->dispatcher.top, FRAME +| setmsgend m +| mov CLOSURE, FRAME->closure +|.endmacro +| +|.macro setmsgend, m +| mov rsi, DECODER->jit_end +|| if (m->is_group) { +| mov64 rax, 0xffffffffffffffff +| mov qword DECODER->delim_end, rax +| mov DECODER->effective_end, rsi +|| } else { +| // Could store a correctly-biased version in the frame, at the cost of +| // a larger stack. +| mov eax, dword FRAME->end_ofs +| add rax, qword DECODER->buf +| mov DECODER->delim_end, rax // delim_end = d->buf + f->end_ofs +| cmp rax, rsi +| jb >8 +| mov rax, rsi // effective_end = min(d->delim_end, d->jit_end) +|8: +| mov DECODER->effective_end, rax +|| } +|.endmacro +| +|// rax contains the tag, compare it against "tag", but since it is a varint +|// we must only compare as many bytes as actually have data. +|.macro checktag, tag +|| switch (upb_value_size(tag)) { +|| case 1: +| cmp cl, tag +|| break; +|| case 2: +| cmp cx, tag +|| break; +|| case 3: +| and ecx, 0xffffff // 3 bytes +| cmp rcx, tag +|| case 4: +| cmp ecx, tag +|| break; +|| case 5: +| mov64 rdx, 0xffffffffff // 5 bytes +| and rcx, rdx +| cmp rcx, tag +|| break; +|| default: abort(); +|| } +|.endmacro +| +|// TODO: optimize for 0 (xor) and 32-bits. +|.macro loadfval, f +|| if (f->fval.val.uint64 == 0) { +| xor ARG2_32, ARG2_32 +|| } else if (f->fval.val.uint64 < 0xffffffff) { +| mov ARG2_32, f->fval.val.uint64 +|| } else { +| mov64 ARG2_64, f->fval.val.uint64 +|| } +|.endmacro + +#include +#include "upb/pb/varint.h" + +// PTR should point to the beginning of the tag. +static void upb_decoder_jit_field(upb_decoder *d, uint32_t tag, uint32_t next_tag, + upb_mhandlers *m, + upb_fhandlers *f, upb_fhandlers *next_f) { + int tag_size = upb_value_size(tag); + + // PC-label for the dispatch table. + // We check the wire type (which must be loaded in edx) because the + // table is keyed on field number, not type. + |=>f->jit_pclabel: + | cmp edx, (tag & 0x7) + | jne ->exit_jit // In the future: could be an unknown field or packed. + |=>f->jit_pclabel_notypecheck: + if (f->repeated) { + if (f->startseq) { + | mov ARG1_64, CLOSURE + | loadfval f + | callp f->startseq + } else { + | mov rdx, CLOSURE + } + | mov esi, FRAME->end_ofs + | pushframe f, rdx, esi, true + } + + |1: // Label for repeating this field. + + // Decode the value into arg 3 for the callback. + switch (f->type) { + case UPB_TYPE(DOUBLE): + case UPB_TYPE(FIXED64): + case UPB_TYPE(SFIXED64): + | mov ARG3_64, qword [PTR + tag_size] + | add PTR, 8 + tag_size + break; + + case UPB_TYPE(FLOAT): + case UPB_TYPE(FIXED32): + case UPB_TYPE(SFIXED32): + | mov ARG3_32, dword [PTR + tag_size] + | add PTR, 4 + tag_size + break; + + case UPB_TYPE(BOOL): + // Can't assume it's one byte long, because bool must be wire-compatible + // with all of the varint integer types. + | decode_varint tag_size + | test ARG3_64, ARG3_64 + | setne ARG3_8 // Other bytes left with val, should be ok. + break; + + case UPB_TYPE(INT64): + case UPB_TYPE(UINT64): + case UPB_TYPE(INT32): + case UPB_TYPE(UINT32): + case UPB_TYPE(ENUM): + | decode_varint tag_size + break; + + case UPB_TYPE(SINT64): + // 64-bit zig-zag decoding. + | decode_varint tag_size + | mov rax, ARG3_64 + | shr ARG3_64, 1 + | and rax, 1 + | neg rax + | xor ARG3_64, rax + break; + + case UPB_TYPE(SINT32): + // 32-bit zig-zag decoding. + | decode_varint tag_size + | mov eax, ARG3_32 + | shr ARG3_32, 1 + | and eax, 1 + | neg eax + | xor ARG3_32, eax + break; + + case UPB_TYPE(STRING): + case UPB_TYPE(BYTES): + // We only handle the case where the entire string is in our current + // buf, which sidesteps any security problems. The C path has more + // robust checks. + | decode_varint tag_size + | mov STRREF->len, ARG3_32 + | mov STRREF->ptr, PTR + | mov rax, PTR + | sub rax, DECODER->buf + | add eax, DECODER->bufstart_ofs // = d->ptr - d->buf + d->bufstart_ofs + | mov STRREF->stream_offset, eax + | add PTR, ARG3_64 + | mov ARG3_64, STRREF + | cmp PTR, DECODER->effective_end + | ja ->exit_jit // Can't deliver, whole string not in buf. + break; + + case UPB_TYPE_ENDGROUP: // A pseudo-type. + | add PTR, tag_size + | mov DECODER->ptr, PTR + | jmp =>m->jit_endofmsg_pclabel + return; + + // Will dispatch callbacks and call submessage in a second. + case UPB_TYPE(MESSAGE): + | decode_varint tag_size + break; + case UPB_TYPE(GROUP): + | add PTR, tag_size + break; + + default: abort(); + } + // Commit our work by advancing ptr. + // (If in the future we wanted to support a UPB_SUSPEND_AGAIN that + // suspends the decoder and redelivers the value later, we would + // need to adjust this to happen perhaps after the callback ran). + | mov DECODER->ptr, PTR + + // Load closure and fval into arg registers. + | mov ARG1_64, CLOSURE + | loadfval f + + // Call callbacks. + if (upb_issubmsgtype(f->type)) { + // Call startsubmsg handler (if any). + if (f->startsubmsg) { + // upb_sflow_t startsubmsg(void *closure, upb_value fval) + | mov r12d, ARG3_32 + | callp f->startsubmsg + } else { + | mov rdx, CLOSURE + | mov r12d, ARG3_32 + } + if (f->type == UPB_TYPE(MESSAGE)) { + | mov rsi, PTR + | sub rsi, DECODER->buf + | add esi, r12d // = (d->ptr - d->buf) + delim_len + } else { + assert(f->type == UPB_TYPE(GROUP)); + | mov esi, UPB_NONDELIMITED + } + | pushframe f, rdx, esi, false + + upb_mhandlers *sub_m = upb_fhandlers_getsubmsg(f); + if (sub_m->jit_parent_field_done_pclabel != UPB_MULTIPLE) { + | jmp =>sub_m->jit_startmsg_pclabel; + } else { + | call =>sub_m->jit_startmsg_pclabel; + } + + |=>f->jit_submsg_done_pclabel: + | popframe + + // Call endsubmsg handler (if any). + if (f->endsubmsg) { + // upb_flow_t endsubmsg(void *closure, upb_value fval); + | mov ARG1_64, CLOSURE + | loadfval f + | callp f->endsubmsg + } + } else { + | callp f->value + } + // TODO: Handle UPB_SKIPSUBMSG, UPB_BREAK + + // Epilogue: load next tag, check for repeated field. + | check_eob m + | mov rcx, qword [PTR] + if (f->repeated) { + | checktag tag + | je <1 + | popframe + if (f->endseq) { + | mov ARG1_64, CLOSURE + | loadfval f + | callp f->endseq + } + } + if (next_tag != 0) { + | checktag next_tag + | je =>next_f->jit_pclabel_notypecheck + } + + // Fall back to dynamic dispatch. Replicate the dispatch + // here so we can learn what fields generally follow others. + | dyndispatch m + |1: +} + +static int upb_compare_uint32(const void *a, const void *b) { + // TODO: always put ENDGROUP at the end. + return *(uint32_t*)a - *(uint32_t*)b; +} + +static void upb_decoder_jit_msg(upb_decoder *d, upb_mhandlers *m) { + |=>m->jit_startmsg_pclabel: + // Call startmsg handler (if any): + if (m->startmsg) { + // upb_flow_t startmsg(void *closure); + | mov ARG1_64, FRAME->closure + | callp m->startmsg + // TODO: Handle UPB_SKIPSUBMSG, UPB_BREAK + } + + | setmsgend m + | check_eob m + | mov ecx, dword [PTR] + | dyndispatch m + + // --------- New code section (does not fall through) ------------------------ + + // Emit code for parsing each field (dynamic dispatch contains pointers to + // all of these). + + // Create an ordering over the fields (inttable ordering is undefined). + int num_keys = upb_inttable_count(&m->fieldtab); + uint32_t *keys = malloc(num_keys * sizeof(*keys)); + int idx = 0; + for(upb_inttable_iter i = upb_inttable_begin(&m->fieldtab); !upb_inttable_done(i); + i = upb_inttable_next(&m->fieldtab, i)) { + keys[idx++] = upb_inttable_iter_key(i); + } + qsort(keys, num_keys, sizeof(uint32_t), &upb_compare_uint32); + + upb_fhandlers *last_f = NULL; + uint32_t last_tag = 0; + for(int i = 0; i < num_keys; i++) { + uint32_t key = keys[i]; + upb_fhandlers *f = upb_inttable_lookup(&m->fieldtab, key); + uint32_t tag = upb_vencode32(key); + if (last_f) upb_decoder_jit_field(d, last_tag, tag, m, last_f, f); + last_tag = tag; + last_f = f; + } + upb_decoder_jit_field(d, last_tag, 0, m, last_f, NULL); + + free(keys); + + // --------- New code section (does not fall through) ------------------------ + + // End-of-buf / end-of-message. + if (!m->is_group) { + // This case doesn't exist for groups, because there eob really means + // eob, so that case just exits the jit directly. + |=>m->jit_endofbuf_pclabel: + | cmp PTR, DECODER->delim_end + | jb ->exit_jit // We are at eob, but not end-of-submsg. + } + + |=>m->jit_endofmsg_pclabel: + // We are at end-of-submsg: call endmsg handler (if any): + if (m->endmsg) { + // void endmsg(void *closure, upb_status *status) { + | mov ARG1_64, FRAME->closure + | lea ARG2_64, DECODER->dispatcher.status + | callp m->endmsg + } + + if (m->jit_parent_field_done_pclabel == UPB_MULTIPLE) { + | ret + } else if (m->jit_parent_field_done_pclabel == UPB_TOPLEVEL_ONE) { + | jmp ->exit_jit + } else { + | jmp =>m->jit_parent_field_done_pclabel + } + +} + +static const char *dbgfmt = + "JIT encountered unknown field! wt=%d, fn=%d\n"; + +static void upb_decoder_jit(upb_decoder *d) { + | push rbp + | mov rbp, rsp + | push r15 + | push r14 + | push r13 + | push r12 + | push rbx + | mov DECODER, ARG1_64 + | mov FRAME, DECODER:ARG1_64->dispatcher.top + | lea STRREF, DECODER:ARG1_64->strref + | mov CLOSURE, FRAME->closure + | mov PTR, DECODER->ptr + + upb_handlers *h = d->dispatcher.handlers; + if (h->msgs[0]->jit_parent_field_done_pclabel == UPB_MULTIPLE) { + | call =>h->msgs[0]->jit_startmsg_pclabel + | jmp ->exit_jit + } + + // TODO: push return addresses for re-entry (will be necessary for multiple + // buffer support). + for (int i = 0; i < h->msgs_len; i++) upb_decoder_jit_msg(d, h->msgs[i]); + + |->exit_jit: + | pop rbx + | pop r12 + | pop r13 + | pop r14 + | pop r15 + | leave + | ret + |=>0: + | mov rdi, stderr + | mov rsi, dbgfmt + | callp fprintf + | callp abort +} + +void upb_decoder_jit_assignfieldlabs(upb_fhandlers *f, + uint32_t *pclabel_count) { + f->jit_pclabel = (*pclabel_count)++; + f->jit_pclabel_notypecheck = (*pclabel_count)++; + f->jit_submsg_done_pclabel = (*pclabel_count)++; +} + +void upb_decoder_jit_assignmsglabs(upb_mhandlers *m, uint32_t *pclabel_count) { + m->jit_startmsg_pclabel = (*pclabel_count)++; + m->jit_endofbuf_pclabel = (*pclabel_count)++; + m->jit_endofmsg_pclabel = (*pclabel_count)++; + m->jit_unknownfield_pclabel = (*pclabel_count)++; + m->jit_parent_field_done_pclabel = UPB_NONE; + m->max_field_number = 0; + upb_inttable_iter i; + for(i = upb_inttable_begin(&m->fieldtab); !upb_inttable_done(i); + i = upb_inttable_next(&m->fieldtab, i)) { + uint32_t key = upb_inttable_iter_key(i); + m->max_field_number = UPB_MAX(m->max_field_number, key); + upb_fhandlers *f = upb_inttable_iter_value(i); + upb_decoder_jit_assignfieldlabs(f, pclabel_count); + } + // XXX: Won't work for large field numbers; will need to use a upb_table. + m->tablearray = malloc((m->max_field_number + 1) * sizeof(void*)); +} + +// Second pass: for messages that have only one parent, link them to the field +// from which they are called. +void upb_decoder_jit_assignmsglabs2(upb_mhandlers *m) { + upb_inttable_iter i; + for(i = upb_inttable_begin(&m->fieldtab); !upb_inttable_done(i); + i = upb_inttable_next(&m->fieldtab, i)) { + upb_fhandlers *f = upb_inttable_iter_value(i); + if (upb_issubmsgtype(f->type)) { + upb_mhandlers *sub_m = upb_fhandlers_getsubmsg(f); + if (sub_m->jit_parent_field_done_pclabel == UPB_NONE) { + sub_m->jit_parent_field_done_pclabel = f->jit_submsg_done_pclabel; + } else { + sub_m->jit_parent_field_done_pclabel = UPB_MULTIPLE; + } + } + } +} + +void upb_decoder_makejit(upb_decoder *d) { + d->debug_info = NULL; + + // Assign pclabels. + uint32_t pclabel_count = 1; + upb_handlers *h = d->dispatcher.handlers; + for (int i = 0; i < h->msgs_len; i++) + upb_decoder_jit_assignmsglabs(h->msgs[i], &pclabel_count); + for (int i = 0; i < h->msgs_len; i++) + upb_decoder_jit_assignmsglabs2(h->msgs[i]); + + if (h->msgs[0]->jit_parent_field_done_pclabel == UPB_NONE) { + h->msgs[0]->jit_parent_field_done_pclabel = UPB_TOPLEVEL_ONE; + } + + void **globals = malloc(UPB_JIT_GLOBAL__MAX * sizeof(*globals)); + dasm_init(d, 1); + dasm_setupglobal(d, globals, UPB_JIT_GLOBAL__MAX); + dasm_growpc(d, pclabel_count); + dasm_setup(d, upb_jit_actionlist); + + upb_decoder_jit(d); + + dasm_link(d, &d->jit_size); + + d->jit_code = mmap(NULL, d->jit_size, PROT_READ | PROT_WRITE, + MAP_32BIT | MAP_ANONYMOUS | MAP_PRIVATE, 0, 0); + + upb_reg_jit_gdb(d); + + dasm_encode(d, d->jit_code); + + // Create dispatch tables. + for (int i = 0; i < h->msgs_len; i++) { + upb_mhandlers *m = h->msgs[i]; + for (uint32_t j = 0; j <= m->max_field_number; j++) { + upb_fhandlers *f = NULL; + for (int k = 0; k < 8; k++) { + f = upb_inttable_lookup(&m->fieldtab, (j << 3) | k); + if (f) break; + } + if (f) { + m->tablearray[j] = d->jit_code + dasm_getpclabel(d, f->jit_pclabel); + } else { + // Don't handle unknown fields yet. + m->tablearray[j] = d->jit_code + dasm_getpclabel(d, 0); + } + } + } + + dasm_free(d); + free(globals); + + mprotect(d->jit_code, d->jit_size, PROT_EXEC | PROT_READ); + + FILE *f = fopen("/tmp/machine-code", "wb"); + fwrite(d->jit_code, d->jit_size, 1, f); + fclose(f); +} + +void upb_decoder_freejit(upb_decoder *d) { + munmap(d->jit_code, d->jit_size); + free(d->debug_info); + // TODO: unregister +} diff --git a/upb/pb/encoder.c b/upb/pb/encoder.c new file mode 100644 index 0000000..139dc88 --- /dev/null +++ b/upb/pb/encoder.c @@ -0,0 +1,421 @@ +/* + * upb - a minimalist implementation of protocol buffers. + * + * Copyright (c) 2009 Google Inc. See LICENSE for details. + * Author: Josh Haberman + */ + +#include "upb_encoder.h" + +#include +#include "descriptor.h" + +/* Functions for calculating sizes of wire values. ****************************/ + +static size_t upb_v_uint64_t_size(uint64_t val) { +#ifdef __GNUC__ + int high_bit = 63 - __builtin_clzll(val); // 0-based, undef if val == 0. +#else + int high_bit = 0; + uint64_t tmp = val; + while(tmp >>= 1) high_bit++; +#endif + return val == 0 ? 1 : high_bit / 7 + 1; +} + +static size_t upb_v_int32_t_size(int32_t val) { + // v_uint32's are sign-extended to maintain wire compatibility with int64s. + return upb_v_uint64_t_size((int64_t)val); +} +static size_t upb_v_uint32_t_size(uint32_t val) { + return upb_v_uint64_t_size(val); +} +static size_t upb_f_uint64_t_size(uint64_t val) { + (void)val; // Length is independent of value. + return sizeof(uint64_t); +} +static size_t upb_f_uint32_t_size(uint32_t val) { + (void)val; // Length is independent of value. + return sizeof(uint32_t); +} + + +/* Functions to write wire values. ********************************************/ + +// Since we know in advance the longest that the value could be, we always make +// sure that our buffer is long enough. This saves us from having to perform +// bounds checks. + +// Puts a varint (wire type: UPB_WIRE_TYPE_VARINT). +static uint8_t *upb_put_v_uint64_t(uint8_t *buf, uint64_t val) +{ + do { + uint8_t byte = val & 0x7f; + val >>= 7; + if(val) byte |= 0x80; + *buf++ = byte; + } while(val); + return buf; +} + +// Puts an unsigned 32-bit varint, verbatim. Never uses the high 64 bits. +static uint8_t *upb_put_v_uint32_t(uint8_t *buf, uint32_t val) +{ + return upb_put_v_uint64_t(buf, val); +} + +// Puts a signed 32-bit varint, first sign-extending to 64-bits. We do this to +// maintain wire-compatibility with 64-bit signed integers. +static uint8_t *upb_put_v_int32_t(uint8_t *buf, int32_t val) +{ + return upb_put_v_uint64_t(buf, (int64_t)val); +} + +static void upb_put32(uint8_t *buf, uint32_t val) { + buf[0] = val & 0xff; + buf[1] = (val >> 8) & 0xff; + buf[2] = (val >> 16) & 0xff; + buf[3] = (val >> 24); +} + +// Puts a fixed-length 32-bit integer (wire type: UPB_WIRE_TYPE_32BIT). +static uint8_t *upb_put_f_uint32_t(uint8_t *buf, uint32_t val) +{ + uint8_t *uint32_end = buf + sizeof(uint32_t); +#if UPB_UNALIGNED_READS_OK + *(uint32_t*)buf = val; +#else + upb_put32(buf, val); +#endif + return uint32_end; +} + +// Puts a fixed-length 64-bit integer (wire type: UPB_WIRE_TYPE_64BIT). +static uint8_t *upb_put_f_uint64_t(uint8_t *buf, uint64_t val) +{ + uint8_t *uint64_end = buf + sizeof(uint64_t); +#if UPB_UNALIGNED_READS_OK + *(uint64_t*)buf = val; +#else + upb_put32(buf, (uint32_t)val); + upb_put32(buf, (uint32_t)(val >> 32)); +#endif + return uint64_end; +} + +/* Functions to write and calculate sizes for .proto values. ******************/ + +// Performs zig-zag encoding, which is used by sint32 and sint64. +static uint32_t upb_zzenc_32(int32_t n) { return (n << 1) ^ (n >> 31); } +static uint64_t upb_zzenc_64(int64_t n) { return (n << 1) ^ (n >> 63); } + +/* Use macros to define a set of two functions for each .proto type: + * + * // Converts and writes a .proto value into buf. "end" indicates the end + * // of the current available buffer (if the buffer does not contain enough + * // space UPB_STATUS_NEED_MORE_DATA is returned). On success, *outbuf will + * // point one past the data that was written. + * uint8_t *upb_put_INT32(uint8_t *buf, int32_t val); + * + * // Returns the number of bytes required to encode val. + * size_t upb_get_INT32_size(int32_t val); + * + * // Given a .proto value s (source) convert it to a wire value. + * uint32_t upb_vtowv_INT32(int32_t s); + */ + +#define VTOWV(type, wire_t, val_t) \ + static wire_t upb_vtowv_ ## type(val_t s) + +#define PUT(type, v_or_f, wire_t, val_t, member_name) \ + static uint8_t *upb_put_ ## type(uint8_t *buf, val_t val) { \ + wire_t tmp = upb_vtowv_ ## type(val); \ + return upb_put_ ## v_or_f ## _ ## wire_t(buf, tmp); \ + } + +#define T(type, v_or_f, wire_t, val_t, member_name) \ + static size_t upb_get_ ## type ## _size(val_t val) { \ + return upb_ ## v_or_f ## _ ## wire_t ## _size(val); \ + } \ + VTOWV(type, wire_t, val_t); /* prototype for PUT below */ \ + PUT(type, v_or_f, wire_t, val_t, member_name) \ + VTOWV(type, wire_t, val_t) + +T(INT32, v, int32_t, int32_t, int32) { return (uint32_t)s; } +T(INT64, v, uint64_t, int64_t, int64) { return (uint64_t)s; } +T(UINT32, v, uint32_t, uint32_t, uint32) { return s; } +T(UINT64, v, uint64_t, uint64_t, uint64) { return s; } +T(SINT32, v, uint32_t, int32_t, int32) { return upb_zzenc_32(s); } +T(SINT64, v, uint64_t, int64_t, int64) { return upb_zzenc_64(s); } +T(FIXED32, f, uint32_t, uint32_t, uint32) { return s; } +T(FIXED64, f, uint64_t, uint64_t, uint64) { return s; } +T(SFIXED32, f, uint32_t, int32_t, int32) { return (uint32_t)s; } +T(SFIXED64, f, uint64_t, int64_t, int64) { return (uint64_t)s; } +T(BOOL, v, uint32_t, bool, _bool) { return (uint32_t)s; } +T(ENUM, v, uint32_t, int32_t, int32) { return (uint32_t)s; } +T(DOUBLE, f, uint64_t, double, _double) { + upb_value v; + v._double = s; + return v.uint64; +} +T(FLOAT, f, uint32_t, float, _float) { + upb_value v; + v._float = s; + return v.uint32; +} +#undef VTOWV +#undef PUT +#undef T + +static uint8_t *upb_encode_value(uint8_t *buf, upb_field_type_t ft, upb_value v) +{ +#define CASE(t, member_name) \ + case UPB_TYPE(t): return upb_put_ ## t(buf, v.member_name); + switch(ft) { + CASE(DOUBLE, _double) + CASE(FLOAT, _float) + CASE(INT32, int32) + CASE(INT64, int64) + CASE(UINT32, uint32) + CASE(UINT64, uint64) + CASE(SINT32, int32) + CASE(SINT64, int64) + CASE(FIXED32, uint32) + CASE(FIXED64, uint64) + CASE(SFIXED32, int32) + CASE(SFIXED64, int64) + CASE(BOOL, _bool) + CASE(ENUM, int32) + default: assert(false); return buf; + } +#undef CASE +} + +static uint32_t _upb_get_value_size(upb_field_type_t ft, upb_value v) +{ +#define CASE(t, member_name) \ + case UPB_TYPE(t): return upb_get_ ## t ## _size(v.member_name); + switch(ft) { + CASE(DOUBLE, _double) + CASE(FLOAT, _float) + CASE(INT32, int32) + CASE(INT64, int64) + CASE(UINT32, uint32) + CASE(UINT64, uint64) + CASE(SINT32, int32) + CASE(SINT64, int64) + CASE(FIXED32, uint32) + CASE(FIXED64, uint64) + CASE(SFIXED32, int32) + CASE(SFIXED64, int64) + CASE(BOOL, _bool) + CASE(ENUM, int32) + default: assert(false); return 0; + } +#undef CASE +} + +static uint8_t *_upb_put_tag(uint8_t *buf, upb_field_number_t num, + upb_wire_type_t wt) +{ + return upb_put_UINT32(buf, wt | (num << 3)); +} + +static uint32_t _upb_get_tag_size(upb_field_number_t num) +{ + return upb_get_UINT32_size(num << 3); +} + + +/* upb_sizebuilder ************************************************************/ + +struct upb_sizebuilder { + // Accumulating size for the current level. + uint32_t size; + + // Stack of sizes for our current nesting. + uint32_t stack[UPB_MAX_NESTING], *top; + + // Vector of sizes. + uint32_t *sizes; + int sizes_len; + int sizes_size; + + upb_status status; +}; + +// upb_sink callbacks. +static upb_sink_status _upb_sizebuilder_valuecb(upb_sink *sink, upb_fielddef *f, + upb_value val, + upb_status *status) +{ + (void)status; + upb_sizebuilder *sb = (upb_sizebuilder*)sink; + uint32_t size = 0; + size += _upb_get_tag_size(f->number); + size += _upb_get_value_size(f->type, val); + sb->size += size; + return UPB_SINK_CONTINUE; +} + +static upb_sink_status _upb_sizebuilder_strcb(upb_sink *sink, upb_fielddef *f, + upb_strptr str, + int32_t start, uint32_t end, + upb_status *status) +{ + (void)status; + (void)str; // String data itself is not used. + upb_sizebuilder *sb = (upb_sizebuilder*)sink; + if(start >= 0) { + uint32_t size = 0; + size += _upb_get_tag_size(f->number); + size += upb_get_UINT32_size(end - start); + sb->size += size; + } + return UPB_SINK_CONTINUE; +} + +static upb_sink_status _upb_sizebuilder_startcb(upb_sink *sink, upb_fielddef *f, + upb_status *status) +{ + (void)status; + (void)f; // Unused (we calculate tag size and delimiter in endcb). + upb_sizebuilder *sb = (upb_sizebuilder*)sink; + if(f->type == UPB_TYPE(MESSAGE)) { + *sb->top = sb->size; + sb->top++; + sb->size = 0; + } else { + assert(f->type == UPB_TYPE(GROUP)); + sb->size += _upb_get_tag_size(f->number); + } + return UPB_SINK_CONTINUE; +} + +static upb_sink_status _upb_sizebuilder_endcb(upb_sink *sink, upb_fielddef *f, + upb_status *status) +{ + (void)status; + upb_sizebuilder *sb = (upb_sizebuilder*)sink; + if(f->type == UPB_TYPE(MESSAGE)) { + sb->top--; + if(sb->sizes_len == sb->sizes_size) { + sb->sizes_size *= 2; + sb->sizes = realloc(sb->sizes, sb->sizes_size * sizeof(*sb->sizes)); + } + uint32_t child_size = sb->size; + uint32_t parent_size = *sb->top; + sb->sizes[sb->sizes_len++] = child_size; + // The size according to the parent includes the tag size and delimiter of + // the submessage. + parent_size += upb_get_UINT32_size(child_size); + parent_size += _upb_get_tag_size(f->number); + // Include size accumulated in parent before child began. + sb->size = child_size + parent_size; + } else { + assert(f->type == UPB_TYPE(GROUP)); + // As an optimization, we could just add this number twice in startcb, to + // avoid having to recalculate it. + sb->size += _upb_get_tag_size(f->number); + } + return UPB_SINK_CONTINUE; +} + +upb_sink_callbacks _upb_sizebuilder_sink_vtbl = { + _upb_sizebuilder_valuecb, + _upb_sizebuilder_strcb, + _upb_sizebuilder_startcb, + _upb_sizebuilder_endcb +}; + + +/* upb_sink callbacks *********************************************************/ + +struct upb_encoder { + upb_sink base; + //upb_bytesink *bytesink; + uint32_t *sizes; + int size_offset; +}; + + +// Within one callback we may need to encode up to two separate values. +#define UPB_ENCODER_BUFSIZE (UPB_MAX_ENCODED_SIZE * 2) + +static upb_sink_status _upb_encoder_push_buf(upb_encoder *s, const uint8_t *buf, + size_t len, upb_status *status) +{ + // TODO: conjure a upb_strptr that points to buf. + //upb_strptr ptr; + (void)s; + (void)buf; + (void)status; + size_t written = 5;// = upb_bytesink_onbytes(s->bytesink, ptr); + if(written < len) { + // TODO: mark to skip "written" bytes next time. + return UPB_SINK_STOP; + } else { + return UPB_SINK_CONTINUE; + } +} + +static upb_sink_status _upb_encoder_valuecb(upb_sink *sink, upb_fielddef *f, + upb_value val, upb_status *status) +{ + upb_encoder *s = (upb_encoder*)sink; + uint8_t buf[UPB_ENCODER_BUFSIZE], *ptr = buf; + upb_wire_type_t wt = upb_types[f->type].expected_wire_type; + // TODO: handle packed encoding. + ptr = _upb_put_tag(ptr, f->number, wt); + ptr = upb_encode_value(ptr, f->type, val); + return _upb_encoder_push_buf(s, buf, ptr - buf, status); +} + +static upb_sink_status _upb_encoder_strcb(upb_sink *sink, upb_fielddef *f, + upb_strptr str, + int32_t start, uint32_t end, + upb_status *status) +{ + upb_encoder *s = (upb_encoder*)sink; + uint8_t buf[UPB_ENCODER_BUFSIZE], *ptr = buf; + if(start >= 0) { + ptr = _upb_put_tag(ptr, f->number, UPB_WIRE_TYPE_DELIMITED); + ptr = upb_put_UINT32(ptr, end - start); + } + // TODO: properly handle partially consumed strings and partially supplied + // strings. + _upb_encoder_push_buf(s, buf, ptr - buf, status); + return _upb_encoder_push_buf(s, (uint8_t*)upb_string_getrobuf(str), end - start, status); +} + +static upb_sink_status _upb_encoder_startcb(upb_sink *sink, upb_fielddef *f, + upb_status *status) +{ + upb_encoder *s = (upb_encoder*)sink; + uint8_t buf[UPB_ENCODER_BUFSIZE], *ptr = buf; + if(f->type == UPB_TYPE(GROUP)) { + ptr = _upb_put_tag(ptr, f->number, UPB_WIRE_TYPE_START_GROUP); + } else { + ptr = _upb_put_tag(ptr, f->number, UPB_WIRE_TYPE_DELIMITED); + ptr = upb_put_UINT32(ptr, s->sizes[--s->size_offset]); + } + return _upb_encoder_push_buf(s, buf, ptr - buf, status); +} + +static upb_sink_status _upb_encoder_endcb(upb_sink *sink, upb_fielddef *f, + upb_status *status) +{ + upb_encoder *s = (upb_encoder*)sink; + uint8_t buf[UPB_ENCODER_BUFSIZE], *ptr = buf; + if(f->type != UPB_TYPE(GROUP)) return UPB_SINK_CONTINUE; + ptr = _upb_put_tag(ptr, f->number, UPB_WIRE_TYPE_END_GROUP); + return _upb_encoder_push_buf(s, buf, ptr - buf, status); +} + +upb_sink_callbacks _upb_encoder_sink_vtbl = { + _upb_encoder_valuecb, + _upb_encoder_strcb, + _upb_encoder_startcb, + _upb_encoder_endcb +}; + diff --git a/upb/pb/encoder.h b/upb/pb/encoder.h new file mode 100644 index 0000000..64c5047 --- /dev/null +++ b/upb/pb/encoder.h @@ -0,0 +1,58 @@ +/* + * upb - a minimalist implementation of protocol buffers. + * + * Copyright (c) 2009-2010 Google Inc. See LICENSE for details. + * Author: Josh Haberman + * + * Implements a set of upb_handlers that write protobuf data to the binary wire + * format. + * + * For messages that have any submessages, the encoder needs a buffer + * containing the submessage sizes, so they can be properly written at the + * front of each message. Note that groups do *not* have this requirement. + */ + +#ifndef UPB_ENCODER_H_ +#define UPB_ENCODER_H_ + +#include "upb.h" +#include "upb_stream.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/* upb_encoder ****************************************************************/ + +// A upb_encoder is a upb_sink that emits data to a upb_bytesink in the protocol +// buffer binary wire format. +struct upb_encoder; +typedef struct upb_encoder upb_encoder; + +upb_encoder *upb_encoder_new(upb_msgdef *md); +void upb_encoder_free(upb_encoder *e); + +// Resets the given upb_encoder such that is is ready to begin encoding, +// outputting data to "bytesink" (which must live until the encoder is +// reset or destroyed). +void upb_encoder_reset(upb_encoder *e, upb_bytesink *bytesink); + +// Returns the upb_sink to which data can be written. The sink is invalidated +// when the encoder is reset or destroyed. Note that if the client wants to +// encode any length-delimited submessages it must first call +// upb_encoder_buildsizes() below. +upb_sink *upb_encoder_sink(upb_encoder *e); + +// Call prior to pushing any data with embedded submessages. "src" must yield +// exactly the same data as what will next be encoded, but in reverse order. +// The encoder iterates over this data in order to determine the sizes of the +// submessages. If any errors are returned by the upb_src, the status will +// be saved in *status. If the client is sure that the upb_src will not throw +// any errors, "status" may be NULL. +void upb_encoder_buildsizes(upb_encoder *e, upb_src *src, upb_status *status); + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif /* UPB_ENCODER_H_ */ diff --git a/upb/pb/glue.c b/upb/pb/glue.c new file mode 100644 index 0000000..3763ae0 --- /dev/null +++ b/upb/pb/glue.c @@ -0,0 +1,129 @@ +/* + * upb - a minimalist implementation of protocol buffers. + * + * Copyright (c) 2010 Google Inc. See LICENSE for details. + * Author: Josh Haberman + */ + +#include "upb/bytestream.h" +#include "upb/descriptor.h" +#include "upb/msg.h" +#include "upb/pb/decoder.h" +#include "upb/pb/glue.h" +#include "upb/pb/textprinter.h" + +void upb_strtomsg(const char *str, size_t len, void *msg, upb_msgdef *md, + upb_status *status) { + upb_stringsrc strsrc; + upb_stringsrc_init(&strsrc); + upb_stringsrc_reset(&strsrc, str, len); + + upb_decoder d; + upb_decoder_initformsgdef(&d, md); + upb_decoder_reset(&d, upb_stringsrc_bytesrc(&strsrc), 0, UINT64_MAX, msg); + upb_decoder_decode(&d, status); + + upb_stringsrc_uninit(&strsrc); + upb_decoder_uninit(&d); +} + +#if 0 +void upb_msgtotext(upb_string *str, upb_msg *msg, upb_msgdef *md, + bool single_line) { + upb_stringsink strsink; + upb_stringsink_init(&strsink); + upb_stringsink_reset(&strsink, str); + + upb_textprinter *p = upb_textprinter_new(); + upb_handlers *h = upb_handlers_new(); + upb_textprinter_reghandlers(h, md); + upb_textprinter_reset(p, upb_stringsink_bytesink(&strsink), single_line); + + upb_status status = UPB_STATUS_INIT; + upb_msg_runhandlers(msg, md, h, p, &status); + // None of {upb_msg_runhandlers, upb_textprinter, upb_stringsink} should be + // capable of returning an error. + assert(upb_ok(&status)); + upb_status_uninit(&status); + + upb_stringsink_uninit(&strsink); + upb_textprinter_free(p); + upb_handlers_unref(h); +} +#endif + +// TODO: read->load. +void upb_read_descriptor(upb_symtab *symtab, const char *str, size_t len, + upb_status *status) { + upb_stringsrc strsrc; + upb_stringsrc_init(&strsrc); + upb_stringsrc_reset(&strsrc, str, len); + + upb_handlers *h = upb_handlers_new(); + upb_descreader_reghandlers(h); + + upb_decoder d; + upb_decoder_initforhandlers(&d, h); + upb_handlers_unref(h); + upb_descreader r; + upb_symtabtxn txn; + upb_symtabtxn_init(&txn); + upb_descreader_init(&r, &txn); + upb_decoder_reset(&d, upb_stringsrc_bytesrc(&strsrc), 0, UINT64_MAX, &r); + + upb_decoder_decode(&d, status); + + // Set default accessors and layouts on all messages. + // for msgdef in symtabtxn: + upb_symtabtxn_iter i; + upb_symtabtxn_begin(&i, &txn); + for(; !upb_symtabtxn_done(&i); upb_symtabtxn_next(&i)) { + upb_def *def = upb_symtabtxn_iter_def(&i); + upb_msgdef *md = upb_dyncast_msgdef(def); + if (!md) return; + // For field in msgdef: + upb_msg_iter i; + for(i = upb_msg_begin(md); !upb_msg_done(i); i = upb_msg_next(md, i)) { + upb_fielddef *f = upb_msg_iter_field(i); + upb_fielddef_setaccessor(f, upb_stdmsg_accessor(f)); + } + upb_msgdef_layout(md); + } + + if (upb_ok(status)) upb_symtab_commit(symtab, &txn, status); + + upb_symtabtxn_uninit(&txn); + upb_descreader_uninit(&r); + upb_stringsrc_uninit(&strsrc); + upb_decoder_uninit(&d); +} + +char *upb_readfile(const char *filename, size_t *len) { + FILE *f = fopen(filename, "rb"); + if(!f) return NULL; + if(fseek(f, 0, SEEK_END) != 0) goto error; + long size = ftell(f); + if(size < 0) goto error; + if(fseek(f, 0, SEEK_SET) != 0) goto error; + char *buf = malloc(size); + if(fread(buf, size, 1, f) != 1) goto error; + fclose(f); + if (len) *len = size; + return buf; + +error: + fclose(f); + return NULL; +} + +void upb_read_descriptorfile(upb_symtab *symtab, const char *fname, + upb_status *status) { + size_t len; + char *data = upb_readfile(fname, &len); + if (!data) { + upb_status_setf(status, UPB_ERROR, "Couldn't read file: %s", fname); + return; + } + upb_read_descriptor(symtab, data, len, status); + free(data); +} diff --git a/upb/pb/glue.h b/upb/pb/glue.h new file mode 100644 index 0000000..5359120 --- /dev/null +++ b/upb/pb/glue.h @@ -0,0 +1,62 @@ +/* + * upb - a minimalist implementation of protocol buffers. + * + * Copyright (c) 2011 Google Inc. See LICENSE for details. + * Author: Josh Haberman + * + * upb's core components like upb_decoder and upb_msg are carefully designed to + * avoid depending on each other for maximum orthogonality. In other words, + * you can use a upb_decoder to decode into *any* kind of structure; upb_msg is + * just one such structure. A upb_msg can be serialized/deserialized into any + * format, protobuf binary format is just one such format. + * + * However, for convenience we provide functions here for doing common + * operations like deserializing protobuf binary format into a upb_msg. The + * compromise is that this file drags in almost all of upb as a dependency, + * which could be undesirable if you're trying to use a trimmed-down build of + * upb. + * + * While these routines are convenient, they do not reuse any encoding/decoding + * state. For example, if a decoder is JIT-based, it will be re-JITted every + * time these functions are called. For this reason, if you are parsing lots + * of data and efficiency is an issue, these may not be the best functions to + * use (though they are useful for prototyping, before optimizing). + */ + +#ifndef UPB_GLUE_H +#define UPB_GLUE_H + +#include +#include "upb/upb.h" + +#ifdef __cplusplus +extern "C" { +#endif + +// Forward-declares so we don't have to include everything in this .h file. +// Clients should use the regular, typedef'd names (eg. upb_string). +struct _upb_msg; +struct _upb_msgdef; +struct _upb_symtab; + +// Decodes the given string, which must be in protobuf binary format, to the +// given upb_msg with msgdef "md", storing the status of the operation in "s". +void upb_strtomsg(const char *str, size_t len, void *msg, + struct _upb_msgdef *md, upb_status *s); + +//void upb_msgtotext(struct _upb_string *str, void *msg, +// struct _upb_msgdef *md, bool single_line); + +void upb_read_descriptor(struct _upb_symtab *symtab, const char *str, size_t len, + upb_status *status); + +void upb_read_descriptorfile(struct _upb_symtab *symtab, const char *fname, + upb_status *status); + +char *upb_readfile(const char *filename, size_t *len); + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif diff --git a/upb/pb/jit_debug_elf_file.s b/upb/pb/jit_debug_elf_file.s new file mode 100644 index 0000000..0b74630 --- /dev/null +++ b/upb/pb/jit_debug_elf_file.s @@ -0,0 +1,7 @@ + .file "JIT mcode" + .text +upb_jit_compiled_decoder: + .globl upb_jit_compiled_decoder + .size upb_jit_compiled_decoder, 0x321 + .type upb_jit_compiled_decoder STT_FUNC + .space 0x321 diff --git a/upb/pb/textprinter.c b/upb/pb/textprinter.c new file mode 100644 index 0000000..ce029d5 --- /dev/null +++ b/upb/pb/textprinter.c @@ -0,0 +1,199 @@ +/* + * upb - a minimalist implementation of protocol buffers. + * + * Copyright (c) 2009 Google Inc. See LICENSE for details. + * Author: Josh Haberman + */ + +#include +#include +#include +#include +#include "upb/pb/textprinter.h" + +struct _upb_textprinter { + upb_bytesink *bytesink; + int indent_depth; + bool single_line; + upb_status status; +}; + +#define CHECK(x) if ((x) < 0) goto err; + +static int upb_textprinter_putescaped(upb_textprinter *p, upb_strref *strref, + bool preserve_utf8) { + // Based on CEscapeInternal() from Google's protobuf release. + // TODO; we could read directly fraom a bytesrc's buffer instead. + // TODO; we could write directly into a bytesink's buffer instead. + char dstbuf[4096], *dst = dstbuf, *dstend = dstbuf + sizeof(dstbuf); + char buf[strref->len], *src = buf; + char *end = src + strref->len; + upb_bytesrc_read(strref->bytesrc, strref->stream_offset, strref->len, buf); + + // I think hex is prettier and more useful, but proto2 uses octal; should + // investigate whether it can parse hex also. + bool use_hex = false; + bool last_hex_escape = false; // true if last output char was \xNN + + for (; src < end; src++) { + if (dstend - dst < 4) { + CHECK(upb_bytesink_write(p->bytesink, dstbuf, dst - dstbuf, &p->status)); + dst = dstbuf; + } + + bool is_hex_escape = false; + switch (*src) { + case '\n': *(dst++) = '\\'; *(dst++) = 'n'; break; + case '\r': *(dst++) = '\\'; *(dst++) = 'r'; break; + case '\t': *(dst++) = '\\'; *(dst++) = 't'; break; + case '\"': *(dst++) = '\\'; *(dst++) = '\"'; break; + case '\'': *(dst++) = '\\'; *(dst++) = '\''; break; + case '\\': *(dst++) = '\\'; *(dst++) = '\\'; break; + default: + // Note that if we emit \xNN and the src character after that is a hex + // digit then that digit must be escaped too to prevent it being + // interpreted as part of the character code by C. + if ((!preserve_utf8 || (uint8_t)*src < 0x80) && + (!isprint(*src) || (last_hex_escape && isxdigit(*src)))) { + sprintf(dst, (use_hex ? "\\x%02x" : "\\%03o"), (uint8_t)*src); + is_hex_escape = use_hex; + dst += 4; + } else { + *(dst++) = *src; break; + } + } + last_hex_escape = is_hex_escape; + } + // Flush remaining data. + CHECK(upb_bytesink_write(p->bytesink, dst, dst - dstbuf, &p->status)); + return 0; +err: + return -1; +} + +static int upb_textprinter_indent(upb_textprinter *p) { + if(!p->single_line) + for(int i = 0; i < p->indent_depth; i++) + CHECK(upb_bytesink_writestr(p->bytesink, " ", &p->status)); + return 0; +err: + return -1; +} + +static int upb_textprinter_endfield(upb_textprinter *p) { + if(p->single_line) { + CHECK(upb_bytesink_writestr(p->bytesink, " ", &p->status)); + } else { + CHECK(upb_bytesink_writestr(p->bytesink, "\n", &p->status)); + } + return 0; +err: + return -1; +} + +static upb_flow_t upb_textprinter_value(void *_p, upb_value fval, + upb_value val) { + upb_textprinter *p = _p; + upb_fielddef *f = upb_value_getfielddef(fval); + upb_textprinter_indent(p); + CHECK(upb_bytesink_printf(p->bytesink, &p->status, "%s: ", f->name)); +#define CASE(fmtstr, member) \ + CHECK(upb_bytesink_printf(p->bytesink, &p->status, fmtstr, upb_value_get ## member(val))); break; + switch(f->type) { + // TODO: figure out what we should really be doing for these + // floating-point formats. + case UPB_TYPE(DOUBLE): + CHECK(upb_bytesink_printf(p->bytesink, &p->status, "%.*g", DBL_DIG, upb_value_getdouble(val))); break; + case UPB_TYPE(FLOAT): + CHECK(upb_bytesink_printf(p->bytesink, &p->status, "%.*g", FLT_DIG+2, upb_value_getfloat(val))); break; + case UPB_TYPE(INT64): + case UPB_TYPE(SFIXED64): + case UPB_TYPE(SINT64): + CASE("%" PRId64, int64) + case UPB_TYPE(UINT64): + case UPB_TYPE(FIXED64): + CASE("%" PRIu64, uint64) + case UPB_TYPE(UINT32): + case UPB_TYPE(FIXED32): + CASE("%" PRIu32, uint32); + case UPB_TYPE(ENUM): { + upb_enumdef *enum_def = upb_downcast_enumdef(f->def); + const char *label = upb_enumdef_iton(enum_def, upb_value_getint32(val)); + if (label) { + // We found a corresponding string for this enum. Otherwise we fall + // through to the int32 code path. + CHECK(upb_bytesink_writestr(p->bytesink, label, &p->status)); + break; + } + } + case UPB_TYPE(INT32): + case UPB_TYPE(SFIXED32): + case UPB_TYPE(SINT32): + CASE("%" PRId32, int32) + case UPB_TYPE(BOOL): + CASE("%hhu", bool); + case UPB_TYPE(STRING): + case UPB_TYPE(BYTES): { + CHECK(upb_bytesink_writestr(p->bytesink, "\"", &p->status)); + CHECK(upb_textprinter_putescaped(p, upb_value_getstrref(val), + f->type == UPB_TYPE(STRING))); + CHECK(upb_bytesink_writestr(p->bytesink, "\"", &p->status)); + break; + } + } + upb_textprinter_endfield(p); + return UPB_CONTINUE; +err: + return UPB_BREAK; +} + +static upb_sflow_t upb_textprinter_startsubmsg(void *_p, upb_value fval) { + upb_textprinter *p = _p; + upb_fielddef *f = upb_value_getfielddef(fval); + upb_textprinter_indent(p); + bool ret = upb_bytesink_printf(p->bytesink, &p->status, "%s {", f->name); + if (!ret) return UPB_SBREAK; + if (!p->single_line) + upb_bytesink_writestr(p->bytesink, "\n", &p->status); + p->indent_depth++; + return UPB_CONTINUE_WITH(_p); +} + +static upb_flow_t upb_textprinter_endsubmsg(void *_p, upb_value fval) { + (void)fval; + upb_textprinter *p = _p; + p->indent_depth--; + upb_textprinter_indent(p); + upb_bytesink_writestr(p->bytesink, "}", &p->status); + upb_textprinter_endfield(p); + return UPB_CONTINUE; +} + +upb_textprinter *upb_textprinter_new() { + upb_textprinter *p = malloc(sizeof(*p)); + return p; +} + +void upb_textprinter_free(upb_textprinter *p) { + free(p); +} + +void upb_textprinter_reset(upb_textprinter *p, upb_bytesink *sink, + bool single_line) { + p->bytesink = sink; + p->single_line = single_line; + p->indent_depth = 0; +} + +upb_mhandlers *upb_textprinter_reghandlers(upb_handlers *h, upb_msgdef *m) { + upb_handlerset hset = { + NULL, // startmsg + NULL, // endmsg + upb_textprinter_value, + upb_textprinter_startsubmsg, + upb_textprinter_endsubmsg, + NULL, // startseq + NULL, // endseq + }; + return upb_handlers_reghandlerset(h, m, &hset); +} diff --git a/upb/pb/textprinter.h b/upb/pb/textprinter.h new file mode 100644 index 0000000..9455208 --- /dev/null +++ b/upb/pb/textprinter.h @@ -0,0 +1,31 @@ +/* + * upb - a minimalist implementation of protocol buffers. + * + * Copyright (c) 2009 Google Inc. See LICENSE for details. + * Author: Josh Haberman + */ + +#ifndef UPB_TEXT_H_ +#define UPB_TEXT_H_ + +#include "upb/bytestream.h" +#include "upb/handlers.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct _upb_textprinter; +typedef struct _upb_textprinter upb_textprinter; + +upb_textprinter *upb_textprinter_new(); +void upb_textprinter_free(upb_textprinter *p); +void upb_textprinter_reset(upb_textprinter *p, upb_bytesink *sink, + bool single_line); +upb_mhandlers *upb_textprinter_reghandlers(upb_handlers *h, upb_msgdef *m); + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif /* UPB_TEXT_H_ */ diff --git a/upb/pb/varint.c b/upb/pb/varint.c new file mode 100644 index 0000000..45caec1 --- /dev/null +++ b/upb/pb/varint.c @@ -0,0 +1,54 @@ +/* + * upb - a minimalist implementation of protocol buffers. + * + * Copyright (c) 2011 Google Inc. See LICENSE for details. + * Author: Josh Haberman + */ + +#include "upb/pb/varint.h" + +// Given an encoded varint v, returns an integer with a single bit set that +// indicates the end of the varint. Subtracting one from this value will +// yield a mask that leaves only bits that are part of the varint. Returns +// 0 if the varint is unterminated. +INLINE uint64_t upb_get_vstopbit(uint64_t v) { + uint64_t cbits = v | 0x7f7f7f7f7f7f7f7fULL; + return ~cbits & (cbits+1); +} +INLINE uint64_t upb_get_vmask(uint64_t v) { return upb_get_vstopbit(v) - 1; } + +upb_decoderet upb_vdecode_max8_massimino(upb_decoderet r) { + uint64_t b; + memcpy(&b, r.p, sizeof(b)); + uint64_t stop_bit = upb_get_vstopbit(b); + b = (b & 0x7f7f7f7f7f7f7f7fULL) & (stop_bit - 1); + b += b & 0x007f007f007f007fULL; + b += 3 * (b & 0x0000ffff0000ffffULL); + b += 15 * (b & 0x00000000ffffffffULL); + if (stop_bit == 0) { + // Error: unterminated varint. + upb_decoderet err_r = {(void*)0, 0}; + return err_r; + } + upb_decoderet my_r = {r.p + ((__builtin_ctzll(stop_bit) + 1) / 8), + r.val | (b << 7)}; + return my_r; +} + +upb_decoderet upb_vdecode_max8_wright(upb_decoderet r) { + uint64_t b; + memcpy(&b, r.p, sizeof(b)); + uint64_t stop_bit = upb_get_vstopbit(b); + b &= (stop_bit - 1); + b = ((b & 0x7f007f007f007f00) >> 1) | (b & 0x007f007f007f007f); + b = ((b & 0xffff0000ffff0000) >> 2) | (b & 0x0000ffff0000ffff); + b = ((b & 0xffffffff00000000) >> 4) | (b & 0x00000000ffffffff); + if (stop_bit == 0) { + // Error: unterminated varint. + upb_decoderet err_r = {(void*)0, 0}; + return err_r; + } + upb_decoderet my_r = {r.p + ((__builtin_ctzll(stop_bit) + 1) / 8), + r.val | (b << 14)}; + return my_r; +} diff --git a/upb/pb/varint.h b/upb/pb/varint.h new file mode 100644 index 0000000..1bbd193 --- /dev/null +++ b/upb/pb/varint.h @@ -0,0 +1,142 @@ +/* + * upb - a minimalist implementation of protocol buffers. + * + * Copyright (c) 2011 Google Inc. See LICENSE for details. + * Author: Josh Haberman + * + * A number of routines for varint manipulation (we keep them all around to + * have multiple approaches available for benchmarking). + */ + +#ifndef UPB_VARINT_DECODER_H_ +#define UPB_VARINT_DECODER_H_ + +#include +#include +#include "upb/upb.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/* Decoding *******************************************************************/ + +// All decoding functions return this struct by value. +typedef struct { + const char *p; // NULL if the varint was unterminated. + uint64_t val; +} upb_decoderet; + +// A basic branch-based decoder, uses 32-bit values to get good performance +// on 32-bit architectures (but performs well on 64-bits also). +INLINE upb_decoderet upb_vdecode_branch32(const char *p) { + upb_decoderet r = {NULL, 0}; + uint32_t low, high = 0; + uint32_t b; + b = *(p++); low = (b & 0x7f) ; if(!(b & 0x80)) goto done; + b = *(p++); low |= (b & 0x7f) << 7; if(!(b & 0x80)) goto done; + b = *(p++); low |= (b & 0x7f) << 14; if(!(b & 0x80)) goto done; + b = *(p++); low |= (b & 0x7f) << 21; if(!(b & 0x80)) goto done; + b = *(p++); low |= (b & 0x7f) << 28; + high = (b & 0x7f) >> 4; if(!(b & 0x80)) goto done; + b = *(p++); high |= (b & 0x7f) << 3; if(!(b & 0x80)) goto done; + b = *(p++); high |= (b & 0x7f) << 10; if(!(b & 0x80)) goto done; + b = *(p++); high |= (b & 0x7f) << 17; if(!(b & 0x80)) goto done; + b = *(p++); high |= (b & 0x7f) << 24; if(!(b & 0x80)) goto done; + b = *(p++); high |= (b & 0x7f) << 31; if(!(b & 0x80)) goto done; + return r; + +done: + r.val = ((uint64_t)high << 32) | low; + r.p = p; + return r; +} + +// Like the previous, but uses 64-bit values. +INLINE upb_decoderet upb_vdecode_branch64(const char *p) { + uint64_t val; + uint64_t b; + upb_decoderet r = {(void*)0, 0}; + b = *(p++); val = (b & 0x7f) ; if(!(b & 0x80)) goto done; + b = *(p++); val |= (b & 0x7f) << 7; if(!(b & 0x80)) goto done; + b = *(p++); val |= (b & 0x7f) << 14; if(!(b & 0x80)) goto done; + b = *(p++); val |= (b & 0x7f) << 21; if(!(b & 0x80)) goto done; + b = *(p++); val |= (b & 0x7f) << 28; if(!(b & 0x80)) goto done; + b = *(p++); val |= (b & 0x7f) << 35; if(!(b & 0x80)) goto done; + b = *(p++); val |= (b & 0x7f) << 42; if(!(b & 0x80)) goto done; + b = *(p++); val |= (b & 0x7f) << 49; if(!(b & 0x80)) goto done; + b = *(p++); val |= (b & 0x7f) << 56; if(!(b & 0x80)) goto done; + b = *(p++); val |= (b & 0x7f) << 63; if(!(b & 0x80)) goto done; + return r; + +done: + r.val = val; + r.p = p; + return r; +} + +// Decodes a varint of at most 8 bytes without branching (except for error). +upb_decoderet upb_vdecode_max8_wright(upb_decoderet r); + +// Another implementation of the previous. +upb_decoderet upb_vdecode_max8_massimino(upb_decoderet r); + +// Template for a function that checks the first two bytes with branching +// and dispatches 2-10 bytes with a separate function. +#define UPB_VARINT_DECODER_CHECK2(name, decode_max8_function) \ +INLINE upb_decoderet upb_vdecode_check2_ ## name(const char *_p) { \ + uint8_t *p = (uint8_t*)_p; \ + if ((*p & 0x80) == 0) { upb_decoderet r = {_p + 1, *p & 0x7f}; return r; } \ + upb_decoderet r = {_p + 2, (*p & 0x7f) | ((*(p + 1) & 0x7f) << 7)}; \ + if ((*(p + 1) & 0x80) == 0) return r; \ + return decode_max8_function(r); \ +} + +UPB_VARINT_DECODER_CHECK2(wright, upb_vdecode_max8_wright); +UPB_VARINT_DECODER_CHECK2(massimino, upb_vdecode_max8_massimino); +#undef UPB_VARINT_DECODER_CHECK2 + +// Our canonical functions for decoding varints, based on the currently +// favored best-performing implementations. +INLINE upb_decoderet upb_vdecode_fast(const char *p) { + // Use nobranch2 on 64-bit, branch32 on 32-bit. + if (sizeof(long) == 8) + return upb_vdecode_check2_massimino(p); + else + return upb_vdecode_branch32(p); +} + +INLINE upb_decoderet upb_vdecode_max8_fast(upb_decoderet r) { + return upb_vdecode_max8_massimino(r); +} + + +/* Encoding *******************************************************************/ + +INLINE size_t upb_value_size(uint64_t val) { +#ifdef __GNUC__ + int high_bit = 63 - __builtin_clzll(val); // 0-based, undef if val == 0. +#else + int high_bit = 0; + uint64_t tmp = val; + while(tmp >>= 1) high_bit++; +#endif + return val == 0 ? 1 : high_bit / 8 + 1; +} + +// Encodes a 32-bit varint, *not* sign-extended. +INLINE uint64_t upb_vencode32(uint32_t val) { + uint64_t ret = 0; + for (int bitpos = 0; val; bitpos+=8, val >>=7) { + if (bitpos > 0) ret |= (1 << (bitpos-1)); + ret |= (val & 0x7f) << bitpos; + } + return ret; +} + + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif /* UPB_VARINT_DECODER_H_ */ -- cgit v1.2.3