From 3d0c7c45da5b72a88bfb03dc5ce3384b7f01cef6 Mon Sep 17 00:00:00 2001 From: Josh Haberman Date: Tue, 18 Nov 2014 15:21:50 -0800 Subject: Sync to Google-internal development. --- upb/pb/compile_decoder.c | 308 ++++++++-------- upb/pb/compile_decoder_x64.c | 2 +- upb/pb/compile_decoder_x64.dasc | 28 +- upb/pb/encoder.c | 769 ++++++++++++++++++++++------------------ upb/pb/encoder.h | 163 +++++++-- upb/pb/varint.c | 23 ++ upb/pb/varint.int.h | 9 + 7 files changed, 760 insertions(+), 542 deletions(-) (limited to 'upb/pb') diff --git a/upb/pb/compile_decoder.c b/upb/pb/compile_decoder.c index 8452bea..64689f6 100644 --- a/upb/pb/compile_decoder.c +++ b/upb/pb/compile_decoder.c @@ -149,7 +149,7 @@ const upb_pbdecodermethod *upb_pbdecodermethod_new( } -/* compiler *******************************************************************/ +/* bytecode compiler **********************************************************/ // Data used only at compilation time. typedef struct { @@ -575,8 +575,8 @@ static void putsel(compiler *c, opcode op, upb_selector_t sel, // Puts an opcode to call a callback, but only if a callback actually exists for // this field and handler type. -static void putcb(compiler *c, opcode op, const upb_handlers *h, - const upb_fielddef *f, upb_handlertype_t type) { +static void maybeput(compiler *c, opcode op, const upb_handlers *h, + const upb_fielddef *f, upb_handlertype_t type) { putsel(c, op, getsel(f, type), h); } @@ -589,40 +589,165 @@ static bool haslazyhandlers(const upb_handlers *h, const upb_fielddef *f) { upb_handlers_gethandler(h, getsel(f, UPB_HANDLER_ENDSTR)); } + +/* bytecode compiler code generation ******************************************/ + +// Symbolic names for our local labels. +#define LABEL_LOOPSTART 1 // Top of a repeated field loop. +#define LABEL_LOOPBREAK 2 // To jump out of a repeated loop +#define LABEL_FIELD 3 // Jump backward to find the most recent field. +#define LABEL_ENDMSG 4 // To reach the OP_ENDMSG instr for this msg. + +// Generates bytecode to parse a single non-lazy message field. +static void generate_msgfield(compiler *c, const upb_fielddef *f, + upb_pbdecodermethod *method) { + const upb_handlers *h = upb_pbdecodermethod_desthandlers(method); + const upb_pbdecodermethod *sub_m = find_submethod(c, method, f); + + if (!sub_m) { + // Don't emit any code for this field at all; it will be parsed as an + // unknown field. + return; + } + + label(c, LABEL_FIELD); + + int wire_type = + (upb_fielddef_descriptortype(f) == UPB_DESCRIPTOR_TYPE_MESSAGE) + ? UPB_WIRE_TYPE_DELIMITED + : UPB_WIRE_TYPE_START_GROUP; + + if (upb_fielddef_isseq(f)) { + putop(c, OP_CHECKDELIM, LABEL_ENDMSG); + putchecktag(c, f, wire_type, LABEL_DISPATCH); + dispatchtarget(c, method, f, wire_type); + putop(c, OP_PUSHTAGDELIM, 0); + putop(c, OP_STARTSEQ, getsel(f, UPB_HANDLER_STARTSEQ)); + label(c, LABEL_LOOPSTART); + putpush(c, f); + putop(c, OP_STARTSUBMSG, getsel(f, UPB_HANDLER_STARTSUBMSG)); + putop(c, OP_CALL, sub_m); + putop(c, OP_POP); + maybeput(c, OP_ENDSUBMSG, h, f, UPB_HANDLER_ENDSUBMSG); + if (wire_type == UPB_WIRE_TYPE_DELIMITED) { + putop(c, OP_SETDELIM); + } + putop(c, OP_CHECKDELIM, LABEL_LOOPBREAK); + putchecktag(c, f, wire_type, LABEL_LOOPBREAK); + putop(c, OP_BRANCH, -LABEL_LOOPSTART); + label(c, LABEL_LOOPBREAK); + putop(c, OP_POP); + maybeput(c, OP_ENDSEQ, h, f, UPB_HANDLER_ENDSEQ); + } else { + putop(c, OP_CHECKDELIM, LABEL_ENDMSG); + putchecktag(c, f, wire_type, LABEL_DISPATCH); + dispatchtarget(c, method, f, wire_type); + putpush(c, f); + putop(c, OP_STARTSUBMSG, getsel(f, UPB_HANDLER_STARTSUBMSG)); + putop(c, OP_CALL, sub_m); + putop(c, OP_POP); + maybeput(c, OP_ENDSUBMSG, h, f, UPB_HANDLER_ENDSUBMSG); + if (wire_type == UPB_WIRE_TYPE_DELIMITED) { + putop(c, OP_SETDELIM); + } + } +} + +// Generates bytecode to parse a single string or lazy submessage field. +static void generate_delimfield(compiler *c, const upb_fielddef *f, + upb_pbdecodermethod *method) { + const upb_handlers *h = upb_pbdecodermethod_desthandlers(method); + + label(c, LABEL_FIELD); + if (upb_fielddef_isseq(f)) { + putop(c, OP_CHECKDELIM, LABEL_ENDMSG); + putchecktag(c, f, UPB_WIRE_TYPE_DELIMITED, LABEL_DISPATCH); + dispatchtarget(c, method, f, UPB_WIRE_TYPE_DELIMITED); + putop(c, OP_PUSHTAGDELIM, 0); + putop(c, OP_STARTSEQ, getsel(f, UPB_HANDLER_STARTSEQ)); + label(c, LABEL_LOOPSTART); + putop(c, OP_PUSHLENDELIM); + putop(c, OP_STARTSTR, getsel(f, UPB_HANDLER_STARTSTR)); + // Need to emit even if no handler to skip past the string. + putop(c, OP_STRING, getsel(f, UPB_HANDLER_STRING)); + putop(c, OP_POP); + maybeput(c, OP_ENDSTR, h, f, UPB_HANDLER_ENDSTR); + putop(c, OP_SETDELIM); + putop(c, OP_CHECKDELIM, LABEL_LOOPBREAK); + putchecktag(c, f, UPB_WIRE_TYPE_DELIMITED, LABEL_LOOPBREAK); + putop(c, OP_BRANCH, -LABEL_LOOPSTART); + label(c, LABEL_LOOPBREAK); + putop(c, OP_POP); + maybeput(c, OP_ENDSEQ, h, f, UPB_HANDLER_ENDSEQ); + } else { + putop(c, OP_CHECKDELIM, LABEL_ENDMSG); + putchecktag(c, f, UPB_WIRE_TYPE_DELIMITED, LABEL_DISPATCH); + dispatchtarget(c, method, f, UPB_WIRE_TYPE_DELIMITED); + putop(c, OP_PUSHLENDELIM); + putop(c, OP_STARTSTR, getsel(f, UPB_HANDLER_STARTSTR)); + putop(c, OP_STRING, getsel(f, UPB_HANDLER_STRING)); + putop(c, OP_POP); + maybeput(c, OP_ENDSTR, h, f, UPB_HANDLER_ENDSTR); + putop(c, OP_SETDELIM); + } +} + +// Generates bytecode to parse a single primitive field. +static void generate_primitivefield(compiler *c, const upb_fielddef *f, + upb_pbdecodermethod *method) { + label(c, LABEL_FIELD); + + const upb_handlers *h = upb_pbdecodermethod_desthandlers(method); + upb_descriptortype_t descriptor_type = upb_fielddef_descriptortype(f); + + // From a decoding perspective, ENUM is the same as INT32. + if (descriptor_type == UPB_DESCRIPTOR_TYPE_ENUM) + descriptor_type = UPB_DESCRIPTOR_TYPE_INT32; + + opcode parse_type = (opcode)descriptor_type; + + // TODO(haberman): generate packed or non-packed first depending on "packed" + // setting in the fielddef. This will favor (in speed) whichever was + // specified. + + assert((int)parse_type >= 0 && parse_type <= OP_MAX); + upb_selector_t sel = getsel(f, upb_handlers_getprimitivehandlertype(f)); + int wire_type = upb_pb_native_wire_types[upb_fielddef_descriptortype(f)]; + if (upb_fielddef_isseq(f)) { + putop(c, OP_CHECKDELIM, LABEL_ENDMSG); + putchecktag(c, f, UPB_WIRE_TYPE_DELIMITED, LABEL_DISPATCH); + dispatchtarget(c, method, f, UPB_WIRE_TYPE_DELIMITED); + putop(c, OP_PUSHLENDELIM); + putop(c, OP_STARTSEQ, getsel(f, UPB_HANDLER_STARTSEQ)); // Packed + label(c, LABEL_LOOPSTART); + putop(c, parse_type, sel); + putop(c, OP_CHECKDELIM, LABEL_LOOPBREAK); + putop(c, OP_BRANCH, -LABEL_LOOPSTART); + dispatchtarget(c, method, f, wire_type); + putop(c, OP_PUSHTAGDELIM, 0); + putop(c, OP_STARTSEQ, getsel(f, UPB_HANDLER_STARTSEQ)); // Non-packed + label(c, LABEL_LOOPSTART); + putop(c, parse_type, sel); + putop(c, OP_CHECKDELIM, LABEL_LOOPBREAK); + putchecktag(c, f, wire_type, LABEL_LOOPBREAK); + putop(c, OP_BRANCH, -LABEL_LOOPSTART); + label(c, LABEL_LOOPBREAK); + putop(c, OP_POP); // Packed and non-packed join. + maybeput(c, OP_ENDSEQ, h, f, UPB_HANDLER_ENDSEQ); + putop(c, OP_SETDELIM); // Could remove for non-packed by dup ENDSEQ. + } else { + putop(c, OP_CHECKDELIM, LABEL_ENDMSG); + putchecktag(c, f, wire_type, LABEL_DISPATCH); + dispatchtarget(c, method, f, wire_type); + putop(c, parse_type, sel); + } +} + // Adds bytecode for parsing the given message to the given decoderplan, // while adding all dispatch targets to this message's dispatch table. static void compile_method(compiler *c, upb_pbdecodermethod *method) { assert(method); - // Symbolic names for our local labels. - const int LABEL_LOOPSTART = 1; // Top of a repeated field loop. - const int LABEL_LOOPBREAK = 2; // To jump out of a repeated loop - const int LABEL_FIELD = 3; // Jump backward to find the most recent field. - const int LABEL_ENDMSG = 4; // To reach the OP_ENDMSG instr for this msg. - - // Index is descriptor type. - static const uint8_t native_wire_types[] = { - UPB_WIRE_TYPE_END_GROUP, // ENDGROUP - UPB_WIRE_TYPE_64BIT, // DOUBLE - UPB_WIRE_TYPE_32BIT, // FLOAT - UPB_WIRE_TYPE_VARINT, // INT64 - UPB_WIRE_TYPE_VARINT, // UINT64 - UPB_WIRE_TYPE_VARINT, // INT32 - UPB_WIRE_TYPE_64BIT, // FIXED64 - UPB_WIRE_TYPE_32BIT, // FIXED32 - UPB_WIRE_TYPE_VARINT, // BOOL - UPB_WIRE_TYPE_DELIMITED, // STRING - UPB_WIRE_TYPE_START_GROUP, // GROUP - UPB_WIRE_TYPE_DELIMITED, // MESSAGE - UPB_WIRE_TYPE_DELIMITED, // BYTES - UPB_WIRE_TYPE_VARINT, // UINT32 - UPB_WIRE_TYPE_VARINT, // ENUM - UPB_WIRE_TYPE_32BIT, // SFIXED32 - UPB_WIRE_TYPE_64BIT, // SFIXED64 - UPB_WIRE_TYPE_VARINT, // SINT32 - UPB_WIRE_TYPE_VARINT, // SINT64 - }; - // Clear all entries in the dispatch table. upb_inttable_uninit(&method->dispatch); upb_inttable_init(&method->dispatch, UPB_CTYPE_UINT64); @@ -637,128 +762,15 @@ static void compile_method(compiler *c, upb_pbdecodermethod *method) { upb_msg_iter i; for(upb_msg_begin(&i, md); !upb_msg_done(&i); upb_msg_next(&i)) { const upb_fielddef *f = upb_msg_iter_field(&i); - upb_descriptortype_t descriptor_type = upb_fielddef_descriptortype(f); upb_fieldtype_t type = upb_fielddef_type(f); - // From a decoding perspective, ENUM is the same as INT32. - if (descriptor_type == UPB_DESCRIPTOR_TYPE_ENUM) - descriptor_type = UPB_DESCRIPTOR_TYPE_INT32; - if (type == UPB_TYPE_MESSAGE && !(haslazyhandlers(h, f) && c->lazy)) { - const upb_pbdecodermethod *sub_m = find_submethod(c, method, f); - if (!sub_m) { - // Don't emit any code for this field at all; it will be parsed as an - // unknown field. - continue; - } - - label(c, LABEL_FIELD); - - int wire_type = (descriptor_type == UPB_DESCRIPTOR_TYPE_MESSAGE) - ? UPB_WIRE_TYPE_DELIMITED - : UPB_WIRE_TYPE_START_GROUP; - if (upb_fielddef_isseq(f)) { - putop(c, OP_CHECKDELIM, LABEL_ENDMSG); - putchecktag(c, f, wire_type, LABEL_DISPATCH); - dispatchtarget(c, method, f, wire_type); - putop(c, OP_PUSHTAGDELIM, 0); - putop(c, OP_STARTSEQ, getsel(f, UPB_HANDLER_STARTSEQ)); - label(c, LABEL_LOOPSTART); - putpush(c, f); - putop(c, OP_STARTSUBMSG, getsel(f, UPB_HANDLER_STARTSUBMSG)); - putop(c, OP_CALL, sub_m); - putop(c, OP_POP); - putcb(c, OP_ENDSUBMSG, h, f, UPB_HANDLER_ENDSUBMSG); - if (wire_type == UPB_WIRE_TYPE_DELIMITED) { - putop(c, OP_SETDELIM); - } - putop(c, OP_CHECKDELIM, LABEL_LOOPBREAK); - putchecktag(c, f, wire_type, LABEL_LOOPBREAK); - putop(c, OP_BRANCH, -LABEL_LOOPSTART); - label(c, LABEL_LOOPBREAK); - putop(c, OP_POP); - putcb(c, OP_ENDSEQ, h, f, UPB_HANDLER_ENDSEQ); - } else { - putop(c, OP_CHECKDELIM, LABEL_ENDMSG); - putchecktag(c, f, wire_type, LABEL_DISPATCH); - dispatchtarget(c, method, f, wire_type); - putpush(c, f); - putop(c, OP_STARTSUBMSG, getsel(f, UPB_HANDLER_STARTSUBMSG)); - putop(c, OP_CALL, sub_m); - putop(c, OP_POP); - putcb(c, OP_ENDSUBMSG, h, f, UPB_HANDLER_ENDSUBMSG); - if (wire_type == UPB_WIRE_TYPE_DELIMITED) { - putop(c, OP_SETDELIM); - } - } + generate_msgfield(c, f, method); } else if (type == UPB_TYPE_STRING || type == UPB_TYPE_BYTES || type == UPB_TYPE_MESSAGE) { - label(c, LABEL_FIELD); - if (upb_fielddef_isseq(f)) { - putop(c, OP_CHECKDELIM, LABEL_ENDMSG); - putchecktag(c, f, UPB_WIRE_TYPE_DELIMITED, LABEL_DISPATCH); - dispatchtarget(c, method, f, UPB_WIRE_TYPE_DELIMITED); - putop(c, OP_PUSHTAGDELIM, 0); - putop(c, OP_STARTSEQ, getsel(f, UPB_HANDLER_STARTSEQ)); - label(c, LABEL_LOOPSTART); - putop(c, OP_PUSHLENDELIM); - putop(c, OP_STARTSTR, getsel(f, UPB_HANDLER_STARTSTR)); - // Need to emit even if no handler to skip past the string. - putop(c, OP_STRING, getsel(f, UPB_HANDLER_STRING)); - putop(c, OP_POP); - putcb(c, OP_ENDSTR, h, f, UPB_HANDLER_ENDSTR); - putop(c, OP_SETDELIM); - putop(c, OP_CHECKDELIM, LABEL_LOOPBREAK); - putchecktag(c, f, UPB_WIRE_TYPE_DELIMITED, LABEL_LOOPBREAK); - putop(c, OP_BRANCH, -LABEL_LOOPSTART); - label(c, LABEL_LOOPBREAK); - putop(c, OP_POP); - putcb(c, OP_ENDSEQ, h, f, UPB_HANDLER_ENDSEQ); - } else { - putop(c, OP_CHECKDELIM, LABEL_ENDMSG); - putchecktag(c, f, UPB_WIRE_TYPE_DELIMITED, LABEL_DISPATCH); - dispatchtarget(c, method, f, UPB_WIRE_TYPE_DELIMITED); - putop(c, OP_PUSHLENDELIM); - putop(c, OP_STARTSTR, getsel(f, UPB_HANDLER_STARTSTR)); - putop(c, OP_STRING, getsel(f, UPB_HANDLER_STRING)); - putop(c, OP_POP); - putcb(c, OP_ENDSTR, h, f, UPB_HANDLER_ENDSTR); - putop(c, OP_SETDELIM); - } + generate_delimfield(c, f, method); } else { - label(c, LABEL_FIELD); - opcode parse_type = (opcode)descriptor_type; - assert((int)parse_type >= 0 && parse_type <= OP_MAX); - upb_selector_t sel = getsel(f, upb_handlers_getprimitivehandlertype(f)); - int wire_type = native_wire_types[upb_fielddef_descriptortype(f)]; - if (upb_fielddef_isseq(f)) { - putop(c, OP_CHECKDELIM, LABEL_ENDMSG); - putchecktag(c, f, UPB_WIRE_TYPE_DELIMITED, LABEL_DISPATCH); - dispatchtarget(c, method, f, UPB_WIRE_TYPE_DELIMITED); - putop(c, OP_PUSHLENDELIM); - putop(c, OP_STARTSEQ, getsel(f, UPB_HANDLER_STARTSEQ)); // Packed - label(c, LABEL_LOOPSTART); - putop(c, parse_type, sel); - putop(c, OP_CHECKDELIM, LABEL_LOOPBREAK); - putop(c, OP_BRANCH, -LABEL_LOOPSTART); - dispatchtarget(c, method, f, wire_type); - putop(c, OP_PUSHTAGDELIM, 0); - putop(c, OP_STARTSEQ, getsel(f, UPB_HANDLER_STARTSEQ)); // Non-packed - label(c, LABEL_LOOPSTART); - putop(c, parse_type, sel); - putop(c, OP_CHECKDELIM, LABEL_LOOPBREAK); - putchecktag(c, f, wire_type, LABEL_LOOPBREAK); - putop(c, OP_BRANCH, -LABEL_LOOPSTART); - label(c, LABEL_LOOPBREAK); - putop(c, OP_POP); // Packed and non-packed join. - putcb(c, OP_ENDSEQ, h, f, UPB_HANDLER_ENDSEQ); - putop(c, OP_SETDELIM); // Could remove for non-packed by dup ENDSEQ. - } else { - putop(c, OP_CHECKDELIM, LABEL_ENDMSG); - putchecktag(c, f, wire_type, LABEL_DISPATCH); - dispatchtarget(c, method, f, wire_type); - putop(c, parse_type, sel); - } + generate_primitivefield(c, f, method); } } diff --git a/upb/pb/compile_decoder_x64.c b/upb/pb/compile_decoder_x64.c index 44c4419..b4086c7 100644 --- a/upb/pb/compile_decoder_x64.c +++ b/upb/pb/compile_decoder_x64.c @@ -23,7 +23,7 @@ // // Note: this mode requires that we can shell out to gcc. // -// 2. Run the test once locally. This will load the JIT code by building a +// 2. Run the test locally. This will load the JIT code by building a // .so (/tmp/upb-jit-code.so) and using dlopen, so more of the tooling will // work properly (like GDB). // diff --git a/upb/pb/compile_decoder_x64.dasc b/upb/pb/compile_decoder_x64.dasc index 180017f..a87b376 100644 --- a/upb/pb/compile_decoder_x64.dasc +++ b/upb/pb/compile_decoder_x64.dasc @@ -61,17 +61,21 @@ | add DELIMEND, DECODER->buf |.endmacro | -| // OPT: use "call rel32" where possible. +| // Calls an external C function at address "addr". |.macro callp, addr -|| { -|| //int64_t ofs = (int64_t)addr - (int64_t)upb_status_init; -|| //if (ofs > (1 << 30) || ofs < -(1 << 30)) { | mov64 rax, (uintptr_t)addr +| +| // Stack must be 16-byte aligned (x86-64 ABI requires this). +| // +| // OPT: possibly remove this by statically ensuring correct alignment. +| // +| // OPT: use "call rel32" where possible. +| push r12 +| mov r12, rsp +| and rsp, 0xfffffffffffffff0UL // Align stack. | call rax -|| //} else { -| // call &addr -|| //} -|| } +| mov rsp, r12 +| pop r12 |.endmacro | |.macro ld64, val @@ -208,12 +212,6 @@ static void emit_static_asm(jitcompiler *jc) { | push r12 | push rbx | - | // Align stack. - | // Since the JIT can call other functions (the JIT'ted code is not a leaf - | // function) we must respect alignment rules. All x86-64 systems require - | // 16-byte stack alignment. - | sub rsp, 8 - | | mov rbx, ARG2_64 // Preserve JIT method. | | mov DECODER, rdi @@ -234,7 +232,6 @@ static void emit_static_asm(jitcompiler *jc) { | mov rax, DECODER->size_param | mov qword DECODER->call_len, 0 |1: - | add rsp, 8 // Counter previous alignment. | pop rbx | pop r12 | pop r13 @@ -270,7 +267,6 @@ static void emit_static_asm(jitcompiler *jc) { | // Must NOT do this before the memcpy(), otherwise memcpy() will | // clobber the stack we are trying to save! | mov rsp, DECODER->saved_rsp - | add rsp, 8 // Counter previous alignment. | pop rbx | pop r12 | pop r13 diff --git a/upb/pb/encoder.c b/upb/pb/encoder.c index 975f3ab..4681c20 100644 --- a/upb/pb/encoder.c +++ b/upb/pb/encoder.c @@ -1,421 +1,496 @@ /* * upb - a minimalist implementation of protocol buffers. * - * Copyright (c) 2009 Google Inc. See LICENSE for details. + * Copyright (c) 2014 Google Inc. See LICENSE for details. * Author: Josh Haberman + * + * Since we are implementing pure handlers (ie. without any out-of-band access + * to pre-computed lengths), we have to buffer all submessages before we can + * emit even their first byte. + * + * Not knowing the size of submessages also means we can't write a perfect + * zero-copy implementation, even with buffering. Lengths are stored as + * varints, which means that we don't know how many bytes to reserve for the + * length until we know what the length is. + * + * This leaves us with three main choices: + * + * 1. buffer all submessage data in a temporary buffer, then copy it exactly + * once into the output buffer. + * + * 2. attempt to buffer data directly into the output buffer, estimating how + * many bytes each length will take. When our guesses are wrong, use + * memmove() to grow or shrink the allotted space. + * + * 3. buffer directly into the output buffer, allocating a max length + * ahead-of-time for each submessage length. If we overallocated, we waste + * space, but no memcpy() or memmove() is required. This approach requires + * defining a maximum size for submessages and rejecting submessages that + * exceed that size. + * + * (2) and (3) have the potential to have better performance, but they are more + * complicated and subtle to implement: + * + * (3) requires making an arbitrary choice of the maximum message size; it + * wastes space when submessages are shorter than this and fails + * completely when they are longer. This makes it more finicky and + * requires configuration based on the input. It also makes it impossible + * to perfectly match the output of reference encoders that always use the + * optimal amount of space for each length. + * + * (2) requires guessing the the size upfront, and if multiple lengths are + * guessed wrong the minimum required number of memmove() operations may + * be complicated to compute correctly. Implemented properly, it may have + * a useful amortized or average cost, but more investigation is required + * to determine this and what the optimal algorithm is to achieve it. + * + * (1) makes you always pay for exactly one copy, but its implementation is + * the simplest and its performance is predictable. + * + * So for now, we implement (1) only. If we wish to optimize later, we should + * be able to do it without affecting users. + * + * The strategy is to buffer the segments of data that do *not* depend on + * unknown lengths in one buffer, and keep a separate buffer of segment pointers + * and lengths. When the top-level submessage ends, we can go beginning to end, + * alternating the writing of lengths with memcpy() of the rest of the data. + * At the top level though, no buffering is required. */ #include "upb/pb/encoder.h" +#include "upb/pb/varint.int.h" #include -#include "upb/descriptor.h" - -/* Functions for calculating sizes of wire values. ****************************/ - -static size_t upb_v_uint64_t_size(uint64_t val) { -#ifdef __GNUC__ - int high_bit = 63 - __builtin_clzll(val); // 0-based, undef if val == 0. -#else - int high_bit = 0; - uint64_t tmp = val; - while(tmp >>= 1) high_bit++; -#endif - return val == 0 ? 1 : high_bit / 7 + 1; -} -static size_t upb_v_int32_t_size(int32_t val) { - // v_uint32's are sign-extended to maintain wire compatibility with int64s. - return upb_v_uint64_t_size((int64_t)val); +/* low-level buffering ********************************************************/ + +// Low-level functions for interacting with the output buffer. + +// TODO(haberman): handle pushback +static void putbuf(upb_pb_encoder *e, const char *buf, size_t len) { + size_t n = upb_bytessink_putbuf(e->output_, e->subc, buf, len, NULL); + UPB_ASSERT_VAR(n, n == len); } -static size_t upb_v_uint32_t_size(uint32_t val) { - return upb_v_uint64_t_size(val); + +static upb_pb_encoder_segment *top(upb_pb_encoder *e) { + return &e->segbuf[*e->top]; } -static size_t upb_f_uint64_t_size(uint64_t val) { - (void)val; // Length is independent of value. - return sizeof(uint64_t); + +// Call to ensure that at least "bytes" bytes are available for writing at +// e->ptr. Returns false if the bytes could not be allocated. +static bool reserve(upb_pb_encoder *e, size_t bytes) { + if ((e->limit - e->ptr) < bytes) { + size_t needed = bytes + (e->ptr - e->buf); + size_t old_size = e->limit - e->buf; + size_t new_size = old_size; + while (new_size < needed) { + new_size *= 2; + } + + char *realloc_from = (e->buf == e->initbuf) ? NULL : e->buf; + char *new_buf = realloc(realloc_from, new_size); + + if (new_buf == NULL) { + return false; + } + + if (realloc_from == NULL) { + memcpy(new_buf, e->initbuf, old_size); + } + + e->ptr = new_buf + (e->ptr - e->buf); + e->runbegin = new_buf + (e->runbegin - e->buf); + e->limit = new_buf + new_size; + e->buf = new_buf; + } + + return true; } -static size_t upb_f_uint32_t_size(uint32_t val) { - (void)val; // Length is independent of value. - return sizeof(uint32_t); + +// Call when "bytes" bytes have been writte at e->ptr. The caller *must* have +// previously called reserve() with at least this many bytes. +static void advance(upb_pb_encoder *e, size_t bytes) { + assert((e->limit - e->ptr) >= bytes); + e->ptr += bytes; } +// Call when all of the bytes for a handler have been written. Flushes the +// bytes if possible and necessary, returning false if this failed. +static bool commit(upb_pb_encoder *e) { + if (!e->top) { + // We aren't inside a delimited region. Flush our accumulated bytes to + // the output. + // + // TODO(haberman): in the future we may want to delay flushing for + // efficiency reasons. + putbuf(e, e->buf, e->ptr - e->buf); + e->ptr = e->buf; + } -/* Functions to write wire values. ********************************************/ + return true; +} -// Since we know in advance the longest that the value could be, we always make -// sure that our buffer is long enough. This saves us from having to perform -// bounds checks. +// Writes the given bytes to the buffer, handling reserve/advance. +static bool encode_bytes(upb_pb_encoder *e, const void *data, size_t len) { + if (!reserve(e, len)) { + return false; + } -// Puts a varint (wire type: UPB_WIRE_TYPE_VARINT). -static uint8_t *upb_put_v_uint64_t(uint8_t *buf, uint64_t val) -{ - do { - uint8_t byte = val & 0x7f; - val >>= 7; - if(val) byte |= 0x80; - *buf++ = byte; - } while(val); - return buf; + memcpy(e->ptr, data, len); + advance(e, len); + return true; } -// Puts an unsigned 32-bit varint, verbatim. Never uses the high 64 bits. -static uint8_t *upb_put_v_uint32_t(uint8_t *buf, uint32_t val) -{ - return upb_put_v_uint64_t(buf, val); +// Finish the current run by adding the run totals to the segment and message +// length. +static void accumulate(upb_pb_encoder *e) { + assert(e->ptr >= e->runbegin); + size_t run_len = e->ptr - e->runbegin; + e->segptr->seglen += run_len; + top(e)->msglen += run_len; + e->runbegin = e->ptr; } -// Puts a signed 32-bit varint, first sign-extending to 64-bits. We do this to -// maintain wire-compatibility with 64-bit signed integers. -static uint8_t *upb_put_v_int32_t(uint8_t *buf, int32_t val) -{ - return upb_put_v_uint64_t(buf, (int64_t)val); +// Call to indicate the start of delimited region for which the full length is +// not yet known. All data will be buffered until the length is known. +// Delimited regions may be nested; their lengths will all be tracked properly. +static bool start_delim(upb_pb_encoder *e) { + if (e->top) { + // We are already buffering, advance to the next segment and push it on the + // stack. + accumulate(e); + + if (++e->top == e->stacklimit) { + // TODO(haberman): grow stack? + return false; + } + + if (++e->segptr == e->seglimit) { + upb_pb_encoder_segment *realloc_from = + (e->segbuf == e->seginitbuf) ? NULL : e->segbuf; + size_t old_size = + (e->seglimit - e->segbuf) * sizeof(upb_pb_encoder_segment); + size_t new_size = old_size * 2; + upb_pb_encoder_segment *new_buf = realloc(realloc_from, new_size); + + if (new_buf == NULL) { + return false; + } + + if (realloc_from == NULL) { + memcpy(new_buf, e->seginitbuf, old_size); + } + + e->segptr = new_buf + (e->segptr - e->segbuf); + e->seglimit = new_buf + (new_size / sizeof(upb_pb_encoder_segment)); + e->segbuf = new_buf; + } + } else { + // We were previously at the top level, start buffering. + e->segptr = e->segbuf; + e->top = e->stack; + e->runbegin = e->ptr; + } + + *e->top = e->segptr - e->segbuf; + e->segptr->seglen = 0; + e->segptr->msglen = 0; + + return true; } -static void upb_put32(uint8_t *buf, uint32_t val) { - buf[0] = val & 0xff; - buf[1] = (val >> 8) & 0xff; - buf[2] = (val >> 16) & 0xff; - buf[3] = (val >> 24); +// Call to indicate the end of a delimited region. We now know the length of +// the delimited region. If we are not nested inside any other delimited +// regions, we can now emit all of the buffered data we accumulated. +static bool end_delim(upb_pb_encoder *e) { + accumulate(e); + size_t msglen = top(e)->msglen; + + if (e->top == e->stack) { + // All lengths are now available, emit all buffered data. + char buf[UPB_PB_VARINT_MAX_LEN]; + upb_pb_encoder_segment *s; + const char *ptr = e->buf; + for (s = e->segbuf; s <= e->segptr; s++) { + size_t lenbytes = upb_vencode64(s->msglen, buf); + putbuf(e, buf, lenbytes); + putbuf(e, ptr, s->seglen); + ptr += s->seglen; + } + + e->ptr = e->buf; + e->top = NULL; + } else { + // Need to keep buffering; propagate length info into enclosing submessages. + --e->top; + top(e)->msglen += msglen + upb_varint_size(msglen); + } + + return true; } -// Puts a fixed-length 32-bit integer (wire type: UPB_WIRE_TYPE_32BIT). -static uint8_t *upb_put_f_uint32_t(uint8_t *buf, uint32_t val) -{ - uint8_t *uint32_end = buf + sizeof(uint32_t); -#if UPB_UNALIGNED_READS_OK - *(uint32_t*)buf = val; -#else - upb_put32(buf, val); -#endif - return uint32_end; + +/* tag_t **********************************************************************/ + +// A precomputed (pre-encoded) tag and length. + +typedef struct { + uint8_t bytes; + char tag[7]; +} tag_t; + +// Allocates a new tag for this field, and sets it in these handlerattr. +static void new_tag(upb_handlers *h, const upb_fielddef *f, upb_wiretype_t wt, + upb_handlerattr *attr) { + uint32_t n = upb_fielddef_number(f); + + tag_t *tag = malloc(sizeof(tag_t)); + tag->bytes = upb_vencode64((n << 3) | wt, tag->tag); + + upb_handlerattr_init(attr); + upb_handlerattr_sethandlerdata(attr, tag); + upb_handlers_addcleanup(h, tag, free); } -// Puts a fixed-length 64-bit integer (wire type: UPB_WIRE_TYPE_64BIT). -static uint8_t *upb_put_f_uint64_t(uint8_t *buf, uint64_t val) -{ - uint8_t *uint64_end = buf + sizeof(uint64_t); -#if UPB_UNALIGNED_READS_OK - *(uint64_t*)buf = val; -#else - upb_put32(buf, (uint32_t)val); - upb_put32(buf, (uint32_t)(val >> 32)); -#endif - return uint64_end; +static bool encode_tag(upb_pb_encoder *e, const tag_t *tag) { + return encode_bytes(e, tag->tag, tag->bytes); } -/* Functions to write and calculate sizes for .proto values. ******************/ -// Performs zig-zag encoding, which is used by sint32 and sint64. -static uint32_t upb_zzenc_32(int32_t n) { return (n << 1) ^ (n >> 31); } -static uint64_t upb_zzenc_64(int64_t n) { return (n << 1) ^ (n >> 63); } +/* encoding of wire types *****************************************************/ -/* Use macros to define a set of two functions for each .proto type: - * - * // Converts and writes a .proto value into buf. "end" indicates the end - * // of the current available buffer (if the buffer does not contain enough - * // space UPB_STATUS_NEED_MORE_DATA is returned). On success, *outbuf will - * // point one past the data that was written. - * uint8_t *upb_put_INT32(uint8_t *buf, int32_t val); - * - * // Returns the number of bytes required to encode val. - * size_t upb_get_INT32_size(int32_t val); - * - * // Given a .proto value s (source) convert it to a wire value. - * uint32_t upb_vtowv_INT32(int32_t s); - */ +static bool encode_fixed64(upb_pb_encoder *e, uint64_t val) { + // TODO(haberman): byte-swap for big endian. + return encode_bytes(e, &val, sizeof(uint64_t)); +} -#define VTOWV(type, wire_t, val_t) \ - static wire_t upb_vtowv_ ## type(val_t s) +static bool encode_fixed32(upb_pb_encoder *e, uint32_t val) { + // TODO(haberman): byte-swap for big endian. + return encode_bytes(e, &val, sizeof(uint32_t)); +} -#define PUT(type, v_or_f, wire_t, val_t, member_name) \ - static uint8_t *upb_put_ ## type(uint8_t *buf, val_t val) { \ - wire_t tmp = upb_vtowv_ ## type(val); \ - return upb_put_ ## v_or_f ## _ ## wire_t(buf, tmp); \ +static bool encode_varint(upb_pb_encoder *e, uint64_t val) { + if (!reserve(e, UPB_PB_VARINT_MAX_LEN)) { + return false; } -#define T(type, v_or_f, wire_t, val_t, member_name) \ - static size_t upb_get_ ## type ## _size(val_t val) { \ - return upb_ ## v_or_f ## _ ## wire_t ## _size(val); \ - } \ - VTOWV(type, wire_t, val_t); /* prototype for PUT below */ \ - PUT(type, v_or_f, wire_t, val_t, member_name) \ - VTOWV(type, wire_t, val_t) - -T(INT32, v, int32_t, int32_t, int32) { return (uint32_t)s; } -T(INT64, v, uint64_t, int64_t, int64) { return (uint64_t)s; } -T(UINT32, v, uint32_t, uint32_t, uint32) { return s; } -T(UINT64, v, uint64_t, uint64_t, uint64) { return s; } -T(SINT32, v, uint32_t, int32_t, int32) { return upb_zzenc_32(s); } -T(SINT64, v, uint64_t, int64_t, int64) { return upb_zzenc_64(s); } -T(FIXED32, f, uint32_t, uint32_t, uint32) { return s; } -T(FIXED64, f, uint64_t, uint64_t, uint64) { return s; } -T(SFIXED32, f, uint32_t, int32_t, int32) { return (uint32_t)s; } -T(SFIXED64, f, uint64_t, int64_t, int64) { return (uint64_t)s; } -T(BOOL, v, uint32_t, bool, _bool) { return (uint32_t)s; } -T(ENUM, v, uint32_t, int32_t, int32) { return (uint32_t)s; } -T(DOUBLE, f, uint64_t, double, _double) { - upb_value v; - v._double = s; - return v.uint64; + advance(e, upb_vencode64(val, e->ptr)); + return true; } -T(FLOAT, f, uint32_t, float, _float) { - upb_value v; - v._float = s; - return v.uint32; + +static uint64_t dbl2uint64(double d) { + uint64_t ret; + memcpy(&ret, &d, sizeof(uint64_t)); + return ret; } -#undef VTOWV -#undef PUT -#undef T -static uint8_t *upb_encode_value(uint8_t *buf, upb_field_type_t ft, upb_value v) -{ -#define CASE(t, member_name) \ - case UPB_TYPE(t): return upb_put_ ## t(buf, v.member_name); - switch(ft) { - CASE(DOUBLE, _double) - CASE(FLOAT, _float) - CASE(INT32, int32) - CASE(INT64, int64) - CASE(UINT32, uint32) - CASE(UINT64, uint64) - CASE(SINT32, int32) - CASE(SINT64, int64) - CASE(FIXED32, uint32) - CASE(FIXED64, uint64) - CASE(SFIXED32, int32) - CASE(SFIXED64, int64) - CASE(BOOL, _bool) - CASE(ENUM, int32) - default: assert(false); return buf; +static uint32_t flt2uint32(float d) { + uint32_t ret; + memcpy(&ret, &d, sizeof(uint32_t)); + return ret; +} + + +/* encoding of proto types ****************************************************/ + +static bool startmsg(void *c, const void *hd) { + upb_pb_encoder *e = c; + UPB_UNUSED(hd); + if (e->depth++ == 0) { + upb_bytessink_start(e->output_, 0, &e->subc); } -#undef CASE + return true; } -static uint32_t _upb_get_value_size(upb_field_type_t ft, upb_value v) -{ -#define CASE(t, member_name) \ - case UPB_TYPE(t): return upb_get_ ## t ## _size(v.member_name); - switch(ft) { - CASE(DOUBLE, _double) - CASE(FLOAT, _float) - CASE(INT32, int32) - CASE(INT64, int64) - CASE(UINT32, uint32) - CASE(UINT64, uint64) - CASE(SINT32, int32) - CASE(SINT64, int64) - CASE(FIXED32, uint32) - CASE(FIXED64, uint64) - CASE(SFIXED32, int32) - CASE(SFIXED64, int64) - CASE(BOOL, _bool) - CASE(ENUM, int32) - default: assert(false); return 0; +static bool endmsg(void *c, const void *hd, upb_status *status) { + upb_pb_encoder *e = c; + UPB_UNUSED(hd); + UPB_UNUSED(status); + if (--e->depth == 0) { + upb_bytessink_end(e->output_); } -#undef CASE + return true; } -static uint8_t *_upb_put_tag(uint8_t *buf, upb_field_number_t num, - upb_wire_type_t wt) -{ - return upb_put_UINT32(buf, wt | (num << 3)); +static void *encode_startdelimfield(void *c, const void *hd) { + bool ok = encode_tag(c, hd) && commit(c) && start_delim(c); + return ok ? c : UPB_BREAK; } -static uint32_t _upb_get_tag_size(upb_field_number_t num) -{ - return upb_get_UINT32_size(num << 3); +static bool encode_enddelimfield(void *c, const void *hd) { + UPB_UNUSED(hd); + return end_delim(c); } +static void *encode_startgroup(void *c, const void *hd) { + return (encode_tag(c, hd) && commit(c)) ? c : UPB_BREAK; +} -/* upb_sizebuilder ************************************************************/ +static bool encode_endgroup(void *c, const void *hd) { + return encode_tag(c, hd) && commit(c); +} -struct upb_sizebuilder { - // Accumulating size for the current level. - uint32_t size; +static void *encode_startstr(void *c, const void *hd, size_t size_hint) { + UPB_UNUSED(size_hint); + return encode_startdelimfield(c, hd); +} - // Stack of sizes for our current nesting. - uint32_t stack[UPB_MAX_NESTING], *top; +static size_t encode_strbuf(void *c, const void *hd, const char *buf, + size_t len, const upb_bufhandle *h) { + UPB_UNUSED(hd); + UPB_UNUSED(h); + return encode_bytes(c, buf, len) ? len : 0; +} - // Vector of sizes. - uint32_t *sizes; - int sizes_len; - int sizes_size; +#define T(type, ctype, convert, encode) \ + static bool encode_scalar_##type(void *e, const void *hd, ctype val) { \ + return encode_tag(e, hd) && encode(e, (convert)(val)) && commit(e); \ + } \ + static bool encode_packed_##type(void *e, const void *hd, ctype val) { \ + UPB_UNUSED(hd); \ + return encode(e, (convert)(val)); \ + } - upb_status status; -}; +T(double, double, dbl2uint64, encode_fixed64) +T(float, float, flt2uint32, encode_fixed32); +T(int64, int64_t, uint64_t, encode_varint); +T(int32, int32_t, uint32_t, encode_varint); +T(fixed64, uint64_t, uint64_t, encode_fixed64); +T(fixed32, uint32_t, uint32_t, encode_fixed32); +T(bool, bool, bool, encode_varint); +T(uint32, uint32_t, uint32_t, encode_varint); +T(uint64, uint64_t, uint64_t, encode_varint); +T(enum, int32_t, uint32_t, encode_varint); +T(sfixed32, int32_t, uint32_t, encode_fixed32); +T(sfixed64, int64_t, uint64_t, encode_fixed64); +T(sint32, int32_t, upb_zzenc_32, encode_varint); +T(sint64, int64_t, upb_zzenc_64, encode_varint); -// upb_sink callbacks. -static upb_sink_status _upb_sizebuilder_valuecb(upb_sink *sink, upb_fielddef *f, - upb_value val, - upb_status *status) -{ - (void)status; - upb_sizebuilder *sb = (upb_sizebuilder*)sink; - uint32_t size = 0; - size += _upb_get_tag_size(f->number); - size += _upb_get_value_size(f->type, val); - sb->size += size; - return UPB_SINK_CONTINUE; -} +#undef T -static upb_sink_status _upb_sizebuilder_strcb(upb_sink *sink, upb_fielddef *f, - upb_strptr str, - int32_t start, uint32_t end, - upb_status *status) -{ - (void)status; - (void)str; // String data itself is not used. - upb_sizebuilder *sb = (upb_sizebuilder*)sink; - if(start >= 0) { - uint32_t size = 0; - size += _upb_get_tag_size(f->number); - size += upb_get_UINT32_size(end - start); - sb->size += size; - } - return UPB_SINK_CONTINUE; -} -static upb_sink_status _upb_sizebuilder_startcb(upb_sink *sink, upb_fielddef *f, - upb_status *status) -{ - (void)status; - (void)f; // Unused (we calculate tag size and delimiter in endcb). - upb_sizebuilder *sb = (upb_sizebuilder*)sink; - if(f->type == UPB_TYPE(MESSAGE)) { - *sb->top = sb->size; - sb->top++; - sb->size = 0; - } else { - assert(f->type == UPB_TYPE(GROUP)); - sb->size += _upb_get_tag_size(f->number); - } - return UPB_SINK_CONTINUE; -} +/* code to build the handlers *************************************************/ + +static void newhandlers_callback(const void *closure, upb_handlers *h) { + UPB_UNUSED(closure); -static upb_sink_status _upb_sizebuilder_endcb(upb_sink *sink, upb_fielddef *f, - upb_status *status) -{ - (void)status; - upb_sizebuilder *sb = (upb_sizebuilder*)sink; - if(f->type == UPB_TYPE(MESSAGE)) { - sb->top--; - if(sb->sizes_len == sb->sizes_size) { - sb->sizes_size *= 2; - sb->sizes = realloc(sb->sizes, sb->sizes_size * sizeof(*sb->sizes)); + upb_handlers_setstartmsg(h, startmsg, NULL); + upb_handlers_setendmsg(h, endmsg, NULL); + + const upb_msgdef *m = upb_handlers_msgdef(h); + upb_msg_iter i; + for(upb_msg_begin(&i, m); !upb_msg_done(&i); upb_msg_next(&i)) { + const upb_fielddef *f = upb_msg_iter_field(&i); + bool packed = upb_fielddef_isseq(f) && upb_fielddef_isprimitive(f) && + upb_fielddef_packed(f); + upb_handlerattr attr; + upb_wiretype_t wt = + packed ? UPB_WIRE_TYPE_DELIMITED + : upb_pb_native_wire_types[upb_fielddef_descriptortype(f)]; + + // Pre-encode the tag for this field. + new_tag(h, f, wt, &attr); + + if (packed) { + upb_handlers_setstartseq(h, f, encode_startdelimfield, &attr); + upb_handlers_setendseq(h, f, encode_enddelimfield, &attr); } - uint32_t child_size = sb->size; - uint32_t parent_size = *sb->top; - sb->sizes[sb->sizes_len++] = child_size; - // The size according to the parent includes the tag size and delimiter of - // the submessage. - parent_size += upb_get_UINT32_size(child_size); - parent_size += _upb_get_tag_size(f->number); - // Include size accumulated in parent before child began. - sb->size = child_size + parent_size; - } else { - assert(f->type == UPB_TYPE(GROUP)); - // As an optimization, we could just add this number twice in startcb, to - // avoid having to recalculate it. - sb->size += _upb_get_tag_size(f->number); + +#define T(upper, lower, upbtype) \ + case UPB_DESCRIPTOR_TYPE_##upper: \ + if (packed) { \ + upb_handlers_set##upbtype(h, f, encode_packed_##lower, &attr); \ + } else { \ + upb_handlers_set##upbtype(h, f, encode_scalar_##lower, &attr); \ + } \ + break; + + switch (upb_fielddef_descriptortype(f)) { + T(DOUBLE, double, double); + T(FLOAT, float, float); + T(INT64, int64, int64); + T(INT32, int32, int32); + T(FIXED64, fixed64, uint64); + T(FIXED32, fixed32, uint32); + T(BOOL, bool, bool); + T(UINT32, uint32, uint32); + T(UINT64, uint64, uint64); + T(ENUM, enum, int32); + T(SFIXED32, sfixed32, int32); + T(SFIXED64, sfixed64, int64); + T(SINT32, sint32, int32); + T(SINT64, sint64, int64); + case UPB_DESCRIPTOR_TYPE_STRING: + case UPB_DESCRIPTOR_TYPE_BYTES: + upb_handlers_setstartstr(h, f, encode_startstr, &attr); + upb_handlers_setendstr(h, f, encode_enddelimfield, &attr); + upb_handlers_setstring(h, f, encode_strbuf, &attr); + break; + case UPB_DESCRIPTOR_TYPE_MESSAGE: + upb_handlers_setstartsubmsg(h, f, encode_startdelimfield, &attr); + upb_handlers_setendsubmsg(h, f, encode_enddelimfield, &attr); + break; + case UPB_DESCRIPTOR_TYPE_GROUP: { + // Endgroup takes a different tag (wire_type = END_GROUP). + upb_handlerattr attr2; + new_tag(h, f, UPB_WIRE_TYPE_END_GROUP, &attr2); + + upb_handlers_setstartsubmsg(h, f, encode_startgroup, &attr); + upb_handlers_setendsubmsg(h, f, encode_endgroup, &attr2); + + upb_handlerattr_uninit(&attr2); + break; + } + } + +#undef T + + upb_handlerattr_uninit(&attr); } - return UPB_SINK_CONTINUE; } -upb_sink_callbacks _upb_sizebuilder_sink_vtbl = { - _upb_sizebuilder_valuecb, - _upb_sizebuilder_strcb, - _upb_sizebuilder_startcb, - _upb_sizebuilder_endcb -}; - - -/* upb_sink callbacks *********************************************************/ - -struct upb_encoder { - upb_sink base; - //upb_bytesink *bytesink; - uint32_t *sizes; - int size_offset; -}; - - -// Within one callback we may need to encode up to two separate values. -#define UPB_ENCODER_BUFSIZE (UPB_MAX_ENCODED_SIZE * 2) - -static upb_sink_status _upb_encoder_push_buf(upb_encoder *s, const uint8_t *buf, - size_t len, upb_status *status) -{ - // TODO: conjure a upb_strptr that points to buf. - //upb_strptr ptr; - (void)s; - (void)buf; - (void)status; - size_t written = 5;// = upb_bytesink_onbytes(s->bytesink, ptr); - if(written < len) { - // TODO: mark to skip "written" bytes next time. - return UPB_SINK_STOP; - } else { - return UPB_SINK_CONTINUE; - } + +/* public API *****************************************************************/ + +const upb_handlers *upb_pb_encoder_newhandlers(const upb_msgdef *m, + const void *owner) { + return upb_handlers_newfrozen(m, owner, newhandlers_callback, NULL); } -static upb_sink_status _upb_encoder_valuecb(upb_sink *sink, upb_fielddef *f, - upb_value val, upb_status *status) -{ - upb_encoder *s = (upb_encoder*)sink; - uint8_t buf[UPB_ENCODER_BUFSIZE], *ptr = buf; - upb_wire_type_t wt = upb_types[f->type].expected_wire_type; - // TODO: handle packed encoding. - ptr = _upb_put_tag(ptr, f->number, wt); - ptr = upb_encode_value(ptr, f->type, val); - return _upb_encoder_push_buf(s, buf, ptr - buf, status); +#define ARRAYSIZE(x) (sizeof(x) / sizeof(x[0])) + +void upb_pb_encoder_init(upb_pb_encoder *e, const upb_handlers *h) { + e->output_ = NULL; + e->subc = NULL; + e->buf = e->initbuf; + e->ptr = e->buf; + e->limit = e->buf + ARRAYSIZE(e->initbuf); + e->segbuf = e->seginitbuf; + e->seglimit = e->segbuf + ARRAYSIZE(e->seginitbuf); + e->stacklimit = e->stack + ARRAYSIZE(e->stack); + upb_sink_reset(&e->input_, h, e); } -static upb_sink_status _upb_encoder_strcb(upb_sink *sink, upb_fielddef *f, - upb_strptr str, - int32_t start, uint32_t end, - upb_status *status) -{ - upb_encoder *s = (upb_encoder*)sink; - uint8_t buf[UPB_ENCODER_BUFSIZE], *ptr = buf; - if(start >= 0) { - ptr = _upb_put_tag(ptr, f->number, UPB_WIRE_TYPE_DELIMITED); - ptr = upb_put_UINT32(ptr, end - start); +void upb_pb_encoder_uninit(upb_pb_encoder *e) { + if (e->buf != e->initbuf) { + free(e->buf); } - // TODO: properly handle partially consumed strings and partially supplied - // strings. - _upb_encoder_push_buf(s, buf, ptr - buf, status); - return _upb_encoder_push_buf(s, (uint8_t*)upb_string_getrobuf(str), end - start, status); -} -static upb_sink_status _upb_encoder_startcb(upb_sink *sink, upb_fielddef *f, - upb_status *status) -{ - upb_encoder *s = (upb_encoder*)sink; - uint8_t buf[UPB_ENCODER_BUFSIZE], *ptr = buf; - if(f->type == UPB_TYPE(GROUP)) { - ptr = _upb_put_tag(ptr, f->number, UPB_WIRE_TYPE_START_GROUP); - } else { - ptr = _upb_put_tag(ptr, f->number, UPB_WIRE_TYPE_DELIMITED); - ptr = upb_put_UINT32(ptr, s->sizes[--s->size_offset]); + if (e->segbuf != e->seginitbuf) { + free(e->segbuf); } - return _upb_encoder_push_buf(s, buf, ptr - buf, status); } -static upb_sink_status _upb_encoder_endcb(upb_sink *sink, upb_fielddef *f, - upb_status *status) -{ - upb_encoder *s = (upb_encoder*)sink; - uint8_t buf[UPB_ENCODER_BUFSIZE], *ptr = buf; - if(f->type != UPB_TYPE(GROUP)) return UPB_SINK_CONTINUE; - ptr = _upb_put_tag(ptr, f->number, UPB_WIRE_TYPE_END_GROUP); - return _upb_encoder_push_buf(s, buf, ptr - buf, status); +void upb_pb_encoder_resetoutput(upb_pb_encoder *e, upb_bytessink *output) { + upb_pb_encoder_reset(e); + e->output_ = output; + e->subc = output->closure; } -upb_sink_callbacks _upb_encoder_sink_vtbl = { - _upb_encoder_valuecb, - _upb_encoder_strcb, - _upb_encoder_startcb, - _upb_encoder_endcb -}; +void upb_pb_encoder_reset(upb_pb_encoder *e) { + e->segptr = NULL; + e->top = NULL; + e->depth = 0; +} +upb_sink *upb_pb_encoder_input(upb_pb_encoder *e) { return &e->input_; } diff --git a/upb/pb/encoder.h b/upb/pb/encoder.h index 563b78d..2df5797 100644 --- a/upb/pb/encoder.h +++ b/upb/pb/encoder.h @@ -7,52 +7,155 @@ * Implements a set of upb_handlers that write protobuf data to the binary wire * format. * - * For messages that have any submessages, the encoder needs a buffer - * containing the submessage sizes, so they can be properly written at the - * front of each message. Note that groups do *not* have this requirement. + * This encoder implementation does not have any access to any out-of-band or + * precomputed lengths for submessages, so it must buffer submessages internally + * before it can emit the first byte. */ #ifndef UPB_ENCODER_H_ #define UPB_ENCODER_H_ -#include "upb/upb.h" -#include "upb/bytestream.h" +#include "upb/sink.h" #ifdef __cplusplus -extern "C" { +namespace upb { +namespace pb { +class Encoder; +} // namespace pb +} // namespace upb #endif -/* upb_encoder ****************************************************************/ +UPB_DECLARE_TYPE(upb::pb::Encoder, upb_pb_encoder); -// A upb_encoder is a upb_sink that emits data to a upb_bytesink in the protocol -// buffer binary wire format. -struct upb_encoder; -typedef struct upb_encoder upb_encoder; +#define UPB_PBENCODER_MAX_NESTING 100 -upb_encoder *upb_encoder_new(upb_msgdef *md); -void upb_encoder_free(upb_encoder *e); +/* upb::pb::Encoder ***********************************************************/ -// Resets the given upb_encoder such that is is ready to begin encoding, -// outputting data to "bytesink" (which must live until the encoder is -// reset or destroyed). -void upb_encoder_reset(upb_encoder *e, upb_bytesink *bytesink); +// The output buffer is divided into segments; a segment is a string of data +// that is "ready to go" -- it does not need any varint lengths inserted into +// the middle. The seams between segments are where varints will be inserted +// once they are known. +// +// We also use the concept of a "run", which is a range of encoded bytes that +// occur at a single submessage level. Every segment contains one or more runs. +// +// A segment can span messages. Consider: +// +// .--Submessage lengths---------. +// | | | +// | V V +// V | |--------------- | |----------------- +// Submessages: | |----------------------------------------------- +// Top-level msg: ------------------------------------------------------------ +// +// Segments: ----- ------------------- ----------------- +// Runs: *---- *--------------*--- *---------------- +// (* marks the start) +// +// Note that the top-level menssage is not in any segment because it does not +// have any length preceding it. +// +// A segment is only interrupted when another length needs to be inserted. So +// observe how the second segment spans both the inner submessage and part of +// the next enclosing message. +typedef struct { + UPB_PRIVATE_FOR_CPP + uint32_t msglen; // The length to varint-encode before this segment. + uint32_t seglen; // Length of the segment. +} upb_pb_encoder_segment; -// Returns the upb_sink to which data can be written. The sink is invalidated -// when the encoder is reset or destroyed. Note that if the client wants to -// encode any length-delimited submessages it must first call -// upb_encoder_buildsizes() below. -upb_sink *upb_encoder_sink(upb_encoder *e); +UPB_DEFINE_CLASS0(upb::pb::Encoder, + public: + Encoder(const upb::Handlers* handlers); + ~Encoder(); -// Call prior to pushing any data with embedded submessages. "src" must yield -// exactly the same data as what will next be encoded, but in reverse order. -// The encoder iterates over this data in order to determine the sizes of the -// submessages. If any errors are returned by the upb_src, the status will -// be saved in *status. If the client is sure that the upb_src will not throw -// any errors, "status" may be NULL. -void upb_encoder_buildsizes(upb_encoder *e, upb_src *src, upb_status *status); + static reffed_ptr NewHandlers(const upb::MessageDef* msg); + + // Resets the state of the printer, so that it will expect to begin a new + // document. + void Reset(); + + // Resets the output pointer which will serve as our closure. + void ResetOutput(BytesSink* output); + + // The input to the encoder. + Sink* input(); + + private: + UPB_DISALLOW_COPY_AND_ASSIGN(Encoder); +, +UPB_DEFINE_STRUCT0(upb_pb_encoder, UPB_QUOTE( + // Our input and output. + upb_sink input_; + upb_bytessink *output_; + + // The "subclosure" -- used as the inner closure as part of the bytessink + // protocol. + void *subc; + + // The output buffer and limit, and our current write position. "buf" + // initially points to "initbuf", but is dynamically allocated if we need to + // grow beyond the initial size. + char *buf, *ptr, *limit; + + // The beginning of the current run, or undefined if we are at the top level. + char *runbegin; + + // The list of segments we are accumulating. + upb_pb_encoder_segment *segbuf, *segptr, *seglimit; + + // The stack of enclosing submessages. Each entry in the stack points to the + // segment where this submessage's length is being accumulated. + int stack[UPB_PBENCODER_MAX_NESTING], *top, *stacklimit; + + // Depth of startmsg/endmsg calls. + int depth; + + // Initial buffers for the output buffer and segment buffer. If we outgrow + // these we will dynamically allocate bigger ones. + char initbuf[256]; + upb_pb_encoder_segment seginitbuf[32]; +))); + +UPB_BEGIN_EXTERN_C + +const upb_handlers *upb_pb_encoder_newhandlers(const upb_msgdef *m, + const void *owner); +void upb_pb_encoder_reset(upb_pb_encoder *e); +upb_sink *upb_pb_encoder_input(upb_pb_encoder *p); +void upb_pb_encoder_init(upb_pb_encoder *e, const upb_handlers *h); +void upb_pb_encoder_resetoutput(upb_pb_encoder *e, upb_bytessink *output); +void upb_pb_encoder_uninit(upb_pb_encoder *e); + +UPB_END_EXTERN_C #ifdef __cplusplus -} /* extern "C" */ + +namespace upb { +namespace pb { +inline Encoder::Encoder(const upb::Handlers* handlers) { + upb_pb_encoder_init(this, handlers); +} +inline Encoder::~Encoder() { + upb_pb_encoder_uninit(this); +} +inline void Encoder::Reset() { + upb_pb_encoder_reset(this); +} +inline void Encoder::ResetOutput(BytesSink* output) { + upb_pb_encoder_resetoutput(this, output); +} +inline Sink* Encoder::input() { + return upb_pb_encoder_input(this); +} +inline reffed_ptr Encoder::NewHandlers( + const upb::MessageDef *md) { + const Handlers* h = upb_pb_encoder_newhandlers(md, &h); + return reffed_ptr(h, &h); +} +} // namespace pb +} // namespace upb + #endif #endif /* UPB_ENCODER_H_ */ diff --git a/upb/pb/varint.c b/upb/pb/varint.c index ccd752d..365deb4 100644 --- a/upb/pb/varint.c +++ b/upb/pb/varint.c @@ -7,6 +7,29 @@ #include "upb/pb/varint.int.h" +// Index is descriptor type. +const uint8_t upb_pb_native_wire_types[] = { + UPB_WIRE_TYPE_END_GROUP, // ENDGROUP + UPB_WIRE_TYPE_64BIT, // DOUBLE + UPB_WIRE_TYPE_32BIT, // FLOAT + UPB_WIRE_TYPE_VARINT, // INT64 + UPB_WIRE_TYPE_VARINT, // UINT64 + UPB_WIRE_TYPE_VARINT, // INT32 + UPB_WIRE_TYPE_64BIT, // FIXED64 + UPB_WIRE_TYPE_32BIT, // FIXED32 + UPB_WIRE_TYPE_VARINT, // BOOL + UPB_WIRE_TYPE_DELIMITED, // STRING + UPB_WIRE_TYPE_START_GROUP, // GROUP + UPB_WIRE_TYPE_DELIMITED, // MESSAGE + UPB_WIRE_TYPE_DELIMITED, // BYTES + UPB_WIRE_TYPE_VARINT, // UINT32 + UPB_WIRE_TYPE_VARINT, // ENUM + UPB_WIRE_TYPE_32BIT, // SFIXED32 + UPB_WIRE_TYPE_64BIT, // SFIXED64 + UPB_WIRE_TYPE_VARINT, // SINT32 + UPB_WIRE_TYPE_VARINT, // SINT64 +}; + // A basic branch-based decoder, uses 32-bit values to get good performance // on 32-bit architectures (but performs well on 64-bits also). // This scheme comes from the original Google Protobuf implementation (proto2). diff --git a/upb/pb/varint.int.h b/upb/pb/varint.int.h index d92fef9..8498acd 100644 --- a/upb/pb/varint.int.h +++ b/upb/pb/varint.int.h @@ -37,6 +37,10 @@ typedef enum { // wiki document about this). #define UPB_PB_VARINT_MAX_LEN 10 +// Array of the "native" (ie. non-packed-repeated) wire type for the given a +// descriptor type (upb_descriptortype_t). +extern const uint8_t upb_pb_native_wire_types[]; + /* Zig-zag encoding/decoding **************************************************/ UPB_INLINE int32_t upb_zzdec_32(uint32_t n) { @@ -129,6 +133,11 @@ UPB_INLINE size_t upb_vencode64(uint64_t val, char *buf) { return i; } +UPB_INLINE size_t upb_varint_size(uint64_t val) { + char buf[UPB_PB_VARINT_MAX_LEN]; + return upb_vencode64(val, buf); +} + // Encodes a 32-bit varint, *not* sign-extended. UPB_INLINE uint64_t upb_vencode32(uint32_t val) { char buf[UPB_PB_VARINT_MAX_LEN]; -- cgit v1.2.3