From 699b51b441c6f963ca95e9a7c40c2cb14adde014 Mon Sep 17 00:00:00 2001 From: Joshua Haberman Date: Mon, 10 Jul 2017 16:25:50 -0500 Subject: Lots of encoder/decoder work (backwards encoder). --- upb/encode.c | 401 ++++++++++++++++++++--------------------------------------- 1 file changed, 135 insertions(+), 266 deletions(-) (limited to 'upb/encode.c') diff --git a/upb/encode.c b/upb/encode.c index 30f2da7..2fe1cc3 100644 --- a/upb/encode.c +++ b/upb/encode.c @@ -3,6 +3,7 @@ #include "upb/structs.int.h" #define UPB_PB_VARINT_MAX_LEN 10 +#define CHK(x) do { if (!(x)) { return false; } } while(0) static size_t upb_encode_varint(uint64_t val, char *buf) { size_t i; @@ -17,11 +18,6 @@ static size_t upb_encode_varint(uint64_t val, char *buf) { return i; } -static size_t upb_varint_size(uint64_t val) { - char buf[UPB_PB_VARINT_MAX_LEN]; - return upb_encode_varint(val, buf); -} - static uint32_t upb_zzenc_32(int32_t n) { return (n << 1) ^ (n >> 31); } static uint64_t upb_zzenc_64(int64_t n) { return (n << 1) ^ (n >> 63); } @@ -57,78 +53,29 @@ const uint8_t upb_native_wiretypes[] = { UPB_WIRE_TYPE_VARINT, /* SINT64 */ }; -/* The output buffer is divided into segments; a segment is a string of data - * that is "ready to go" -- it does not need any varint lengths inserted into - * the middle. The seams between segments are where varints will be inserted - * once they are known. - * - * We also use the concept of a "run", which is a range of encoded bytes that - * occur at a single submessage level. Every segment contains one or more runs. - * - * A segment can span messages. Consider: - * - * .--Submessage lengths---------. - * | | | - * | V V - * V | |--------------- | |----------------- - * Submessages: | |----------------------------------------------- - * Top-level msg: ------------------------------------------------------------ - * - * Segments: ----- ------------------- ----------------- - * Runs: *---- *--------------*--- *---------------- - * (* marks the start) - * - * Note that the top-level menssage is not in any segment because it does not - * have any length preceding it. - * - * A segment is only interrupted when another length needs to be inserted. So - * observe how the second segment spans both the inner submessage and part of - * the next enclosing message. */ - -typedef struct { - uint32_t msglen; /* The length to varint-encode before this segment. */ - uint32_t seglen; /* Length of the segment. */ -} upb_segment; - typedef struct { upb_env *env; char *buf, *ptr, *limit; - - /* The beginning of the current run, or undefined if we are at the top - * level. */ - char *runbegin; - - /* The list of segments we are accumulating. */ - upb_segment *segbuf, *segptr, *seglimit; - - /* The stack of enclosing submessages. Each entry in the stack points to the - * segment where this submessage's length is being accumulated. */ - int *stack, *top, *stacklimit; } upb_encstate; -static upb_segment *upb_encode_top(upb_encstate *e) { - return &e->segbuf[*e->top]; +static size_t upb_roundup_pow2(size_t bytes) { + size_t ret = 128; + while (ret < bytes) { + ret *= 2; + } + return ret; } static bool upb_encode_growbuffer(upb_encstate *e, size_t bytes) { - char *new_buf; - size_t needed = bytes + (e->ptr - e->buf); size_t old_size = e->limit - e->buf; + size_t new_size = upb_roundup_pow2(bytes + (e->limit - e->ptr)); + char *new_buf = upb_env_realloc(e->env, e->buf, old_size, new_size); + CHK(new_buf); - size_t new_size = old_size; + /* We want previous data at the end, realloc() put it at the beginning. */ + memmove(e->limit - old_size, e->buf, old_size); - while (new_size < needed) { - new_size *= 2; - } - - new_buf = upb_env_realloc(e->env, e->buf, old_size, new_size); - - if (new_buf == NULL) { - return false; - } - - e->ptr = new_buf + (e->ptr - e->buf); - e->runbegin = new_buf + (e->runbegin - e->buf); + e->ptr = new_buf + new_size - (e->limit - e->ptr); e->limit = new_buf + new_size; e->buf = new_buf; return true; @@ -137,120 +84,20 @@ static bool upb_encode_growbuffer(upb_encstate *e, size_t bytes) { /* Call to ensure that at least "bytes" bytes are available for writing at * e->ptr. Returns false if the bytes could not be allocated. */ static bool upb_encode_reserve(upb_encstate *e, size_t bytes) { - if (UPB_LIKELY((size_t)(e->limit - e->ptr) >= bytes)) { - return true; - } - - return upb_encode_growbuffer(e, bytes); -} + CHK(UPB_LIKELY((size_t)(e->ptr - e->buf) >= bytes) || + upb_encode_growbuffer(e, bytes)); -/* Call when "bytes" bytes have been writte at e->ptr. The caller *must* have - * previously called reserve() with at least this many bytes. */ -static void upb_encode_advance(upb_encstate *e, size_t bytes) { - UPB_ASSERT((size_t)(e->limit - e->ptr) >= bytes); - e->ptr += bytes; + e->ptr -= bytes; + return true; } /* Writes the given bytes to the buffer, handling reserve/advance. */ static bool upb_put_bytes(upb_encstate *e, const void *data, size_t len) { - if (!upb_encode_reserve(e, len)) { - return false; - } - + CHK(upb_encode_reserve(e, len)); memcpy(e->ptr, data, len); - upb_encode_advance(e, len); - return true; -} - -/* Finish the current run by adding the run totals to the segment and message - * length. */ -static void upb_encode_accumulate(upb_encstate *e) { - size_t run_len; - UPB_ASSERT(e->ptr >= e->runbegin); - run_len = e->ptr - e->runbegin; - e->segptr->seglen += run_len; - upb_encode_top(e)->msglen += run_len; - e->runbegin = e->ptr; -} - -/* Call to indicate the start of delimited region for which the full length is - * not yet known. The length will be inserted at the current position once it - * is known (and subsequent data moved if necessary). */ -static bool upb_encode_startdelim(upb_encstate *e) { - if (e->top) { - /* We are already buffering, advance to the next segment and push it on the - * stack. */ - upb_encode_accumulate(e); - - if (++e->top == e->stacklimit) { - /* TODO(haberman): grow stack? */ - return false; - } - - if (++e->segptr == e->seglimit) { - /* Grow segment buffer. */ - size_t old_size = - (e->seglimit - e->segbuf) * sizeof(upb_segment); - size_t new_size = old_size * 2; - upb_segment *new_buf = - upb_env_realloc(e->env, e->segbuf, old_size, new_size); - - if (new_buf == NULL) { - return false; - } - - e->segptr = new_buf + (e->segptr - e->segbuf); - e->seglimit = new_buf + (new_size / sizeof(upb_segment)); - e->segbuf = new_buf; - } - } else { - /* We were previously at the top level, start buffering. */ - e->segptr = e->segbuf; - e->top = e->stack; - e->runbegin = e->ptr; - } - - *e->top = e->segptr - e->segbuf; - e->segptr->seglen = 0; - e->segptr->msglen = 0; - return true; } -/* Call to indicate the end of a delimited region. We now know the length of - * the delimited region. If we are not nested inside any other delimited - * regions, we can now emit all of the buffered data we accumulated. */ -static bool upb_encode_enddelim(upb_encstate *e) { - size_t msglen; - upb_encode_accumulate(e); - msglen = upb_encode_top(e)->msglen; - - if (e->top == e->stack) { - /* All lengths are now available, emit all buffered data. */ - char buf[UPB_PB_VARINT_MAX_LEN]; - upb_segment *s; - const char *ptr = e->buf; - for (s = e->segbuf; s <= e->segptr; s++) { - size_t lenbytes = upb_encode_varint(s->msglen, buf); - //putbuf(e, buf, lenbytes); - //putbuf(e, ptr, s->seglen); - ptr += s->seglen; - } - - e->ptr = e->buf; - e->top = NULL; - } else { - /* Need to keep buffering; propagate length info into enclosing - * submessages. */ - --e->top; - upb_encode_top(e)->msglen += msglen + upb_varint_size(msglen); - } - - return true; -} - -/* encoding of wire types *****************************************************/ - static bool upb_put_fixed64(upb_encstate *e, uint64_t val) { /* TODO(haberman): byte-swap for big endian. */ return upb_put_bytes(e, &val, sizeof(uint64_t)); @@ -262,11 +109,13 @@ static bool upb_put_fixed32(upb_encstate *e, uint32_t val) { } static bool upb_put_varint(upb_encstate *e, uint64_t val) { - if (!upb_encode_reserve(e, UPB_PB_VARINT_MAX_LEN)) { - return false; - } - - upb_encode_advance(e, upb_encode_varint(val, e->ptr)); + size_t len; + char *start; + CHK(upb_encode_reserve(e, UPB_PB_VARINT_MAX_LEN)); + len = upb_encode_varint(val, e->ptr); + start = e->ptr + UPB_PB_VARINT_MAX_LEN - len; + memmove(start, e->ptr, len); + e->ptr = start; return true; } @@ -304,11 +153,12 @@ static bool upb_put_tag(upb_encstate *e, int field_number, int wire_type) { static bool upb_put_fixedarray(upb_encstate *e, const upb_array *arr, size_t size) { size_t bytes = arr->len * size; - return upb_put_varint(e, bytes) && upb_put_bytes(e, arr->data, bytes); + return upb_put_bytes(e, arr->data, bytes) && upb_put_varint(e, bytes); } bool upb_encode_message(upb_encstate *e, const char *msg, - const upb_msglayout_msginit_v1 *m); + const upb_msglayout_msginit_v1 *m, + size_t *size); static bool upb_encode_array(upb_encstate *e, const char *field_mem, const upb_msglayout_msginit_v1 *m, @@ -319,146 +169,161 @@ static bool upb_encode_array(upb_encstate *e, const char *field_mem, return true; } - /* We encode all primitive arrays as packed, regardless of what was specified - * in the .proto file. Could special case 1-sized arrays. */ - if (!upb_put_tag(e, f->number, UPB_WIRE_TYPE_DELIMITED)) { - return false; - } - -#define VARINT_CASE(ctype, encode) { \ - uint64_t *data = arr->data; \ - uint64_t *limit = data + arr->len; \ - if (!upb_encode_startdelim(e)) { \ - return false; \ - } \ - for (; data < limit; data++) { \ - if (!upb_put_varint(e, encode)) { \ - return false; \ - } \ - } \ - return upb_encode_enddelim(e); \ -} +#define VARINT_CASE(ctype, encode) do { \ + uint64_t *start = arr->data; \ + uint64_t *ptr = start + arr->len; \ + char *buf_ptr = e->ptr; \ + do { \ + ptr--; \ + CHK(upb_put_varint(e, encode)); \ + } while (ptr != start); \ + CHK(upb_put_varint(e, buf_ptr - e->ptr)); \ + break; \ +} while(0) switch (f->type) { case UPB_DESCRIPTOR_TYPE_DOUBLE: - return upb_put_fixedarray(e, arr, sizeof(double)); + CHK(upb_put_fixedarray(e, arr, sizeof(double))); + break; case UPB_DESCRIPTOR_TYPE_FLOAT: - return upb_put_fixedarray(e, arr, sizeof(float)); + CHK(upb_put_fixedarray(e, arr, sizeof(float))); + break; case UPB_DESCRIPTOR_TYPE_SFIXED64: case UPB_DESCRIPTOR_TYPE_FIXED64: - return upb_put_fixedarray(e, arr, sizeof(uint64_t)); + CHK(upb_put_fixedarray(e, arr, sizeof(uint64_t))); + break; case UPB_DESCRIPTOR_TYPE_FIXED32: case UPB_DESCRIPTOR_TYPE_SFIXED32: - return upb_put_fixedarray(e, arr, sizeof(uint32_t)); + CHK(upb_put_fixedarray(e, arr, sizeof(uint32_t))); + break; case UPB_DESCRIPTOR_TYPE_INT64: case UPB_DESCRIPTOR_TYPE_UINT64: - VARINT_CASE(uint64_t, *data); + VARINT_CASE(uint64_t, *ptr); case UPB_DESCRIPTOR_TYPE_UINT32: case UPB_DESCRIPTOR_TYPE_INT32: case UPB_DESCRIPTOR_TYPE_ENUM: - VARINT_CASE(uint32_t, *data); + VARINT_CASE(uint32_t, *ptr); case UPB_DESCRIPTOR_TYPE_BOOL: - VARINT_CASE(bool, *data); + VARINT_CASE(bool, *ptr); case UPB_DESCRIPTOR_TYPE_SINT32: - VARINT_CASE(int32_t, upb_zzenc_32(*data)); + VARINT_CASE(int32_t, upb_zzenc_32(*ptr)); case UPB_DESCRIPTOR_TYPE_SINT64: - VARINT_CASE(int64_t, upb_zzenc_64(*data)); + VARINT_CASE(int64_t, upb_zzenc_64(*ptr)); case UPB_DESCRIPTOR_TYPE_STRING: case UPB_DESCRIPTOR_TYPE_BYTES: { - upb_stringview *data = arr->data; - upb_stringview *limit = data + arr->len; - goto put_string_data; /* Skip first tag, we already put it. */ - for (; data < limit; data++) { - if (!upb_put_tag(e, f->number, UPB_WIRE_TYPE_DELIMITED)) { - return false; - } -put_string_data: - if (!upb_put_varint(e, data->size) || - !upb_put_bytes(e, data->data, data->size)) { - return false; - } - } + upb_stringview *start = arr->data; + upb_stringview *ptr = start + arr->len; + do { + ptr--; + CHK(upb_put_bytes(e, ptr->data, ptr->size) && + upb_put_varint(e, ptr->size) && + upb_put_tag(e, f->number, UPB_WIRE_TYPE_DELIMITED)); + } while (ptr != start); + return true; + } + case UPB_DESCRIPTOR_TYPE_GROUP: { + void **start = arr->data; + void **ptr = start + arr->len; + const upb_msglayout_msginit_v1 *subm = m->submsgs[f->submsg_index]; + do { + size_t size; + ptr--; + CHK(upb_put_tag(e, f->number, UPB_WIRE_TYPE_END_GROUP) && + upb_encode_message(e, *ptr, subm, &size) && + upb_put_tag(e, f->number, UPB_WIRE_TYPE_START_GROUP)); + } while (ptr != start); + return true; } - case UPB_DESCRIPTOR_TYPE_GROUP: case UPB_DESCRIPTOR_TYPE_MESSAGE: { - void **data = arr->data; - void **limit = data + arr->len; + void **start = arr->data; + void **ptr = start + arr->len; const upb_msglayout_msginit_v1 *subm = m->submsgs[f->submsg_index]; - goto put_submsg_data; /* Skip first tag, we already put it. */ - for (; data < limit; data++) { - if (!upb_put_tag(e, f->number, UPB_WIRE_TYPE_DELIMITED)) { - return false; - } -put_submsg_data: - if (!upb_encode_startdelim(e) || - !upb_encode_message(e, *data, subm) || - !upb_encode_enddelim(e)) { - return false; - } - } + do { + size_t size; + ptr--; + CHK(upb_encode_message(e, *ptr, subm, &size) && + upb_put_varint(e, size) && + upb_put_tag(e, f->number, UPB_WIRE_TYPE_DELIMITED)); + } while (ptr != start); + return true; } } - UPB_UNREACHABLE(); #undef VARINT_CASE + + /* We encode all primitive arrays as packed, regardless of what was specified + * in the .proto file. Could special case 1-sized arrays. */ + CHK(upb_put_tag(e, f->number, UPB_WIRE_TYPE_DELIMITED)); + return true; } static bool upb_encode_scalarfield(upb_encstate *e, const char *field_mem, const upb_msglayout_msginit_v1 *m, const upb_msglayout_fieldinit_v1 *f, bool is_proto3) { -#define CASE(ctype, type, wire_type, encodeval) { \ +#define CASE(ctype, type, wire_type, encodeval) do { \ ctype val = *(ctype*)field_mem; \ if (is_proto3 && val == 0) { \ return true; \ } \ - return upb_put_tag(e, f->number, wire_type) && \ - upb_put_ ## type(e, encodeval); \ -} + return upb_put_ ## type(e, encodeval) && \ + upb_put_tag(e, f->number, wire_type); \ +} while(0) switch (f->type) { case UPB_DESCRIPTOR_TYPE_DOUBLE: - CASE(double, double, UPB_WIRE_TYPE_64BIT, val) + CASE(double, double, UPB_WIRE_TYPE_64BIT, val); case UPB_DESCRIPTOR_TYPE_FLOAT: - CASE(float, float, UPB_WIRE_TYPE_32BIT, val) + CASE(float, float, UPB_WIRE_TYPE_32BIT, val); case UPB_DESCRIPTOR_TYPE_INT64: case UPB_DESCRIPTOR_TYPE_UINT64: - CASE(uint64_t, varint, UPB_WIRE_TYPE_VARINT, val) + CASE(uint64_t, varint, UPB_WIRE_TYPE_VARINT, val); case UPB_DESCRIPTOR_TYPE_UINT32: case UPB_DESCRIPTOR_TYPE_INT32: case UPB_DESCRIPTOR_TYPE_ENUM: - CASE(uint32_t, varint, UPB_WIRE_TYPE_VARINT, val) + CASE(uint32_t, varint, UPB_WIRE_TYPE_VARINT, val); case UPB_DESCRIPTOR_TYPE_SFIXED64: case UPB_DESCRIPTOR_TYPE_FIXED64: - CASE(uint64_t, fixed64, UPB_WIRE_TYPE_64BIT, val) + CASE(uint64_t, fixed64, UPB_WIRE_TYPE_64BIT, val); case UPB_DESCRIPTOR_TYPE_FIXED32: case UPB_DESCRIPTOR_TYPE_SFIXED32: - CASE(uint32_t, fixed32, UPB_WIRE_TYPE_32BIT, val) + CASE(uint32_t, fixed32, UPB_WIRE_TYPE_32BIT, val); case UPB_DESCRIPTOR_TYPE_BOOL: - CASE(bool, varint, UPB_WIRE_TYPE_VARINT, val) + CASE(bool, varint, UPB_WIRE_TYPE_VARINT, val); case UPB_DESCRIPTOR_TYPE_SINT32: - CASE(int32_t, varint, UPB_WIRE_TYPE_VARINT, upb_zzenc_32(val)) + CASE(int32_t, varint, UPB_WIRE_TYPE_VARINT, upb_zzenc_32(val)); case UPB_DESCRIPTOR_TYPE_SINT64: - CASE(int64_t, varint, UPB_WIRE_TYPE_VARINT, upb_zzenc_64(val)) + CASE(int64_t, varint, UPB_WIRE_TYPE_VARINT, upb_zzenc_64(val)); case UPB_DESCRIPTOR_TYPE_STRING: case UPB_DESCRIPTOR_TYPE_BYTES: { upb_stringview view = *(upb_stringview*)field_mem; if (is_proto3 && view.size == 0) { return true; } - return upb_put_tag(e, f->number, UPB_WIRE_TYPE_DELIMITED) && + return upb_put_bytes(e, view.data, view.size) && upb_put_varint(e, view.size) && - upb_put_bytes(e, view.data, view.size); + upb_put_tag(e, f->number, UPB_WIRE_TYPE_DELIMITED); + } + case UPB_DESCRIPTOR_TYPE_GROUP: { + size_t size; + void *submsg = *(void**)field_mem; + const upb_msglayout_msginit_v1 *subm = m->submsgs[f->submsg_index]; + if (is_proto3 && submsg == NULL) { + return true; + } + return upb_put_tag(e, f->number, UPB_WIRE_TYPE_END_GROUP) && + upb_encode_message(e, submsg, subm, &size) && + upb_put_tag(e, f->number, UPB_WIRE_TYPE_START_GROUP); } - case UPB_DESCRIPTOR_TYPE_GROUP: case UPB_DESCRIPTOR_TYPE_MESSAGE: { + size_t size; void *submsg = *(void**)field_mem; + const upb_msglayout_msginit_v1 *subm = m->submsgs[f->submsg_index]; if (is_proto3 && submsg == NULL) { return true; } - return upb_put_tag(e, f->number, UPB_WIRE_TYPE_DELIMITED) && - upb_encode_startdelim(e) && - upb_encode_message(e, submsg, m->submsgs[f->submsg_index]) && - upb_encode_enddelim(e); + return upb_encode_message(e, submsg, subm, &size) && + upb_put_varint(e, size) && + upb_put_tag(e, f->number, UPB_WIRE_TYPE_DELIMITED); } } #undef CASE @@ -479,34 +344,38 @@ bool upb_encode_hasscalarfield(const char *msg, } bool upb_encode_message(upb_encstate* e, const char *msg, - const upb_msglayout_msginit_v1 *m) { + const upb_msglayout_msginit_v1 *m, + size_t *size) { int i; - for (i = 0; i < m->field_count; i++) { + char *buf_end = e->ptr; + for (i = m->field_count - 1; i >= 0; i--) { const upb_msglayout_fieldinit_v1 *f = &m->fields[i]; if (f->label == UPB_LABEL_REPEATED) { - if (!upb_encode_array(e, msg, m, f)) { - return NULL; - } + CHK(upb_encode_array(e, msg, m, f)); } else { - if (upb_encode_hasscalarfield(msg, m, f) && - !upb_encode_scalarfield(e, msg + f->offset, m, f, !m->is_proto2)) { - return NULL; + if (upb_encode_hasscalarfield(msg, m, f)) { + CHK(upb_encode_scalarfield(e, msg + f->offset, m, f, !m->is_proto2)); } } } + *size = buf_end - e->ptr; return true; } char *upb_encode(const void *msg, const upb_msglayout_msginit_v1 *m, upb_env *env, size_t *size) { upb_encstate e; + e.env = env; + e.buf = NULL; + e.limit = NULL; + e.ptr = NULL; - if (!upb_encode_message(&e, msg, m)) { + if (!upb_encode_message(&e, msg, m, size)) { return false; } - *size = e.ptr - e.buf; - return e.buf; + *size = e.limit - e.ptr; + return e.ptr; } -- cgit v1.2.3