diff options
author | Josh Haberman <jhaberman@gmail.com> | 2017-07-08 00:00:05 -0700 |
---|---|---|
committer | Josh Haberman <jhaberman@gmail.com> | 2017-07-08 00:00:05 -0700 |
commit | 1aafd4111b9b6d08d2d0937b0f396a4caa9ea04d (patch) | |
tree | 9bcb6c61ee333de00819973f3a3acd8aab9238e5 /upb/encode.c | |
parent | dd536fd567b2ba9bed67a0094c5cd905c774f951 (diff) |
A good start on upb_encode and upb_decode.
Diffstat (limited to 'upb/encode.c')
-rw-r--r-- | upb/encode.c | 512 |
1 files changed, 512 insertions, 0 deletions
diff --git a/upb/encode.c b/upb/encode.c new file mode 100644 index 0000000..30f2da7 --- /dev/null +++ b/upb/encode.c @@ -0,0 +1,512 @@ + +#include "upb/encode.h" +#include "upb/structs.int.h" + +#define UPB_PB_VARINT_MAX_LEN 10 + +static size_t upb_encode_varint(uint64_t val, char *buf) { + size_t i; + if (val == 0) { buf[0] = 0; return 1; } + i = 0; + while (val) { + uint8_t byte = val & 0x7fU; + val >>= 7; + if (val) byte |= 0x80U; + buf[i++] = byte; + } + return i; +} + +static size_t upb_varint_size(uint64_t val) { + char buf[UPB_PB_VARINT_MAX_LEN]; + return upb_encode_varint(val, buf); +} + +static uint32_t upb_zzenc_32(int32_t n) { return (n << 1) ^ (n >> 31); } +static uint64_t upb_zzenc_64(int64_t n) { return (n << 1) ^ (n >> 63); } + +typedef enum { + UPB_WIRE_TYPE_VARINT = 0, + UPB_WIRE_TYPE_64BIT = 1, + UPB_WIRE_TYPE_DELIMITED = 2, + UPB_WIRE_TYPE_START_GROUP = 3, + UPB_WIRE_TYPE_END_GROUP = 4, + UPB_WIRE_TYPE_32BIT = 5 +} upb_wiretype_t; + +/* Index is descriptor type. */ +const uint8_t upb_native_wiretypes[] = { + UPB_WIRE_TYPE_END_GROUP, /* ENDGROUP */ + UPB_WIRE_TYPE_64BIT, /* DOUBLE */ + UPB_WIRE_TYPE_32BIT, /* FLOAT */ + UPB_WIRE_TYPE_VARINT, /* INT64 */ + UPB_WIRE_TYPE_VARINT, /* UINT64 */ + UPB_WIRE_TYPE_VARINT, /* INT32 */ + UPB_WIRE_TYPE_64BIT, /* FIXED64 */ + UPB_WIRE_TYPE_32BIT, /* FIXED32 */ + UPB_WIRE_TYPE_VARINT, /* BOOL */ + UPB_WIRE_TYPE_DELIMITED, /* STRING */ + UPB_WIRE_TYPE_START_GROUP, /* GROUP */ + UPB_WIRE_TYPE_DELIMITED, /* MESSAGE */ + UPB_WIRE_TYPE_DELIMITED, /* BYTES */ + UPB_WIRE_TYPE_VARINT, /* UINT32 */ + UPB_WIRE_TYPE_VARINT, /* ENUM */ + UPB_WIRE_TYPE_32BIT, /* SFIXED32 */ + UPB_WIRE_TYPE_64BIT, /* SFIXED64 */ + UPB_WIRE_TYPE_VARINT, /* SINT32 */ + UPB_WIRE_TYPE_VARINT, /* SINT64 */ +}; + +/* The output buffer is divided into segments; a segment is a string of data + * that is "ready to go" -- it does not need any varint lengths inserted into + * the middle. The seams between segments are where varints will be inserted + * once they are known. + * + * We also use the concept of a "run", which is a range of encoded bytes that + * occur at a single submessage level. Every segment contains one or more runs. + * + * A segment can span messages. Consider: + * + * .--Submessage lengths---------. + * | | | + * | V V + * V | |--------------- | |----------------- + * Submessages: | |----------------------------------------------- + * Top-level msg: ------------------------------------------------------------ + * + * Segments: ----- ------------------- ----------------- + * Runs: *---- *--------------*--- *---------------- + * (* marks the start) + * + * Note that the top-level menssage is not in any segment because it does not + * have any length preceding it. + * + * A segment is only interrupted when another length needs to be inserted. So + * observe how the second segment spans both the inner submessage and part of + * the next enclosing message. */ + +typedef struct { + uint32_t msglen; /* The length to varint-encode before this segment. */ + uint32_t seglen; /* Length of the segment. */ +} upb_segment; + +typedef struct { + upb_env *env; + char *buf, *ptr, *limit; + + /* The beginning of the current run, or undefined if we are at the top + * level. */ + char *runbegin; + + /* The list of segments we are accumulating. */ + upb_segment *segbuf, *segptr, *seglimit; + + /* The stack of enclosing submessages. Each entry in the stack points to the + * segment where this submessage's length is being accumulated. */ + int *stack, *top, *stacklimit; +} upb_encstate; + +static upb_segment *upb_encode_top(upb_encstate *e) { + return &e->segbuf[*e->top]; +} + +static bool upb_encode_growbuffer(upb_encstate *e, size_t bytes) { + char *new_buf; + size_t needed = bytes + (e->ptr - e->buf); + size_t old_size = e->limit - e->buf; + + size_t new_size = old_size; + + while (new_size < needed) { + new_size *= 2; + } + + new_buf = upb_env_realloc(e->env, e->buf, old_size, new_size); + + if (new_buf == NULL) { + return false; + } + + e->ptr = new_buf + (e->ptr - e->buf); + e->runbegin = new_buf + (e->runbegin - e->buf); + e->limit = new_buf + new_size; + e->buf = new_buf; + return true; +} + +/* Call to ensure that at least "bytes" bytes are available for writing at + * e->ptr. Returns false if the bytes could not be allocated. */ +static bool upb_encode_reserve(upb_encstate *e, size_t bytes) { + if (UPB_LIKELY((size_t)(e->limit - e->ptr) >= bytes)) { + return true; + } + + return upb_encode_growbuffer(e, bytes); +} + +/* Call when "bytes" bytes have been writte at e->ptr. The caller *must* have + * previously called reserve() with at least this many bytes. */ +static void upb_encode_advance(upb_encstate *e, size_t bytes) { + UPB_ASSERT((size_t)(e->limit - e->ptr) >= bytes); + e->ptr += bytes; +} + +/* Writes the given bytes to the buffer, handling reserve/advance. */ +static bool upb_put_bytes(upb_encstate *e, const void *data, size_t len) { + if (!upb_encode_reserve(e, len)) { + return false; + } + + memcpy(e->ptr, data, len); + upb_encode_advance(e, len); + return true; +} + +/* Finish the current run by adding the run totals to the segment and message + * length. */ +static void upb_encode_accumulate(upb_encstate *e) { + size_t run_len; + UPB_ASSERT(e->ptr >= e->runbegin); + run_len = e->ptr - e->runbegin; + e->segptr->seglen += run_len; + upb_encode_top(e)->msglen += run_len; + e->runbegin = e->ptr; +} + +/* Call to indicate the start of delimited region for which the full length is + * not yet known. The length will be inserted at the current position once it + * is known (and subsequent data moved if necessary). */ +static bool upb_encode_startdelim(upb_encstate *e) { + if (e->top) { + /* We are already buffering, advance to the next segment and push it on the + * stack. */ + upb_encode_accumulate(e); + + if (++e->top == e->stacklimit) { + /* TODO(haberman): grow stack? */ + return false; + } + + if (++e->segptr == e->seglimit) { + /* Grow segment buffer. */ + size_t old_size = + (e->seglimit - e->segbuf) * sizeof(upb_segment); + size_t new_size = old_size * 2; + upb_segment *new_buf = + upb_env_realloc(e->env, e->segbuf, old_size, new_size); + + if (new_buf == NULL) { + return false; + } + + e->segptr = new_buf + (e->segptr - e->segbuf); + e->seglimit = new_buf + (new_size / sizeof(upb_segment)); + e->segbuf = new_buf; + } + } else { + /* We were previously at the top level, start buffering. */ + e->segptr = e->segbuf; + e->top = e->stack; + e->runbegin = e->ptr; + } + + *e->top = e->segptr - e->segbuf; + e->segptr->seglen = 0; + e->segptr->msglen = 0; + + return true; +} + +/* Call to indicate the end of a delimited region. We now know the length of + * the delimited region. If we are not nested inside any other delimited + * regions, we can now emit all of the buffered data we accumulated. */ +static bool upb_encode_enddelim(upb_encstate *e) { + size_t msglen; + upb_encode_accumulate(e); + msglen = upb_encode_top(e)->msglen; + + if (e->top == e->stack) { + /* All lengths are now available, emit all buffered data. */ + char buf[UPB_PB_VARINT_MAX_LEN]; + upb_segment *s; + const char *ptr = e->buf; + for (s = e->segbuf; s <= e->segptr; s++) { + size_t lenbytes = upb_encode_varint(s->msglen, buf); + //putbuf(e, buf, lenbytes); + //putbuf(e, ptr, s->seglen); + ptr += s->seglen; + } + + e->ptr = e->buf; + e->top = NULL; + } else { + /* Need to keep buffering; propagate length info into enclosing + * submessages. */ + --e->top; + upb_encode_top(e)->msglen += msglen + upb_varint_size(msglen); + } + + return true; +} + +/* encoding of wire types *****************************************************/ + +static bool upb_put_fixed64(upb_encstate *e, uint64_t val) { + /* TODO(haberman): byte-swap for big endian. */ + return upb_put_bytes(e, &val, sizeof(uint64_t)); +} + +static bool upb_put_fixed32(upb_encstate *e, uint32_t val) { + /* TODO(haberman): byte-swap for big endian. */ + return upb_put_bytes(e, &val, sizeof(uint32_t)); +} + +static bool upb_put_varint(upb_encstate *e, uint64_t val) { + if (!upb_encode_reserve(e, UPB_PB_VARINT_MAX_LEN)) { + return false; + } + + upb_encode_advance(e, upb_encode_varint(val, e->ptr)); + return true; +} + +static bool upb_put_double(upb_encstate *e, double d) { + uint64_t u64; + UPB_ASSERT(sizeof(double) == sizeof(uint64_t)); + memcpy(&u64, &d, sizeof(uint64_t)); + return upb_put_fixed64(e, u64); +} + +static bool upb_put_float(upb_encstate *e, float d) { + uint32_t u32; + UPB_ASSERT(sizeof(float) == sizeof(uint32_t)); + memcpy(&u32, &d, sizeof(uint32_t)); + return upb_put_fixed32(e, u32); +} + +static uint32_t upb_readcase(const char *msg, const upb_msglayout_msginit_v1 *m, + int oneof_index) { + uint32_t ret; + memcpy(&ret, msg + m->oneofs[oneof_index].case_offset, sizeof(ret)); + return ret; +} + +static bool upb_readhasbit(const char *msg, + const upb_msglayout_fieldinit_v1 *f) { + UPB_ASSERT(f->hasbit != UPB_NO_HASBIT); + return msg[f->hasbit / 8] & (1 << (f->hasbit % 8)); +} + +static bool upb_put_tag(upb_encstate *e, int field_number, int wire_type) { + return upb_put_varint(e, (field_number << 3) | wire_type); +} + +static bool upb_put_fixedarray(upb_encstate *e, const upb_array *arr, + size_t size) { + size_t bytes = arr->len * size; + return upb_put_varint(e, bytes) && upb_put_bytes(e, arr->data, bytes); +} + +bool upb_encode_message(upb_encstate *e, const char *msg, + const upb_msglayout_msginit_v1 *m); + +static bool upb_encode_array(upb_encstate *e, const char *field_mem, + const upb_msglayout_msginit_v1 *m, + const upb_msglayout_fieldinit_v1 *f) { + const upb_array *arr = *(const upb_array**)field_mem; + + if (arr->len == 0) { + return true; + } + + /* We encode all primitive arrays as packed, regardless of what was specified + * in the .proto file. Could special case 1-sized arrays. */ + if (!upb_put_tag(e, f->number, UPB_WIRE_TYPE_DELIMITED)) { + return false; + } + +#define VARINT_CASE(ctype, encode) { \ + uint64_t *data = arr->data; \ + uint64_t *limit = data + arr->len; \ + if (!upb_encode_startdelim(e)) { \ + return false; \ + } \ + for (; data < limit; data++) { \ + if (!upb_put_varint(e, encode)) { \ + return false; \ + } \ + } \ + return upb_encode_enddelim(e); \ +} + + switch (f->type) { + case UPB_DESCRIPTOR_TYPE_DOUBLE: + return upb_put_fixedarray(e, arr, sizeof(double)); + case UPB_DESCRIPTOR_TYPE_FLOAT: + return upb_put_fixedarray(e, arr, sizeof(float)); + case UPB_DESCRIPTOR_TYPE_SFIXED64: + case UPB_DESCRIPTOR_TYPE_FIXED64: + return upb_put_fixedarray(e, arr, sizeof(uint64_t)); + case UPB_DESCRIPTOR_TYPE_FIXED32: + case UPB_DESCRIPTOR_TYPE_SFIXED32: + return upb_put_fixedarray(e, arr, sizeof(uint32_t)); + case UPB_DESCRIPTOR_TYPE_INT64: + case UPB_DESCRIPTOR_TYPE_UINT64: + VARINT_CASE(uint64_t, *data); + case UPB_DESCRIPTOR_TYPE_UINT32: + case UPB_DESCRIPTOR_TYPE_INT32: + case UPB_DESCRIPTOR_TYPE_ENUM: + VARINT_CASE(uint32_t, *data); + case UPB_DESCRIPTOR_TYPE_BOOL: + VARINT_CASE(bool, *data); + case UPB_DESCRIPTOR_TYPE_SINT32: + VARINT_CASE(int32_t, upb_zzenc_32(*data)); + case UPB_DESCRIPTOR_TYPE_SINT64: + VARINT_CASE(int64_t, upb_zzenc_64(*data)); + case UPB_DESCRIPTOR_TYPE_STRING: + case UPB_DESCRIPTOR_TYPE_BYTES: { + upb_stringview *data = arr->data; + upb_stringview *limit = data + arr->len; + goto put_string_data; /* Skip first tag, we already put it. */ + for (; data < limit; data++) { + if (!upb_put_tag(e, f->number, UPB_WIRE_TYPE_DELIMITED)) { + return false; + } +put_string_data: + if (!upb_put_varint(e, data->size) || + !upb_put_bytes(e, data->data, data->size)) { + return false; + } + } + } + case UPB_DESCRIPTOR_TYPE_GROUP: + case UPB_DESCRIPTOR_TYPE_MESSAGE: { + void **data = arr->data; + void **limit = data + arr->len; + const upb_msglayout_msginit_v1 *subm = m->submsgs[f->submsg_index]; + goto put_submsg_data; /* Skip first tag, we already put it. */ + for (; data < limit; data++) { + if (!upb_put_tag(e, f->number, UPB_WIRE_TYPE_DELIMITED)) { + return false; + } +put_submsg_data: + if (!upb_encode_startdelim(e) || + !upb_encode_message(e, *data, subm) || + !upb_encode_enddelim(e)) { + return false; + } + } + } + } + UPB_UNREACHABLE(); +#undef VARINT_CASE +} + +static bool upb_encode_scalarfield(upb_encstate *e, const char *field_mem, + const upb_msglayout_msginit_v1 *m, + const upb_msglayout_fieldinit_v1 *f, + bool is_proto3) { +#define CASE(ctype, type, wire_type, encodeval) { \ + ctype val = *(ctype*)field_mem; \ + if (is_proto3 && val == 0) { \ + return true; \ + } \ + return upb_put_tag(e, f->number, wire_type) && \ + upb_put_ ## type(e, encodeval); \ +} + + switch (f->type) { + case UPB_DESCRIPTOR_TYPE_DOUBLE: + CASE(double, double, UPB_WIRE_TYPE_64BIT, val) + case UPB_DESCRIPTOR_TYPE_FLOAT: + CASE(float, float, UPB_WIRE_TYPE_32BIT, val) + case UPB_DESCRIPTOR_TYPE_INT64: + case UPB_DESCRIPTOR_TYPE_UINT64: + CASE(uint64_t, varint, UPB_WIRE_TYPE_VARINT, val) + case UPB_DESCRIPTOR_TYPE_UINT32: + case UPB_DESCRIPTOR_TYPE_INT32: + case UPB_DESCRIPTOR_TYPE_ENUM: + CASE(uint32_t, varint, UPB_WIRE_TYPE_VARINT, val) + case UPB_DESCRIPTOR_TYPE_SFIXED64: + case UPB_DESCRIPTOR_TYPE_FIXED64: + CASE(uint64_t, fixed64, UPB_WIRE_TYPE_64BIT, val) + case UPB_DESCRIPTOR_TYPE_FIXED32: + case UPB_DESCRIPTOR_TYPE_SFIXED32: + CASE(uint32_t, fixed32, UPB_WIRE_TYPE_32BIT, val) + case UPB_DESCRIPTOR_TYPE_BOOL: + CASE(bool, varint, UPB_WIRE_TYPE_VARINT, val) + case UPB_DESCRIPTOR_TYPE_SINT32: + CASE(int32_t, varint, UPB_WIRE_TYPE_VARINT, upb_zzenc_32(val)) + case UPB_DESCRIPTOR_TYPE_SINT64: + CASE(int64_t, varint, UPB_WIRE_TYPE_VARINT, upb_zzenc_64(val)) + case UPB_DESCRIPTOR_TYPE_STRING: + case UPB_DESCRIPTOR_TYPE_BYTES: { + upb_stringview view = *(upb_stringview*)field_mem; + if (is_proto3 && view.size == 0) { + return true; + } + return upb_put_tag(e, f->number, UPB_WIRE_TYPE_DELIMITED) && + upb_put_varint(e, view.size) && + upb_put_bytes(e, view.data, view.size); + } + case UPB_DESCRIPTOR_TYPE_GROUP: + case UPB_DESCRIPTOR_TYPE_MESSAGE: { + void *submsg = *(void**)field_mem; + if (is_proto3 && submsg == NULL) { + return true; + } + return upb_put_tag(e, f->number, UPB_WIRE_TYPE_DELIMITED) && + upb_encode_startdelim(e) && + upb_encode_message(e, submsg, m->submsgs[f->submsg_index]) && + upb_encode_enddelim(e); + } + } +#undef CASE + UPB_UNREACHABLE(); +} + +bool upb_encode_hasscalarfield(const char *msg, + const upb_msglayout_msginit_v1 *m, + const upb_msglayout_fieldinit_v1 *f) { + if (f->oneof_index != UPB_NOT_IN_ONEOF) { + return upb_readcase(msg, m, f->oneof_index) == f->number; + } else if (m->is_proto2) { + return upb_readhasbit(msg, f); + } else { + /* For proto3, we'll test for the field being empty later. */ + return true; + } +} + +bool upb_encode_message(upb_encstate* e, const char *msg, + const upb_msglayout_msginit_v1 *m) { + int i; + for (i = 0; i < m->field_count; i++) { + const upb_msglayout_fieldinit_v1 *f = &m->fields[i]; + + if (f->label == UPB_LABEL_REPEATED) { + if (!upb_encode_array(e, msg, m, f)) { + return NULL; + } + } else { + if (upb_encode_hasscalarfield(msg, m, f) && + !upb_encode_scalarfield(e, msg + f->offset, m, f, !m->is_proto2)) { + return NULL; + } + } + } + + return true; +} + +char *upb_encode(const void *msg, const upb_msglayout_msginit_v1 *m, + upb_env *env, size_t *size) { + upb_encstate e; + + if (!upb_encode_message(&e, msg, m)) { + return false; + } + + *size = e.ptr - e.buf; + return e.buf; +} |