summaryrefslogtreecommitdiff
path: root/upb/encode.c
diff options
context:
space:
mode:
authorJosh Haberman <jhaberman@gmail.com>2017-07-08 00:00:05 -0700
committerJosh Haberman <jhaberman@gmail.com>2017-07-08 00:00:05 -0700
commit1aafd4111b9b6d08d2d0937b0f396a4caa9ea04d (patch)
tree9bcb6c61ee333de00819973f3a3acd8aab9238e5 /upb/encode.c
parentdd536fd567b2ba9bed67a0094c5cd905c774f951 (diff)
A good start on upb_encode and upb_decode.
Diffstat (limited to 'upb/encode.c')
-rw-r--r--upb/encode.c512
1 files changed, 512 insertions, 0 deletions
diff --git a/upb/encode.c b/upb/encode.c
new file mode 100644
index 0000000..30f2da7
--- /dev/null
+++ b/upb/encode.c
@@ -0,0 +1,512 @@
+
+#include "upb/encode.h"
+#include "upb/structs.int.h"
+
+#define UPB_PB_VARINT_MAX_LEN 10
+
+static size_t upb_encode_varint(uint64_t val, char *buf) {
+ size_t i;
+ if (val == 0) { buf[0] = 0; return 1; }
+ i = 0;
+ while (val) {
+ uint8_t byte = val & 0x7fU;
+ val >>= 7;
+ if (val) byte |= 0x80U;
+ buf[i++] = byte;
+ }
+ return i;
+}
+
+static size_t upb_varint_size(uint64_t val) {
+ char buf[UPB_PB_VARINT_MAX_LEN];
+ return upb_encode_varint(val, buf);
+}
+
+static uint32_t upb_zzenc_32(int32_t n) { return (n << 1) ^ (n >> 31); }
+static uint64_t upb_zzenc_64(int64_t n) { return (n << 1) ^ (n >> 63); }
+
+typedef enum {
+ UPB_WIRE_TYPE_VARINT = 0,
+ UPB_WIRE_TYPE_64BIT = 1,
+ UPB_WIRE_TYPE_DELIMITED = 2,
+ UPB_WIRE_TYPE_START_GROUP = 3,
+ UPB_WIRE_TYPE_END_GROUP = 4,
+ UPB_WIRE_TYPE_32BIT = 5
+} upb_wiretype_t;
+
+/* Index is descriptor type. */
+const uint8_t upb_native_wiretypes[] = {
+ UPB_WIRE_TYPE_END_GROUP, /* ENDGROUP */
+ UPB_WIRE_TYPE_64BIT, /* DOUBLE */
+ UPB_WIRE_TYPE_32BIT, /* FLOAT */
+ UPB_WIRE_TYPE_VARINT, /* INT64 */
+ UPB_WIRE_TYPE_VARINT, /* UINT64 */
+ UPB_WIRE_TYPE_VARINT, /* INT32 */
+ UPB_WIRE_TYPE_64BIT, /* FIXED64 */
+ UPB_WIRE_TYPE_32BIT, /* FIXED32 */
+ UPB_WIRE_TYPE_VARINT, /* BOOL */
+ UPB_WIRE_TYPE_DELIMITED, /* STRING */
+ UPB_WIRE_TYPE_START_GROUP, /* GROUP */
+ UPB_WIRE_TYPE_DELIMITED, /* MESSAGE */
+ UPB_WIRE_TYPE_DELIMITED, /* BYTES */
+ UPB_WIRE_TYPE_VARINT, /* UINT32 */
+ UPB_WIRE_TYPE_VARINT, /* ENUM */
+ UPB_WIRE_TYPE_32BIT, /* SFIXED32 */
+ UPB_WIRE_TYPE_64BIT, /* SFIXED64 */
+ UPB_WIRE_TYPE_VARINT, /* SINT32 */
+ UPB_WIRE_TYPE_VARINT, /* SINT64 */
+};
+
+/* The output buffer is divided into segments; a segment is a string of data
+ * that is "ready to go" -- it does not need any varint lengths inserted into
+ * the middle. The seams between segments are where varints will be inserted
+ * once they are known.
+ *
+ * We also use the concept of a "run", which is a range of encoded bytes that
+ * occur at a single submessage level. Every segment contains one or more runs.
+ *
+ * A segment can span messages. Consider:
+ *
+ * .--Submessage lengths---------.
+ * | | |
+ * | V V
+ * V | |--------------- | |-----------------
+ * Submessages: | |-----------------------------------------------
+ * Top-level msg: ------------------------------------------------------------
+ *
+ * Segments: ----- ------------------- -----------------
+ * Runs: *---- *--------------*--- *----------------
+ * (* marks the start)
+ *
+ * Note that the top-level menssage is not in any segment because it does not
+ * have any length preceding it.
+ *
+ * A segment is only interrupted when another length needs to be inserted. So
+ * observe how the second segment spans both the inner submessage and part of
+ * the next enclosing message. */
+
+typedef struct {
+ uint32_t msglen; /* The length to varint-encode before this segment. */
+ uint32_t seglen; /* Length of the segment. */
+} upb_segment;
+
+typedef struct {
+ upb_env *env;
+ char *buf, *ptr, *limit;
+
+ /* The beginning of the current run, or undefined if we are at the top
+ * level. */
+ char *runbegin;
+
+ /* The list of segments we are accumulating. */
+ upb_segment *segbuf, *segptr, *seglimit;
+
+ /* The stack of enclosing submessages. Each entry in the stack points to the
+ * segment where this submessage's length is being accumulated. */
+ int *stack, *top, *stacklimit;
+} upb_encstate;
+
+static upb_segment *upb_encode_top(upb_encstate *e) {
+ return &e->segbuf[*e->top];
+}
+
+static bool upb_encode_growbuffer(upb_encstate *e, size_t bytes) {
+ char *new_buf;
+ size_t needed = bytes + (e->ptr - e->buf);
+ size_t old_size = e->limit - e->buf;
+
+ size_t new_size = old_size;
+
+ while (new_size < needed) {
+ new_size *= 2;
+ }
+
+ new_buf = upb_env_realloc(e->env, e->buf, old_size, new_size);
+
+ if (new_buf == NULL) {
+ return false;
+ }
+
+ e->ptr = new_buf + (e->ptr - e->buf);
+ e->runbegin = new_buf + (e->runbegin - e->buf);
+ e->limit = new_buf + new_size;
+ e->buf = new_buf;
+ return true;
+}
+
+/* Call to ensure that at least "bytes" bytes are available for writing at
+ * e->ptr. Returns false if the bytes could not be allocated. */
+static bool upb_encode_reserve(upb_encstate *e, size_t bytes) {
+ if (UPB_LIKELY((size_t)(e->limit - e->ptr) >= bytes)) {
+ return true;
+ }
+
+ return upb_encode_growbuffer(e, bytes);
+}
+
+/* Call when "bytes" bytes have been writte at e->ptr. The caller *must* have
+ * previously called reserve() with at least this many bytes. */
+static void upb_encode_advance(upb_encstate *e, size_t bytes) {
+ UPB_ASSERT((size_t)(e->limit - e->ptr) >= bytes);
+ e->ptr += bytes;
+}
+
+/* Writes the given bytes to the buffer, handling reserve/advance. */
+static bool upb_put_bytes(upb_encstate *e, const void *data, size_t len) {
+ if (!upb_encode_reserve(e, len)) {
+ return false;
+ }
+
+ memcpy(e->ptr, data, len);
+ upb_encode_advance(e, len);
+ return true;
+}
+
+/* Finish the current run by adding the run totals to the segment and message
+ * length. */
+static void upb_encode_accumulate(upb_encstate *e) {
+ size_t run_len;
+ UPB_ASSERT(e->ptr >= e->runbegin);
+ run_len = e->ptr - e->runbegin;
+ e->segptr->seglen += run_len;
+ upb_encode_top(e)->msglen += run_len;
+ e->runbegin = e->ptr;
+}
+
+/* Call to indicate the start of delimited region for which the full length is
+ * not yet known. The length will be inserted at the current position once it
+ * is known (and subsequent data moved if necessary). */
+static bool upb_encode_startdelim(upb_encstate *e) {
+ if (e->top) {
+ /* We are already buffering, advance to the next segment and push it on the
+ * stack. */
+ upb_encode_accumulate(e);
+
+ if (++e->top == e->stacklimit) {
+ /* TODO(haberman): grow stack? */
+ return false;
+ }
+
+ if (++e->segptr == e->seglimit) {
+ /* Grow segment buffer. */
+ size_t old_size =
+ (e->seglimit - e->segbuf) * sizeof(upb_segment);
+ size_t new_size = old_size * 2;
+ upb_segment *new_buf =
+ upb_env_realloc(e->env, e->segbuf, old_size, new_size);
+
+ if (new_buf == NULL) {
+ return false;
+ }
+
+ e->segptr = new_buf + (e->segptr - e->segbuf);
+ e->seglimit = new_buf + (new_size / sizeof(upb_segment));
+ e->segbuf = new_buf;
+ }
+ } else {
+ /* We were previously at the top level, start buffering. */
+ e->segptr = e->segbuf;
+ e->top = e->stack;
+ e->runbegin = e->ptr;
+ }
+
+ *e->top = e->segptr - e->segbuf;
+ e->segptr->seglen = 0;
+ e->segptr->msglen = 0;
+
+ return true;
+}
+
+/* Call to indicate the end of a delimited region. We now know the length of
+ * the delimited region. If we are not nested inside any other delimited
+ * regions, we can now emit all of the buffered data we accumulated. */
+static bool upb_encode_enddelim(upb_encstate *e) {
+ size_t msglen;
+ upb_encode_accumulate(e);
+ msglen = upb_encode_top(e)->msglen;
+
+ if (e->top == e->stack) {
+ /* All lengths are now available, emit all buffered data. */
+ char buf[UPB_PB_VARINT_MAX_LEN];
+ upb_segment *s;
+ const char *ptr = e->buf;
+ for (s = e->segbuf; s <= e->segptr; s++) {
+ size_t lenbytes = upb_encode_varint(s->msglen, buf);
+ //putbuf(e, buf, lenbytes);
+ //putbuf(e, ptr, s->seglen);
+ ptr += s->seglen;
+ }
+
+ e->ptr = e->buf;
+ e->top = NULL;
+ } else {
+ /* Need to keep buffering; propagate length info into enclosing
+ * submessages. */
+ --e->top;
+ upb_encode_top(e)->msglen += msglen + upb_varint_size(msglen);
+ }
+
+ return true;
+}
+
+/* encoding of wire types *****************************************************/
+
+static bool upb_put_fixed64(upb_encstate *e, uint64_t val) {
+ /* TODO(haberman): byte-swap for big endian. */
+ return upb_put_bytes(e, &val, sizeof(uint64_t));
+}
+
+static bool upb_put_fixed32(upb_encstate *e, uint32_t val) {
+ /* TODO(haberman): byte-swap for big endian. */
+ return upb_put_bytes(e, &val, sizeof(uint32_t));
+}
+
+static bool upb_put_varint(upb_encstate *e, uint64_t val) {
+ if (!upb_encode_reserve(e, UPB_PB_VARINT_MAX_LEN)) {
+ return false;
+ }
+
+ upb_encode_advance(e, upb_encode_varint(val, e->ptr));
+ return true;
+}
+
+static bool upb_put_double(upb_encstate *e, double d) {
+ uint64_t u64;
+ UPB_ASSERT(sizeof(double) == sizeof(uint64_t));
+ memcpy(&u64, &d, sizeof(uint64_t));
+ return upb_put_fixed64(e, u64);
+}
+
+static bool upb_put_float(upb_encstate *e, float d) {
+ uint32_t u32;
+ UPB_ASSERT(sizeof(float) == sizeof(uint32_t));
+ memcpy(&u32, &d, sizeof(uint32_t));
+ return upb_put_fixed32(e, u32);
+}
+
+static uint32_t upb_readcase(const char *msg, const upb_msglayout_msginit_v1 *m,
+ int oneof_index) {
+ uint32_t ret;
+ memcpy(&ret, msg + m->oneofs[oneof_index].case_offset, sizeof(ret));
+ return ret;
+}
+
+static bool upb_readhasbit(const char *msg,
+ const upb_msglayout_fieldinit_v1 *f) {
+ UPB_ASSERT(f->hasbit != UPB_NO_HASBIT);
+ return msg[f->hasbit / 8] & (1 << (f->hasbit % 8));
+}
+
+static bool upb_put_tag(upb_encstate *e, int field_number, int wire_type) {
+ return upb_put_varint(e, (field_number << 3) | wire_type);
+}
+
+static bool upb_put_fixedarray(upb_encstate *e, const upb_array *arr,
+ size_t size) {
+ size_t bytes = arr->len * size;
+ return upb_put_varint(e, bytes) && upb_put_bytes(e, arr->data, bytes);
+}
+
+bool upb_encode_message(upb_encstate *e, const char *msg,
+ const upb_msglayout_msginit_v1 *m);
+
+static bool upb_encode_array(upb_encstate *e, const char *field_mem,
+ const upb_msglayout_msginit_v1 *m,
+ const upb_msglayout_fieldinit_v1 *f) {
+ const upb_array *arr = *(const upb_array**)field_mem;
+
+ if (arr->len == 0) {
+ return true;
+ }
+
+ /* We encode all primitive arrays as packed, regardless of what was specified
+ * in the .proto file. Could special case 1-sized arrays. */
+ if (!upb_put_tag(e, f->number, UPB_WIRE_TYPE_DELIMITED)) {
+ return false;
+ }
+
+#define VARINT_CASE(ctype, encode) { \
+ uint64_t *data = arr->data; \
+ uint64_t *limit = data + arr->len; \
+ if (!upb_encode_startdelim(e)) { \
+ return false; \
+ } \
+ for (; data < limit; data++) { \
+ if (!upb_put_varint(e, encode)) { \
+ return false; \
+ } \
+ } \
+ return upb_encode_enddelim(e); \
+}
+
+ switch (f->type) {
+ case UPB_DESCRIPTOR_TYPE_DOUBLE:
+ return upb_put_fixedarray(e, arr, sizeof(double));
+ case UPB_DESCRIPTOR_TYPE_FLOAT:
+ return upb_put_fixedarray(e, arr, sizeof(float));
+ case UPB_DESCRIPTOR_TYPE_SFIXED64:
+ case UPB_DESCRIPTOR_TYPE_FIXED64:
+ return upb_put_fixedarray(e, arr, sizeof(uint64_t));
+ case UPB_DESCRIPTOR_TYPE_FIXED32:
+ case UPB_DESCRIPTOR_TYPE_SFIXED32:
+ return upb_put_fixedarray(e, arr, sizeof(uint32_t));
+ case UPB_DESCRIPTOR_TYPE_INT64:
+ case UPB_DESCRIPTOR_TYPE_UINT64:
+ VARINT_CASE(uint64_t, *data);
+ case UPB_DESCRIPTOR_TYPE_UINT32:
+ case UPB_DESCRIPTOR_TYPE_INT32:
+ case UPB_DESCRIPTOR_TYPE_ENUM:
+ VARINT_CASE(uint32_t, *data);
+ case UPB_DESCRIPTOR_TYPE_BOOL:
+ VARINT_CASE(bool, *data);
+ case UPB_DESCRIPTOR_TYPE_SINT32:
+ VARINT_CASE(int32_t, upb_zzenc_32(*data));
+ case UPB_DESCRIPTOR_TYPE_SINT64:
+ VARINT_CASE(int64_t, upb_zzenc_64(*data));
+ case UPB_DESCRIPTOR_TYPE_STRING:
+ case UPB_DESCRIPTOR_TYPE_BYTES: {
+ upb_stringview *data = arr->data;
+ upb_stringview *limit = data + arr->len;
+ goto put_string_data; /* Skip first tag, we already put it. */
+ for (; data < limit; data++) {
+ if (!upb_put_tag(e, f->number, UPB_WIRE_TYPE_DELIMITED)) {
+ return false;
+ }
+put_string_data:
+ if (!upb_put_varint(e, data->size) ||
+ !upb_put_bytes(e, data->data, data->size)) {
+ return false;
+ }
+ }
+ }
+ case UPB_DESCRIPTOR_TYPE_GROUP:
+ case UPB_DESCRIPTOR_TYPE_MESSAGE: {
+ void **data = arr->data;
+ void **limit = data + arr->len;
+ const upb_msglayout_msginit_v1 *subm = m->submsgs[f->submsg_index];
+ goto put_submsg_data; /* Skip first tag, we already put it. */
+ for (; data < limit; data++) {
+ if (!upb_put_tag(e, f->number, UPB_WIRE_TYPE_DELIMITED)) {
+ return false;
+ }
+put_submsg_data:
+ if (!upb_encode_startdelim(e) ||
+ !upb_encode_message(e, *data, subm) ||
+ !upb_encode_enddelim(e)) {
+ return false;
+ }
+ }
+ }
+ }
+ UPB_UNREACHABLE();
+#undef VARINT_CASE
+}
+
+static bool upb_encode_scalarfield(upb_encstate *e, const char *field_mem,
+ const upb_msglayout_msginit_v1 *m,
+ const upb_msglayout_fieldinit_v1 *f,
+ bool is_proto3) {
+#define CASE(ctype, type, wire_type, encodeval) { \
+ ctype val = *(ctype*)field_mem; \
+ if (is_proto3 && val == 0) { \
+ return true; \
+ } \
+ return upb_put_tag(e, f->number, wire_type) && \
+ upb_put_ ## type(e, encodeval); \
+}
+
+ switch (f->type) {
+ case UPB_DESCRIPTOR_TYPE_DOUBLE:
+ CASE(double, double, UPB_WIRE_TYPE_64BIT, val)
+ case UPB_DESCRIPTOR_TYPE_FLOAT:
+ CASE(float, float, UPB_WIRE_TYPE_32BIT, val)
+ case UPB_DESCRIPTOR_TYPE_INT64:
+ case UPB_DESCRIPTOR_TYPE_UINT64:
+ CASE(uint64_t, varint, UPB_WIRE_TYPE_VARINT, val)
+ case UPB_DESCRIPTOR_TYPE_UINT32:
+ case UPB_DESCRIPTOR_TYPE_INT32:
+ case UPB_DESCRIPTOR_TYPE_ENUM:
+ CASE(uint32_t, varint, UPB_WIRE_TYPE_VARINT, val)
+ case UPB_DESCRIPTOR_TYPE_SFIXED64:
+ case UPB_DESCRIPTOR_TYPE_FIXED64:
+ CASE(uint64_t, fixed64, UPB_WIRE_TYPE_64BIT, val)
+ case UPB_DESCRIPTOR_TYPE_FIXED32:
+ case UPB_DESCRIPTOR_TYPE_SFIXED32:
+ CASE(uint32_t, fixed32, UPB_WIRE_TYPE_32BIT, val)
+ case UPB_DESCRIPTOR_TYPE_BOOL:
+ CASE(bool, varint, UPB_WIRE_TYPE_VARINT, val)
+ case UPB_DESCRIPTOR_TYPE_SINT32:
+ CASE(int32_t, varint, UPB_WIRE_TYPE_VARINT, upb_zzenc_32(val))
+ case UPB_DESCRIPTOR_TYPE_SINT64:
+ CASE(int64_t, varint, UPB_WIRE_TYPE_VARINT, upb_zzenc_64(val))
+ case UPB_DESCRIPTOR_TYPE_STRING:
+ case UPB_DESCRIPTOR_TYPE_BYTES: {
+ upb_stringview view = *(upb_stringview*)field_mem;
+ if (is_proto3 && view.size == 0) {
+ return true;
+ }
+ return upb_put_tag(e, f->number, UPB_WIRE_TYPE_DELIMITED) &&
+ upb_put_varint(e, view.size) &&
+ upb_put_bytes(e, view.data, view.size);
+ }
+ case UPB_DESCRIPTOR_TYPE_GROUP:
+ case UPB_DESCRIPTOR_TYPE_MESSAGE: {
+ void *submsg = *(void**)field_mem;
+ if (is_proto3 && submsg == NULL) {
+ return true;
+ }
+ return upb_put_tag(e, f->number, UPB_WIRE_TYPE_DELIMITED) &&
+ upb_encode_startdelim(e) &&
+ upb_encode_message(e, submsg, m->submsgs[f->submsg_index]) &&
+ upb_encode_enddelim(e);
+ }
+ }
+#undef CASE
+ UPB_UNREACHABLE();
+}
+
+bool upb_encode_hasscalarfield(const char *msg,
+ const upb_msglayout_msginit_v1 *m,
+ const upb_msglayout_fieldinit_v1 *f) {
+ if (f->oneof_index != UPB_NOT_IN_ONEOF) {
+ return upb_readcase(msg, m, f->oneof_index) == f->number;
+ } else if (m->is_proto2) {
+ return upb_readhasbit(msg, f);
+ } else {
+ /* For proto3, we'll test for the field being empty later. */
+ return true;
+ }
+}
+
+bool upb_encode_message(upb_encstate* e, const char *msg,
+ const upb_msglayout_msginit_v1 *m) {
+ int i;
+ for (i = 0; i < m->field_count; i++) {
+ const upb_msglayout_fieldinit_v1 *f = &m->fields[i];
+
+ if (f->label == UPB_LABEL_REPEATED) {
+ if (!upb_encode_array(e, msg, m, f)) {
+ return NULL;
+ }
+ } else {
+ if (upb_encode_hasscalarfield(msg, m, f) &&
+ !upb_encode_scalarfield(e, msg + f->offset, m, f, !m->is_proto2)) {
+ return NULL;
+ }
+ }
+ }
+
+ return true;
+}
+
+char *upb_encode(const void *msg, const upb_msglayout_msginit_v1 *m,
+ upb_env *env, size_t *size) {
+ upb_encstate e;
+
+ if (!upb_encode_message(&e, msg, m)) {
+ return false;
+ }
+
+ *size = e.ptr - e.buf;
+ return e.buf;
+}
generated by cgit on debian on lair
contact matthew@masot.net with questions or feedback