From 699b51b441c6f963ca95e9a7c40c2cb14adde014 Mon Sep 17 00:00:00 2001
From: Joshua Haberman <jhaberman@gmail.com>
Date: Mon, 10 Jul 2017 16:25:50 -0500
Subject: Lots of encoder/decoder work (backwards encoder).

---
 upb/encode.c | 401 ++++++++++++++++++++---------------------------------------
 1 file changed, 135 insertions(+), 266 deletions(-)

(limited to 'upb/encode.c')

diff --git a/upb/encode.c b/upb/encode.c
index 30f2da7..2fe1cc3 100644
--- a/upb/encode.c
+++ b/upb/encode.c
@@ -3,6 +3,7 @@
 #include "upb/structs.int.h"
 
 #define UPB_PB_VARINT_MAX_LEN 10
+#define CHK(x) do { if (!(x)) { return false; } } while(0)
 
 static size_t upb_encode_varint(uint64_t val, char *buf) {
   size_t i;
@@ -17,11 +18,6 @@ static size_t upb_encode_varint(uint64_t val, char *buf) {
   return i;
 }
 
-static size_t upb_varint_size(uint64_t val) {
-  char buf[UPB_PB_VARINT_MAX_LEN];
-  return upb_encode_varint(val, buf);
-}
-
 static uint32_t upb_zzenc_32(int32_t n) { return (n << 1) ^ (n >> 31); }
 static uint64_t upb_zzenc_64(int64_t n) { return (n << 1) ^ (n >> 63); }
 
@@ -57,78 +53,29 @@ const uint8_t upb_native_wiretypes[] = {
   UPB_WIRE_TYPE_VARINT,        /* SINT64 */
 };
 
-/* The output buffer is divided into segments; a segment is a string of data
- * that is "ready to go" -- it does not need any varint lengths inserted into
- * the middle.  The seams between segments are where varints will be inserted
- * once they are known.
- *
- * We also use the concept of a "run", which is a range of encoded bytes that
- * occur at a single submessage level.  Every segment contains one or more runs.
- *
- * A segment can span messages.  Consider:
- *
- *                  .--Submessage lengths---------.
- *                  |       |                     |
- *                  |       V                     V
- *                  V      | |---------------    | |-----------------
- * Submessages:    | |-----------------------------------------------
- * Top-level msg: ------------------------------------------------------------
- *
- * Segments:          -----   -------------------   -----------------
- * Runs:              *----   *--------------*---   *----------------
- * (* marks the start)
- *
- * Note that the top-level menssage is not in any segment because it does not
- * have any length preceding it.
- *
- * A segment is only interrupted when another length needs to be inserted.  So
- * observe how the second segment spans both the inner submessage and part of
- * the next enclosing message. */
-
-typedef struct {
-  uint32_t msglen;  /* The length to varint-encode before this segment. */
-  uint32_t seglen;  /* Length of the segment. */
-} upb_segment;
-
 typedef struct {
   upb_env *env;
   char *buf, *ptr, *limit;
-
-  /* The beginning of the current run, or undefined if we are at the top
-   * level. */
-  char *runbegin;
-
-  /* The list of segments we are accumulating. */
-  upb_segment *segbuf, *segptr, *seglimit;
-
-  /* The stack of enclosing submessages.  Each entry in the stack points to the
-   * segment where this submessage's length is being accumulated. */
-  int *stack, *top, *stacklimit;
 } upb_encstate;
 
-static upb_segment *upb_encode_top(upb_encstate *e) {
-  return &e->segbuf[*e->top];
+static size_t upb_roundup_pow2(size_t bytes) {
+  size_t ret = 128;
+  while (ret < bytes) {
+    ret *= 2;
+  }
+  return ret;
 }
 
 static bool upb_encode_growbuffer(upb_encstate *e, size_t bytes) {
-  char *new_buf;
-  size_t needed = bytes + (e->ptr - e->buf);
   size_t old_size = e->limit - e->buf;
+  size_t new_size = upb_roundup_pow2(bytes + (e->limit - e->ptr));
+  char *new_buf = upb_env_realloc(e->env, e->buf, old_size, new_size);
+  CHK(new_buf);
 
-  size_t new_size = old_size;
+  /* We want previous data at the end, realloc() put it at the beginning. */
+  memmove(e->limit - old_size, e->buf, old_size);
 
-  while (new_size < needed) {
-    new_size *= 2;
-  }
-
-  new_buf = upb_env_realloc(e->env, e->buf, old_size, new_size);
-
-  if (new_buf == NULL) {
-    return false;
-  }
-
-  e->ptr = new_buf + (e->ptr - e->buf);
-  e->runbegin = new_buf + (e->runbegin - e->buf);
+  e->ptr = new_buf + new_size - (e->limit - e->ptr);
   e->limit = new_buf + new_size;
   e->buf = new_buf;
   return true;
@@ -137,120 +84,20 @@ static bool upb_encode_growbuffer(upb_encstate *e, size_t bytes) {
 /* Call to ensure that at least "bytes" bytes are available for writing at
  * e->ptr.  Returns false if the bytes could not be allocated. */
 static bool upb_encode_reserve(upb_encstate *e, size_t bytes) {
-  if (UPB_LIKELY((size_t)(e->limit - e->ptr) >= bytes)) {
-    return true;
-  }
-
-  return upb_encode_growbuffer(e, bytes);
-}
+  CHK(UPB_LIKELY((size_t)(e->ptr - e->buf) >= bytes) ||
+      upb_encode_growbuffer(e, bytes));
 
-/* Call when "bytes" bytes have been writte at e->ptr.  The caller *must* have
- * previously called reserve() with at least this many bytes. */
-static void upb_encode_advance(upb_encstate *e, size_t bytes) {
-  UPB_ASSERT((size_t)(e->limit - e->ptr) >= bytes);
-  e->ptr += bytes;
+  e->ptr -= bytes;
+  return true;
 }
 
 /* Writes the given bytes to the buffer, handling reserve/advance. */
 static bool upb_put_bytes(upb_encstate *e, const void *data, size_t len) {
-  if (!upb_encode_reserve(e, len)) {
-    return false;
-  }
-
+  CHK(upb_encode_reserve(e, len));
   memcpy(e->ptr, data, len);
-  upb_encode_advance(e, len);
-  return true;
-}
-
-/* Finish the current run by adding the run totals to the segment and message
- * length. */
-static void upb_encode_accumulate(upb_encstate *e) {
-  size_t run_len;
-  UPB_ASSERT(e->ptr >= e->runbegin);
-  run_len = e->ptr - e->runbegin;
-  e->segptr->seglen += run_len;
-  upb_encode_top(e)->msglen += run_len;
-  e->runbegin = e->ptr;
-}
-
-/* Call to indicate the start of delimited region for which the full length is
- * not yet known.  The length will be inserted at the current position once it
- * is known (and subsequent data moved if necessary). */
-static bool upb_encode_startdelim(upb_encstate *e) {
-  if (e->top) {
-    /* We are already buffering, advance to the next segment and push it on the
-     * stack. */
-    upb_encode_accumulate(e);
-
-    if (++e->top == e->stacklimit) {
-      /* TODO(haberman): grow stack? */
-      return false;
-    }
-
-    if (++e->segptr == e->seglimit) {
-      /* Grow segment buffer. */
-      size_t old_size =
-          (e->seglimit - e->segbuf) * sizeof(upb_segment);
-      size_t new_size = old_size * 2;
-      upb_segment *new_buf =
-          upb_env_realloc(e->env, e->segbuf, old_size, new_size);
-
-      if (new_buf == NULL) {
-        return false;
-      }
-
-      e->segptr = new_buf + (e->segptr - e->segbuf);
-      e->seglimit = new_buf + (new_size / sizeof(upb_segment));
-      e->segbuf = new_buf;
-    }
-  } else {
-    /* We were previously at the top level, start buffering. */
-    e->segptr = e->segbuf;
-    e->top = e->stack;
-    e->runbegin = e->ptr;
-  }
-
-  *e->top = e->segptr - e->segbuf;
-  e->segptr->seglen = 0;
-  e->segptr->msglen = 0;
-
   return true;
 }
 
-/* Call to indicate the end of a delimited region.  We now know the length of
- * the delimited region.  If we are not nested inside any other delimited
- * regions, we can now emit all of the buffered data we accumulated. */
-static bool upb_encode_enddelim(upb_encstate *e) {
-  size_t msglen;
-  upb_encode_accumulate(e);
-  msglen = upb_encode_top(e)->msglen;
-
-  if (e->top == e->stack) {
-    /* All lengths are now available, emit all buffered data. */
-    char buf[UPB_PB_VARINT_MAX_LEN];
-    upb_segment *s;
-    const char *ptr = e->buf;
-    for (s = e->segbuf; s <= e->segptr; s++) {
-      size_t lenbytes = upb_encode_varint(s->msglen, buf);
-      //putbuf(e, buf, lenbytes);
-      //putbuf(e, ptr, s->seglen);
-      ptr += s->seglen;
-    }
-
-    e->ptr = e->buf;
-    e->top = NULL;
-  } else {
-    /* Need to keep buffering; propagate length info into enclosing
-     * submessages. */
-    --e->top;
-    upb_encode_top(e)->msglen += msglen + upb_varint_size(msglen);
-  }
-
-  return true;
-}
-
-/* encoding of wire types *****************************************************/
-
 static bool upb_put_fixed64(upb_encstate *e, uint64_t val) {
   /* TODO(haberman): byte-swap for big endian. */
   return upb_put_bytes(e, &val, sizeof(uint64_t));
@@ -262,11 +109,13 @@ static bool upb_put_fixed32(upb_encstate *e, uint32_t val) {
 }
 
 static bool upb_put_varint(upb_encstate *e, uint64_t val) {
-  if (!upb_encode_reserve(e, UPB_PB_VARINT_MAX_LEN)) {
-    return false;
-  }
-
-  upb_encode_advance(e, upb_encode_varint(val, e->ptr));
+  size_t len;
+  char *start;
+  CHK(upb_encode_reserve(e, UPB_PB_VARINT_MAX_LEN));
+  len = upb_encode_varint(val, e->ptr);
+  start = e->ptr + UPB_PB_VARINT_MAX_LEN - len;
+  memmove(start, e->ptr, len);
+  e->ptr = start;
   return true;
 }
 
@@ -304,11 +153,12 @@ static bool upb_put_tag(upb_encstate *e, int field_number, int wire_type) {
 static bool upb_put_fixedarray(upb_encstate *e, const upb_array *arr,
                                size_t size) {
   size_t bytes = arr->len * size;
-  return upb_put_varint(e, bytes) && upb_put_bytes(e, arr->data, bytes);
+  return upb_put_bytes(e, arr->data, bytes) && upb_put_varint(e, bytes);
 }
 
 bool upb_encode_message(upb_encstate *e, const char *msg,
-                        const upb_msglayout_msginit_v1 *m);
+                        const upb_msglayout_msginit_v1 *m,
+                        size_t *size);
 
 static bool upb_encode_array(upb_encstate *e, const char *field_mem,
                              const upb_msglayout_msginit_v1 *m,
@@ -319,146 +169,161 @@ static bool upb_encode_array(upb_encstate *e, const char *field_mem,
     return true;
   }
 
-  /* We encode all primitive arrays as packed, regardless of what was specified
-   * in the .proto file.  Could special case 1-sized arrays. */
-  if (!upb_put_tag(e, f->number, UPB_WIRE_TYPE_DELIMITED)) {
-    return false;
-  }
-
-#define VARINT_CASE(ctype, encode) { \
-  uint64_t *data = arr->data; \
-  uint64_t *limit = data + arr->len; \
-  if (!upb_encode_startdelim(e)) { \
-    return false; \
-  } \
-  for (; data < limit; data++) { \
-    if (!upb_put_varint(e, encode)) { \
-      return false; \
-    } \
-  } \
-  return upb_encode_enddelim(e); \
-}
+#define VARINT_CASE(ctype, encode) do { \
+  uint64_t *start = arr->data; \
+  uint64_t *ptr = start + arr->len; \
+  char *buf_ptr = e->ptr; \
+  do { \
+    ptr--; \
+    CHK(upb_put_varint(e, encode)); \
+  } while (ptr != start); \
+  CHK(upb_put_varint(e, buf_ptr - e->ptr)); \
+  break; \
+} while(0)
 
   switch (f->type) {
     case UPB_DESCRIPTOR_TYPE_DOUBLE:
-      return upb_put_fixedarray(e, arr, sizeof(double));
+      CHK(upb_put_fixedarray(e, arr, sizeof(double)));
+      break;
     case UPB_DESCRIPTOR_TYPE_FLOAT:
-      return upb_put_fixedarray(e, arr, sizeof(float));
+      CHK(upb_put_fixedarray(e, arr, sizeof(float)));
+      break;
     case UPB_DESCRIPTOR_TYPE_SFIXED64:
     case UPB_DESCRIPTOR_TYPE_FIXED64:
-      return upb_put_fixedarray(e, arr, sizeof(uint64_t));
+      CHK(upb_put_fixedarray(e, arr, sizeof(uint64_t)));
+      break;
     case UPB_DESCRIPTOR_TYPE_FIXED32:
     case UPB_DESCRIPTOR_TYPE_SFIXED32:
-      return upb_put_fixedarray(e, arr, sizeof(uint32_t));
+      CHK(upb_put_fixedarray(e, arr, sizeof(uint32_t)));
+      break;
     case UPB_DESCRIPTOR_TYPE_INT64:
     case UPB_DESCRIPTOR_TYPE_UINT64:
-      VARINT_CASE(uint64_t, *data);
+      VARINT_CASE(uint64_t, *ptr);
     case UPB_DESCRIPTOR_TYPE_UINT32:
     case UPB_DESCRIPTOR_TYPE_INT32:
     case UPB_DESCRIPTOR_TYPE_ENUM:
-      VARINT_CASE(uint32_t, *data);
+      VARINT_CASE(uint32_t, *ptr);
     case UPB_DESCRIPTOR_TYPE_BOOL:
-      VARINT_CASE(bool, *data);
+      VARINT_CASE(bool, *ptr);
     case UPB_DESCRIPTOR_TYPE_SINT32:
-      VARINT_CASE(int32_t, upb_zzenc_32(*data));
+      VARINT_CASE(int32_t, upb_zzenc_32(*ptr));
     case UPB_DESCRIPTOR_TYPE_SINT64:
-      VARINT_CASE(int64_t, upb_zzenc_64(*data));
+      VARINT_CASE(int64_t, upb_zzenc_64(*ptr));
     case UPB_DESCRIPTOR_TYPE_STRING:
     case UPB_DESCRIPTOR_TYPE_BYTES: {
-      upb_stringview *data = arr->data;
-      upb_stringview *limit = data + arr->len;
-      goto put_string_data;  /* Skip first tag, we already put it. */
-      for (; data < limit; data++) {
-        if (!upb_put_tag(e, f->number, UPB_WIRE_TYPE_DELIMITED)) {
-          return false;
-        }
-put_string_data:
-        if (!upb_put_varint(e, data->size) ||
-            !upb_put_bytes(e, data->data, data->size)) {
-          return false;
-        }
-      }
+      upb_stringview *start = arr->data;
+      upb_stringview *ptr = start + arr->len;
+      do {
+        ptr--;
+        CHK(upb_put_bytes(e, ptr->data, ptr->size) &&
+            upb_put_varint(e, ptr->size) &&
+            upb_put_tag(e, f->number, UPB_WIRE_TYPE_DELIMITED));
+      } while (ptr != start);
+      return true;
+    }
+    case UPB_DESCRIPTOR_TYPE_GROUP: {
+      void **start = arr->data;
+      void **ptr = start + arr->len;
+      const upb_msglayout_msginit_v1 *subm = m->submsgs[f->submsg_index];
+      do {
+        size_t size;
+        ptr--;
+        CHK(upb_put_tag(e, f->number, UPB_WIRE_TYPE_END_GROUP) &&
+            upb_encode_message(e, *ptr, subm, &size) &&
+            upb_put_tag(e, f->number, UPB_WIRE_TYPE_START_GROUP));
+      } while (ptr != start);
+      return true;
     }
-    case UPB_DESCRIPTOR_TYPE_GROUP:
     case UPB_DESCRIPTOR_TYPE_MESSAGE: {
-      void **data = arr->data;
-      void **limit = data + arr->len;
+      void **start = arr->data;
+      void **ptr = start + arr->len;
       const upb_msglayout_msginit_v1 *subm = m->submsgs[f->submsg_index];
-      goto put_submsg_data;  /* Skip first tag, we already put it. */
-      for (; data < limit; data++) {
-        if (!upb_put_tag(e, f->number, UPB_WIRE_TYPE_DELIMITED)) {
-          return false;
-        }
-put_submsg_data:
-        if (!upb_encode_startdelim(e) ||
-            !upb_encode_message(e, *data, subm) ||
-            !upb_encode_enddelim(e)) {
-          return false;
-        }
-      }
+      do {
+        size_t size;
+        ptr--;
+        CHK(upb_encode_message(e, *ptr, subm, &size) &&
+            upb_put_varint(e, size) &&
+            upb_put_tag(e, f->number, UPB_WIRE_TYPE_DELIMITED));
+      } while (ptr != start);
+      return true;
     }
   }
-  UPB_UNREACHABLE();
 #undef VARINT_CASE
+
+  /* We encode all primitive arrays as packed, regardless of what was specified
+   * in the .proto file.  Could special case 1-sized arrays. */
+  CHK(upb_put_tag(e, f->number, UPB_WIRE_TYPE_DELIMITED));
+  return true;
 }
 
 static bool upb_encode_scalarfield(upb_encstate *e, const char *field_mem,
                                    const upb_msglayout_msginit_v1 *m,
                                    const upb_msglayout_fieldinit_v1 *f,
                                    bool is_proto3) {
-#define CASE(ctype, type, wire_type, encodeval) { \
+#define CASE(ctype, type, wire_type, encodeval) do { \
   ctype val = *(ctype*)field_mem; \
   if (is_proto3 && val == 0) { \
     return true; \
   } \
-  return upb_put_tag(e, f->number, wire_type) && \
-      upb_put_ ## type(e, encodeval); \
-}
+  return upb_put_ ## type(e, encodeval) && \
+      upb_put_tag(e, f->number, wire_type); \
+} while(0)
 
   switch (f->type) {
     case UPB_DESCRIPTOR_TYPE_DOUBLE:
-      CASE(double, double, UPB_WIRE_TYPE_64BIT, val)
+      CASE(double, double, UPB_WIRE_TYPE_64BIT, val);
     case UPB_DESCRIPTOR_TYPE_FLOAT:
-      CASE(float, float, UPB_WIRE_TYPE_32BIT, val)
+      CASE(float, float, UPB_WIRE_TYPE_32BIT, val);
     case UPB_DESCRIPTOR_TYPE_INT64:
     case UPB_DESCRIPTOR_TYPE_UINT64:
-      CASE(uint64_t, varint, UPB_WIRE_TYPE_VARINT, val)
+      CASE(uint64_t, varint, UPB_WIRE_TYPE_VARINT, val);
     case UPB_DESCRIPTOR_TYPE_UINT32:
     case UPB_DESCRIPTOR_TYPE_INT32:
     case UPB_DESCRIPTOR_TYPE_ENUM:
-      CASE(uint32_t, varint, UPB_WIRE_TYPE_VARINT, val)
+      CASE(uint32_t, varint, UPB_WIRE_TYPE_VARINT, val);
     case UPB_DESCRIPTOR_TYPE_SFIXED64:
     case UPB_DESCRIPTOR_TYPE_FIXED64:
-      CASE(uint64_t, fixed64, UPB_WIRE_TYPE_64BIT, val)
+      CASE(uint64_t, fixed64, UPB_WIRE_TYPE_64BIT, val);
     case UPB_DESCRIPTOR_TYPE_FIXED32:
     case UPB_DESCRIPTOR_TYPE_SFIXED32:
-      CASE(uint32_t, fixed32, UPB_WIRE_TYPE_32BIT, val)
+      CASE(uint32_t, fixed32, UPB_WIRE_TYPE_32BIT, val);
     case UPB_DESCRIPTOR_TYPE_BOOL:
-      CASE(bool, varint, UPB_WIRE_TYPE_VARINT, val)
+      CASE(bool, varint, UPB_WIRE_TYPE_VARINT, val);
     case UPB_DESCRIPTOR_TYPE_SINT32:
-      CASE(int32_t, varint, UPB_WIRE_TYPE_VARINT, upb_zzenc_32(val))
+      CASE(int32_t, varint, UPB_WIRE_TYPE_VARINT, upb_zzenc_32(val));
     case UPB_DESCRIPTOR_TYPE_SINT64:
-      CASE(int64_t, varint, UPB_WIRE_TYPE_VARINT, upb_zzenc_64(val))
+      CASE(int64_t, varint, UPB_WIRE_TYPE_VARINT, upb_zzenc_64(val));
     case UPB_DESCRIPTOR_TYPE_STRING:
     case UPB_DESCRIPTOR_TYPE_BYTES: {
       upb_stringview view = *(upb_stringview*)field_mem;
       if (is_proto3 && view.size == 0) {
         return true;
       }
-      return upb_put_tag(e, f->number, UPB_WIRE_TYPE_DELIMITED) &&
+      return upb_put_bytes(e, view.data, view.size) &&
           upb_put_varint(e, view.size) &&
-          upb_put_bytes(e, view.data, view.size);
+          upb_put_tag(e, f->number, UPB_WIRE_TYPE_DELIMITED);
+    }
+    case UPB_DESCRIPTOR_TYPE_GROUP: {
+      size_t size;
+      void *submsg = *(void**)field_mem;
+      const upb_msglayout_msginit_v1 *subm = m->submsgs[f->submsg_index];
+      if (is_proto3 && submsg == NULL) {
+        return true;
+      }
+      return upb_put_tag(e, f->number, UPB_WIRE_TYPE_END_GROUP) &&
+          upb_encode_message(e, submsg, subm, &size) &&
+          upb_put_tag(e, f->number, UPB_WIRE_TYPE_START_GROUP);
     }
-    case UPB_DESCRIPTOR_TYPE_GROUP:
     case UPB_DESCRIPTOR_TYPE_MESSAGE: {
+      size_t size;
       void *submsg = *(void**)field_mem;
+      const upb_msglayout_msginit_v1 *subm = m->submsgs[f->submsg_index];
       if (is_proto3 && submsg == NULL) {
         return true;
       }
-      return upb_put_tag(e, f->number, UPB_WIRE_TYPE_DELIMITED) &&
-          upb_encode_startdelim(e) &&
-          upb_encode_message(e, submsg, m->submsgs[f->submsg_index]) &&
-          upb_encode_enddelim(e);
+      return upb_encode_message(e, submsg, subm, &size) &&
+          upb_put_varint(e, size) &&
+          upb_put_tag(e, f->number, UPB_WIRE_TYPE_DELIMITED);
     }
   }
 #undef CASE
@@ -479,34 +344,38 @@ bool upb_encode_hasscalarfield(const char *msg,
 }
 
 bool upb_encode_message(upb_encstate* e, const char *msg,
-                        const upb_msglayout_msginit_v1 *m) {
+                        const upb_msglayout_msginit_v1 *m,
+                        size_t *size) {
   int i;
-  for (i = 0; i < m->field_count; i++) {
+  char *buf_end = e->ptr;
+  for (i = m->field_count - 1; i >= 0; i--) {
     const upb_msglayout_fieldinit_v1 *f = &m->fields[i];
 
     if (f->label == UPB_LABEL_REPEATED) {
-      if (!upb_encode_array(e, msg, m, f)) {
-        return NULL;
-      }
+      CHK(upb_encode_array(e, msg, m, f));
     } else {
-      if (upb_encode_hasscalarfield(msg, m, f) &&
-          !upb_encode_scalarfield(e, msg + f->offset, m, f, !m->is_proto2)) {
-        return NULL;
+      if (upb_encode_hasscalarfield(msg, m, f)) {
+        CHK(upb_encode_scalarfield(e, msg + f->offset, m, f, !m->is_proto2));
       }
     }
   }
 
+  *size = buf_end - e->ptr;
   return true;
 }
 
 char *upb_encode(const void *msg, const upb_msglayout_msginit_v1 *m,
                  upb_env *env, size_t *size) {
   upb_encstate e;
+  e.env = env;
+  e.buf = NULL;
+  e.limit = NULL;
+  e.ptr = NULL;
 
-  if (!upb_encode_message(&e, msg, m)) {
+  if (!upb_encode_message(&e, msg, m, size)) {
     return false;
   }
 
-  *size = e.ptr - e.buf;
-  return e.buf;
+  *size = e.limit - e.ptr;
+  return e.ptr;
 }
-- 
cgit v1.2.3