5 files changed, 169 insertions, 74 deletions
diff --git a/Makefile b/Makefile
index 0b0d9da..2a1e5b9 100644
--- a/Makefile
+++ b/Makefile
@@ -16,6 +16,7 @@ clean:
 src/libupb.a: $(OBJ)
 	ar rcs src/libupb.a $(OBJ)
 tests/test_table: src/libupb.a
+tests/tests: src/libupb.a
 tools/upbc: src/libupb.a
 benchmark/benchmark: src/libupb.a benchmark/google_messages.pb.h benchmark/google_messages.pb.o benchmark/benchmark.o
 	$(CXX) $(CPPFLAGS) -o benchmark/benchmark benchmark/google_messages.pb.o benchmark/benchmark.cc src/libupb.a -lm -lprotobuf -lpthread
diff --git a/README b/README
index 10e905a..e869667 100644
--- a/README
+++ b/README
@@ -4,6 +4,32 @@ upb - a minimalist implementation of protocol buffers.
 - For API documentation, see the header files.
 - To build type "make".
 
+
+ROADMAP OF THE SOURCE
+=====================
+
+benchmark/
+  Benchmarks of upb and other protocol buffer implementations.
+descriptor/
+  Files that describe the format of Protocol Buffer "descriptors", which are
+  protocol buffers that describe the format of other protocol buffers.  These
+  are used extensively inside upb.
+labs/
+  Code that is not part of upb, but contains efficiency-related experiments
+  about alternate ways of implementing things.  When possible, these are
+  benchmarked by the tests in benchmark/.  We also test these with the tests
+  in tests/, to ensure that the alternate implementations are actually correct.
+src/
+  The core source directory.  builds into src/libupb.a.
+tests/
+  Unit tests.
+tools/
+  Command-line tools like the upb compiler.
+
+
+CONTACT
+=======
+
 Author: Joshua Haberman (joshua@reverberate.org, haberman@google.com)
 See LICENSE for copyright information.
 
diff --git a/src/upb_parse.c b/src/upb_parse.c
index a8fa3a6..1411bbd 100644
--- a/src/upb_parse.c
+++ b/src/upb_parse.c
@@ -35,50 +35,21 @@ struct upb_type_info upb_type_info[] = {
 
 /* This is called by the inline version of the function if the varint turns out
  * to be >= 2 bytes. */
-upb_status_t upb_get_v_uint64_t_full(uint8_t *restrict buf, uint8_t *end,
-                                     uint64_t *restrict val,
+upb_status_t upb_get_v_uint64_t_full(uint8_t *buf, uint8_t *end, uint64_t *val,
                                      uint8_t **outbuf)
 {
+  uint8_t *const *maxend = buf + 10;
   uint8_t last = 0x80;
   *val = 0;
-  for(int bitpos = 0; buf < (uint8_t*)end && (last & 0x80); buf++, bitpos += 7)
+  int bitpos;
+  for(bitpos = 0; buf < (uint8_t*)end && (last & 0x80); buf++, bitpos += 7)
     *val |= ((uint64_t)((last = *buf) & 0x7F)) << bitpos;
-  if(last & 0x80) return UPB_STATUS_NEED_MORE_DATA;
+  if(buf >= end && buf <= maxend && (last & 0x80)) return UPB_STATUS_NEED_MORE_DATA;
+  if(buf > maxend) return UPB_ERROR_UNTERMINATED_VARINT;
   *outbuf = buf;
   return UPB_STATUS_OK;
 }
 
-static upb_status_t skip_v_uint64_t(uint8_t *buf, uint8_t *end, uint8_t **outbuf)
-{
-  /* TODO: fix and optimize. */
-  uint8_t last = 0x80;
-  for(; buf < end && (last & 0x80); buf++) {
-    last = *buf;
-  }
-
-  if(last & 0x80) {
-    return UPB_ERROR_UNTERMINATED_VARINT;
-  }
-  *outbuf = buf;
-  return UPB_STATUS_OK;
-}
-
-static upb_status_t skip_f_uint32_t(uint8_t *buf, uint8_t *end, uint8_t **outbuf)
-{
-  uint8_t *uint32_end = buf + sizeof(uint32_t);
-  if(uint32_end > end) return UPB_STATUS_NEED_MORE_DATA;
-  *outbuf = uint32_end;
-  return UPB_STATUS_OK;
-}
-
-static upb_status_t skip_f_uint64_t(uint8_t *buf, uint8_t *end, uint8_t **outbuf)
-{
-  uint8_t *uint64_end = buf + sizeof(uint64_t);
-  if(uint64_end > end) return UPB_STATUS_NEED_MORE_DATA;
-  *outbuf = uint64_end;
-  return UPB_STATUS_OK;
-}
-
 upb_status_t upb_parse_wire_value(uint8_t *buf, uint8_t *end, upb_wire_type_t wt,
                                   union upb_wire_value *wv, uint8_t **outbuf)
 {
@@ -94,9 +65,9 @@ static upb_status_t skip_wire_value(uint8_t *buf, uint8_t *end, upb_wire_type_t
                                     uint8_t **outbuf)
 {
   switch(wt) {
-    case UPB_WIRE_TYPE_VARINT: return skip_v_uint64_t(buf, end, outbuf);
-    case UPB_WIRE_TYPE_64BIT:  return skip_f_uint64_t(buf, end, outbuf);
-    case UPB_WIRE_TYPE_32BIT:  return skip_f_uint32_t(buf, end, outbuf);
+    case UPB_WIRE_TYPE_VARINT: return upb_skip_v_uint64_t(buf, end, outbuf);
+    case UPB_WIRE_TYPE_64BIT:  return upb_skip_f_uint64_t(buf, end, outbuf);
+    case UPB_WIRE_TYPE_32BIT:  return upb_skip_f_uint32_t(buf, end, outbuf);
     case UPB_WIRE_TYPE_START_GROUP: /* TODO: skip to matching end group. */
     case UPB_WIRE_TYPE_END_GROUP: return UPB_STATUS_OK;
     default: return UPB_ERROR_ILLEGAL;
diff --git a/src/upb_parse.h b/src/upb_parse.h
index e727c11..f0ec5e2 100644
--- a/src/upb_parse.h
+++ b/src/upb_parse.h
@@ -208,6 +208,38 @@ INLINE upb_status_t upb_get_f_uint64_t(uint8_t *buf, uint8_t *end,
   return UPB_STATUS_OK;
 }
 
+INLINE upb_status_t upb_skip_v_uint64_t(uint8_t *buf, uint8_t *end,
+                                        uint8_t **outbuf)
+{
+  uint8_t *const maxend = buf + 10;
+  uint8_t last = 0x80;
+  for(; buf < (uint8_t*)end && (last & 0x80); buf++)
+    last = *buf;
+  if(buf >= end && buf <= maxend && (last & 0x80)) return UPB_STATUS_NEED_MORE_DATA;
+  if(buf > maxend) return UPB_ERROR_UNTERMINATED_VARINT;
+  *outbuf = buf;
+  return UPB_STATUS_OK;
+}
+
+INLINE upb_status_t upb_skip_f_uint32_t(uint8_t *buf, uint8_t *end, uint8_t
+                                        **outbuf)
+{
+  uint8_t *uint32_end = buf + sizeof(uint32_t);
+  if(uint32_end > end) return UPB_STATUS_NEED_MORE_DATA;
+  *outbuf = uint32_end;
+  return UPB_STATUS_OK;
+}
+
+INLINE upb_status_t upb_skip_f_uint64_t(uint8_t *buf, uint8_t *end, uint8_t
+                                        **outbuf)
+{
+  uint8_t *uint64_end = buf + sizeof(uint64_t);
+  if(uint64_end > end) return UPB_STATUS_NEED_MORE_DATA;
+  *outbuf = uint64_end;
+  return UPB_STATUS_OK;
+}
+
+
 /* Functions to read .proto values. *******************************************/
 
 /* These functions read the appropriate wire value for a given .proto type
diff --git a/tests/tests.c b/tests/tests.c
index 497910f..2a769a9 100644
--- a/tests/tests.c
+++ b/tests/tests.c
@@ -3,14 +3,8 @@
 #include <assert.h>
 #include <stdio.h>
 #include <stdlib.h>
-#include "descriptor.c"
-#include "upb_context.c"
-#include "upb_enum.c"
-#include "upb_msg.c"
-#include "upb_parse.c"
-#include "upb_serialize.c"
-#include "upb_table.c"
-#include "upb_text.c"
+#include "upb_parse.h"
+#include "upb_context.h"
 
 int num_assertions = 0;
 #define ASSERT(expr) do { \
@@ -25,10 +19,15 @@ static void test_get_v_uint64_t()
     uint8_t name[] = bytes; \
     uint8_t *name ## _buf = name; \
     uint64_t name ## _val = 0; \
-    status = upb_get_v_uint64_t(name ## _buf, name + sizeof(name), &name ## _val, &name ## _buf); \
+    status = upb_get_v_uint64_t(name, name + sizeof(name) - 1, &name ## _val, &name ## _buf); \
     ASSERT(status == UPB_STATUS_OK); \
     ASSERT(name ## _val == val); \
     ASSERT(name ## _buf == name + sizeof(name) - 1);  /* - 1 for NULL */ \
+    /* Test NEED_MORE_DATA. */ \
+    if(sizeof(name) > 2) { \
+      status = upb_get_v_uint64_t(name, name + sizeof(name) - 2, &name ## _val, &name ## _buf); \
+      ASSERT(status == UPB_STATUS_NEED_MORE_DATA); \
+    } \
   }
 
   TEST(zero,   "\x00",                                                      0ULL);
@@ -42,18 +41,37 @@ static void test_get_v_uint64_t()
   TEST(eightb, "\x81\x83\x87\x8f\x9f\xbf\xff\x01",            0x3fdf9f1e1c181ULL);
   TEST(nineb,  "\x81\x83\x87\x8f\x9f\xbf\xff\x81\x03",      0x303fdf9f1e1c181ULL);
   TEST(tenb,   "\x81\x83\x87\x8f\x9f\xbf\xff\x81\x83\x07", 0x8303fdf9f1e1c181ULL);
+#undef TEST
 
-  uint8_t elevenbyte[] = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x01};
-  uint8_t *elevenbyte_buf = elevenbyte;
-  uint64_t elevenbyte_val = 0;
-  upb_status_t status = upb_get_v_uint64_t(elevenbyte_buf, elevenbyte + sizeof(elevenbyte), &elevenbyte_val, &elevenbyte_buf);
+  uint8_t twelvebyte[] = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x01, 0x01};
+  uint8_t *twelvebyte_buf = twelvebyte;
+  uint64_t twelvebyte_val = 0;
+  upb_status_t status;
+  /* A varint that terminates before hitting the end of the provided buffer,
+   * but in too many bytes (11 instead of 10). */
+  status = upb_get_v_uint64_t(twelvebyte_buf, twelvebyte + 12, &twelvebyte_val, &twelvebyte_buf);
   ASSERT(status == UPB_ERROR_UNTERMINATED_VARINT);
-  status = upb_get_v_uint64_t(elevenbyte_buf, elevenbyte + sizeof(elevenbyte)-1, &elevenbyte_val, &elevenbyte_buf);
-  /* Byte 10 is 0x80, so we know it's unterminated. */
+
+  /* A varint that terminates simultaneously with the end of the provided
+   * buffer, but in too many bytes (11 instead of 10). */
+  status = upb_get_v_uint64_t(twelvebyte_buf, twelvebyte + 11, &twelvebyte_val, &twelvebyte_buf);
   ASSERT(status == UPB_ERROR_UNTERMINATED_VARINT);
-  status = upb_get_v_uint64_t(elevenbyte_buf, elevenbyte + sizeof(elevenbyte)-2, &elevenbyte_val, &elevenbyte_buf);
+
+  /* A varint whose buffer ends on exactly the byte where the varint must
+   * terminate, but the final byte does not terminate.  The absolutely most
+   * correct return code here is UPB_ERROR_UNTERMINATED_VARINT, because we know
+   * by this point that the varint does not properly terminate.  But we also
+   * allow a return value of UPB_STATUS_NEED_MORE_DATA here, because it does not
+   * compromise overall correctness -- clients who supply more data later will
+   * then receive a UPB_ERROR_UNTERMINATED_VARINT error; clients who have no
+   * more data to supply will (rightly) conclude that their protobuf is corrupt.
+   */
+  status = upb_get_v_uint64_t(twelvebyte_buf, twelvebyte + 10, &twelvebyte_val, &twelvebyte_buf);
+  ASSERT(status == UPB_ERROR_UNTERMINATED_VARINT ||
+         status == UPB_STATUS_NEED_MORE_DATA);
+
+  status = upb_get_v_uint64_t(twelvebyte_buf, twelvebyte + 9, &twelvebyte_val, &twelvebyte_buf);
   ASSERT(status == UPB_STATUS_NEED_MORE_DATA);
-#undef TEST
 }
 
 static void test_get_v_uint32_t()
@@ -63,10 +81,15 @@ static void test_get_v_uint32_t()
     uint8_t name[] = bytes; \
     uint8_t *name ## _buf = name; \
     uint32_t name ## _val = 0; \
-    status = upb_get_v_uint32_t(name ## _buf, name + sizeof(name), &name ## _val, &name ## _buf); \
+    status = upb_get_v_uint32_t(name, name + sizeof(name), &name ## _val, &name ## _buf); \
     ASSERT(status == UPB_STATUS_OK); \
     ASSERT(name ## _val == val); \
     ASSERT(name ## _buf == name + sizeof(name) - 1);  /* - 1 for NULL */ \
+    /* Test NEED_MORE_DATA. */ \
+    if(sizeof(name) > 2) { \
+      status = upb_get_v_uint32_t(name, name + sizeof(name) - 2, &name ## _val, &name ## _buf); \
+      ASSERT(status == UPB_STATUS_NEED_MORE_DATA); \
+    } \
   }
 
   TEST(zero,   "\x00",                                              0UL);
@@ -81,18 +104,37 @@ static void test_get_v_uint32_t()
   TEST(eightb, "\x81\x83\x87\x8f\x9f\xbf\xff\x01",         0xf1e1c181UL);
   TEST(nineb,  "\x81\x83\x87\x8f\x9f\xbf\xff\x81\x03",     0xf1e1c181UL);
   TEST(tenb,   "\x81\x83\x87\x8f\x9f\xbf\xff\x81\x83\x07", 0xf1e1c181UL);
+#undef TEST
 
-  uint8_t elevenbyte[] = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x01};
-  uint8_t *elevenbyte_buf = elevenbyte;
-  uint64_t elevenbyte_val = 0;
-  upb_status_t status = upb_get_v_uint64_t(elevenbyte_buf, elevenbyte + sizeof(elevenbyte), &elevenbyte_val, &elevenbyte_buf);
+  uint8_t twelvebyte[] = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x01, 0x01};
+  uint8_t *twelvebyte_buf = twelvebyte;
+  uint32_t twelvebyte_val = 0;
+  upb_status_t status;
+  /* A varint that terminates before hitting the end of the provided buffer,
+   * but in too many bytes (11 instead of 10). */
+  status = upb_get_v_uint32_t(twelvebyte_buf, twelvebyte + 12, &twelvebyte_val, &twelvebyte_buf);
   ASSERT(status == UPB_ERROR_UNTERMINATED_VARINT);
-  status = upb_get_v_uint64_t(elevenbyte_buf, elevenbyte + sizeof(elevenbyte)-1, &elevenbyte_val, &elevenbyte_buf);
-  /* Byte 10 is 0x80, so we know it's unterminated. */
+
+  /* A varint that terminates simultaneously with the end of the provided
+   * buffer, but in too many bytes (11 instead of 10). */
+  status = upb_get_v_uint32_t(twelvebyte_buf, twelvebyte + 11, &twelvebyte_val, &twelvebyte_buf);
   ASSERT(status == UPB_ERROR_UNTERMINATED_VARINT);
-  status = upb_get_v_uint64_t(elevenbyte_buf, elevenbyte + sizeof(elevenbyte)-2, &elevenbyte_val, &elevenbyte_buf);
+
+  /* A varint whose buffer ends on exactly the byte where the varint must
+   * terminate, but the final byte does not terminate.  The absolutely most
+   * correct return code here is UPB_ERROR_UNTERMINATED_VARINT, because we know
+   * by this point that the varint does not properly terminate.  But we also
+   * allow a return value of UPB_STATUS_NEED_MORE_DATA here, because it does not
+   * compromise overall correctness -- clients who supply more data later will
+   * then receive a UPB_ERROR_UNTERMINATED_VARINT error; clients who have no
+   * more data to supply will (rightly) conclude that their protobuf is corrupt.
+   */
+  status = upb_get_v_uint32_t(twelvebyte_buf, twelvebyte + 10, &twelvebyte_val, &twelvebyte_buf);
+  ASSERT(status == UPB_ERROR_UNTERMINATED_VARINT ||
+         status == UPB_STATUS_NEED_MORE_DATA);
+
+  status = upb_get_v_uint32_t(twelvebyte_buf, twelvebyte + 9, &twelvebyte_val, &twelvebyte_buf);
   ASSERT(status == UPB_STATUS_NEED_MORE_DATA);
-#undef TEST
 }
 
 static void test_skip_v_uint64_t()
@@ -101,9 +143,14 @@ static void test_skip_v_uint64_t()
     upb_status_t status; \
     uint8_t name[] = bytes; \
     uint8_t *name ## _buf = name; \
-    status = skip_v_uint64_t(name ## _buf, name + sizeof(name), &name ## _buf); \
+    status = upb_skip_v_uint64_t(name ## _buf, name + sizeof(name), &name ## _buf); \
     ASSERT(status == UPB_STATUS_OK); \
     ASSERT(name ## _buf == name + sizeof(name) - 1);  /* - 1 for NULL */ \
+    /* Test NEED_MORE_DATA. */ \
+    if(sizeof(name) > 2) { \
+      status = upb_skip_v_uint64_t(name, name + sizeof(name) - 2, &name ## _buf); \
+      ASSERT(status == UPB_STATUS_NEED_MORE_DATA); \
+    } \
   }
 
   TEST(zero,   "\x00");
@@ -117,18 +164,36 @@ static void test_skip_v_uint64_t()
   TEST(eightb, "\x81\x83\x87\x8f\x9f\xbf\xff\x01");
   TEST(nineb,  "\x81\x83\x87\x8f\x9f\xbf\xff\x81\x03");
   TEST(tenb,   "\x81\x83\x87\x8f\x9f\xbf\xff\x81\x83\x07");
+#undef TEST
 
-  uint8_t elevenbyte[] = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x01};
-  uint8_t *elevenbyte_buf = elevenbyte;
-  upb_status_t status = skip_v_uint64_t(elevenbyte_buf, elevenbyte + sizeof(elevenbyte), &elevenbyte_buf);
-  printf("%d\n", status);
+  uint8_t twelvebyte[] = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x01, 0x01};
+  uint8_t *twelvebyte_buf = twelvebyte;
+  upb_status_t status;
+  /* A varint that terminates before hitting the end of the provided buffer,
+   * but in too many bytes (11 instead of 10). */
+  status = upb_skip_v_uint64_t(twelvebyte_buf, twelvebyte + 12, &twelvebyte_buf);
   ASSERT(status == UPB_ERROR_UNTERMINATED_VARINT);
-  status = skip_v_uint64_t(elevenbyte_buf, elevenbyte + sizeof(elevenbyte)-1, &elevenbyte_buf);
-  /* Byte 10 is 0x80, so we know it's unterminated. */
+
+  /* A varint that terminates simultaneously with the end of the provided
+   * buffer, but in too many bytes (11 instead of 10). */
+  status = upb_skip_v_uint64_t(twelvebyte_buf, twelvebyte + 11, &twelvebyte_buf);
   ASSERT(status == UPB_ERROR_UNTERMINATED_VARINT);
-  status = skip_v_uint64_t(elevenbyte_buf, elevenbyte + sizeof(elevenbyte)-2, &elevenbyte_buf);
+
+  /* A varint whose buffer ends on exactly the byte where the varint must
+   * terminate, but the final byte does not terminate.  The absolutely most
+   * correct return code here is UPB_ERROR_UNTERMINATED_VARINT, because we know
+   * by this point that the varint does not properly terminate.  But we also
+   * allow a return value of UPB_STATUS_NEED_MORE_DATA here, because it does not
+   * compromise overall correctness -- clients who supply more data later will
+   * then receive a UPB_ERROR_UNTERMINATED_VARINT error; clients who have no
+   * more data to supply will (rightly) conclude that their protobuf is corrupt.
+   */
+  status = upb_skip_v_uint64_t(twelvebyte_buf, twelvebyte + 10, &twelvebyte_buf);
+  ASSERT(status == UPB_ERROR_UNTERMINATED_VARINT ||
+         status == UPB_STATUS_NEED_MORE_DATA);
+
+  status = upb_skip_v_uint64_t(twelvebyte_buf, twelvebyte + 9, &twelvebyte_buf);
   ASSERT(status == UPB_STATUS_NEED_MORE_DATA);
-#undef TEST
 }
 
 static void test_get_f_uint32_t()
@@ -150,7 +215,7 @@ static void test_get_f_uint32_t()
   uint8_t threeb[] = {0x00, 0x00, 0x00};
   uint8_t *threeb_buf = threeb;
   uint32_t threeb_val;
-  upb_status_t status = upb_get_f_uint32_t(threeb_buf, threeb + sizeof(threeb), &threeb_val, &threeb_buf);
+  upb_status_t status = upb_get_f_uint32_t(threeb, threeb + sizeof(threeb), &threeb_val, &threeb_buf);
   ASSERT(status == UPB_STATUS_NEED_MORE_DATA);
 
 #undef TEST