From ee84a7da167d2211066c4a663d41febdf9544438 Mon Sep 17 00:00:00 2001
From: Joshua Haberman <joshua@reverberate.org>
Date: Thu, 10 Feb 2011 23:37:47 -0800
Subject: Add (but do not activate) an SSE varint decoder.

---
 stream/upb_decoder.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++------
 tests/tests.c        |  8 +++++---
 2 files changed, 55 insertions(+), 9 deletions(-)

diff --git a/stream/upb_decoder.c b/stream/upb_decoder.c
index 7da8993..4a43c4b 100644
--- a/stream/upb_decoder.c
+++ b/stream/upb_decoder.c
@@ -16,8 +16,47 @@
 // The key fast-path varint-decoding routine.  Here we can assume we have at
 // least UPB_MAX_VARINT_ENCODED_SIZE bytes available.  There are a lot of
 // possibilities for optimization/experimentation here.
-INLINE bool upb_decode_varint_fast(const char **ptr, uint64_t *val,
-                                   upb_status *status) {
+
+#ifdef USE_SSE_VARINT_DECODING
+#include <emmintrin.h>
+
+// This works, but is empirically slower than the branchy version below.  Why?
+// Most varints are very short.  Next step: use branches for 1/2-byte varints,
+// but use the SSE version for 3-10 byte varints.
+INLINE bool upb_decode_varint_fast(const char **ptr, uint64_t *val, upb_status *s) {
+  const char *p = *ptr;
+  __m128i val128 = _mm_loadu_si128((void*)p);
+  unsigned int continuation_bits = _mm_movemask_epi8(val128);
+  unsigned int bsr_val = ~continuation_bits;
+  int varint_length = __builtin_ffs(bsr_val);
+  if (varint_length > 10) {
+    upb_seterr(s, UPB_ERROR, "Unterminated varint");
+    return false;
+  }
+
+  uint16_t twob;
+  memcpy(&twob, p, 2);
+  twob &= 0x7f7f;
+  twob = ((twob & 0xff00) >> 1) | (twob & 0xff);
+
+  uint64_t eightb;
+  memcpy(&eightb, p + 2, 8);
+  eightb &= 0x7f7f7f7f7f7f7f7f;
+  eightb = ((eightb & 0xff00ff00ff00ff00) >> 1) | (eightb & 0x00ff00ff00ff00ff);
+  eightb = ((eightb & 0xffff0000ffff0000) >> 2) | (eightb & 0x0000ffff0000ffff);
+  eightb = ((eightb & 0xffffffff00000000) >> 4) | (eightb & 0x00000000ffffffff);
+
+  uint64_t all_bits = twob | (eightb << 14);
+  int varint_bits = varint_length * 7;
+  uint64_t mask = varint_bits == 70 ? (uint64_t)-1 : (1ULL << (varint_bits)) - 1;
+  *val = all_bits & mask;
+  *ptr = p + varint_length;
+  return true;
+}
+
+#else
+
+INLINE bool upb_decode_varint_fast(const char **ptr, uint64_t *val, upb_status *s) {
   const char *p = *ptr;
   uint32_t low, high = 0;
   uint32_t b;
@@ -33,14 +72,17 @@ INLINE bool upb_decode_varint_fast(const char **ptr, uint64_t *val,
   b = *(p++); high |= (b & 0x7f) << 24; if(!(b & 0x80)) goto done;
   b = *(p++); high |= (b & 0x7f) << 31; if(!(b & 0x80)) goto done;
 
-  upb_seterr(status, UPB_ERROR, "Unterminated varint");
+  upb_seterr(s, UPB_ERROR, "Unterminated varint");
   return false;
+
 done:
-  *ptr = p;
   *val = ((uint64_t)high << 32) | low;
+  *ptr = p;
   return true;
 }
 
+#endif
+
 
 /* Decoding/Buffering of individual values ************************************/
 
@@ -163,7 +205,7 @@ done:
 }
 
 INLINE bool upb_decode_varint(upb_decoder *d, upb_dstate *s, upb_value *val) {
-  if (s->len >= UPB_MAX_VARINT_ENCODED_SIZE) {
+  if (s->len >= 16) {
     // Common (fast) case.
     uint64_t val64;
     const char *p = s->ptr;
@@ -315,7 +357,9 @@ void upb_decoder_run(upb_src *src, upb_status *status) {
       CHECK_FLOW(upb_dispatch_unknownval(&d->dispatcher, tag.field_number, val));
     } else if (!upb_check_type(tag.wire_type, f->type)) {
       // TODO: put more details in this error msg.
-      upb_seterr(status, UPB_ERROR, "Field had incorrect type.");
+      upb_seterr(status, UPB_ERROR, "Field had incorrect type, name: " UPB_STRFMT, UPB_STRARG(f->name));
+      upb_printerr(status);
+      *(int*)0 = 0;
       goto err;
     }
 
diff --git a/tests/tests.c b/tests/tests.c
index 17e00f3..c691b18 100644
--- a/tests/tests.c
+++ b/tests/tests.c
@@ -17,17 +17,18 @@ static void test_get_v_uint64_t()
 {
 #define TEST(name, bytes, val) {\
     upb_status status = UPB_STATUS_INIT; \
-    const char name[] = bytes; \
+    const char name[] = bytes "\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff" ; \
     const char *name ## _buf = name; \
     uint64_t name ## _val = 0; \
     upb_decode_varint_fast(&name ## _buf, &name ## _val, &status); \
     ASSERT(upb_ok(&status)); \
     ASSERT(name ## _val == val); \
-    ASSERT(name ## _buf == name + sizeof(name) - 1);  /* - 1 for NULL */ \
+    ASSERT(name ## _buf == name + sizeof(name) - 16);  /* - 1 for NULL */ \
   }
 
   TEST(zero,   "\x00",                                                      0ULL);
   TEST(one,    "\x01",                                                      1ULL);
+  TEST(twob,   "\x81\x14",                                              0xa01ULL);
   TEST(twob,   "\x81\x03",                                              0x181ULL);
   TEST(threeb, "\x81\x83\x07",                                        0x1c181ULL);
   TEST(fourb,  "\x81\x83\x87\x0f",                                  0x1e1c181ULL);
@@ -39,7 +40,7 @@ static void test_get_v_uint64_t()
   TEST(tenb,   "\x81\x83\x87\x8f\x9f\xbf\xff\x81\x83\x07", 0x8303fdf9f1e1c181ULL);
 #undef TEST
 
-  char twelvebyte[] = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x01, 0x01};
+  char twelvebyte[16] = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x01, 0x01};
   const char *twelvebyte_buf = twelvebyte;
   uint64_t twelvebyte_val = 0;
   upb_status status = UPB_STATUS_INIT;
@@ -214,6 +215,7 @@ static void test_upb_symtab() {
   }
   upb_status status = UPB_STATUS_INIT;
   upb_parsedesc(s, descriptor, &status);
+  upb_printerr(&status);
   ASSERT(upb_ok(&status));
   upb_status_uninit(&status);
   upb_string_unref(descriptor);
-- 
cgit v1.2.3