From ee84a7da167d2211066c4a663d41febdf9544438 Mon Sep 17 00:00:00 2001 From: Joshua Haberman Date: Thu, 10 Feb 2011 23:37:47 -0800 Subject: Add (but do not activate) an SSE varint decoder. --- stream/upb_decoder.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++------ tests/tests.c | 8 +++++--- 2 files changed, 55 insertions(+), 9 deletions(-) diff --git a/stream/upb_decoder.c b/stream/upb_decoder.c index 7da8993..4a43c4b 100644 --- a/stream/upb_decoder.c +++ b/stream/upb_decoder.c @@ -16,8 +16,47 @@ // The key fast-path varint-decoding routine. Here we can assume we have at // least UPB_MAX_VARINT_ENCODED_SIZE bytes available. There are a lot of // possibilities for optimization/experimentation here. -INLINE bool upb_decode_varint_fast(const char **ptr, uint64_t *val, - upb_status *status) { + +#ifdef USE_SSE_VARINT_DECODING +#include + +// This works, but is empirically slower than the branchy version below. Why? +// Most varints are very short. Next step: use branches for 1/2-byte varints, +// but use the SSE version for 3-10 byte varints. +INLINE bool upb_decode_varint_fast(const char **ptr, uint64_t *val, upb_status *s) { + const char *p = *ptr; + __m128i val128 = _mm_loadu_si128((void*)p); + unsigned int continuation_bits = _mm_movemask_epi8(val128); + unsigned int bsr_val = ~continuation_bits; + int varint_length = __builtin_ffs(bsr_val); + if (varint_length > 10) { + upb_seterr(s, UPB_ERROR, "Unterminated varint"); + return false; + } + + uint16_t twob; + memcpy(&twob, p, 2); + twob &= 0x7f7f; + twob = ((twob & 0xff00) >> 1) | (twob & 0xff); + + uint64_t eightb; + memcpy(&eightb, p + 2, 8); + eightb &= 0x7f7f7f7f7f7f7f7f; + eightb = ((eightb & 0xff00ff00ff00ff00) >> 1) | (eightb & 0x00ff00ff00ff00ff); + eightb = ((eightb & 0xffff0000ffff0000) >> 2) | (eightb & 0x0000ffff0000ffff); + eightb = ((eightb & 0xffffffff00000000) >> 4) | (eightb & 0x00000000ffffffff); + + uint64_t all_bits = twob | (eightb << 14); + int varint_bits = varint_length * 7; + uint64_t mask = varint_bits == 70 ? (uint64_t)-1 : (1ULL << (varint_bits)) - 1; + *val = all_bits & mask; + *ptr = p + varint_length; + return true; +} + +#else + +INLINE bool upb_decode_varint_fast(const char **ptr, uint64_t *val, upb_status *s) { const char *p = *ptr; uint32_t low, high = 0; uint32_t b; @@ -33,14 +72,17 @@ INLINE bool upb_decode_varint_fast(const char **ptr, uint64_t *val, b = *(p++); high |= (b & 0x7f) << 24; if(!(b & 0x80)) goto done; b = *(p++); high |= (b & 0x7f) << 31; if(!(b & 0x80)) goto done; - upb_seterr(status, UPB_ERROR, "Unterminated varint"); + upb_seterr(s, UPB_ERROR, "Unterminated varint"); return false; + done: - *ptr = p; *val = ((uint64_t)high << 32) | low; + *ptr = p; return true; } +#endif + /* Decoding/Buffering of individual values ************************************/ @@ -163,7 +205,7 @@ done: } INLINE bool upb_decode_varint(upb_decoder *d, upb_dstate *s, upb_value *val) { - if (s->len >= UPB_MAX_VARINT_ENCODED_SIZE) { + if (s->len >= 16) { // Common (fast) case. uint64_t val64; const char *p = s->ptr; @@ -315,7 +357,9 @@ void upb_decoder_run(upb_src *src, upb_status *status) { CHECK_FLOW(upb_dispatch_unknownval(&d->dispatcher, tag.field_number, val)); } else if (!upb_check_type(tag.wire_type, f->type)) { // TODO: put more details in this error msg. - upb_seterr(status, UPB_ERROR, "Field had incorrect type."); + upb_seterr(status, UPB_ERROR, "Field had incorrect type, name: " UPB_STRFMT, UPB_STRARG(f->name)); + upb_printerr(status); + *(int*)0 = 0; goto err; } diff --git a/tests/tests.c b/tests/tests.c index 17e00f3..c691b18 100644 --- a/tests/tests.c +++ b/tests/tests.c @@ -17,17 +17,18 @@ static void test_get_v_uint64_t() { #define TEST(name, bytes, val) {\ upb_status status = UPB_STATUS_INIT; \ - const char name[] = bytes; \ + const char name[] = bytes "\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff" ; \ const char *name ## _buf = name; \ uint64_t name ## _val = 0; \ upb_decode_varint_fast(&name ## _buf, &name ## _val, &status); \ ASSERT(upb_ok(&status)); \ ASSERT(name ## _val == val); \ - ASSERT(name ## _buf == name + sizeof(name) - 1); /* - 1 for NULL */ \ + ASSERT(name ## _buf == name + sizeof(name) - 16); /* - 1 for NULL */ \ } TEST(zero, "\x00", 0ULL); TEST(one, "\x01", 1ULL); + TEST(twob, "\x81\x14", 0xa01ULL); TEST(twob, "\x81\x03", 0x181ULL); TEST(threeb, "\x81\x83\x07", 0x1c181ULL); TEST(fourb, "\x81\x83\x87\x0f", 0x1e1c181ULL); @@ -39,7 +40,7 @@ static void test_get_v_uint64_t() TEST(tenb, "\x81\x83\x87\x8f\x9f\xbf\xff\x81\x83\x07", 0x8303fdf9f1e1c181ULL); #undef TEST - char twelvebyte[] = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x01, 0x01}; + char twelvebyte[16] = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x01, 0x01}; const char *twelvebyte_buf = twelvebyte; uint64_t twelvebyte_val = 0; upb_status status = UPB_STATUS_INIT; @@ -214,6 +215,7 @@ static void test_upb_symtab() { } upb_status status = UPB_STATUS_INIT; upb_parsedesc(s, descriptor, &status); + upb_printerr(&status); ASSERT(upb_ok(&status)); upb_status_uninit(&status); upb_string_unref(descriptor); -- cgit v1.2.3