summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorJoshua Haberman <joshua@reverberate.org>2011-03-28 16:03:22 -0700
committerJoshua Haberman <joshua@reverberate.org>2011-03-28 16:03:22 -0700
commit19517cc6f39871abf4a0705b49cfed9049ca6033 (patch)
tree1449952d59e93184abdd5505db6645d4de74cef0 /src
parent2c86e7eddb23c92cc83391c3f751c72237ec3759 (diff)
Switch to non-branching varint decoder.
Diffstat (limited to 'src')
-rw-r--r--src/upb_varint_decoder.h92
1 files changed, 59 insertions, 33 deletions
diff --git a/src/upb_varint_decoder.h b/src/upb_varint_decoder.h
index 6d88022..7297f43 100644
--- a/src/upb_varint_decoder.h
+++ b/src/upb_varint_decoder.h
@@ -76,43 +76,69 @@ done:
return r;
}
-#ifdef __SSE__
-
-#include <xmmintrin.h>
-
-// Avoids branches (this can very likely be improved). Requires SSE.
-INLINE upb_decoderet upb_decode_varint_nobranch(const char *p) {
- upb_decoderet r = {(void*)0, 0};
- __m128i val128 = _mm_loadu_si128((void*)p);
- unsigned int continuation_bits = _mm_movemask_epi8(val128);
- unsigned int bsr_val = ~continuation_bits;
- int varint_length = __builtin_ffs(bsr_val);
- if (varint_length > 10) return r;
-
- uint16_t twob;
- memcpy(&twob, p, 2);
- twob &= 0x7f7f;
- twob = ((twob & 0xff00) >> 1) | (twob & 0xff);
-
- uint64_t eightb;
- memcpy(&eightb, p + 2, 8);
- eightb &= 0x7f7f7f7f7f7f7f7f;
- eightb = ((eightb & 0xff00ff00ff00ff00) >> 1) | (eightb & 0x00ff00ff00ff00ff);
- eightb = ((eightb & 0xffff0000ffff0000) >> 2) | (eightb & 0x0000ffff0000ffff);
- eightb = ((eightb & 0xffffffff00000000) >> 4) | (eightb & 0x00000000ffffffff);
-
- uint64_t all_bits = twob | (eightb << 14);
- int varint_bits = varint_length * 7;
- uint64_t mask = varint_bits == 70 ? (uint64_t)-1 : (1ULL << (varint_bits)) - 1;
- r.val = all_bits & mask;
- r.p = p + varint_length;
+// Avoids branches for values >2-bytes.
+INLINE upb_decoderet upb_decode_varint_nobranch1(const char *p) {
+ uint64_t b = 0;
+ upb_decoderet r = {p, 0};
+ memcpy(&b, r.p, 2);
+ if ((b & 0x80) == 0) { r.val = (b & 0x7f); r.p = p + 1; return r; }
+ r.val = (b & 0x7f) | ((b & 0x7f00) >> 1);
+ r.p = p + 2;
+ if ((b & 0x8000) == 0) return r;
+
+ // >2-byte varint.
+ memcpy(&b, r.p, sizeof(b));
+ uint64_t cbits = b | 0x7f7f7f7f7f7f7f7fULL;
+ uint64_t stop_bit = ~cbits & (cbits+1);
+ b &= (stop_bit - 1);
+ b = ((b & 0x7f007f007f007f00) >> 1) | (b & 0x007f007f007f007f);
+ b = ((b & 0xffff0000ffff0000) >> 2) | (b & 0x0000ffff0000ffff);
+ b = ((b & 0xffffffff00000000) >> 4) | (b & 0x00000000ffffffff);
+ r.val |= b << 14;
+ r.p += (__builtin_ctzll(stop_bit) + 1) / 8;
+ if (stop_bit == 0) {
+ // Error: unterminated varint.
+ upb_decoderet err_r = {(void*)0, 0};
+ return err_r;
+ }
return r;
}
-#endif
+// Avoids branches for values >2-bytes.
+INLINE upb_decoderet upb_decode_varint_nobranch2(const char *p) {
+ uint64_t b = 0;
+ upb_decoderet r = {p, 0};
+ memcpy(&b, r.p, 2);
+ if ((b & 0x80) == 0) { r.val = (b & 0x7f); r.p = p + 1; return r; }
+ r.val = (b & 0x7f) | ((b & 0x7f00) >> 1);
+ r.p = p + 2;
+ if ((b & 0x8000) == 0) return r;
+
+ // >2-byte varint.
+ memcpy(&b, r.p, sizeof(b));
+ uint64_t cbits = b | 0x7f7f7f7f7f7f7f7fULL;
+ uint64_t stop_bit = ~cbits & (cbits + 1);
+ b = (b & 0x7f7f7f7f7f7f7f7fULL) & (stop_bit - 1);
+ b += b & 0x007f007f007f007fULL;
+ b += 3 * (b & 0x0000ffff0000ffffULL);
+ b += 15 * (b & 0x00000000ffffffffULL);
+ r.val |= b << 7;
+ r.p += (__builtin_ctzll(stop_bit) + 1) / 8;
+ if (stop_bit == 0) {
+ // Error: unterminated varint.
+ upb_decoderet err_r = {(void*)0, 0};
+ return err_r;
+ }
+ return r;
+}
-// For now, always use the branch32 decoder.
-#define upb_decode_varint_fast upb_decode_varint_branch32
+INLINE upb_decoderet upb_decode_varint_fast(const char *p) {
+ // Use nobranch2 on 64-bit, branch32 on 32-bit.
+ if (sizeof(long) == 8)
+ return upb_decode_varint_nobranch2(p);
+ else
+ return upb_decode_varint_branch32(p);
+}
#ifdef __cplusplus
} /* extern "C" */
generated by cgit on debian on lair
contact matthew@masot.net with questions or feedback