From 9eb4d695c49a85f7f72ad68c3c31affd61fef984 Mon Sep 17 00:00:00 2001 From: Joshua Haberman Date: Fri, 1 Apr 2011 15:40:06 -0700 Subject: First rough version of the JIT. It can successfully parse SpeedMessage1. Preliminary results: 750MB/s on Core2 2.4GHz. This number is 2.5x proto2. This isn't apples-to-apples, because proto2 is parsing to a struct and we are just doing stream parsing, but for apps that are currently using proto2, this is the improvement they would see if they could move to stream-based processing. Unfortunately perf-regression-test.py is broken, and I'm not 100% sure why. It would be nice to fix it first (to ensure that there are no performance regressions for the table-based decoder) but I'm really impatient to get the JIT checked in. --- src/upb_varint_decoder.h | 74 +++++++++++++++++++++++++++--------------------- 1 file changed, 41 insertions(+), 33 deletions(-) (limited to 'src/upb_varint_decoder.h') diff --git a/src/upb_varint_decoder.h b/src/upb_varint_decoder.h index 7297f43..d7af90a 100644 --- a/src/upb_varint_decoder.h +++ b/src/upb_varint_decoder.h @@ -30,7 +30,7 @@ typedef struct { // A basic branch-based decoder, uses 32-bit values to get good performance // on 32-bit architectures (but performs well on 64-bits also). -INLINE upb_decoderet upb_decode_varint_branch32(const char *p) { +INLINE upb_decoderet upb_vdecode_branch32(const char *p) { upb_decoderet r = {NULL, 0}; uint32_t low, high = 0; uint32_t b; @@ -54,7 +54,7 @@ done: } // Like the previous, but uses 64-bit values. -INLINE upb_decoderet upb_decode_varint_branch64(const char *p) { +INLINE upb_decoderet upb_vdecode_branch64(const char *p) { uint64_t val; uint64_t b; upb_decoderet r = {(void*)0, 0}; @@ -76,17 +76,9 @@ done: return r; } -// Avoids branches for values >2-bytes. -INLINE upb_decoderet upb_decode_varint_nobranch1(const char *p) { - uint64_t b = 0; - upb_decoderet r = {p, 0}; - memcpy(&b, r.p, 2); - if ((b & 0x80) == 0) { r.val = (b & 0x7f); r.p = p + 1; return r; } - r.val = (b & 0x7f) | ((b & 0x7f00) >> 1); - r.p = p + 2; - if ((b & 0x8000) == 0) return r; - - // >2-byte varint. +// Decodes a varint of at most 8 bytes without branching (except for error). +INLINE upb_decoderet upb_vdecode_max8_wright(upb_decoderet r) { + uint64_t b; memcpy(&b, r.p, sizeof(b)); uint64_t cbits = b | 0x7f7f7f7f7f7f7f7fULL; uint64_t stop_bit = ~cbits & (cbits+1); @@ -94,27 +86,19 @@ INLINE upb_decoderet upb_decode_varint_nobranch1(const char *p) { b = ((b & 0x7f007f007f007f00) >> 1) | (b & 0x007f007f007f007f); b = ((b & 0xffff0000ffff0000) >> 2) | (b & 0x0000ffff0000ffff); b = ((b & 0xffffffff00000000) >> 4) | (b & 0x00000000ffffffff); - r.val |= b << 14; - r.p += (__builtin_ctzll(stop_bit) + 1) / 8; if (stop_bit == 0) { // Error: unterminated varint. upb_decoderet err_r = {(void*)0, 0}; return err_r; } - return r; + upb_decoderet my_r = {r.p + ((__builtin_ctzll(stop_bit) + 1) / 8), + r.val | (b << 14)}; + return my_r; } -// Avoids branches for values >2-bytes. -INLINE upb_decoderet upb_decode_varint_nobranch2(const char *p) { - uint64_t b = 0; - upb_decoderet r = {p, 0}; - memcpy(&b, r.p, 2); - if ((b & 0x80) == 0) { r.val = (b & 0x7f); r.p = p + 1; return r; } - r.val = (b & 0x7f) | ((b & 0x7f00) >> 1); - r.p = p + 2; - if ((b & 0x8000) == 0) return r; - - // >2-byte varint. +// Another implementation of the previous. +INLINE upb_decoderet upb_vdecode_max8_massimino(upb_decoderet r) { + uint64_t b; memcpy(&b, r.p, sizeof(b)); uint64_t cbits = b | 0x7f7f7f7f7f7f7f7fULL; uint64_t stop_bit = ~cbits & (cbits + 1); @@ -122,22 +106,46 @@ INLINE upb_decoderet upb_decode_varint_nobranch2(const char *p) { b += b & 0x007f007f007f007fULL; b += 3 * (b & 0x0000ffff0000ffffULL); b += 15 * (b & 0x00000000ffffffffULL); - r.val |= b << 7; - r.p += (__builtin_ctzll(stop_bit) + 1) / 8; if (stop_bit == 0) { // Error: unterminated varint. upb_decoderet err_r = {(void*)0, 0}; return err_r; } - return r; + upb_decoderet my_r = {r.p + ((__builtin_ctzll(stop_bit) + 1) / 8), + r.val | (b << 7)}; + return my_r; } -INLINE upb_decoderet upb_decode_varint_fast(const char *p) { +// Template for a function that checks the first two bytes with branching +// and dispatches 2-10 bytes with a separate function. +#define UPB_VARINT_DECODER_CHECK2(name, decode_max8_function) \ +INLINE upb_decoderet upb_vdecode_check2_ ## name(const char *p) { \ + uint64_t b = 0; \ + upb_decoderet r = {p, 0}; \ + memcpy(&b, r.p, 2); \ + if ((b & 0x80) == 0) { r.val = (b & 0x7f); r.p = p + 1; return r; } \ + r.val = (b & 0x7f) | ((b & 0x7f00) >> 1); \ + r.p = p + 2; \ + if ((b & 0x8000) == 0) return r; \ + return decode_max8_function(r); \ +} + +UPB_VARINT_DECODER_CHECK2(wright, upb_vdecode_max8_wright); +UPB_VARINT_DECODER_CHECK2(massimino, upb_vdecode_max8_massimino); +#undef UPB_VARINT_DECODER_CHECK2 + +// Our canonical functions for decoding varints, based on the currently +// favored best-performing implementations. +INLINE upb_decoderet upb_vdecode_fast(const char *p) { // Use nobranch2 on 64-bit, branch32 on 32-bit. if (sizeof(long) == 8) - return upb_decode_varint_nobranch2(p); + return upb_vdecode_check2_massimino(p); else - return upb_decode_varint_branch32(p); + return upb_vdecode_branch32(p); +} + +INLINE upb_decoderet upb_vdecode_max8_fast(upb_decoderet r) { + return upb_vdecode_max8_massimino(r); } #ifdef __cplusplus -- cgit v1.2.3