From 8ef6873e0e14309a1715a252a650bab0ae1a33ef Mon Sep 17 00:00:00 2001
From: Josh Haberman <jhaberman@gmail.com>
Date: Sun, 20 Mar 2011 13:13:51 -0700
Subject: upb_stream: all callbacks registered ahead-of-time.

This is a significant change to the upb_stream
protocol, and should hopefully be the last
significant change.

All callbacks are now registered ahead-of-time
instead of having delegated callbacks registered
at runtime, which makes it much easier to
aggressively optimize ahead-of-time (like with a
JIT).

Other impacts of this change:

- You no longer need to have loaded descriptor.proto
  as a upb_def to load other descriptors!  This means
  the special-case code we used for bootstrapping is
  no longer necessary, and we no longer need to link
  the descriptor for descriptor.proto into upb.

- A client can now register any upb_value as what
  will be delivered to their value callback, not
  just a upb_fielddef*.  This should allow for other
  clients to get more bang out of the streaming
  decoder.

This change unfortunately causes a bit of a performance
regression -- I think largely due to highly
suboptimal code that GCC generates when structs
are returned by value.  See:
  http://blog.reverberate.org/2011/03/19/when-a-compilers-slow-code-actually-bites-you/

On the other hand, once we have a JIT this should
no longer matter.

Performance numbers:

plain.parsestream_googlemessage1.upb_table: 374 -> 396 (5.88)
plain.parsestream_googlemessage2.upb_table: 616 -> 449 (-27.11)
plain.parsetostruct_googlemessage1.upb_table_byref: 268 -> 269 (0.37)
plain.parsetostruct_googlemessage1.upb_table_byval: 215 -> 204 (-5.12)
plain.parsetostruct_googlemessage2.upb_table_byref: 307 -> 281 (-8.47)
plain.parsetostruct_googlemessage2.upb_table_byval: 297 -> 272 (-8.42)
omitfp.parsestream_googlemessage1.upb_table: 423 -> 410 (-3.07)
omitfp.parsestream_googlemessage2.upb_table: 679 -> 483 (-28.87)
omitfp.parsetostruct_googlemessage1.upb_table_byref: 287 -> 282 (-1.74)
omitfp.parsetostruct_googlemessage1.upb_table_byval: 226 -> 219 (-3.10)
omitfp.parsetostruct_googlemessage2.upb_table_byref: 315 -> 298 (-5.40)
omitfp.parsetostruct_googlemessage2.upb_table_byval: 297 -> 287 (-3.37)
---
 src/upb_decoder_x64.asm | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'src/upb_decoder_x64.asm')

diff --git a/src/upb_decoder_x64.asm b/src/upb_decoder_x64.asm
index 032ea86..c417644 100644
--- a/src/upb_decoder_x64.asm
+++ b/src/upb_decoder_x64.asm
@@ -34,8 +34,8 @@ SECTION .text
 %define BUF rbx       ; const char *p, current buf position.
 %define END rbp       ; const char *end, where the buf ends (either submsg end or buf end)
 %define STRING r12    ; unused
-%define FIELDDEF r13  ; upb_fielddef *f, needs to be preserved across varint decoding call.
-%define CALLBACK r14
+%define FVAL r13      ; upb_value fval, needs to be preserved across varint decoding call.
+%define UNUSED r14
 %define CLOSURE r15
 
 ; Stack layout: *tableptr, uint32_t maxfield_times_8
@@ -57,10 +57,10 @@ SECTION .text
 ;    path that goes into a tight loop if the encoding was packed).
 ; - check_6: the field is not a group or a message (or string, TODO)
 ;   (this could be relaxed, but due to delegation it's a bit tricky).
-; - if the value is a string, the entire string is available in
+; - check_7: if the value is a string, the entire string is available in
 ;   the buffer, and our cached string object can be recycled, and
 ;   our string object already references the source buffer, so
-;   absolutely no refcount twiddling is required.  (check_7)
+;   absolutely no refcount twiddling is required.
 
 
 %macro decode_and_dispatch_ 0
@@ -78,7 +78,7 @@ align 16
   ; Decode a 1 or 2-byte varint -> eax.
   mov cl, byte [BUF]
   lea rdi, [BUF+1]
-  movzx rax, cl    ; Need all of rax since we're doing a 64-bit lea later.
+  movzx eax, cl
   and eax, 0x7f
   test cl, cl
   jns .one_byte_tag ; Should be predictable if fields are in order.
-- 
cgit v1.2.3