summaryrefslogtreecommitdiff
path: root/upb/pb/compile_decoder_x64.dasc
diff options
context:
space:
mode:
Diffstat (limited to 'upb/pb/compile_decoder_x64.dasc')
-rw-r--r--upb/pb/compile_decoder_x64.dasc1145
1 files changed, 0 insertions, 1145 deletions
diff --git a/upb/pb/compile_decoder_x64.dasc b/upb/pb/compile_decoder_x64.dasc
deleted file mode 100644
index 7fcd006..0000000
--- a/upb/pb/compile_decoder_x64.dasc
+++ /dev/null
@@ -1,1145 +0,0 @@
-|//
-|// upb - a minimalist implementation of protocol buffers.
-|//
-|// Copyright (c) 2011-2013 Google Inc. See LICENSE for details.
-|// Author: Josh Haberman <jhaberman@gmail.com>
-|//
-|// JIT compiler for upb_pbdecoder on x86-64. Generates machine code from the
-|// bytecode generated in compile_decoder.c.
-|
-|.arch x64
-|.actionlist upb_jit_actionlist
-|.globals UPB_JIT_GLOBAL_
-|.globalnames upb_jit_globalnames
-|
-|// Calling conventions. Note -- this will need to be changed for
-|// Windows, which uses a different calling convention!
-|.define ARG1_64, rdi
-|.define ARG2_8, r6b // DynASM's equivalent to "sil" -- low byte of esi.
-|.define ARG2_32, esi
-|.define ARG2_64, rsi
-|.define ARG3_8, dl
-|.define ARG3_32, edx
-|.define ARG3_64, rdx
-|.define ARG4_64, rcx
-|.define ARG5_64, r8
-|.define XMMARG1, xmm0
-|
-|// Register allocation / type map.
-|// ALL of the code in this file uses these register allocations.
-|// When we "call" within this file, we do not use regular calling
-|// conventions, but of course when calling to user callbacks we must.
-|.define PTR, rbx // DECODER->ptr (unsynced)
-|.define DATAEND, r12 // DECODER->data_end (unsynced)
-|.define CLOSURE, r13 // FRAME->closure (unsynced)
-|.type FRAME, upb_pbdecoder_frame, r14 // DECODER->top (unsynced)
-|.type DECODER, upb_pbdecoder, r15 // DECODER (immutable)
-|.define DELIMEND, rbp
-|
-| // Spills unsynced registers back to memory.
-|.macro commit_regs
-| mov DECODER->top, FRAME
-| mov DECODER->ptr, PTR
-| mov DECODER->data_end, DATAEND
-| // We don't guarantee that delim_end is NULL when out of range like the
-| // interpreter does.
-| mov DECODER->delim_end, DELIMEND
-| sub DELIMEND, DECODER->buf
-| add DELIMEND, DECODER->bufstart_ofs
-| mov FRAME->end_ofs, DELIMEND
-| mov FRAME->sink.closure, CLOSURE
-|.endmacro
-|
-| // Loads unsynced registers from memory back into registers.
-|.macro load_regs
-| mov FRAME, DECODER->top
-| mov PTR, DECODER->ptr
-| mov DATAEND, DECODER->data_end
-| mov CLOSURE, FRAME->sink.closure
-| mov DELIMEND, FRAME->end_ofs
-| sub DELIMEND, DECODER->bufstart_ofs
-| add DELIMEND, DECODER->buf
-|.endmacro
-|
-| // Calls an external C function at address "addr".
-|.macro callp, addr
-| mov64 rax, (uintptr_t)addr
-|
-| // Stack must be 16-byte aligned (x86-64 ABI requires this).
-| //
-| // OPT: possibly remove this by statically ensuring correct alignment.
-| //
-| // OPT: use "call rel32" where possible.
-| push r12
-| mov r12, rsp
-| and rsp, 0xfffffffffffffff0UL // Align stack.
-| call rax
-| mov rsp, r12
-| pop r12
-|.endmacro
-|
-|.macro ld64, val
-|| {
-|| uintptr_t v = (uintptr_t)val;
-|| if (v > 0xffffffff) {
-| mov64 ARG2_64, v
-|| } else if (v) {
-| mov ARG2_32, v
-|| } else {
-| xor ARG2_32, ARG2_32
-|| }
-|| }
-|.endmacro
-|
-|.macro load_handler_data, h, arg
-| ld64 upb_handlers_gethandlerdata(h, arg)
-|.endmacro
-|
-|.macro chkeob, bytes, target
-|| if (bytes == 1) {
-| cmp PTR, DATAEND
-| je target
-|| } else {
-| mov rcx, DATAEND
-| sub rcx, PTR
-| cmp rcx, bytes
-| jb target
-|| }
-|.endmacro
-|
-|.macro chkneob, bytes, target
-|| if (bytes == 1) {
-| cmp PTR, DATAEND
-| jne target
-|| } else {
-| mov rcx, DATAEND
-| sub rcx, PTR
-| cmp rcx, bytes
-| jae target
-|| }
-|.endmacro
-
-|.macro sethas, reg, hasbit
-|| if (hasbit >= 0) {
-| or byte [reg + ((uint32_t)hasbit / 8)], (1 << ((uint32_t)hasbit % 8))
-|| }
-|.endmacro
-|
-| // Decodes 32-bit varint into rdx, inlining 1 byte.
-|.macro dv32
-| chkeob 1, >7
-| movzx edx, byte [PTR]
-| test dl, dl
-| jns >8
-|7:
-| call ->decodev32_fallback
-|8:
-| add PTR, 1
-|.endmacro
-
-#define DECODE_EOF -3
-
-static upb_func *gethandler(const upb_handlers *h, upb_selector_t sel) {
- return h ? upb_handlers_gethandler(h, sel) : NULL;
-}
-
-/* Defines an "assembly label" for the current code generation offset.
- * This label exists *purely* for debugging purposes: it is emitted into
- * the .so, and printed as part of JIT debugging output when UPB_JIT_LOAD_SO is
- * defined.
- *
- * We would define this in the .c file except that it conditionally defines a
- * pclabel. */
-static void asmlabel(jitcompiler *jc, const char *fmt, ...) {
-#ifndef NDEBUG
- int ofs = jc->dynasm->section->ofs;
- UPB_ASSERT(ofs != jc->lastlabelofs);
- jc->lastlabelofs = ofs;
-#endif
-
-#ifndef UPB_JIT_LOAD_SO
- UPB_UNUSED(jc);
- UPB_UNUSED(fmt);
-#else
- va_list args;
- va_start(args, fmt);
- char *str = upb_vasprintf(fmt, args);
- va_end(args);
-
- int pclabel = alloc_pclabel(jc);
- /* Normally we would prefer to allocate this inline with the codegen,
- * ie.
- * |=>asmlabel(...)
- * But since we do this conditionally, only when UPB_JIT_LOAD_SO is defined,
- * we do it here instead. */
- |=>pclabel:
- upb_inttable_insert(&jc->asmlabels, pclabel, upb_value_ptr(str));
-#endif
-}
-
-/* Should only be called when the associated handler is known to exist. */
-static bool alwaysok(const upb_handlers *h, upb_selector_t sel) {
- upb_handlerattr attr = UPB_HANDLERATTR_INITIALIZER;
- bool ok = upb_handlers_getattr(h, sel, &attr);
- bool ret;
-
- UPB_ASSERT(ok);
- ret = upb_handlerattr_alwaysok(&attr);
- upb_handlerattr_uninit(&attr);
- return ret;
-}
-
-/* Emit static assembly routines; code that does not vary based on the message
- * schema. Since it's not input-dependent, we only need one single copy of it.
- * For the moment we generate a single copy per generated handlers. Eventually
- * we should generate this code at compile time and link it into the binary so
- * we have one copy total. To do that we'll want to be sure that it is within
- * 2GB of our JIT code, so that branches between the two are near (rel32).
- *
- * We'd put this assembly in a .s file directly, but DynASM's ability to
- * calculate structure offsets automatically is too useful to pass up (it's way
- * more convenient to write DECODER->sink than [rbx + 0x96], especially since
- * the latter would have to be changed whenever the structure is updated). */
-static void emit_static_asm(jitcompiler *jc) {
- | // Trampolines for entering/exiting the JIT. These are a bit tricky to
- | // support full resuming; when we suspend we copy the JIT's portion of
- | // the call stack into the upb_pbdecoder and restore it when we resume.
- asmlabel(jc, "enterjit");
- |->enterjit:
- |1:
- | push rbp
- | push r15
- | push r14
- | push r13
- | push r12
- | push rbx
- |
- | mov rbx, ARG2_64 // Preserve JIT method.
- |
- | mov DECODER, rdi
- | callp upb_pbdecoder_resume // Same args as us; reuse regs.
- | test eax, eax
- | jns >1
- | mov DECODER->saved_rsp, rsp
- | mov rax, rbx
- | load_regs
- |
- | // Test whether we have a saved stack to resume.
- | mov ARG3_64, DECODER->call_len
- | test ARG3_64, ARG3_64
- | jnz >2
- |
- | call rax
- |
- | mov rax, DECODER->size_param
- | mov qword DECODER->call_len, 0
- |1:
- | pop rbx
- | pop r12
- | pop r13
- | pop r14
- | pop r15
- | pop rbp
- | ret
- |
- |2:
- | // Resume decoder.
- | mov ARG2_64, DECODER->callstack
- | sub rsp, ARG3_64
- | mov ARG1_64, rsp
- | callp memcpy // Restore stack.
- | ret // Return to resumed function (not ->enterjit caller).
- |
- | // Other code can call this to suspend the JIT.
- | // To the calling code, it will appear that the function returns when
- | // the JIT resumes, and more buffer space will be available.
- | // Args: eax=the value that decode() should return.
- asmlabel(jc, "exitjit");
- |->exitjit:
- | // Save the stack into DECODER->callstack.
- | mov ARG1_64, DECODER->callstack
- | mov ARG2_64, rsp
- | mov ARG3_64, DECODER->saved_rsp
- | sub ARG3_64, rsp
- | mov DECODER->call_len, ARG3_64 // Preserve len for next resume.
- | mov ebx, eax // Preserve return value across memcpy.
- | callp memcpy // Copy stack into decoder.
- | mov eax, ebx // This will be our return value.
- |
- | // Must NOT do this before the memcpy(), otherwise memcpy() will
- | // clobber the stack we are trying to save!
- | mov rsp, DECODER->saved_rsp
- | pop rbx
- | pop r12
- | pop r13
- | pop r14
- | pop r15
- | pop rbp
- | ret
- |
- | // Like suspend() in the C decoder, except that the function appears
- | // (from the caller's perspective) not to return until the decoder is
- | // resumed.
- asmlabel(jc, "suspend");
- |->suspend:
- | cmp DECODER->ptr, PTR
- | je >1
- | mov DECODER->checkpoint, PTR
- |1:
- | commit_regs
- | mov rdi, DECODER
- | callp upb_pbdecoder_suspend
- | jmp ->exitjit
- |
- asmlabel(jc, "pushlendelim");
- |->pushlendelim:
- |1:
- | mov FRAME->sink.closure, CLOSURE
- | mov DECODER->checkpoint, PTR
- | dv32
- | mov rcx, DELIMEND
- | sub rcx, PTR
- | sub rcx, rdx
- | jb >4 // Len is greater than enclosing message.
- | mov FRAME->end_ofs, rcx
- | cmp FRAME, DECODER->limit
- | je >3 // Stack overflow
- | add FRAME, sizeof(upb_pbdecoder_frame)
- | mov DELIMEND, PTR
- | add DELIMEND, rdx
- | mov dword FRAME->groupnum, 0
- | test rcx, rcx
- | jz >2
- | mov DATAEND, DECODER->end
- | cmp PTR, DELIMEND
- | ja >2
- | cmp DELIMEND, DATAEND
- | ja >2
- | mov DATAEND, DELIMEND // If DELIMEND >= PTR && DELIMEND < DATAEND
- |2:
- | ret
- |3:
- | // Stack overflow error.
- | mov PTR, DECODER->checkpoint // Rollback to before the delim len.
- | // Prepare seterr args.
- | mov ARG1_64, DECODER
- | ld64 kPbDecoderStackOverflow
- | callp upb_pbdecoder_seterr
- | call ->suspend
- | jmp <1
- |4:
- | // Overextended len.
- | mov PTR, DECODER->checkpoint // Rollback to before the delim len.
- | // Prepare seterr args.
- | mov ARG1_64, DECODER
- | ld64 kPbDecoderSubmessageTooLong
- | callp upb_pbdecoder_seterr
- | call ->suspend
- | jmp <1
- |
- | // For getting a value that spans a buffer seam. Falls back to C.
- |.macro getvalue_slow, func, bytes
- | sub rsp, 8 // Need stack space for func to write value to.
- |1:
- | mov qword [rsp], 0 // For parsing routines that only parse 32 bits.
- | mov ARG1_64, DECODER
- | mov ARG2_64, rsp
- | mov DECODER->checkpoint, PTR
- | commit_regs
- | callp func
- | load_regs
- | test eax, eax
- | jns >2
- | // Success; return parsed data (in rdx AND xmm0).
- | mov rdx, [rsp]
- | movsd xmm0, qword [rsp]
- | add rsp, 8
- | sub PTR, bytes // Bias our buffer pointer to rejoin the fast-path.
- | mov DECODER->ptr, PTR
- | ret
- |2:
- | call ->exitjit // Return eax from decode function.
- | jmp <1
- |.endmacro
- |
- asmlabel(jc, "parse_unknown");
- | // Args: edx=fieldnum, cl=wire type
- |->parse_unknown:
- | // OPT: handle directly instead of kicking to C.
- | // Check for ENDGROUP.
- | mov ARG1_64, DECODER
- | mov ARG2_32, edx
- | movzx ARG3_32, cl
- | commit_regs
- | callp upb_pbdecoder_skipunknown
- | load_regs
- | cmp eax, DECODE_ENDGROUP
- | jne >1
- | ret // Return eax=DECODE_ENDGROUP, not zero
- |1:
- | cmp eax, DECODE_OK
- | je >1
- | call ->exitjit // Return eax from decode function.
- |1:
- | xor eax, eax
- | ret
- |
- | // Fallback functions for parsing single values. These are used when the
- | // buffer doesn't contain enough remaining data for the fast path. Each
- | // primitive type (v32, v64, f32, f64) has two functions: decode & skip.
- | // Decode functions return their value in rsi/esi.
- | //
- | // These functions leave PTR = value_end - fast_path_bytes, so that we can
- | // re-join the fast path which will add fast_path_bytes after the callback
- | // completes. We also set DECODER->ptr to this value which is a signal to
- | // ->suspend that DECODER->checkpoint is up to date.
- asmlabel(jc, "skip_decode_f32_fallback");
- |->skipf32_fallback:
- |->decodef32_fallback:
- | getvalue_slow upb_pbdecoder_decode_f32, 4
- |
- asmlabel(jc, "skip_decode_f64_fallback");
- |->skipf64_fallback:
- |->decodef64_fallback:
- | getvalue_slow upb_pbdecoder_decode_f64, 8
- |
- | // Called for varint >= 1 byte.
- asmlabel(jc, "skip_decode_v32_fallback");
- |->skipv32_fallback:
- |->skipv64_fallback:
- | chkeob 16, >1
- | // With at least 16 bytes left, we can do a branch-less SSE version.
- | movdqu xmm0, [PTR]
- | pmovmskb eax, xmm0 // bits 0-15 are continuation bits, 16-31 are 0.
- | not eax
- | bsf eax, eax
- | cmp al, 10
- | jae ->decode_varint_slow // Error (>10 byte varint).
- | add PTR, rax // bsf result is 0-based, so PTR=end-1, as desired.
- | ret
- |
- |1:
- | // With fewer than 16 bytes, we have to read byte by byte.
- | lea rcx, [PTR + 10]
- | mov rax, PTR // Preserve PTR in case of fallback to slow path.
- | cmp rcx, DATAEND
- | cmova rcx, DATAEND // rcx = MIN(DATAEND, PTR + 10)
- |2:
- | cmp rax, rcx
- | je ->decode_varint_slow
- | test byte [rax], 0x80
- | jz >3
- | add rax, 1
- | jmp <2
- |3:
- | mov PTR, rax // PTR = varint_end - 1, as desired
- | ret
- |
- | // Returns tag in edx
- asmlabel(jc, "decode_unknown_tag_fallback");
- |->decode_unknown_tag_fallback:
- | sub rsp, 16
- |1:
- | cmp PTR, DELIMEND
- | jne >2
- | add rsp, 16
- | xor eax, eax
- | ret
- |2:
- | // OPT: Have a medium-fast path before falling back to _slow.
- | mov ARG1_64, DECODER
- | mov ARG2_64, rsp
- | commit_regs
- | callp upb_pbdecoder_decode_varint_slow
- | load_regs
- | cmp eax, 0
- | jge >3
- | mov edx, [rsp] // Success; return parsed data.
- | add rsp, 16
- | ret
- |3:
- | call ->exitjit // Return eax from decode function.
- | jmp <1
- |
- | // Called for varint >= 1 byte.
- asmlabel(jc, "decode_v32_v64_fallback");
- |->decodev32_fallback:
- |->decodev64_fallback:
- | chkeob 10, ->decode_varint_slow
- | // OPT: do something faster than just calling the C version.
- | mov rdi, PTR
- | callp upb_vdecode_fast
- | test rax, rax
- | je ->decode_varint_slow // Unterminated varint.
- | mov PTR, rax
- | sub PTR, 1
- | mov DECODER->ptr, PTR
- | ret
- |
- asmlabel(jc, "decode_varint_slow");
- |->decode_varint_slow:
- | // Slow path: end of buffer or error (varint length >= 10).
- | getvalue_slow upb_pbdecoder_decode_varint_slow, 1
- |
- | // Args: rsi=expected tag, return=rax (DECODE_{OK,MISMATCH})
- asmlabel(jc, "checktag_fallback");
- |->checktag_fallback:
- | sub rsp, 8
- | mov [rsp], rsi // Preserve expected tag.
- |1:
- | mov ARG1_64, DECODER
- | commit_regs
- | mov DECODER->checkpoint, PTR
- | callp upb_pbdecoder_checktag_slow
- | load_regs
- | cmp eax, 0
- | jge >2
- | add rsp, 8
- | ret
- |2:
- | call ->exitjit
- | mov rsi, [rsp]
- | cmp PTR, DELIMEND
- | jne <1
- | mov eax, DECODE_EOF
- | add rsp, 8
- | ret
- |
- | // Args: rsi=upb_inttable, rdx=key, return=rax (-1 if not found).
- | // Preserves: rcx, rdx
- | // OPT: Could write this in assembly if it's a hotspot.
- asmlabel(jc, "hashlookup");
- |->hashlookup:
- | push rcx
- | push rdx
- | sub rsp, 16
- | mov rdi, rsi
- | mov rsi, rdx
- | mov rdx, rsp
- | callp upb_inttable_lookup
- | add rsp, 16
- | pop rdx
- | pop rcx
- | test al, al
- | jz >2 // Unknown field.
- | mov rax, [rsp-32] // Value from table.
- | ret
- |2:
- | xor rax, rax
- | not rax
- | ret
-}
-
-static void jitprimitive(jitcompiler *jc, opcode op,
- const upb_handlers *h, upb_selector_t sel) {
- typedef enum { V32, V64, F32, F64, X } valtype_t;
- static valtype_t types[] = {
- X, F64, F32, V64, V64, V32, F64, F32, V64, X, X, X, X, V32, V32, F32, F64,
- V32, V64 };
- static char fastpath_bytes[] = { 1, 1, 4, 8 };
- const valtype_t vtype = types[op];
- const int fastbytes = fastpath_bytes[vtype];
- upb_func *handler = gethandler(h, sel);
- upb_fieldtype_t ftype;
- size_t offset;
- int32_t hasbit;
-
- if (handler) {
- |1:
- | chkneob fastbytes, >3
- |2:
- switch (vtype) {
- case V32:
- | call ->decodev32_fallback
- break;
- case V64:
- | call ->decodev64_fallback
- break;
- case F32:
- | call ->decodef32_fallback
- break;
- case F64:
- | call ->decodef64_fallback
- break;
- case X: break;
- }
- | jmp >4
-
- /* Fast path decode; for when check_bytes bytes are available. */
- |3:
- switch (op) {
- case OP_PARSE_SFIXED32:
- case OP_PARSE_FIXED32:
- | mov edx, dword [PTR]
- break;
- case OP_PARSE_SFIXED64:
- case OP_PARSE_FIXED64:
- | mov rdx, qword [PTR]
- break;
- case OP_PARSE_FLOAT:
- | movss xmm0, dword [PTR]
- break;
- case OP_PARSE_DOUBLE:
- | movsd xmm0, qword [PTR]
- break;
- default:
- /* Inline one byte of varint decoding. */
- | movzx edx, byte [PTR]
- | test dl, dl
- | js <2 // Fallback to slow path for >1 byte varint.
- break;
- }
-
- /* Second-stage decode; used for both fast and slow paths */
- /* (only needed for a few types). */
- |4:
- switch (op) {
- case OP_PARSE_SINT32:
- /* 32-bit zig-zag decode. */
- | mov eax, edx
- | shr edx, 1
- | and eax, 1
- | neg eax
- | xor edx, eax
- break;
- case OP_PARSE_SINT64:
- /* 64-bit zig-zag decode. */
- | mov rax, rdx
- | shr rdx, 1
- | and rax, 1
- | neg rax
- | xor rdx, rax
- break;
- case OP_PARSE_BOOL:
- | test rdx, rdx
- | setne dl
- break;
- default: break;
- }
-
- /* Call callback (or specialize if we can). */
- if (upb_msg_getscalarhandlerdata(h, sel, &ftype, &offset, &hasbit)) {
- switch (ftype) {
- case UPB_TYPE_INT64:
- case UPB_TYPE_UINT64:
- | mov [CLOSURE + offset], rdx
- break;
- case UPB_TYPE_INT32:
- case UPB_TYPE_UINT32:
- case UPB_TYPE_ENUM:
- | mov [CLOSURE + offset], edx
- break;
- case UPB_TYPE_DOUBLE:
- | movsd qword [CLOSURE + offset], XMMARG1
- break;
- case UPB_TYPE_FLOAT:
- | movss dword [CLOSURE + offset], XMMARG1
- break;
- case UPB_TYPE_BOOL:
- | mov [CLOSURE + offset], dl
- break;
- case UPB_TYPE_STRING:
- case UPB_TYPE_BYTES:
- case UPB_TYPE_MESSAGE:
- UPB_ASSERT(false); break;
- }
- | sethas CLOSURE, hasbit
- } else if (handler) {
- | mov ARG1_64, CLOSURE
- | load_handler_data h, sel
- | callp handler
- if (!alwaysok(h, sel)) {
- | test al, al
- | jnz >5
- | call ->suspend
- | jmp <1
- |5:
- }
- }
-
- /* We do this last so that the checkpoint is not advanced past the user's
- * data until the callback has returned success. */
- | add PTR, fastbytes
- } else {
- /* No handler registered for this value, just skip it. */
- | chkneob fastbytes, >3
- |2:
- switch (vtype) {
- case V32:
- | call ->skipv32_fallback
- break;
- case V64:
- | call ->skipv64_fallback
- break;
- case F32:
- | call ->skipf32_fallback
- break;
- case F64:
- | call ->skipf64_fallback
- break;
- case X: break;
- }
-
- /* Fast-path skip. */
- |3:
- if (vtype == V32 || vtype == V64) {
- | test byte [PTR], 0x80
- | jnz <2
- }
- | add PTR, fastbytes
- }
-}
-
-static void jitdispatch(jitcompiler *jc,
- const upb_pbdecodermethod *method) {
- /* Lots of room for tweaking/optimization here. */
-
- const upb_inttable *dispatch = &method->dispatch;
- bool has_hash_entries = (dispatch->t.count > 0);
-
- /* Whether any of the fields for this message can have two wire types which
- * are both valid (packed & non-packed).
- *
- * OPT: populate this more precisely; not all messages with hash entries have
- * this characteristic. */
- bool has_multi_wiretype = has_hash_entries;
-
- |=>define_jmptarget(jc, &method->dispatch):
- |1:
- /* Decode the field tag. */
- | mov aword DECODER->checkpoint, PTR
- | chkeob 2, >6
- | movzx edx, byte [PTR]
- | test dl, dl
- | jns >7 // Jump if first byte has no continuation bit.
- | movzx ecx, byte [PTR + 1]
- | test cl, cl
- | js >6 // Jump if second byte has continuation bit.
- | // Confirmed two-byte varint.
- | shl ecx, 7
- | and edx, 0x7f
- | or edx, ecx
- | add PTR, 2
- | jmp >8
- |6:
- | call ->decode_unknown_tag_fallback
- | test eax, eax // Hit DELIMEND?
- | jnz >8
- | ret
- |7:
- | add PTR, 1
- |8:
- | mov ecx, edx
- | shr edx, 3
- | and cl, 7
-
- /* See comment attached to upb_pbdecodermethod.dispatch for layout of the
- * dispatch table. */
- |2:
- | cmp edx, dispatch->array_size
- if (has_hash_entries) {
- | jae >7
- } else {
- | jae >5
- }
- | // OPT: Compact the lookup arr into 32-bit entries.
- if ((uintptr_t)dispatch->array > 0x7fffffff) {
- | mov64 rax, (uintptr_t)dispatch->array
- | mov rax, qword [rax + rdx * 8]
- } else {
- | mov rax, qword [rdx * 8 + dispatch->array]
- }
- |3:
- | // We take advantage of the fact that non-present entries are stored
- | // as -1, which will result in wire types that will never match.
- | cmp al, cl
- if (has_multi_wiretype) {
- | jne >6
- } else {
- | jne >5
- }
- | shr rax, 16
- |
- | // Load the machine code address from the table entry.
- | // The table entry is relative to the dispatch->array jmptarget
- | // (patchdispatch() took care of this) which is the same as
- | // local label "4". The "lea" is really just trying to do
- | // lea rax, [>4 + rax]
- | //
- | // But we can't write that directly for some reason, so we use
- | // rdx as a temporary.
- | lea rdx, [>4]
- |=>define_jmptarget(jc, dispatch->array):
- |4:
- | add rax, rdx
- | ret
- |
- |5:
- | // Field isn't in our table.
- |
- | // For pushing unknown fields to the unknown field handler.
- | mov64 rax, (uintptr_t)method->dest_handlers_
- | mov FRAME->sink.handlers, rax
- |
- | call ->parse_unknown
- | test eax, eax // ENDGROUP?
- | jz <1
- | lea rax, [>9] // ENDGROUP; Load address of OP_ENDMSG.
- | ret
-
- if (has_multi_wiretype) {
- |6:
- | // Primary wire type didn't match, check secondary wire type.
- | cmp ah, cl
- | jne <5
- | // Secondary wire type is a match, look up fn + UPB_MAX_FIELDNUMBER.
- | add rdx, UPB_MAX_FIELDNUMBER
- | // This key will never be in the array part, so do a hash lookup.
- UPB_ASSERT(has_hash_entries);
- | ld64 dispatch
- | jmp ->hashlookup // Tail call.
- }
-
- if (has_hash_entries) {
- |7:
- | // Hash table lookup.
- | ld64 dispatch
- | call ->hashlookup
- | jmp <3
- }
-}
-
-static void jittag(jitcompiler *jc, uint64_t tag, int n, int ofs,
- const upb_pbdecodermethod *method) {
- /* Internally we parse unknown fields; if this runs us into DELIMEND we jump
- * to the corresponding DELIMEND target (either msg end or repeated field
- * end), which we find from the OP_CHECKDELIM which must have necessarily
- * preceded us. */
- uint32_t last_instruction = *(jc->pc - 2);
- int last_arg = (int32_t)last_instruction >> 8;
- uint32_t *delimend = (jc->pc - 1) + last_arg;
- const size_t ptr_words = sizeof(void*) / sizeof(uint32_t);
-
- UPB_ASSERT((last_instruction & 0xff) == OP_CHECKDELIM);
-
- if (getop(*(jc->pc - 1)) == OP_TAGN) {
- jc->pc += ptr_words;
- }
-
- | chkneob n, >1
-
- | // OPT: this is way too much fallback code to put here.
- | // Reduce and/or move to a separate section to make better icache usage.
- | ld64 tag
- | call ->checktag_fallback
- | cmp eax, DECODE_MISMATCH
- | je >3
- | cmp eax, DECODE_EOF
- | je =>jmptarget(jc, delimend)
- | jmp >5
-
- |1:
- switch (n) {
- case 1:
- | cmp byte [PTR], tag
- break;
- case 2:
- | cmp word [PTR], tag
- break;
- case 3:
- | // OPT: Slightly more efficient code, but depends on an extra byte.
- | // mov eax, dword [PTR]
- | // shl eax, 8
- | // cmp eax, tag << 8
- | cmp word [PTR], (tag & 0xffff)
- | jne >2
- | cmp byte [PTR + 2], (tag >> 16)
- |2:
- break;
- case 4:
- | cmp dword [PTR], tag
- break;
- case 5:
- | cmp dword [PTR], (tag & 0xffffffff)
- | jne >3
- | cmp byte [PTR + 4], (tag >> 32)
- }
- | je >4
- |3:
- if (ofs == 0) {
- | call =>jmptarget(jc, &method->dispatch)
- | test rax, rax
- | jz =>jmptarget(jc, delimend)
- | jmp rax
- } else {
- | jmp =>jmptarget(jc, jc->pc + ofs)
- }
- |4:
- | add PTR, n
- |5:
-}
-
-/* Compile the bytecode to x64. */
-static void jitbytecode(jitcompiler *jc) {
- upb_pbdecodermethod *method = NULL;
- const upb_handlers *h = NULL;
- for (jc->pc = jc->group->bytecode; jc->pc < jc->group->bytecode_end; ) {
- int32_t instr = *jc->pc;
- opcode op = instr & 0xff;
- uint32_t arg = instr >> 8;
- int32_t longofs = arg;
-
- if (op != OP_SETDISPATCH) {
- /* Skipped for SETDISPATCH because it defines its own asmlabel for the
- * dispatch code it emits. */
- asmlabel(jc, "0x%lx.%s", pcofs(jc), upb_pbdecoder_getopname(op));
-
- /* Skipped for SETDISPATCH because it should point at the function
- * prologue, not the dispatch function that is emitted first.
- * TODO: optimize this to only define pclabels that are actually used. */
- |=>define_jmptarget(jc, jc->pc):
- }
-
- jc->pc++;
-
- switch (op) {
- case OP_STARTMSG: {
- upb_func *startmsg = gethandler(h, UPB_STARTMSG_SELECTOR);
- if (startmsg) {
- /* bool startmsg(void *closure, const void *hd) */
- |1:
- | mov ARG1_64, CLOSURE
- | load_handler_data h, UPB_STARTMSG_SELECTOR
- | callp startmsg
- if (!alwaysok(h, UPB_STARTMSG_SELECTOR)) {
- | test al, al
- | jnz >2
- | call ->suspend
- | jmp <1
- |2:
- }
- } else {
- | nop
- }
- break;
- }
- case OP_ENDMSG: {
- upb_func *endmsg = gethandler(h, UPB_ENDMSG_SELECTOR);
- |9:
- if (endmsg) {
- /* bool endmsg(void *closure, const void *hd, upb_status *status) */
- | mov ARG1_64, CLOSURE
- | load_handler_data h, UPB_ENDMSG_SELECTOR
- | mov ARG3_64, DECODER->status
- | callp endmsg
- }
- break;
- }
- case OP_SETDISPATCH: {
- uint32_t *op_pc = jc->pc - 1;
- const char *msgname;
- upb_inttable *dispatch;
-
- /* Load info for new method. */
- memcpy(&dispatch, jc->pc, sizeof(void*));
- jc->pc += sizeof(void*) / sizeof(uint32_t);
- /* The OP_SETDISPATCH bytecode contains a pointer that is
- * &method->dispatch; we want to go backwards and recover method. */
- method =
- (void*)((char*)dispatch - offsetof(upb_pbdecodermethod, dispatch));
- /* May be NULL, in which case no handlers for this message will be found.
- * OPT: we should do better by completely skipping the message in this
- * case instead of parsing it field by field. We should also do the skip
- * in the containing message's code. */
- h = method->dest_handlers_;
- msgname = upb_msgdef_fullname(upb_handlers_msgdef(h));
-
- /* Emit dispatch code for new method. */
- asmlabel(jc, "0x%lx.dispatch.%s", pcofs(jc), msgname);
- jitdispatch(jc, method);
-
- /* Emit function prologue for new method. */
- asmlabel(jc, "0x%lx.parse.%s", pcofs(jc), msgname);
- |=>define_jmptarget(jc, op_pc):
- |=>define_jmptarget(jc, method):
- | sub rsp, 8
-
- break;
- }
- case OP_PARSE_DOUBLE:
- case OP_PARSE_FLOAT:
- case OP_PARSE_INT64:
- case OP_PARSE_UINT64:
- case OP_PARSE_INT32:
- case OP_PARSE_FIXED64:
- case OP_PARSE_FIXED32:
- case OP_PARSE_BOOL:
- case OP_PARSE_UINT32:
- case OP_PARSE_SFIXED32:
- case OP_PARSE_SFIXED64:
- case OP_PARSE_SINT32:
- case OP_PARSE_SINT64:
- jitprimitive(jc, op, h, arg);
- break;
- case OP_STARTSEQ:
- case OP_STARTSUBMSG:
- case OP_STARTSTR: {
- upb_func *start = gethandler(h, arg);
- if (start) {
- /* void *startseq(void *closure, const void *hd)
- * void *startsubmsg(void *closure, const void *hd)
- * void *startstr(void *closure, const void *hd, size_t size_hint) */
- |1:
- | mov ARG1_64, CLOSURE
- | load_handler_data h, arg
- if (op == OP_STARTSTR) {
- | mov ARG3_64, DELIMEND
- | sub ARG3_64, PTR
- }
- | callp start
- if (!alwaysok(h, arg)) {
- | test rax, rax
- | jnz >2
- | call ->suspend
- | jmp <1
- |2:
- }
- | mov CLOSURE, rax
- } else {
- /* TODO: nop is only required because of asmlabel(). */
- | nop
- }
- break;
- }
- case OP_ENDSEQ:
- case OP_ENDSUBMSG:
- case OP_ENDSTR: {
- upb_func *end = gethandler(h, arg);
- if (end) {
- /* bool endseq(void *closure, const void *hd)
- * bool endsubmsg(void *closure, const void *hd)
- * bool endstr(void *closure, const void *hd) */
- |1:
- | mov ARG1_64, CLOSURE
- | load_handler_data h, arg
- | callp end
- if (!alwaysok(h, arg)) {
- | test al, al
- | jnz >2
- | call ->suspend
- | jmp <1
- |2:
- }
- } else {
- /* TODO: nop is only required because of asmlabel(). */
- | nop
- }
- break;
- }
- case OP_STRING: {
- upb_func *str = gethandler(h, arg);
- | cmp PTR, DELIMEND
- | je >4
- |1:
- | cmp PTR, DATAEND
- | jne >2
- | call ->suspend
- | jmp <1
- |2:
- if (str) {
- /* size_t str(void *closure, const void *hd, const char *str,
- * size_t n) */
- | mov ARG1_64, CLOSURE
- | load_handler_data h, arg
- | mov ARG3_64, PTR
- | mov ARG4_64, DATAEND
- | sub ARG4_64, PTR
- | mov ARG5_64, qword DECODER->handle
- | callp str
- | add PTR, rax
- if (!alwaysok(h, arg)) {
- | cmp PTR, DATAEND
- | je >3
- | call ->strret_fallback
- |3:
- }
- } else {
- | mov PTR, DATAEND
- }
- | cmp PTR, DELIMEND
- | jne <1
- |4:
- break;
- }
- case OP_PUSHTAGDELIM:
- | mov FRAME->sink.closure, CLOSURE
- | // This shouldn't need to be read, because tag-delimited fields
- | // shouldn't have an OP_SETDELIM after them. But for the moment
- | // non-packed repeated fields do OP_SETDELIM so they can share more
- | // code with the packed code-path. If this is changed later, this
- | // store can be removed.
- | mov qword FRAME->end_ofs, 0
- | cmp FRAME, DECODER->limit
- | je ->err
- | add FRAME, sizeof(upb_pbdecoder_frame)
- | mov dword FRAME->groupnum, arg
- break;
- case OP_PUSHLENDELIM:
- | call ->pushlendelim
- break;
- case OP_POP:
- | sub FRAME, sizeof(upb_pbdecoder_frame)
- | mov CLOSURE, FRAME->sink.closure
- break;
- case OP_SETDELIM:
- /* OPT: experiment with testing vs old offset to optimize away. */
- | mov DATAEND, DECODER->end
- | add DELIMEND, FRAME->end_ofs
- | cmp DELIMEND, DECODER->buf
- | jb >1
- | cmp DELIMEND, DATAEND
- | ja >1 // OPT: try cmov.
- | mov DATAEND, DELIMEND
- |1:
- break;
- case OP_SETBIGGROUPNUM:
- | mov dword FRAME->groupnum, *jc->pc++
- break;
- case OP_CHECKDELIM:
- | cmp DELIMEND, PTR
- | je =>jmptarget(jc, jc->pc + longofs)
- break;
- case OP_CALL:
- | call =>jmptarget(jc, jc->pc + longofs)
- break;
- case OP_BRANCH:
- | jmp =>jmptarget(jc, jc->pc + longofs);
- break;
- case OP_RET:
- |9:
- | add rsp, 8
- | ret
- break;
- case OP_TAG1:
- jittag(jc, (arg >> 8) & 0xff, 1, (int8_t)arg, method);
- break;
- case OP_TAG2:
- jittag(jc, (arg >> 8) & 0xffff, 2, (int8_t)arg, method);
- break;
- case OP_TAGN: {
- uint64_t tag;
- memcpy(&tag, jc->pc, 8);
- jittag(jc, tag, arg >> 8, (int8_t)arg, method);
- break;
- }
- case OP_DISPATCH:
- | call =>jmptarget(jc, &method->dispatch)
- break;
- case OP_HALT:
- UPB_ASSERT(false);
- }
- }
-
- asmlabel(jc, "eof");
- | nop
-}
generated by cgit on debian on lair
contact matthew@masot.net with questions or feedback