|// |// upb - a minimalist implementation of protocol buffers. |// |// Copyright (c) 2011-2013 Google Inc. See LICENSE for details. |// Author: Josh Haberman |// |// JIT compiler for upb_pbdecoder on x86-64. Generates machine code from the |// bytecode generated in compile_decoder.c. | |.arch x64 |.actionlist upb_jit_actionlist |.globals UPB_JIT_GLOBAL_ |.globalnames upb_jit_globalnames | |// Calling conventions. Note -- this will need to be changed for |// Windows, which uses a different calling convention! |.define ARG1_64, rdi |.define ARG2_8, r6b // DynASM's equivalent to "sil" -- low byte of esi. |.define ARG2_32, esi |.define ARG2_64, rsi |.define ARG3_8, dl |.define ARG3_32, edx |.define ARG3_64, rdx |.define ARG4_64, rcx |.define ARG5_64, r8 |.define XMMARG1, xmm0 | |// Register allocation / type map. |// ALL of the code in this file uses these register allocations. |// When we "call" within this file, we do not use regular calling |// conventions, but of course when calling to user callbacks we must. |.define PTR, rbx // DECODER->ptr (unsynced) |.define DATAEND, r12 // DECODER->data_end (unsynced) |.define CLOSURE, r13 // FRAME->closure (unsynced) |.type FRAME, upb_pbdecoder_frame, r14 // DECODER->top (unsynced) |.type DECODER, upb_pbdecoder, r15 // DECODER (immutable) |.define DELIMEND, rbp | | // Spills unsynced registers back to memory. |.macro commit_regs | mov DECODER->top, FRAME | mov DECODER->ptr, PTR | mov DECODER->data_end, DATAEND | // We don't guarantee that delim_end is NULL when out of range like the | // interpreter does. | mov DECODER->delim_end, DELIMEND | sub DELIMEND, DECODER->buf | add DELIMEND, DECODER->bufstart_ofs | mov FRAME->end_ofs, DELIMEND | mov FRAME->sink.closure, CLOSURE |.endmacro | | // Loads unsynced registers from memory back into registers. |.macro load_regs | mov FRAME, DECODER->top | mov PTR, DECODER->ptr | mov DATAEND, DECODER->data_end | mov CLOSURE, FRAME->sink.closure | mov DELIMEND, FRAME->end_ofs | sub DELIMEND, DECODER->bufstart_ofs | add DELIMEND, DECODER->buf |.endmacro | | // Calls an external C function at address "addr". |.macro callp, addr | mov64 rax, (uintptr_t)addr | | // Stack must be 16-byte aligned (x86-64 ABI requires this). | // | // OPT: possibly remove this by statically ensuring correct alignment. | // | // OPT: use "call rel32" where possible. | push r12 | mov r12, rsp | and rsp, 0xfffffffffffffff0UL // Align stack. | call rax | mov rsp, r12 | pop r12 |.endmacro | |.macro ld64, val || { || uintptr_t v = (uintptr_t)val; || if (v > 0xffffffff) { | mov64 ARG2_64, v || } else if (v) { | mov ARG2_32, v || } else { | xor ARG2_32, ARG2_32 || } || } |.endmacro | |.macro load_handler_data, h, arg | ld64 upb_handlers_gethandlerdata(h, arg) |.endmacro | |.macro chkeob, bytes, target || if (bytes == 1) { | cmp PTR, DATAEND | je target || } else { | mov rcx, DATAEND | sub rcx, PTR | cmp rcx, bytes | jb target || } |.endmacro | |.macro chkneob, bytes, target || if (bytes == 1) { | cmp PTR, DATAEND | jne target || } else { | mov rcx, DATAEND | sub rcx, PTR | cmp rcx, bytes | jae target || } |.endmacro |.macro sethas, reg, hasbit || if (hasbit >= 0) { | or byte [reg + ((uint32_t)hasbit / 8)], (1 << ((uint32_t)hasbit % 8)) || } |.endmacro | | // Decodes 32-bit varint into rdx, inlining 1 byte. |.macro dv32 | chkeob 1, >7 | movzx edx, byte [PTR] | test dl, dl | jns >8 |7: | call ->decodev32_fallback |8: | add PTR, 1 |.endmacro #define DECODE_EOF -3 static upb_func *gethandler(const upb_handlers *h, upb_selector_t sel) { return h ? upb_handlers_gethandler(h, sel) : NULL; } /* Defines an "assembly label" for the current code generation offset. * This label exists *purely* for debugging purposes: it is emitted into * the .so, and printed as part of JIT debugging output when UPB_JIT_LOAD_SO is * defined. * * We would define this in the .c file except that it conditionally defines a * pclabel. */ static void asmlabel(jitcompiler *jc, const char *fmt, ...) { #ifndef NDEBUG int ofs = jc->dynasm->section->ofs; UPB_ASSERT(ofs != jc->lastlabelofs); jc->lastlabelofs = ofs; #endif #ifndef UPB_JIT_LOAD_SO UPB_UNUSED(jc); UPB_UNUSED(fmt); #else va_list args; va_start(args, fmt); char *str = upb_vasprintf(fmt, args); va_end(args); int pclabel = alloc_pclabel(jc); /* Normally we would prefer to allocate this inline with the codegen, * ie. * |=>asmlabel(...) * But since we do this conditionally, only when UPB_JIT_LOAD_SO is defined, * we do it here instead. */ |=>pclabel: upb_inttable_insert(&jc->asmlabels, pclabel, upb_value_ptr(str)); #endif } /* Should only be called when the associated handler is known to exist. */ static bool alwaysok(const upb_handlers *h, upb_selector_t sel) { upb_handlerattr attr = UPB_HANDLERATTR_INITIALIZER; bool ok = upb_handlers_getattr(h, sel, &attr); bool ret; UPB_ASSERT(ok); ret = upb_handlerattr_alwaysok(&attr); upb_handlerattr_uninit(&attr); return ret; } /* Emit static assembly routines; code that does not vary based on the message * schema. Since it's not input-dependent, we only need one single copy of it. * For the moment we generate a single copy per generated handlers. Eventually * we should generate this code at compile time and link it into the binary so * we have one copy total. To do that we'll want to be sure that it is within * 2GB of our JIT code, so that branches between the two are near (rel32). * * We'd put this assembly in a .s file directly, but DynASM's ability to * calculate structure offsets automatically is too useful to pass up (it's way * more convenient to write DECODER->sink than [rbx + 0x96], especially since * the latter would have to be changed whenever the structure is updated). */ static void emit_static_asm(jitcompiler *jc) { | // Trampolines for entering/exiting the JIT. These are a bit tricky to | // support full resuming; when we suspend we copy the JIT's portion of | // the call stack into the upb_pbdecoder and restore it when we resume. asmlabel(jc, "enterjit"); |->enterjit: |1: | push rbp | push r15 | push r14 | push r13 | push r12 | push rbx | | mov rbx, ARG2_64 // Preserve JIT method. | | mov DECODER, rdi | callp upb_pbdecoder_resume // Same args as us; reuse regs. | test eax, eax | jns >1 | mov DECODER->saved_rsp, rsp | mov rax, rbx | load_regs | | // Test whether we have a saved stack to resume. | mov ARG3_64, DECODER->call_len | test ARG3_64, ARG3_64 | jnz >2 | | call rax | | mov rax, DECODER->size_param | mov qword DECODER->call_len, 0 |1: | pop rbx | pop r12 | pop r13 | pop r14 | pop r15 | pop rbp | ret | |2: | // Resume decoder. | mov ARG2_64, DECODER->callstack | sub rsp, ARG3_64 | mov ARG1_64, rsp | callp memcpy // Restore stack. | ret // Return to resumed function (not ->enterjit caller). | | // Other code can call this to suspend the JIT. | // To the calling code, it will appear that the function returns when | // the JIT resumes, and more buffer space will be available. | // Args: eax=the value that decode() should return. asmlabel(jc, "exitjit"); |->exitjit: | // Save the stack into DECODER->callstack. | mov ARG1_64, DECODER->callstack | mov ARG2_64, rsp | mov ARG3_64, DECODER->saved_rsp | sub ARG3_64, rsp | mov DECODER->call_len, ARG3_64 // Preserve len for next resume. | mov ebx, eax // Preserve return value across memcpy. | callp memcpy // Copy stack into decoder. | mov eax, ebx // This will be our return value. | | // Must NOT do this before the memcpy(), otherwise memcpy() will | // clobber the stack we are trying to save! | mov rsp, DECODER->saved_rsp | pop rbx | pop r12 | pop r13 | pop r14 | pop r15 | pop rbp | ret | | // Like suspend() in the C decoder, except that the function appears | // (from the caller's perspective) not to return until the decoder is | // resumed. asmlabel(jc, "suspend"); |->suspend: | cmp DECODER->ptr, PTR | je >1 | mov DECODER->checkpoint, PTR |1: | commit_regs | mov rdi, DECODER | callp upb_pbdecoder_suspend | jmp ->exitjit | asmlabel(jc, "pushlendelim"); |->pushlendelim: |1: | mov FRAME->sink.closure, CLOSURE | mov DECODER->checkpoint, PTR | dv32 | mov rcx, DELIMEND | sub rcx, PTR | sub rcx, rdx | jb >4 // Len is greater than enclosing message. | mov FRAME->end_ofs, rcx | cmp FRAME, DECODER->limit | je >3 // Stack overflow | add FRAME, sizeof(upb_pbdecoder_frame) | mov DELIMEND, PTR | add DELIMEND, rdx | mov dword FRAME->groupnum, 0 | test rcx, rcx | jz >2 | mov DATAEND, DECODER->end | cmp PTR, DELIMEND | ja >2 | cmp DELIMEND, DATAEND | ja >2 | mov DATAEND, DELIMEND // If DELIMEND >= PTR && DELIMEND < DATAEND |2: | ret |3: | // Stack overflow error. | mov PTR, DECODER->checkpoint // Rollback to before the delim len. | // Prepare seterr args. | mov ARG1_64, DECODER | ld64 kPbDecoderStackOverflow | callp upb_pbdecoder_seterr | call ->suspend | jmp <1 |4: | // Overextended len. | mov PTR, DECODER->checkpoint // Rollback to before the delim len. | // Prepare seterr args. | mov ARG1_64, DECODER | ld64 kPbDecoderSubmessageTooLong | callp upb_pbdecoder_seterr | call ->suspend | jmp <1 | | // For getting a value that spans a buffer seam. Falls back to C. |.macro getvalue_slow, func, bytes | sub rsp, 8 // Need stack space for func to write value to. |1: | mov qword [rsp], 0 // For parsing routines that only parse 32 bits. | mov ARG1_64, DECODER | mov ARG2_64, rsp | mov DECODER->checkpoint, PTR | commit_regs | callp func | load_regs | test eax, eax | jns >2 | // Success; return parsed data (in rdx AND xmm0). | mov rdx, [rsp] | movsd xmm0, qword [rsp] | add rsp, 8 | sub PTR, bytes // Bias our buffer pointer to rejoin the fast-path. | mov DECODER->ptr, PTR | ret |2: | call ->exitjit // Return eax from decode function. | jmp <1 |.endmacro | asmlabel(jc, "parse_unknown"); | // Args: edx=fieldnum, cl=wire type |->parse_unknown: | // OPT: handle directly instead of kicking to C. | // Check for ENDGROUP. | mov ARG1_64, DECODER | mov ARG2_32, edx | movzx ARG3_32, cl | commit_regs | callp upb_pbdecoder_skipunknown | load_regs | cmp eax, DECODE_ENDGROUP | jne >1 | ret // Return eax=DECODE_ENDGROUP, not zero |1: | cmp eax, DECODE_OK | je >1 | call ->exitjit // Return eax from decode function. |1: | xor eax, eax | ret | | // Fallback functions for parsing single values. These are used when the | // buffer doesn't contain enough remaining data for the fast path. Each | // primitive type (v32, v64, f32, f64) has two functions: decode & skip. | // Decode functions return their value in rsi/esi. | // | // These functions leave PTR = value_end - fast_path_bytes, so that we can | // re-join the fast path which will add fast_path_bytes after the callback | // completes. We also set DECODER->ptr to this value which is a signal to | // ->suspend that DECODER->checkpoint is up to date. asmlabel(jc, "skip_decode_f32_fallback"); |->skipf32_fallback: |->decodef32_fallback: | getvalue_slow upb_pbdecoder_decode_f32, 4 | asmlabel(jc, "skip_decode_f64_fallback"); |->skipf64_fallback: |->decodef64_fallback: | getvalue_slow upb_pbdecoder_decode_f64, 8 | | // Called for varint >= 1 byte. asmlabel(jc, "skip_decode_v32_fallback"); |->skipv32_fallback: |->skipv64_fallback: | chkeob 16, >1 | // With at least 16 bytes left, we can do a branch-less SSE version. | movdqu xmm0, [PTR] | pmovmskb eax, xmm0 // bits 0-15 are continuation bits, 16-31 are 0. | not eax | bsf eax, eax | cmp al, 10 | jae ->decode_varint_slow // Error (>10 byte varint). | add PTR, rax // bsf result is 0-based, so PTR=end-1, as desired. | ret | |1: | // With fewer than 16 bytes, we have to read byte by byte. | lea rcx, [PTR + 10] | mov rax, PTR // Preserve PTR in case of fallback to slow path. | cmp rcx, DATAEND | cmova rcx, DATAEND // rcx = MIN(DATAEND, PTR + 10) |2: | cmp rax, rcx | je ->decode_varint_slow | test byte [rax], 0x80 | jz >3 | add rax, 1 | jmp <2 |3: | mov PTR, rax // PTR = varint_end - 1, as desired | ret | | // Returns tag in edx asmlabel(jc, "decode_unknown_tag_fallback"); |->decode_unknown_tag_fallback: | sub rsp, 16 |1: | cmp PTR, DELIMEND | jne >2 | add rsp, 16 | xor eax, eax | ret |2: | // OPT: Have a medium-fast path before falling back to _slow. | mov ARG1_64, DECODER | mov ARG2_64, rsp | commit_regs | callp upb_pbdecoder_decode_varint_slow | load_regs | cmp eax, 0 | jge >3 | mov edx, [rsp] // Success; return parsed data. | add rsp, 16 | ret |3: | call ->exitjit // Return eax from decode function. | jmp <1 | | // Called for varint >= 1 byte. asmlabel(jc, "decode_v32_v64_fallback"); |->decodev32_fallback: |->decodev64_fallback: | chkeob 10, ->decode_varint_slow | // OPT: do something faster than just calling the C version. | mov rdi, PTR | callp upb_vdecode_fast | test rax, rax | je ->decode_varint_slow // Unterminated varint. | mov PTR, rax | sub PTR, 1 | mov DECODER->ptr, PTR | ret | asmlabel(jc, "decode_varint_slow"); |->decode_varint_slow: | // Slow path: end of buffer or error (varint length >= 10). | getvalue_slow upb_pbdecoder_decode_varint_slow, 1 | | // Args: rsi=expected tag, return=rax (DECODE_{OK,MISMATCH}) asmlabel(jc, "checktag_fallback"); |->checktag_fallback: | sub rsp, 8 | mov [rsp], rsi // Preserve expected tag. |1: | mov ARG1_64, DECODER | commit_regs | mov DECODER->checkpoint, PTR | callp upb_pbdecoder_checktag_slow | load_regs | cmp eax, 0 | jge >2 | add rsp, 8 | ret |2: | call ->exitjit | mov rsi, [rsp] | cmp PTR, DELIMEND | jne <1 | mov eax, DECODE_EOF | add rsp, 8 | ret | | // Args: rsi=upb_inttable, rdx=key, return=rax (-1 if not found). | // Preserves: rcx, rdx | // OPT: Could write this in assembly if it's a hotspot. asmlabel(jc, "hashlookup"); |->hashlookup: | push rcx | push rdx | sub rsp, 16 | mov rdi, rsi | mov rsi, rdx | mov rdx, rsp | callp upb_inttable_lookup | add rsp, 16 | pop rdx | pop rcx | test al, al | jz >2 // Unknown field. | mov rax, [rsp-32] // Value from table. | ret |2: | xor rax, rax | not rax | ret } static void jitprimitive(jitcompiler *jc, opcode op, const upb_handlers *h, upb_selector_t sel) { typedef enum { V32, V64, F32, F64, X } valtype_t; static valtype_t types[] = { X, F64, F32, V64, V64, V32, F64, F32, V64, X, X, X, X, V32, V32, F32, F64, V32, V64 }; static char fastpath_bytes[] = { 1, 1, 4, 8 }; const valtype_t vtype = types[op]; const int fastbytes = fastpath_bytes[vtype]; upb_func *handler = gethandler(h, sel); upb_fieldtype_t ftype; size_t offset; int32_t hasbit; if (handler) { |1: | chkneob fastbytes, >3 |2: switch (vtype) { case V32: | call ->decodev32_fallback break; case V64: | call ->decodev64_fallback break; case F32: | call ->decodef32_fallback break; case F64: | call ->decodef64_fallback break; case X: break; } | jmp >4 /* Fast path decode; for when check_bytes bytes are available. */ |3: switch (op) { case OP_PARSE_SFIXED32: case OP_PARSE_FIXED32: | mov edx, dword [PTR] break; case OP_PARSE_SFIXED64: case OP_PARSE_FIXED64: | mov rdx, qword [PTR] break; case OP_PARSE_FLOAT: | movss xmm0, dword [PTR] break; case OP_PARSE_DOUBLE: | movsd xmm0, qword [PTR] break; default: /* Inline one byte of varint decoding. */ | movzx edx, byte [PTR] | test dl, dl | js <2 // Fallback to slow path for >1 byte varint. break; } /* Second-stage decode; used for both fast and slow paths */ /* (only needed for a few types). */ |4: switch (op) { case OP_PARSE_SINT32: /* 32-bit zig-zag decode. */ | mov eax, edx | shr edx, 1 | and eax, 1 | neg eax | xor edx, eax break; case OP_PARSE_SINT64: /* 64-bit zig-zag decode. */ | mov rax, rdx | shr rdx, 1 | and rax, 1 | neg rax | xor rdx, rax break; case OP_PARSE_BOOL: | test rdx, rdx | setne dl break; default: break; } /* Call callback (or specialize if we can). */ if (upb_msg_getscalarhandlerdata(h, sel, &ftype, &offset, &hasbit)) { switch (ftype) { case UPB_TYPE_INT64: case UPB_TYPE_UINT64: | mov [CLOSURE + offset], rdx break; case UPB_TYPE_INT32: case UPB_TYPE_UINT32: case UPB_TYPE_ENUM: | mov [CLOSURE + offset], edx break; case UPB_TYPE_DOUBLE: | movsd qword [CLOSURE + offset], XMMARG1 break; case UPB_TYPE_FLOAT: | movss dword [CLOSURE + offset], XMMARG1 break; case UPB_TYPE_BOOL: | mov [CLOSURE + offset], dl break; case UPB_TYPE_STRING: case UPB_TYPE_BYTES: case UPB_TYPE_MESSAGE: UPB_ASSERT(false); break; } | sethas CLOSURE, hasbit } else if (handler) { | mov ARG1_64, CLOSURE | load_handler_data h, sel | callp handler if (!alwaysok(h, sel)) { | test al, al | jnz >5 | call ->suspend | jmp <1 |5: } } /* We do this last so that the checkpoint is not advanced past the user's * data until the callback has returned success. */ | add PTR, fastbytes } else { /* No handler registered for this value, just skip it. */ | chkneob fastbytes, >3 |2: switch (vtype) { case V32: | call ->skipv32_fallback break; case V64: | call ->skipv64_fallback break; case F32: | call ->skipf32_fallback break; case F64: | call ->skipf64_fallback break; case X: break; } /* Fast-path skip. */ |3: if (vtype == V32 || vtype == V64) { | test byte [PTR], 0x80 | jnz <2 } | add PTR, fastbytes } } static void jitdispatch(jitcompiler *jc, const upb_pbdecodermethod *method) { /* Lots of room for tweaking/optimization here. */ const upb_inttable *dispatch = &method->dispatch; bool has_hash_entries = (dispatch->t.count > 0); /* Whether any of the fields for this message can have two wire types which * are both valid (packed & non-packed). * * OPT: populate this more precisely; not all messages with hash entries have * this characteristic. */ bool has_multi_wiretype = has_hash_entries; |=>define_jmptarget(jc, &method->dispatch): |1: /* Decode the field tag. */ | mov aword DECODER->checkpoint, PTR | chkeob 2, >6 | movzx edx, byte [PTR] | test dl, dl | jns >7 // Jump if first byte has no continuation bit. | movzx ecx, byte [PTR + 1] | test cl, cl | js >6 // Jump if second byte has continuation bit. | // Confirmed two-byte varint. | shl ecx, 7 | and edx, 0x7f | or edx, ecx | add PTR, 2 | jmp >8 |6: | call ->decode_unknown_tag_fallback | test eax, eax // Hit DELIMEND? | jnz >8 | ret |7: | add PTR, 1 |8: | mov ecx, edx | shr edx, 3 | and cl, 7 /* See comment attached to upb_pbdecodermethod.dispatch for layout of the * dispatch table. */ |2: | cmp edx, dispatch->array_size if (has_hash_entries) { | jae >7 } else { | jae >5 } | // OPT: Compact the lookup arr into 32-bit entries. if ((uintptr_t)dispatch->array > 0x7fffffff) { | mov64 rax, (uintptr_t)dispatch->array | mov rax, qword [rax + rdx * 8] } else { | mov rax, qword [rdx * 8 + dispatch->array] } |3: | // We take advantage of the fact that non-present entries are stored | // as -1, which will result in wire types that will never match. | cmp al, cl if (has_multi_wiretype) { | jne >6 } else { | jne >5 } | shr rax, 16 | | // Load the machine code address from the table entry. | // The table entry is relative to the dispatch->array jmptarget | // (patchdispatch() took care of this) which is the same as | // local label "4". The "lea" is really just trying to do | // lea rax, [>4 + rax] | // | // But we can't write that directly for some reason, so we use | // rdx as a temporary. | lea rdx, [>4] |=>define_jmptarget(jc, dispatch->array): |4: | add rax, rdx | ret | |5: | // Field isn't in our table. | | // For pushing unknown fields to the unknown field handler. | mov64 rax, (uintptr_t)method->dest_handlers_ | mov FRAME->sink.handlers, rax | | call ->parse_unknown | test eax, eax // ENDGROUP? | jz <1 | lea rax, [>9] // ENDGROUP; Load address of OP_ENDMSG. | ret if (has_multi_wiretype) { |6: | // Primary wire type didn't match, check secondary wire type. | cmp ah, cl | jne <5 | // Secondary wire type is a match, look up fn + UPB_MAX_FIELDNUMBER. | add rdx, UPB_MAX_FIELDNUMBER | // This key will never be in the array part, so do a hash lookup. UPB_ASSERT(has_hash_entries); | ld64 dispatch | jmp ->hashlookup // Tail call. } if (has_hash_entries) { |7: | // Hash table lookup. | ld64 dispatch | call ->hashlookup | jmp <3 } } static void jittag(jitcompiler *jc, uint64_t tag, int n, int ofs, const upb_pbdecodermethod *method) { /* Internally we parse unknown fields; if this runs us into DELIMEND we jump * to the corresponding DELIMEND target (either msg end or repeated field * end), which we find from the OP_CHECKDELIM which must have necessarily * preceded us. */ uint32_t last_instruction = *(jc->pc - 2); int last_arg = (int32_t)last_instruction >> 8; uint32_t *delimend = (jc->pc - 1) + last_arg; const size_t ptr_words = sizeof(void*) / sizeof(uint32_t); UPB_ASSERT((last_instruction & 0xff) == OP_CHECKDELIM); if (getop(*(jc->pc - 1)) == OP_TAGN) { jc->pc += ptr_words; } | chkneob n, >1 | // OPT: this is way too much fallback code to put here. | // Reduce and/or move to a separate section to make better icache usage. | ld64 tag | call ->checktag_fallback | cmp eax, DECODE_MISMATCH | je >3 | cmp eax, DECODE_EOF | je =>jmptarget(jc, delimend) | jmp >5 |1: switch (n) { case 1: | cmp byte [PTR], tag break; case 2: | cmp word [PTR], tag break; case 3: | // OPT: Slightly more efficient code, but depends on an extra byte. | // mov eax, dword [PTR] | // shl eax, 8 | // cmp eax, tag << 8 | cmp word [PTR], (tag & 0xffff) | jne >2 | cmp byte [PTR + 2], (tag >> 16) |2: break; case 4: | cmp dword [PTR], tag break; case 5: | cmp dword [PTR], (tag & 0xffffffff) | jne >3 | cmp byte [PTR + 4], (tag >> 32) } | je >4 |3: if (ofs == 0) { | call =>jmptarget(jc, &method->dispatch) | test rax, rax | jz =>jmptarget(jc, delimend) | jmp rax } else { | jmp =>jmptarget(jc, jc->pc + ofs) } |4: | add PTR, n |5: } /* Compile the bytecode to x64. */ static void jitbytecode(jitcompiler *jc) { upb_pbdecodermethod *method = NULL; const upb_handlers *h = NULL; for (jc->pc = jc->group->bytecode; jc->pc < jc->group->bytecode_end; ) { int32_t instr = *jc->pc; opcode op = instr & 0xff; uint32_t arg = instr >> 8; int32_t longofs = arg; if (op != OP_SETDISPATCH) { /* Skipped for SETDISPATCH because it defines its own asmlabel for the * dispatch code it emits. */ asmlabel(jc, "0x%lx.%s", pcofs(jc), upb_pbdecoder_getopname(op)); /* Skipped for SETDISPATCH because it should point at the function * prologue, not the dispatch function that is emitted first. * TODO: optimize this to only define pclabels that are actually used. */ |=>define_jmptarget(jc, jc->pc): } jc->pc++; switch (op) { case OP_STARTMSG: { upb_func *startmsg = gethandler(h, UPB_STARTMSG_SELECTOR); if (startmsg) { /* bool startmsg(void *closure, const void *hd) */ |1: | mov ARG1_64, CLOSURE | load_handler_data h, UPB_STARTMSG_SELECTOR | callp startmsg if (!alwaysok(h, UPB_STARTMSG_SELECTOR)) { | test al, al | jnz >2 | call ->suspend | jmp <1 |2: } } else { | nop } break; } case OP_ENDMSG: { upb_func *endmsg = gethandler(h, UPB_ENDMSG_SELECTOR); |9: if (endmsg) { /* bool endmsg(void *closure, const void *hd, upb_status *status) */ | mov ARG1_64, CLOSURE | load_handler_data h, UPB_ENDMSG_SELECTOR | mov ARG3_64, DECODER->status | callp endmsg } break; } case OP_SETDISPATCH: { uint32_t *op_pc = jc->pc - 1; const char *msgname; upb_inttable *dispatch; /* Load info for new method. */ memcpy(&dispatch, jc->pc, sizeof(void*)); jc->pc += sizeof(void*) / sizeof(uint32_t); /* The OP_SETDISPATCH bytecode contains a pointer that is * &method->dispatch; we want to go backwards and recover method. */ method = (void*)((char*)dispatch - offsetof(upb_pbdecodermethod, dispatch)); /* May be NULL, in which case no handlers for this message will be found. * OPT: we should do better by completely skipping the message in this * case instead of parsing it field by field. We should also do the skip * in the containing message's code. */ h = method->dest_handlers_; msgname = upb_msgdef_fullname(upb_handlers_msgdef(h)); /* Emit dispatch code for new method. */ asmlabel(jc, "0x%lx.dispatch.%s", pcofs(jc), msgname); jitdispatch(jc, method); /* Emit function prologue for new method. */ asmlabel(jc, "0x%lx.parse.%s", pcofs(jc), msgname); |=>define_jmptarget(jc, op_pc): |=>define_jmptarget(jc, method): | sub rsp, 8 break; } case OP_PARSE_DOUBLE: case OP_PARSE_FLOAT: case OP_PARSE_INT64: case OP_PARSE_UINT64: case OP_PARSE_INT32: case OP_PARSE_FIXED64: case OP_PARSE_FIXED32: case OP_PARSE_BOOL: case OP_PARSE_UINT32: case OP_PARSE_SFIXED32: case OP_PARSE_SFIXED64: case OP_PARSE_SINT32: case OP_PARSE_SINT64: jitprimitive(jc, op, h, arg); break; case OP_STARTSEQ: case OP_STARTSUBMSG: case OP_STARTSTR: { upb_func *start = gethandler(h, arg); if (start) { /* void *startseq(void *closure, const void *hd) * void *startsubmsg(void *closure, const void *hd) * void *startstr(void *closure, const void *hd, size_t size_hint) */ |1: | mov ARG1_64, CLOSURE | load_handler_data h, arg if (op == OP_STARTSTR) { | mov ARG3_64, DELIMEND | sub ARG3_64, PTR } | callp start if (!alwaysok(h, arg)) { | test rax, rax | jnz >2 | call ->suspend | jmp <1 |2: } | mov CLOSURE, rax } else { /* TODO: nop is only required because of asmlabel(). */ | nop } break; } case OP_ENDSEQ: case OP_ENDSUBMSG: case OP_ENDSTR: { upb_func *end = gethandler(h, arg); if (end) { /* bool endseq(void *closure, const void *hd) * bool endsubmsg(void *closure, const void *hd) * bool endstr(void *closure, const void *hd) */ |1: | mov ARG1_64, CLOSURE | load_handler_data h, arg | callp end if (!alwaysok(h, arg)) { | test al, al | jnz >2 | call ->suspend | jmp <1 |2: } } else { /* TODO: nop is only required because of asmlabel(). */ | nop } break; } case OP_STRING: { upb_func *str = gethandler(h, arg); | cmp PTR, DELIMEND | je >4 |1: | cmp PTR, DATAEND | jne >2 | call ->suspend | jmp <1 |2: if (str) { /* size_t str(void *closure, const void *hd, const char *str, * size_t n) */ | mov ARG1_64, CLOSURE | load_handler_data h, arg | mov ARG3_64, PTR | mov ARG4_64, DATAEND | sub ARG4_64, PTR | mov ARG5_64, qword DECODER->handle | callp str | add PTR, rax if (!alwaysok(h, arg)) { | cmp PTR, DATAEND | je >3 | call ->strret_fallback |3: } } else { | mov PTR, DATAEND } | cmp PTR, DELIMEND | jne <1 |4: break; } case OP_PUSHTAGDELIM: | mov FRAME->sink.closure, CLOSURE | // This shouldn't need to be read, because tag-delimited fields | // shouldn't have an OP_SETDELIM after them. But for the moment | // non-packed repeated fields do OP_SETDELIM so they can share more | // code with the packed code-path. If this is changed later, this | // store can be removed. | mov qword FRAME->end_ofs, 0 | cmp FRAME, DECODER->limit | je ->err | add FRAME, sizeof(upb_pbdecoder_frame) | mov dword FRAME->groupnum, arg break; case OP_PUSHLENDELIM: | call ->pushlendelim break; case OP_POP: | sub FRAME, sizeof(upb_pbdecoder_frame) | mov CLOSURE, FRAME->sink.closure break; case OP_SETDELIM: /* OPT: experiment with testing vs old offset to optimize away. */ | mov DATAEND, DECODER->end | add DELIMEND, FRAME->end_ofs | cmp DELIMEND, DECODER->buf | jb >1 | cmp DELIMEND, DATAEND | ja >1 // OPT: try cmov. | mov DATAEND, DELIMEND |1: break; case OP_SETBIGGROUPNUM: | mov dword FRAME->groupnum, *jc->pc++ break; case OP_CHECKDELIM: | cmp DELIMEND, PTR | je =>jmptarget(jc, jc->pc + longofs) break; case OP_CALL: | call =>jmptarget(jc, jc->pc + longofs) break; case OP_BRANCH: | jmp =>jmptarget(jc, jc->pc + longofs); break; case OP_RET: |9: | add rsp, 8 | ret break; case OP_TAG1: jittag(jc, (arg >> 8) & 0xff, 1, (int8_t)arg, method); break; case OP_TAG2: jittag(jc, (arg >> 8) & 0xffff, 2, (int8_t)arg, method); break; case OP_TAGN: { uint64_t tag; memcpy(&tag, jc->pc, 8); jittag(jc, tag, arg >> 8, (int8_t)arg, method); break; } case OP_DISPATCH: | call =>jmptarget(jc, &method->dispatch) break; case OP_HALT: UPB_ASSERT(false); } } asmlabel(jc, "eof"); | nop }