|// |// upb - a minimalist implementation of protocol buffers. |// |// Copyright (c) 2011 Google Inc. See LICENSE for details. |// Author: Josh Haberman |// |// JIT compiler for upb_pbdecoder on x86. Given a decoderplan object (which |// contains an embedded set of upb_handlers), generates code specialized to |// parsing the specific message and calling specific handlers. |// |// Since the JIT can call other functions (the JIT'ted code is not a leaf |// function) we must respect alignment rules. All x86-64 systems require |// 16-byte stack alignment. #include #include #include "dynasm/dasm_x86.h" #ifndef MAP_ANONYMOUS # define MAP_ANONYMOUS MAP_ANON #endif // We map into the low 32 bits when we can, but if this is not available // (like on OS X) we take what we can get. It's not required for correctness, // it's just a performance thing that makes it more likely that our jumps // can be rel32 (i.e. within 32-bits of our pc) instead of the longer // sequence required for other jumps (see callp). #ifndef MAP_32BIT #define MAP_32BIT 0 #endif // These are used to track jump targets for messages and fields. enum { STARTMSG = 0, AFTER_STARTMSG = 1, ENDOFBUF = 2, ENDOFMSG = 3, DYNDISPATCH = 4, TOTAL_MSG_PCLABELS = 5, }; enum { FIELD = 0, FIELD_NO_TYPECHECK = 1, TOTAL_FIELD_PCLABELS = 2, }; typedef struct { uint32_t max_field_number; // Currently keyed on field number. Could also try keying it // on encoded or decoded tag, or on encoded field number. void **tablearray; // Pointer to the JIT code for parsing this message. void *jit_func; } upb_jitmsginfo; static uint32_t upb_getpclabel(decoderplan *plan, const void *obj, int n) { upb_value v; bool found = upb_inttable_lookupptr(&plan->pclabels, obj, &v); UPB_ASSERT_VAR(found, found); return upb_value_getuint32(v) + n; } static upb_jitmsginfo *upb_getmsginfo(decoderplan *plan, const upb_handlers *h) { upb_value v; bool found = upb_inttable_lookupptr(&plan->msginfo, h, &v); UPB_ASSERT_VAR(found, found); return upb_value_getptr(v); } // To debug JIT-ted code with GDB we need to tell GDB about the JIT-ted code // at runtime. GDB 7.x+ has defined an interface for doing this, and these // structure/function defintions are copied out of gdb/jit.h // // We need to give GDB an ELF file at runtime describing the symbols we have // generated. To avoid implementing the ELF format, we generate an ELF file // at compile-time and compile it in as a character string. We can replace // a few key constants (address of JIT-ted function and its size) by looking // for a few magic numbers and doing a dumb string replacement. #ifndef __APPLE__ const unsigned char upb_jit_debug_elf_file[] = { #include "upb/pb/jit_debug_elf_file.h" }; typedef enum { GDB_JIT_NOACTION = 0, GDB_JIT_REGISTER, GDB_JIT_UNREGISTER } jit_actions_t; typedef struct gdb_jit_entry { struct gdb_jit_entry *next_entry; struct gdb_jit_entry *prev_entry; const char *symfile_addr; uint64_t symfile_size; } gdb_jit_entry; typedef struct { uint32_t version; uint32_t action_flag; gdb_jit_entry *relevant_entry; gdb_jit_entry *first_entry; } gdb_jit_descriptor; gdb_jit_descriptor __jit_debug_descriptor = {1, GDB_JIT_NOACTION, NULL, NULL}; void __attribute__((noinline)) __jit_debug_register_code() { __asm__ __volatile__(""); } void upb_reg_jit_gdb(decoderplan *plan) { // Create debug info. size_t elf_len = sizeof(upb_jit_debug_elf_file); plan->debug_info = malloc(elf_len); memcpy(plan->debug_info, upb_jit_debug_elf_file, elf_len); uint64_t *p = (void*)plan->debug_info; for (; (void*)(p+1) <= (void*)plan->debug_info + elf_len; ++p) { if (*p == 0x12345678) { *p = (uintptr_t)plan->jit_code; } if (*p == 0x321) { *p = plan->jit_size; } } // Register the JIT-ted code with GDB. gdb_jit_entry *e = malloc(sizeof(gdb_jit_entry)); e->next_entry = __jit_debug_descriptor.first_entry; e->prev_entry = NULL; if (e->next_entry) e->next_entry->prev_entry = e; e->symfile_addr = plan->debug_info; e->symfile_size = elf_len; __jit_debug_descriptor.first_entry = e; __jit_debug_descriptor.relevant_entry = e; __jit_debug_descriptor.action_flag = GDB_JIT_REGISTER; __jit_debug_register_code(); } #else void upb_reg_jit_gdb(decoderplan *plan) { (void)plan; } #endif // Has to be a separate function, otherwise GCC will complain about // expressions like (&foo != NULL) because they will never evaluate // to false. static void upb_assert_notnull(void *addr) { assert(addr != NULL); (void)addr; } |.arch x64 |.actionlist upb_jit_actionlist |.globals UPB_JIT_GLOBAL_ |.globalnames upb_jit_globalnames | |// Calling conventions. Note -- this will need to be changed for |// Windows, which uses a different calling convention! |.define ARG1_64, rdi |.define ARG2_8, r6b // DynASM's equivalent to "sil" -- low byte of esi. |.define ARG2_32, esi |.define ARG2_64, rsi |.define ARG3_32, edx |.define ARG3_64, rdx |.define ARG4_64, rcx |.define XMMARG1, xmm0 | |// Register allocation / type map. |// ALL of the code in this file uses these register allocations. |// When we "call" within this file, we do not use regular calling |// conventions, but of course when calling to user callbacks we must. |.define PTR, rbx // Writing this to DECODER->ptr commits our progress. |.define CLOSURE, r12 |.type SINKFRAME, upb_sinkframe, r13 |.type FRAME, frame, r14 |.type DECODER, upb_pbdecoder, r15 |.type SINK, upb_sink | |.macro callp, addr || upb_assert_notnull(addr); |// TODO(haberman): fix this. I believe the predicate we should actually be |// testing is whether the jump distance is greater than INT32_MAX, not the |// absolute address of the target. || if ((uintptr_t)addr < 0xffffffff) { | call &addr || } else { | mov64 rax, (uintptr_t)addr | call rax || } |.endmacro | |.macro load_handler_data, h, f, type ||{ || uintptr_t data = (uintptr_t)gethandlerdata(h, f, type); || if (data > 0xffffffff) { | mov64 rax, data | mov SINKFRAME->u.handler_data, rax || } else if (data > 0x7fffffff) { | mov eax, data | mov SINKFRAME->u.handler_data, rax || } else { | mov qword SINKFRAME->u.handler_data, data || } || } |.endmacro | |// Checkpoints our progress by writing PTR to DECODER, and |// checks for end-of-buffer. |.macro checkpoint, h | mov DECODER->ptr, PTR | cmp PTR, DECODER->effective_end | jae =>upb_getpclabel(plan, h, ENDOFBUF) |.endmacro | |.macro check_bool_ret | test al, al | jz ->exit_jit |.endmacro | |.macro check_ptr_ret | test rax, rax | jz ->exit_jit |.endmacro | |// Decodes varint into ARG2. |// Inputs: |// - ecx: first 4 bytes of varint |// - offset: offset from PTR where varint begins |// Outputs: |// - ARG2: contains decoded varint |// - rax: new PTR |.macro decode_loaded_varint, offset | // Check for <=2 bytes inline, otherwise jump to 2-10 byte decoder. | lea rax, [PTR + offset + 1] | mov ARG2_32, ecx | and ARG2_32, 0x7f | test cl, cl | jns >9 | lea rax, [PTR + offset + 2] | movzx edx, ch | and edx, 0x7f | shl edx, 7 | or ARG2_32, edx | test cx, cx | jns >9 | mov ARG1_64, rax |// XXX: I don't think this handles 64-bit values correctly. |// Test with UINT64_MAX | callp upb_vdecode_max8_fast |// rax return from function will contain new pointer | mov ARG2_64, rdx | check_ptr_ret // Check for unterminated, >10-byte varint. |9: |.endmacro | |.macro decode_varint, offset | mov ecx, dword [PTR + offset] | decode_loaded_varint offset | mov PTR, rax |.endmacro | |// Table-based field dispatch. |// Inputs: |// - ecx: first 4 bytes of tag |// Outputs: |// - edx: field number |// - esi: wire type |// Could specialize this by avoiding the value masking: could just key the |// table on the raw (length-masked) varint to save 3-4 cycles of latency. |// Currently only support tables where all entries are in the array part. |.macro dyndispatch_, h |=>upb_getpclabel(plan, h, DYNDISPATCH): | decode_loaded_varint, 0 | mov ecx, esi | shr ecx, 3 | and esi, 0x7 // Note: this value is used in the FIELD pclabel below. | cmp esi, UPB_WIRE_TYPE_END_GROUP | je >1 || upb_jitmsginfo *mi = upb_getmsginfo(plan, h); | cmp ecx, mi->max_field_number // Bounds-check the field. | ja ->exit_jit // In the future; could be unknown label || if ((uintptr_t)mi->tablearray < 0xffffffff) { | // TODO: support hybrid array/hash tables. | mov rax, qword [rcx*8 + mi->tablearray] || } else { | mov64 rax, (uintptr_t)mi->tablearray | mov rax, qword [rax + rcx*8] || } | jmp rax // Dispatch: unpredictable jump. |1: |// End group. | cmp ecx, FRAME->group_fieldnum | jne ->exit_jit // Unexpected END_GROUP tag. | mov PTR, rax // rax came from decode_loaded_varint | mov DECODER->ptr, PTR | jmp =>upb_getpclabel(plan, h, ENDOFMSG) |.endmacro | |.if 1 | // Replicated dispatch: larger code, but better branch prediction. | .define dyndispatch, dyndispatch_ |.else | // Single dispatch: smaller code, could be faster because of reduced | // icache usage. We keep this around to allow for easy comparison between | // the two. | .macro dyndispatch, h | jmp =>upb_getpclabel(plan, h, DYNDISPATCH) | .endmacro |.endif | |.macro pushsinkframe, handlers, field, endtype | mov rax, DECODER->sink | mov dword SINKFRAME->u.selector, getselector(field, endtype) | lea rcx, [SINKFRAME + sizeof(upb_sinkframe)] // rcx for short addressing | cmp rcx, SINK:rax->limit | jae ->exit_jit // Frame stack overflow. | mov64 r9, (uintptr_t)handlers | mov SINKFRAME:rcx->h, r9 | mov SINKFRAME:rcx->closure, CLOSURE | mov SINK:rax->top_, rcx | mov SINKFRAME:rcx->sink_, rax | mov SINKFRAME, rcx |.endmacro | |.macro popsinkframe | sub SINKFRAME, sizeof(upb_sinkframe) | mov rax, DECODER->sink | mov SINK:rax->top_, SINKFRAME | mov CLOSURE, SINKFRAME->closure |.endmacro | |// Push a stack frame (not the CPU stack, the upb_pbdecoder stack). |.macro pushframe, handlers, field, end_offset_, endtype |// Decoder Frame. | lea rax, [FRAME + sizeof(frame)] // rax for short addressing | cmp rax, DECODER->limit | jae ->exit_jit // Frame stack overflow. | mov64 r10, (uintptr_t)field | mov FRAME:rax->f, r10 | mov qword FRAME:rax->end_ofs, end_offset_ | mov byte FRAME:rax->is_sequence, (endtype == UPB_HANDLER_ENDSEQ) | mov byte FRAME:rax->is_packed, 0 || if (upb_fielddef_istagdelim(field) && endtype == UPB_HANDLER_ENDSUBMSG) { | mov dword FRAME:rax->group_fieldnum, upb_fielddef_number(field) || } else { | mov dword FRAME:rax->group_fieldnum, 0xffffffff || } | mov DECODER->top, rax | mov FRAME, rax | pushsinkframe handlers, field, endtype |.endmacro | |.macro popframe | sub FRAME, sizeof(frame) | mov DECODER->top, FRAME | popsinkframe | setmsgend |.endmacro | |.macro setmsgend | mov rsi, DECODER->jit_end | mov rax, qword FRAME->end_ofs // Will be UINT64_MAX for groups. | sub rax, qword DECODER->bufstart_ofs | add rax, qword DECODER->buf // rax = d->buf + f->end_ofs - d->bufstart_ofs | jc >8 // If the addition overflowed, use jit_end | cmp rax, rsi | ja >8 // If jit_end is less, use jit_end | mov rsi, rax // Use frame end. |8: | mov DECODER->effective_end, rsi |.endmacro | |// rcx contains the tag, compare it against "tag", but since it is a varint |// we must only compare as many bytes as actually have data. |.macro checktag, tag || switch (upb_value_size(tag)) { || case 1: | cmp cl, tag || break; || case 2: | cmp cx, tag || break; || case 3: | and ecx, 0xffffff // 3 bytes | cmp rcx, tag || case 4: | cmp ecx, tag || break; || case 5: | mov64 rdx, 0xffffffffff // 5 bytes | and rcx, rdx | cmp rcx, tag || break; || default: abort(); || } |.endmacro | |.macro sethas, reg, hasbit || if (hasbit >= 0) { | or byte [reg + ((uint32_t)hasbit / 8)], (1 << ((uint32_t)hasbit % 8)) || } |.endmacro #include #include "upb/pb/varint.h" static upb_func *gethandler(const upb_handlers *h, const upb_fielddef *f, upb_handlertype_t type) { return upb_handlers_gethandler(h, getselector(f, type)); } static uintptr_t gethandlerdata(const upb_handlers *h, const upb_fielddef *f, upb_handlertype_t type) { return (uintptr_t)upb_handlers_gethandlerdata(h, getselector(f, type)); } // Decodes the next val into ARG2, advances PTR. static void upb_decoderplan_jit_decodefield(decoderplan *plan, size_t tag_size, const upb_handlers *h, const upb_fielddef *f) { // Decode the value into arg 3 for the callback. switch (upb_fielddef_descriptortype(f)) { case UPB_DESCRIPTOR_TYPE_DOUBLE: | movsd XMMARG1, qword [PTR + tag_size] | add PTR, 8 + tag_size break; case UPB_DESCRIPTOR_TYPE_FIXED64: case UPB_DESCRIPTOR_TYPE_SFIXED64: | mov ARG2_64, qword [PTR + tag_size] | add PTR, 8 + tag_size break; case UPB_DESCRIPTOR_TYPE_FLOAT: | movss XMMARG1, dword [PTR + tag_size] | add PTR, 4 + tag_size break; case UPB_DESCRIPTOR_TYPE_FIXED32: case UPB_DESCRIPTOR_TYPE_SFIXED32: | mov ARG2_32, dword [PTR + tag_size] | add PTR, 4 + tag_size break; case UPB_DESCRIPTOR_TYPE_BOOL: // Can't assume it's one byte long, because bool must be wire-compatible // with all of the varint integer types. | decode_varint tag_size | test ARG2_64, ARG2_64 | setne al | movzx ARG2_32, al break; case UPB_DESCRIPTOR_TYPE_INT64: case UPB_DESCRIPTOR_TYPE_UINT64: case UPB_DESCRIPTOR_TYPE_INT32: case UPB_DESCRIPTOR_TYPE_UINT32: case UPB_DESCRIPTOR_TYPE_ENUM: | decode_varint tag_size break; case UPB_DESCRIPTOR_TYPE_SINT64: // 64-bit zig-zag decoding. | decode_varint tag_size | mov rax, ARG2_64 | shr ARG2_64, 1 | and rax, 1 | neg rax | xor ARG2_64, rax break; case UPB_DESCRIPTOR_TYPE_SINT32: // 32-bit zig-zag decoding. | decode_varint tag_size | mov eax, ARG2_32 | shr ARG2_32, 1 | and eax, 1 | neg eax | xor ARG2_32, eax break; case UPB_DESCRIPTOR_TYPE_STRING: case UPB_DESCRIPTOR_TYPE_BYTES: { // We only handle the case where the entire string is in our current // buf, which sidesteps any security problems. The C path has more // robust checks. | mov ecx, dword [PTR + tag_size] | decode_loaded_varint tag_size | mov rdi, DECODER->end | sub rdi, rax | cmp ARG2_64, rdi // if (len > d->end - str) | ja ->exit_jit // Can't deliver, whole string not in buf. | mov PTR, rax upb_func *handler = gethandler(h, f, UPB_HANDLER_STARTSTR); if (handler) { | mov DECODER->tmp_len, ARG2_32 | mov ARG1_64, SINKFRAME | load_handler_data h, f, UPB_HANDLER_STARTSTR | callp handler | check_ptr_ret | mov CLOSURE, rax | mov ARG3_32, DECODER->tmp_len } else { | mov ARG3_64, ARG2_64 } handler = gethandler(h, f, UPB_HANDLER_STRING); if (handler) { // TODO: push a real frame so we can resume into the string. // (but maybe do this only if the string breaks). | pushsinkframe h, f, UPB_HANDLER_ENDSTR // size_t str(const upb_sinkframe *frame, const char *buf, size_t len) | mov ARG1_64, SINKFRAME | load_handler_data h, f, UPB_HANDLER_STRING | mov ARG2_64, PTR | callp handler // TODO: properly handle returns other than "n" (the whole string). | add PTR, rax | popsinkframe } else { | add PTR, ARG3_64 } handler = gethandler(h, f, UPB_HANDLER_ENDSTR); if (handler) { // bool endstr(const upb_sinkframe *frame); | mov ARG1_64, SINKFRAME | load_handler_data h, f, UPB_HANDLER_ENDSTR | callp handler | check_bool_ret } break; } // Will dispatch callbacks and call submessage in a second. case UPB_DESCRIPTOR_TYPE_MESSAGE: | decode_varint tag_size break; case UPB_DESCRIPTOR_TYPE_GROUP: | add PTR, tag_size break; default: abort(); } } static void upb_decoderplan_jit_callcb(decoderplan *plan, const upb_handlers *h, const upb_fielddef *f) { // Call callbacks. Specializing the append accessors didn't yield a speed // increase in benchmarks. if (upb_fielddef_issubmsg(f)) { // Call startsubmsg handler (if any). upb_func *startsubmsg = gethandler(h, f, UPB_HANDLER_STARTSUBMSG); if (startsubmsg) { // upb_sflow_t startsubmsg(const upb_sinkframe *frame) | mov DECODER->tmp_len, ARG2_32 | mov ARG1_64, SINKFRAME | load_handler_data h, f, UPB_HANDLER_STARTSUBMSG | callp startsubmsg | check_ptr_ret | mov CLOSURE, rax } const upb_handlers *sub_h = upb_handlers_getsubhandlers(h, f); if (sub_h) { if (upb_fielddef_istagdelim(f)) { | mov rdx, UPB_NONDELIMITED } else { | mov esi, DECODER->tmp_len | mov rdx, PTR | sub rdx, DECODER->buf | add rdx, DECODER->bufstart_ofs | add rdx, rsi // = d->bufstart_ofs + (d->ptr - d->buf) + delim_len } | pushframe sub_h, f, rdx, UPB_HANDLER_ENDSUBMSG | call =>upb_getpclabel(plan, sub_h, STARTMSG) | popframe } else { if (upb_fielddef_istagdelim(f)) { // Groups with no handlers not supported yet. assert(false); } else { | mov esi, DECODER->tmp_len | add PTR, rsi } } // Call endsubmsg handler (if any). upb_func *endsubmsg = gethandler(h, f, UPB_HANDLER_ENDSUBMSG); if (endsubmsg) { // upb_flow_t endsubmsg(void *closure, upb_value fval); | mov ARG1_64, SINKFRAME | load_handler_data h, f, UPB_HANDLER_ENDSUBMSG | callp endsubmsg | check_bool_ret } } else if (!upb_fielddef_isstring(f)) { upb_handlertype_t handlertype = upb_handlers_getprimitivehandlertype(f); upb_func *handler = gethandler(h, f, handlertype); const upb_stdmsg_fval *fv = (void*)gethandlerdata(h, f, handlertype); // Test for callbacks we can specialize. // Can't switch() on function pointers. if (handler == (void*)&upb_stdmsg_setint64 || handler == (void*)&upb_stdmsg_setuint64) { | mov [CLOSURE + fv->offset], ARG2_64 | sethas CLOSURE, fv->hasbit } else if (handler == (void*)&upb_stdmsg_setdouble) { | movsd qword [CLOSURE + fv->offset], XMMARG1 | sethas CLOSURE, fv->hasbit } else if (handler == (void*)&upb_stdmsg_setint32 || handler == (void*)&upb_stdmsg_setuint32) { | mov [CLOSURE + fv->offset], ARG2_32 | sethas CLOSURE, fv->hasbit } else if (handler == (void*)&upb_stdmsg_setfloat) { | movss dword [CLOSURE + fv->offset], XMMARG1 | sethas CLOSURE, fv->hasbit } else if (handler == (void*)&upb_stdmsg_setbool) { | mov [CLOSURE + fv->offset], ARG2_8 | sethas CLOSURE, fv->hasbit } else if (handler) { // bool value(const upb_sinkframe* frame, ctype val) | mov ARG1_64, SINKFRAME | load_handler_data h, f, handlertype | callp handler | check_bool_ret } } } static uint64_t upb_get_encoded_tag(const upb_fielddef *f) { uint32_t tag = (upb_fielddef_number(f) << 3) | upb_decoder_types[upb_fielddef_descriptortype(f)].native_wire_type; uint64_t encoded_tag = upb_vencode32(tag); // No tag should be greater than 5 bytes. assert(encoded_tag <= 0xffffffffff); return encoded_tag; } static void upb_decoderplan_jit_endseq(decoderplan *plan, const upb_handlers *h, const upb_fielddef *f) { | popframe upb_func *endseq = gethandler(h, f, UPB_HANDLER_ENDSEQ); if (endseq) { | mov ARG1_64, SINKFRAME | load_handler_data h, f, UPB_HANDLER_ENDSEQ | callp endseq } } // PTR should point to the beginning of the tag. static void upb_decoderplan_jit_field(decoderplan *plan, const upb_handlers *h, const upb_fielddef *f, const upb_fielddef *next_f) { uint64_t tag = upb_get_encoded_tag(f); uint64_t next_tag = next_f ? upb_get_encoded_tag(next_f) : 0; int tag_size = upb_value_size(tag); // PC-label for the dispatch table. // We check the wire type (which must be loaded in edi) because the // table is keyed on field number, not type. |=>upb_getpclabel(plan, f, FIELD): | cmp esi, (tag & 0x7) | jne ->exit_jit // In the future: could be an unknown field or packed. |=>upb_getpclabel(plan, f, FIELD_NO_TYPECHECK): if (upb_fielddef_isseq(f)) { upb_func *startseq = gethandler(h, f, UPB_HANDLER_STARTSEQ); if (startseq) { | mov ARG1_64, SINKFRAME | load_handler_data h, f, UPB_HANDLER_STARTSEQ | callp startseq | check_ptr_ret | mov CLOSURE, rax } | mov rsi, FRAME->end_ofs | pushframe h, f, rsi, UPB_HANDLER_ENDSEQ } |1: // Label for repeating this field. upb_decoderplan_jit_decodefield(plan, tag_size, h, f); upb_decoderplan_jit_callcb(plan, h, f); // This is kind of gross; future redesign should take into account how to // make this work nicely. The difficult part is that the sequence can be // broken either by end-of-message or by seeing a different field; in both // cases we need to call the endseq handler, but what we do after that // depends on which case triggered the end-of-sequence. | mov DECODER->ptr, PTR | cmp PTR, DECODER->jit_end | jae ->exit_jit | cmp PTR, DECODER->effective_end | jb >2 if (upb_fielddef_isseq(f)) { upb_decoderplan_jit_endseq(plan, h, f); } | jmp =>upb_getpclabel(plan, h, ENDOFMSG) |2: | mov rcx, qword [PTR] if (upb_fielddef_isseq(f)) { | checktag tag | je <1 upb_decoderplan_jit_endseq(plan, h, f); // Load next tag again (popframe/endseq clobbered it). | mov rcx, qword [PTR] } if (next_tag != 0) { | checktag next_tag | je =>upb_getpclabel(plan, next_f, FIELD_NO_TYPECHECK) } // Fall back to dynamic dispatch. | dyndispatch h } static int upb_compare_uint32(const void *a, const void *b) { return *(uint32_t*)a - *(uint32_t*)b; } static void upb_decoderplan_jit_msg(decoderplan *plan, const upb_handlers *h) { |=>upb_getpclabel(plan, h, AFTER_STARTMSG): | push rbp | mov rbp, rsp | jmp >1 |=>upb_getpclabel(plan, h, STARTMSG): | push rbp | mov rbp, rsp // Call startmsg handler (if any): upb_startmsg_handler *startmsg = upb_handlers_getstartmsg(h); if (startmsg) { // upb_flow_t startmsg(void *closure); | mov ARG1_64, SINKFRAME | callp startmsg | check_bool_ret } |1: | setmsgend | checkpoint h | mov ecx, dword [PTR] | dyndispatch_ h // --------- New code section (does not fall through) ------------------------ // Emit code for parsing each field (dynamic dispatch contains pointers to // all of these). // Create an ordering over the fields in field number order. // Parsing will theoretically be fastest if we emit code in the same // order as field numbers are seen on-the-wire because of an optimization // in the generated code that skips dynamic dispatch if the next field is // as expected. const upb_msgdef *md = upb_handlers_msgdef(h); int num_keys = upb_msgdef_numfields(md); uint32_t *keys = malloc(num_keys * sizeof(*keys)); int idx = 0; upb_msg_iter i; for(upb_msg_begin(&i, md); !upb_msg_done(&i); upb_msg_next(&i)) { keys[idx++] = upb_fielddef_number(upb_msg_iter_field(&i)); } qsort(keys, num_keys, sizeof(uint32_t), &upb_compare_uint32); for(int i = 0; i < num_keys; i++) { const upb_fielddef *f = upb_msgdef_itof(md, keys[i]); const upb_fielddef *next_f = (i + 1 < num_keys) ? upb_msgdef_itof(md, keys[i + 1]) : NULL; upb_decoderplan_jit_field(plan, h, f, next_f); } free(keys); // --------- New code section (does not fall through) ------------------------ // End-of-buf / end-of-message. // We hit a buffer limit; either we hit jit_end or end-of-submessage. |=>upb_getpclabel(plan, h, ENDOFBUF): | cmp PTR, DECODER->jit_end | jae ->exit_jit |=>upb_getpclabel(plan, h, ENDOFMSG): // We are at end-of-submsg: call endmsg handler (if any): upb_endmsg_handler *endmsg = upb_handlers_getendmsg(h); if (endmsg) { // void endmsg(void *closure, upb_status *status) { | mov ARG1_64, SINKFRAME | mov ARG2_64, DECODER->sink | mov ARG2_64, SINK:ARG2_64->pipeline_ | add ARG2_64, offsetof(upb_pipeline, status_) | callp endmsg } | leave | ret } static void upb_decoderplan_jit(decoderplan *plan) { // The JIT prologue/epilogue trampoline that is generated in this function // does not depend on the handlers, so it will never vary. Ideally we would // put it in an object file and just link it into upb so we could have only a // single copy of it instead of one copy for each decoderplan. But our // options for doing that are undesirable: GCC inline assembly is // complicated, not portable to other compilers, and comes with subtle // caveats about incorrect things what the optimizer might do if you eg. // execute non-local jumps. Putting this code in a .s file would force us to // calculate the structure offsets ourself instead of symbolically // (ie. [r15 + 0xcd] instead of DECODER->ptr). So we tolerate a bit of // unnecessary duplication/redundancy. | push rbp | mov rbp, rsp | push r15 | push r14 | push r13 | push r12 | push rbx // Align stack. | sub rsp, 8 | mov DECODER, ARG1_64 | mov DECODER->saved_rbp, rbp | mov FRAME, DECODER:ARG1_64->top | mov rax, DECODER:ARG1_64->sink | mov SINKFRAME, SINK:rax->top_ | mov CLOSURE, SINKFRAME->closure | mov PTR, DECODER->ptr // TODO: push return addresses for re-entry (will be necessary for multiple // buffer support). | call ARG2_64 |->exit_jit: | mov rbp, DECODER->saved_rbp | lea rsp, [rbp - 48] // Counter previous alignment. | add rsp, 8 | pop rbx | pop r12 | pop r13 | pop r14 | pop r15 | leave | ret upb_inttable_iter i; upb_inttable_begin(&i, &plan->msginfo); for(; !upb_inttable_done(&i); upb_inttable_next(&i)) { const upb_handlers *h = (const upb_handlers*)upb_inttable_iter_key(&i); upb_decoderplan_jit_msg(plan, h); } } static void upb_decoderplan_jit_assignpclabels(decoderplan *plan, const upb_handlers *h) { // Limit the DFS. if (upb_inttable_lookupptr(&plan->pclabels, h, NULL)) return; upb_inttable_insertptr(&plan->pclabels, h, upb_value_uint32(plan->pclabel_count)); plan->pclabel_count += TOTAL_MSG_PCLABELS; upb_jitmsginfo *info = malloc(sizeof(*info)); info->max_field_number = 0; upb_inttable_insertptr(&plan->msginfo, h, upb_value_ptr(info)); upb_msg_iter i; upb_msg_begin(&i, upb_handlers_msgdef(h)); for(; !upb_msg_done(&i); upb_msg_next(&i)) { const upb_fielddef *f = upb_msg_iter_field(&i); info->max_field_number = UPB_MAX(info->max_field_number, upb_fielddef_number(f)); upb_inttable_insertptr(&plan->pclabels, f, upb_value_uint32(plan->pclabel_count)); plan->pclabel_count += TOTAL_FIELD_PCLABELS; // Discover the whole graph of handlers depth-first. We will probably // revise this later to be more explicit about the list of handlers that // the plan should include. if (upb_fielddef_issubmsg(f)) { const upb_handlers *subh = upb_handlers_getsubhandlers(h, f); if (subh) upb_decoderplan_jit_assignpclabels(plan, subh); } } // TODO: support large field numbers by either using a hash table or // generating code for a binary search. For now large field numbers // will just fall back to the table decoder. info->max_field_number = UPB_MIN(info->max_field_number, 16000); info->tablearray = malloc((info->max_field_number + 1) * sizeof(void*)); } static void upb_decoderplan_makejit(decoderplan *plan) { upb_inttable_init(&plan->msginfo, UPB_CTYPE_PTR); plan->debug_info = NULL; // Assign pclabels. plan->pclabel_count = 0; upb_inttable_init(&plan->pclabels, UPB_CTYPE_UINT32); upb_decoderplan_jit_assignpclabels(plan, plan->dest_handlers); void **globals = malloc(UPB_JIT_GLOBAL__MAX * sizeof(*globals)); dasm_init(plan, 1); dasm_setupglobal(plan, globals, UPB_JIT_GLOBAL__MAX); dasm_growpc(plan, plan->pclabel_count); dasm_setup(plan, upb_jit_actionlist); upb_decoderplan_jit(plan); int dasm_status = dasm_link(plan, &plan->jit_size); (void)dasm_status; assert(dasm_status == DASM_S_OK); plan->jit_code = mmap(NULL, plan->jit_size, PROT_READ | PROT_WRITE, MAP_32BIT | MAP_ANONYMOUS | MAP_PRIVATE, 0, 0); upb_reg_jit_gdb(plan); dasm_encode(plan, plan->jit_code); // Create dispatch tables. upb_inttable_iter i; upb_inttable_begin(&i, &plan->msginfo); for(; !upb_inttable_done(&i); upb_inttable_next(&i)) { const upb_handlers *h = (const upb_handlers*)upb_inttable_iter_key(&i); upb_jitmsginfo *mi = upb_getmsginfo(plan, h); // We jump to after the startmsg handler since it is called before entering // the JIT (either by upb_pbdecoder or by a previous call to the JIT). mi->jit_func = plan->jit_code + dasm_getpclabel(plan, upb_getpclabel(plan, h, AFTER_STARTMSG)); for (uint32_t j = 0; j <= mi->max_field_number; j++) { const upb_fielddef *f = upb_msgdef_itof(upb_handlers_msgdef(h), j); if (f) { mi->tablearray[j] = plan->jit_code + dasm_getpclabel(plan, upb_getpclabel(plan, f, FIELD)); } else { // TODO: extend the JIT to handle unknown fields. // For the moment we exit the JIT for any unknown field. mi->tablearray[j] = globals[UPB_JIT_GLOBAL_exit_jit]; } } } upb_inttable_uninit(&plan->pclabels); dasm_free(plan); free(globals); mprotect(plan->jit_code, plan->jit_size, PROT_EXEC | PROT_READ); #ifndef NDEBUG // View with: objdump -M intel -D -b binary -mi386 -Mx86-64 /tmp/machine-code // Or: ndisasm -b 64 /tmp/machine-code FILE *f = fopen("/tmp/machine-code", "wb"); fwrite(plan->jit_code, plan->jit_size, 1, f); fclose(f); #endif } static void upb_decoderplan_freejit(decoderplan *plan) { upb_inttable_iter i; upb_inttable_begin(&i, &plan->msginfo); for(; !upb_inttable_done(&i); upb_inttable_next(&i)) { upb_jitmsginfo *mi = upb_value_getptr(upb_inttable_iter_value(&i)); free(mi->tablearray); free(mi); } upb_inttable_uninit(&plan->msginfo); munmap(plan->jit_code, plan->jit_size); free(plan->debug_info); // TODO: unregister } static void upb_decoder_enterjit(upb_pbdecoder *d, decoderplan *plan) { if (plan->jit_code && d->top == d->stack && d->sink->top_ == d->sink->stack && d->ptr && d->ptr < d->jit_end) { #ifndef NDEBUG register uint64_t rbx asm ("rbx") = 11; register uint64_t r12 asm ("r12") = 12; register uint64_t r13 asm ("r13") = 13; register uint64_t r14 asm ("r14") = 14; register uint64_t r15 asm ("r15") = 15; #endif // Decodes as many fields as possible, updating d->ptr appropriately, // before falling through to the slow(er) path. void (*upb_jit_decode)(upb_pbdecoder *d, void*) = (void*)plan->jit_code; upb_jitmsginfo *mi = upb_getmsginfo(plan, plan->dest_handlers); assert(mi); upb_jit_decode(d, mi->jit_func); assert(d->ptr <= d->end); // Test that callee-save registers were properly restored. assert(rbx == 11); assert(r12 == 12); assert(r13 == 13); assert(r14 == 14); assert(r15 == 15); } }