From 7d986946b7dca88823dcc3fbe004c14dd2dc863f Mon Sep 17 00:00:00 2001 From: Joshua Haberman Date: Mon, 19 Sep 2011 17:28:13 -0700 Subject: Rename x86 -> x64, since JIT is x64-only. --- Makefile | 10 +- upb/pb/decoder_x64.dasc | 805 ++++++++++++++++++++++++++++++++++++++++++++++++ upb/pb/decoder_x86.dasc | 805 ------------------------------------------------ 3 files changed, 810 insertions(+), 810 deletions(-) create mode 100644 upb/pb/decoder_x64.dasc delete mode 100644 upb/pb/decoder_x86.dasc diff --git a/Makefile b/Makefile index 143673c..00b9d3a 100644 --- a/Makefile +++ b/Makefile @@ -105,7 +105,7 @@ PB= \ clean_leave_profile: rm -rf $(LIBUPB) $(LIBUPB_PIC) rm -rf $(call rwildcard,,*.o) $(call rwildcard,,*.lo) $(call rwildcard,,*.dSYM) - rm -rf upb/pb/decoder_x86.h + rm -rf upb/pb/decoder_x64.h rm -rf benchmark/google_messages.proto.pb benchmark/google_messages.pb.* benchmarks/b.* benchmarks/*.pb* rm -rf upb/pb/jit_debug_elf_file.o rm -rf upb/pb/jit_debug_elf_file.h @@ -129,7 +129,7 @@ OBJ=$(patsubst %.c,%.o,$(SRC)) PICOBJ=$(patsubst %.c,%.lo,$(SRC)) ifdef USE_JIT -upb/pb/decoder.o upb/pb/decoder.lo: upb/pb/decoder_x86.h +upb/pb/decoder.o upb/pb/decoder.lo: upb/pb/decoder_x64.h endif $(LIBUPB): $(OBJ) $(E) AR $(LIBUPB) @@ -156,9 +156,9 @@ upb/def.lo: upb/def.c $(E) 'CC -fPIC' $< $(Q) $(CC) $(CFLAGS) $(CPPFLAGS) $(DEF_OPT) -c -o $@ $< -fPIC -upb/pb/decoder_x86.h: upb/pb/decoder_x86.dasc +upb/pb/decoder_x64.h: upb/pb/decoder_x64.dasc $(E) DYNASM $< - $(Q) lua dynasm/dynasm.lua upb/pb/decoder_x86.dasc > upb/pb/decoder_x86.h + $(Q) lua dynasm/dynasm.lua upb/pb/decoder_x64.dasc > upb/pb/decoder_x64.h ifneq ($(shell uname), Darwin) upb/pb/jit_debug_elf_file.o: upb/pb/jit_debug_elf_file.s @@ -168,7 +168,7 @@ upb/pb/jit_debug_elf_file.o: upb/pb/jit_debug_elf_file.s upb/pb/jit_debug_elf_file.h: upb/pb/jit_debug_elf_file.o $(E) XXD $< $(Q) xxd -i upb/pb/jit_debug_elf_file.o > upb/pb/jit_debug_elf_file.h -upb/pb/decoder_x86.h: upb/pb/jit_debug_elf_file.h +upb/pb/decoder_x64.h: upb/pb/jit_debug_elf_file.h endif # Function to expand a wildcard pattern recursively. diff --git a/upb/pb/decoder_x64.dasc b/upb/pb/decoder_x64.dasc new file mode 100644 index 0000000..c413ce5 --- /dev/null +++ b/upb/pb/decoder_x64.dasc @@ -0,0 +1,805 @@ +|// +|// upb - a minimalist implementation of protocol buffers. +|// +|// Copyright (c) 2011 Google Inc. See LICENSE for details. +|// Author: Josh Haberman +|// +|// JIT compiler for upb_decoder on x86. Given a upb_handlers object, +|// generates code specialized to parsing the specific message and +|// calling specific handlers. +|// +|// Since the JIT can call other functions (the JIT'ted code is not a leaf +|// function) we must respect alignment rules. On OS X, this means aligning +|// the stack to 16 bytes. + +#define UPB_NONE -1 +#define UPB_MULTIPLE -2 +#define UPB_TOPLEVEL_ONE -3 + +#include +#include "dynasm/dasm_proto.h" +#include "dynasm/dasm_x86.h" + +#ifndef MAP_ANONYMOUS +# define MAP_ANONYMOUS MAP_ANON +#endif + +// We map into the low 32 bits when we can, but if this is not available +// (like on OS X) we take what we can get. It's not required for correctness, +// it's just a performance thing that makes it more likely that our jumps +// can be rel32 (i.e. within 32-bits of our pc) instead of the longer +// sequence required for other jumps (see callp). +#ifndef MAP_32BIT +#define MAP_32BIT 0 +#endif + +// To debug JIT-ted code with GDB we need to tell GDB about the JIT-ted code +// at runtime. GDB 7.x+ has defined an interface for doing this, and these +// structure/function defintions are copied out of gdb/jit.h +// +// We need to give GDB an ELF file at runtime describing the symbols we have +// generated. To avoid implementing the ELF format, we generate an ELF file +// at compile-time and compile it in as a character string. We can replace +// a few key constants (address of JIT-ted function and its size) by looking +// for a few magic numbers and doing a dumb string replacement. + +#ifndef __APPLE__ +#include "upb/pb/jit_debug_elf_file.h" + +typedef enum +{ + GDB_JIT_NOACTION = 0, + GDB_JIT_REGISTER, + GDB_JIT_UNREGISTER +} jit_actions_t; + +typedef struct gdb_jit_entry { + struct gdb_jit_entry *next_entry; + struct gdb_jit_entry *prev_entry; + const char *symfile_addr; + uint64_t symfile_size; +} gdb_jit_entry; + +typedef struct { + uint32_t version; + uint32_t action_flag; + gdb_jit_entry *relevant_entry; + gdb_jit_entry *first_entry; +} gdb_jit_descriptor; + +gdb_jit_descriptor __jit_debug_descriptor = {1, GDB_JIT_NOACTION, NULL, NULL}; + +void __attribute__((noinline)) __jit_debug_register_code() { __asm__ __volatile__(""); } + +void upb_reg_jit_gdb(upb_decoder *d) { + // Create debug info. + size_t elf_len = upb_pb_jit_debug_elf_file_o_len; + d->debug_info = malloc(elf_len); + memcpy(d->debug_info, upb_pb_jit_debug_elf_file_o, elf_len); + uint64_t *p = (void*)d->debug_info; + for (; (void*)(p+1) <= (void*)d->debug_info + elf_len; ++p) { + if (*p == 0x12345678) { *p = (uintptr_t)d->jit_code; } + if (*p == 0x321) { *p = d->jit_size; } + } + + // Register the JIT-ted code with GDB. + gdb_jit_entry *e = malloc(sizeof(gdb_jit_entry)); + e->next_entry = __jit_debug_descriptor.first_entry; + e->prev_entry = NULL; + if (e->next_entry) e->next_entry->prev_entry = e; + e->symfile_addr = d->debug_info; + e->symfile_size = elf_len; + __jit_debug_descriptor.first_entry = e; + __jit_debug_descriptor.relevant_entry = e; + __jit_debug_descriptor.action_flag = GDB_JIT_REGISTER; + __jit_debug_register_code(); +} + +#else + +void upb_reg_jit_gdb(upb_decoder *d) { + (void)d; +} + +#endif + +|.arch x64 +|.actionlist upb_jit_actionlist +|.globals UPB_JIT_GLOBAL_ +|.globalnames upb_jit_globalnames +| +|// Calling conventions. +|.define ARG1_64, rdi +|.define ARG2_8, sil +|.define ARG2_32, esi +|.define ARG2_64, rsi +|.define ARG3_8, dl +|.define ARG3_32, edx +|.define ARG3_64, rdx +| +|// Register allocation / type map. +|// ALL of the code in this file uses these register allocations. +|// When we "call" within this file, we do not use regular calling +|// conventions, but of course when calling to user callbacks we must. +|.define PTR, rbx +|.define CLOSURE, r12 +|.type FRAME, upb_dispatcher_frame, r13 +|.type STRREF, upb_strref, r14 +|.type DECODER, upb_decoder, r15 +|.type STDARRAY, upb_stdarray +| +|.macro callp, addr +|| if ((uintptr_t)addr < 0xffffffff) { + | call &addr +|| } else { + | mov64 rax, (uintptr_t)addr + | call rax +|| } +|.endmacro +| +|// Checks PTR for end-of-buffer. +|.macro check_eob, m +| cmp PTR, DECODER->effective_end +|| if (m->is_group) { + | jae ->exit_jit +|| } else { + | jae =>m->jit_endofbuf_pclabel +|| } +|.endmacro +| +|// Decodes varint from [PTR + offset] -> ARG3. +|// Saves new pointer as rax. +|.macro decode_loaded_varint, offset +| // Check for <=2 bytes inline, otherwise jump to 2-10 byte decoder. +| lea rax, [PTR + offset + 1] +| mov ARG3_32, ecx +| and ARG3_32, 0x7f +| test cl, cl +| jns >9 +| lea rax, [PTR + offset + 2] +| movzx esi, ch +| and esi, 0x7f +| shl esi, 7 +| or ARG3_32, esi +| test cx, cx +| jns >9 +| mov ARG1_64, rax +| mov ARG2_32, ARG3_32 +| callp upb_vdecode_max8_fast +| test rax, rax +| jz ->exit_jit // >10-byte varint. +|9: +|.endmacro +| +|.macro decode_varint, offset +| mov ecx, dword [PTR + offset] +| decode_loaded_varint offset +| mov PTR, rax +|.endmacro +| +|// Decode the tag -> edx. +|// Could specialize this by avoiding the value masking: could just key the +|// table on the raw (length-masked) varint to save 3-4 cycles of latency. +|// Currently only support tables where all entries are in the array part. +|.macro dyndispatch_, m +|=>m->jit_dyndispatch_pclabel: +| decode_loaded_varint, 0 +| mov ecx, edx +| shr ecx, 3 +| and edx, 0x7 +| cmp ecx, m->max_field_number // Bounds-check the field. +| ja ->exit_jit // In the future; could be unknown label +|| if ((uintptr_t)m->tablearray < 0xffffffff) { +| mov rax, qword [rcx*8 + m->tablearray] // TODO: support hybrid array/hash tables. +|| } else { +| mov64 rax, (uintptr_t)m->tablearray +| mov rax, qword [rax + rcx*8] +|| } +| jmp rax // Dispatch: unpredictable jump. +|.endmacro +| +|.if 1 +| // Replicated dispatch: larger code, but better branch prediction. +| .define dyndispatch, dyndispatch_ +|.else +| .macro dyndispatch, m +| jmp =>m->jit_dyndispatch_pclabel +| .endmacro +|.endif +| +|// Push a stack frame (not the CPU stack, the upb_decoder stack). +|.macro pushframe, f, closure_, end_offset_, is_sequence_ +| lea rax, [FRAME + sizeof(upb_dispatcher_frame)] // rax for shorter addressing. +| cmp rax, qword DECODER->dispatcher.limit +| jae ->exit_jit // Frame stack overflow. +| mov qword FRAME:rax->f, f +| mov qword FRAME:rax->closure, closure_ +| mov dword FRAME:rax->end_ofs, end_offset_ +| mov byte FRAME:rax->is_sequence, is_sequence_ +| mov CLOSURE, rdx +| mov DECODER->dispatcher.top, rax +| mov FRAME, rax +|.endmacro +| +|.macro popframe, m +| sub FRAME, sizeof(upb_dispatcher_frame) +| mov DECODER->dispatcher.top, FRAME +| setmsgend m +| mov CLOSURE, FRAME->closure +|.endmacro +| +|.macro setmsgend, m +| mov rsi, DECODER->jit_end +|| if (m->is_group) { +| mov64 rax, 0xffffffffffffffff +| mov qword DECODER->delim_end, rax +| mov DECODER->effective_end, rsi +|| } else { +| // Could store a correctly-biased version in the frame, at the cost of +| // a larger stack. +| mov eax, dword FRAME->end_ofs +| add rax, qword DECODER->buf +| mov DECODER->delim_end, rax // delim_end = d->buf + f->end_ofs +| cmp rax, rsi +| jb >8 +| mov rax, rsi // effective_end = min(d->delim_end, d->jit_end) +|8: +| mov DECODER->effective_end, rax +|| } +|.endmacro +| +|// rax contains the tag, compare it against "tag", but since it is a varint +|// we must only compare as many bytes as actually have data. +|.macro checktag, tag +|| switch (upb_value_size(tag)) { +|| case 1: +| cmp cl, tag +|| break; +|| case 2: +| cmp cx, tag +|| break; +|| case 3: +| and ecx, 0xffffff // 3 bytes +| cmp rcx, tag +|| case 4: +| cmp ecx, tag +|| break; +|| case 5: +| mov64 rdx, 0xffffffffff // 5 bytes +| and rcx, rdx +| cmp rcx, tag +|| break; +|| default: abort(); +|| } +|.endmacro +| +|// TODO: optimize for 0 (xor) and 32-bits. +|.macro loadfval, f +|| if (f->fval.val.uint64 == 0) { +| xor ARG2_32, ARG2_32 +|| } else if (f->fval.val.uint64 < 0xffffffff) { +| mov ARG2_32, f->fval.val.uint64 +|| } else { +| mov64 ARG2_64, f->fval.val.uint64 +|| } +|.endmacro +| +|.macro sethas, reg, hasbit +|| if (hasbit >= 0) { +| or byte [reg + (hasbit / 8)], (1 << (hasbit % 8)) +|| } +|.endmacro + + +#include +#include "upb/pb/varint.h" +#include "upb/msg.h" + +// Decodes the next val into ARG3, advances PTR. +static void upb_decoder_jit_decodefield(upb_decoder *d, upb_mhandlers *m, + uint8_t type, size_t tag_size) { + // Decode the value into arg 3 for the callback. + switch (type) { + case UPB_TYPE(DOUBLE): + case UPB_TYPE(FIXED64): + case UPB_TYPE(SFIXED64): + | mov ARG3_64, qword [PTR + tag_size] + | add PTR, 8 + tag_size + break; + + case UPB_TYPE(FLOAT): + case UPB_TYPE(FIXED32): + case UPB_TYPE(SFIXED32): + | mov ARG3_32, dword [PTR + tag_size] + | add PTR, 4 + tag_size + break; + + case UPB_TYPE(BOOL): + // Can't assume it's one byte long, because bool must be wire-compatible + // with all of the varint integer types. + | decode_varint tag_size + | test ARG3_64, ARG3_64 + | setne ARG3_8 // Other bytes left with val, should be ok. + break; + + case UPB_TYPE(INT64): + case UPB_TYPE(UINT64): + case UPB_TYPE(INT32): + case UPB_TYPE(UINT32): + case UPB_TYPE(ENUM): + | decode_varint tag_size + break; + + case UPB_TYPE(SINT64): + // 64-bit zig-zag decoding. + | decode_varint tag_size + | mov rax, ARG3_64 + | shr ARG3_64, 1 + | and rax, 1 + | neg rax + | xor ARG3_64, rax + break; + + case UPB_TYPE(SINT32): + // 32-bit zig-zag decoding. + | decode_varint tag_size + | mov eax, ARG3_32 + | shr ARG3_32, 1 + | and eax, 1 + | neg eax + | xor ARG3_32, eax + break; + + case UPB_TYPE(STRING): + case UPB_TYPE(BYTES): + // We only handle the case where the entire string is in our current + // buf, which sidesteps any security problems. The C path has more + // robust checks. + | mov ecx, dword [PTR + tag_size] + | decode_loaded_varint tag_size + | mov rdi, rax + | add rdi, ARG3_64 + | mov STRREF->len, ARG3_32 + | mov STRREF->ptr, rax + | sub rax, DECODER->buf + | add eax, DECODER->bufstart_ofs // = d->ptr - d->buf + d->bufstart_ofs + | mov STRREF->stream_offset, eax + | mov ARG3_64, STRREF + | cmp rdi, DECODER->effective_end + | ja ->exit_jit // Can't deliver, whole string not in buf. + | mov PTR, rdi + break; + + case UPB_TYPE_ENDGROUP: // A pseudo-type. + | add PTR, tag_size + | jmp =>m->jit_endofmsg_pclabel + return; + + // Will dispatch callbacks and call submessage in a second. + case UPB_TYPE(MESSAGE): + | decode_varint tag_size + break; + case UPB_TYPE(GROUP): + | add PTR, tag_size + break; + + default: abort(); + } +} + +#if 0 +// These appear not to speed things up, but keeping around for +// further experimentation. +static void upb_decoder_jit_doappend(upb_decoder *d, uint8_t size, + upb_fhandlers *f) { + | mov eax, STDARRAY:ARG1_64->len + | cmp eax, STDARRAY:ARG1_64->size + | jne >2 + // If array is full, fall back to actual function. + | loadfval f + | callp f->value + | jmp >3 + |2: + | mov rcx, STDARRAY:ARG1_64->ptr + | mov esi, eax + | add eax, 1 + + switch (size) { + case 8: + | mov [rcx + rsi * 8], ARG3_64 + break; + + case 4: + | mov [rcx + rsi * 4], ARG3_32 + break; + + case 1: + | mov [rcx + rsi * 4], ARG3_8 + break; + } + + | mov STDARRAY:ARG1_64->len, eax + |3: +} +#endif + +static void upb_decoder_jit_callcb(upb_decoder *d, upb_fhandlers *f) { + const upb_fielddef *fd = upb_value_getfielddef(f->fval); + // Call callbacks. + if (upb_issubmsgtype(f->type)) { + // Load closure and fval into arg registers. + | mov ARG1_64, CLOSURE + | loadfval f + + // Call startsubmsg handler (if any). + if (f->startsubmsg) { + // upb_sflow_t startsubmsg(void *closure, upb_value fval) + | mov r12d, ARG3_32 + | callp f->startsubmsg + } else { + | mov rdx, CLOSURE + | mov r12d, ARG3_32 + } + if (f->type == UPB_TYPE(MESSAGE)) { + | mov rsi, PTR + | sub rsi, DECODER->buf + | add esi, r12d // = (d->ptr - d->buf) + delim_len + } else { + assert(f->type == UPB_TYPE(GROUP)); + | mov esi, UPB_NONDELIMITED + } + | pushframe f, rdx, esi, false + + const upb_mhandlers *sub_m = upb_fhandlers_getsubmsg(f); + if (sub_m->jit_parent_field_done_pclabel != UPB_MULTIPLE) { + | jmp =>sub_m->jit_startmsg_pclabel; + } else { + | call =>sub_m->jit_startmsg_pclabel; + } + + |=>f->jit_submsg_done_pclabel: + | popframe upb_fhandlers_getmsg(f) + + // Call endsubmsg handler (if any). + if (f->endsubmsg) { + // upb_flow_t endsubmsg(void *closure, upb_value fval); + | mov ARG1_64, CLOSURE + | loadfval f + | callp f->endsubmsg + } + } else { + | mov ARG1_64, CLOSURE + // Test for callbacks we can specialize. + // Can't switch() on function pointers. + if (f->value == &upb_stdmsg_setint64 || + f->value == &upb_stdmsg_setuint64 || + f->value == &upb_stdmsg_setptr || + f->value == &upb_stdmsg_setdouble) { + | mov [ARG1_64 + fd->offset], ARG3_64 + } else if (f->value == &upb_stdmsg_setint32 || + f->value == &upb_stdmsg_setuint32 || + f->value == &upb_stdmsg_setfloat) { + | mov [ARG1_64 + fd->offset], ARG3_32 + } else if (f->value == &upb_stdmsg_setbool) { + | mov [ARG1_64 + fd->offset], ARG3_8 +#if 0 + // These appear not to speed things up, but keeping around for + // further experimentation. + } else if (f->value == &upb_stdmsg_setint64_r || + f->value == &upb_stdmsg_setuint64_r || + f->value == &upb_stdmsg_setptr_r || + f->value == &upb_stdmsg_setdouble_r) { + upb_decoder_jit_doappend(d, 8, f); + } else if (f->value == &upb_stdmsg_setint32_r || + f->value == &upb_stdmsg_setuint32_r || + f->value == &upb_stdmsg_setfloat_r) { + upb_decoder_jit_doappend(d, 4, f); + } else if (f->value == &upb_stdmsg_setbool_r) { + upb_decoder_jit_doappend(d, 1, f); +#endif + } else { + // Load closure and fval into arg registers. + | loadfval f + | callp f->value + } + | sethas CLOSURE, f->valuehasbit + } + // TODO: Handle UPB_SKIPSUBMSG, UPB_BREAK +} + +// PTR should point to the beginning of the tag. +static void upb_decoder_jit_field(upb_decoder *d, uint32_t tag, uint32_t next_tag, + upb_mhandlers *m, + upb_fhandlers *f, upb_fhandlers *next_f) { + // PC-label for the dispatch table. + // We check the wire type (which must be loaded in edx) because the + // table is keyed on field number, not type. + |=>f->jit_pclabel: + | cmp edx, (tag & 0x7) + | jne ->exit_jit // In the future: could be an unknown field or packed. + |=>f->jit_pclabel_notypecheck: + if (f->repeated) { + if (f->startseq) { + | mov ARG1_64, CLOSURE + | loadfval f + | callp f->startseq + } else { + | mov rdx, CLOSURE + } + | mov esi, FRAME->end_ofs + | pushframe f, rdx, esi, true + } + + |1: // Label for repeating this field. + + upb_decoder_jit_decodefield(d, m, f->type, upb_value_size(tag)); + upb_decoder_jit_callcb(d, f); + + // Epilogue: load next tag, check for repeated field. + | check_eob m + | mov rcx, qword [PTR] + if (f->repeated) { + | checktag tag + | je <1 + | popframe m + if (f->endseq) { + | mov ARG1_64, CLOSURE + | loadfval f + | callp f->endseq + } + } + if (next_tag != 0) { + | checktag next_tag + | je =>next_f->jit_pclabel_notypecheck + } + + // Fall back to dynamic dispatch. + | dyndispatch m + |1: +} + +static int upb_compare_uint32(const void *a, const void *b) { + // TODO: always put ENDGROUP at the end. + return *(uint32_t*)a - *(uint32_t*)b; +} + +static void upb_decoder_jit_msg(upb_decoder *d, upb_mhandlers *m) { + |=>m->jit_startmsg_pclabel: + + if (m->jit_parent_field_done_pclabel == UPB_MULTIPLE) { + // There was a call to get here, so we need to align the stack. + | sub rsp, 8 + } + // Call startmsg handler (if any): + if (m->startmsg) { + // upb_flow_t startmsg(void *closure); + | mov ARG1_64, FRAME->closure + | callp m->startmsg + // TODO: Handle UPB_SKIPSUBMSG, UPB_BREAK + } + + | setmsgend m + | check_eob m + | mov ecx, dword [PTR] + | dyndispatch_ m + + // --------- New code section (does not fall through) ------------------------ + + // Emit code for parsing each field (dynamic dispatch contains pointers to + // all of these). + + // Create an ordering over the fields (inttable ordering is undefined). + int num_keys = upb_inttable_count(&m->fieldtab); + uint32_t *keys = malloc(num_keys * sizeof(*keys)); + int idx = 0; + for(upb_inttable_iter i = upb_inttable_begin(&m->fieldtab); !upb_inttable_done(i); + i = upb_inttable_next(&m->fieldtab, i)) { + keys[idx++] = upb_inttable_iter_key(i); + } + qsort(keys, num_keys, sizeof(uint32_t), &upb_compare_uint32); + + upb_fhandlers *last_f = NULL; + uint32_t last_tag = 0; + for(int i = 0; i < num_keys; i++) { + uint32_t key = keys[i]; + upb_fhandlers *f = upb_inttable_lookup(&m->fieldtab, key); + uint32_t tag = upb_vencode32(key); + if (last_f) upb_decoder_jit_field(d, last_tag, tag, m, last_f, f); + last_tag = tag; + last_f = f; + } + upb_decoder_jit_field(d, last_tag, 0, m, last_f, NULL); + + free(keys); + + // --------- New code section (does not fall through) ------------------------ + + // End-of-buf / end-of-message. + if (!m->is_group) { + // This case doesn't exist for groups, because there eob really means + // eob, so that case just exits the jit directly. + |=>m->jit_endofbuf_pclabel: + | cmp PTR, DECODER->delim_end + | jb ->exit_jit // We are at eob, but not end-of-submsg. + } + + |=>m->jit_endofmsg_pclabel: + // We are at end-of-submsg: call endmsg handler (if any): + if (m->endmsg) { + // void endmsg(void *closure, upb_status *status) { + | mov ARG1_64, FRAME->closure + | lea ARG2_64, DECODER->dispatcher.status + | callp m->endmsg + } + + if (m->jit_parent_field_done_pclabel == UPB_MULTIPLE) { + // Counter previous alignment. + | add rsp, 8 + | ret + } else if (m->jit_parent_field_done_pclabel == UPB_TOPLEVEL_ONE) { + | jmp ->exit_jit + } else { + | jmp =>m->jit_parent_field_done_pclabel + } + +} + +static const char *dbgfmt = + "JIT encountered unknown field! wt=%d, fn=%d\n"; + +static void upb_decoder_jit(upb_decoder *d) { + | push rbp + | mov rbp, rsp + | push r15 + | push r14 + | push r13 + | push r12 + | push rbx + // Align stack. + | sub rsp, 8 + | mov DECODER, ARG1_64 + | mov FRAME, DECODER:ARG1_64->dispatcher.top + | lea STRREF, DECODER:ARG1_64->strref + | mov CLOSURE, FRAME->closure + | mov PTR, DECODER->ptr + + upb_handlers *h = d->dispatcher.handlers; + if (h->msgs[0]->jit_parent_field_done_pclabel == UPB_MULTIPLE) { + | call =>h->msgs[0]->jit_startmsg_pclabel + | jmp ->exit_jit + } + + // TODO: push return addresses for re-entry (will be necessary for multiple + // buffer support). + for (int i = 0; i < h->msgs_len; i++) upb_decoder_jit_msg(d, h->msgs[i]); + + |->exit_jit: + | mov DECODER->ptr, PTR + // Counter previous alignment. + | add rsp, 8 + | pop rbx + | pop r12 + | pop r13 + | pop r14 + | pop r15 + | leave + | ret + |=>0: + | mov rdi, stderr + | mov rsi, dbgfmt + | callp fprintf + | callp abort +} + +void upb_decoder_jit_assignfieldlabs(upb_fhandlers *f, + uint32_t *pclabel_count) { + f->jit_pclabel = (*pclabel_count)++; + f->jit_pclabel_notypecheck = (*pclabel_count)++; + f->jit_submsg_done_pclabel = (*pclabel_count)++; +} + +void upb_decoder_jit_assignmsglabs(upb_mhandlers *m, uint32_t *pclabel_count) { + m->jit_startmsg_pclabel = (*pclabel_count)++; + m->jit_endofbuf_pclabel = (*pclabel_count)++; + m->jit_endofmsg_pclabel = (*pclabel_count)++; + m->jit_dyndispatch_pclabel = (*pclabel_count)++; + m->jit_unknownfield_pclabel = (*pclabel_count)++; + m->jit_parent_field_done_pclabel = UPB_NONE; + m->max_field_number = 0; + upb_inttable_iter i; + for(i = upb_inttable_begin(&m->fieldtab); !upb_inttable_done(i); + i = upb_inttable_next(&m->fieldtab, i)) { + uint32_t key = upb_inttable_iter_key(i); + m->max_field_number = UPB_MAX(m->max_field_number, key); + upb_fhandlers *f = upb_inttable_iter_value(i); + upb_decoder_jit_assignfieldlabs(f, pclabel_count); + } + // XXX: Won't work for large field numbers; will need to use a upb_table. + m->tablearray = malloc((m->max_field_number + 1) * sizeof(void*)); +} + +// Second pass: for messages that have only one parent, link them to the field +// from which they are called. +void upb_decoder_jit_assignmsglabs2(upb_mhandlers *m) { + upb_inttable_iter i; + for(i = upb_inttable_begin(&m->fieldtab); !upb_inttable_done(i); + i = upb_inttable_next(&m->fieldtab, i)) { + upb_fhandlers *f = upb_inttable_iter_value(i); + if (upb_issubmsgtype(f->type)) { + upb_mhandlers *sub_m = upb_fhandlers_getsubmsg(f); + if (sub_m->jit_parent_field_done_pclabel == UPB_NONE) { + sub_m->jit_parent_field_done_pclabel = f->jit_submsg_done_pclabel; + } else { + sub_m->jit_parent_field_done_pclabel = UPB_MULTIPLE; + } + } + } +} + +void upb_decoder_makejit(upb_decoder *d) { + d->debug_info = NULL; + + // Assign pclabels. + uint32_t pclabel_count = 1; + upb_handlers *h = d->dispatcher.handlers; + for (int i = 0; i < h->msgs_len; i++) + upb_decoder_jit_assignmsglabs(h->msgs[i], &pclabel_count); + for (int i = 0; i < h->msgs_len; i++) + upb_decoder_jit_assignmsglabs2(h->msgs[i]); + + if (h->msgs[0]->jit_parent_field_done_pclabel == UPB_NONE) { + h->msgs[0]->jit_parent_field_done_pclabel = UPB_TOPLEVEL_ONE; + } + + void **globals = malloc(UPB_JIT_GLOBAL__MAX * sizeof(*globals)); + dasm_init(d, 1); + dasm_setupglobal(d, globals, UPB_JIT_GLOBAL__MAX); + dasm_growpc(d, pclabel_count); + dasm_setup(d, upb_jit_actionlist); + + upb_decoder_jit(d); + + dasm_link(d, &d->jit_size); + + d->jit_code = mmap(NULL, d->jit_size, PROT_READ | PROT_WRITE, + MAP_32BIT | MAP_ANONYMOUS | MAP_PRIVATE, 0, 0); + + upb_reg_jit_gdb(d); + + dasm_encode(d, d->jit_code); + + // Create dispatch tables. + for (int i = 0; i < h->msgs_len; i++) { + upb_mhandlers *m = h->msgs[i]; + for (uint32_t j = 0; j <= m->max_field_number; j++) { + upb_fhandlers *f = NULL; + for (int k = 0; k < 8; k++) { + f = upb_inttable_lookup(&m->fieldtab, (j << 3) | k); + if (f) break; + } + if (f) { + m->tablearray[j] = d->jit_code + dasm_getpclabel(d, f->jit_pclabel); + } else { + // Don't handle unknown fields yet. + m->tablearray[j] = d->jit_code + dasm_getpclabel(d, 0); + } + } + } + + dasm_free(d); + free(globals); + + mprotect(d->jit_code, d->jit_size, PROT_EXEC | PROT_READ); + + // View with: objdump -M intel -D -b binary -mi386 -Mx86-64 /tmp/machine-code + // Or: ndisasm -b 64 /tmp/machine-code + FILE *f = fopen("/tmp/machine-code", "wb"); + fwrite(d->jit_code, d->jit_size, 1, f); + fclose(f); +} + +void upb_decoder_freejit(upb_decoder *d) { + munmap(d->jit_code, d->jit_size); + free(d->debug_info); + // TODO: unregister +} diff --git a/upb/pb/decoder_x86.dasc b/upb/pb/decoder_x86.dasc deleted file mode 100644 index c413ce5..0000000 --- a/upb/pb/decoder_x86.dasc +++ /dev/null @@ -1,805 +0,0 @@ -|// -|// upb - a minimalist implementation of protocol buffers. -|// -|// Copyright (c) 2011 Google Inc. See LICENSE for details. -|// Author: Josh Haberman -|// -|// JIT compiler for upb_decoder on x86. Given a upb_handlers object, -|// generates code specialized to parsing the specific message and -|// calling specific handlers. -|// -|// Since the JIT can call other functions (the JIT'ted code is not a leaf -|// function) we must respect alignment rules. On OS X, this means aligning -|// the stack to 16 bytes. - -#define UPB_NONE -1 -#define UPB_MULTIPLE -2 -#define UPB_TOPLEVEL_ONE -3 - -#include -#include "dynasm/dasm_proto.h" -#include "dynasm/dasm_x86.h" - -#ifndef MAP_ANONYMOUS -# define MAP_ANONYMOUS MAP_ANON -#endif - -// We map into the low 32 bits when we can, but if this is not available -// (like on OS X) we take what we can get. It's not required for correctness, -// it's just a performance thing that makes it more likely that our jumps -// can be rel32 (i.e. within 32-bits of our pc) instead of the longer -// sequence required for other jumps (see callp). -#ifndef MAP_32BIT -#define MAP_32BIT 0 -#endif - -// To debug JIT-ted code with GDB we need to tell GDB about the JIT-ted code -// at runtime. GDB 7.x+ has defined an interface for doing this, and these -// structure/function defintions are copied out of gdb/jit.h -// -// We need to give GDB an ELF file at runtime describing the symbols we have -// generated. To avoid implementing the ELF format, we generate an ELF file -// at compile-time and compile it in as a character string. We can replace -// a few key constants (address of JIT-ted function and its size) by looking -// for a few magic numbers and doing a dumb string replacement. - -#ifndef __APPLE__ -#include "upb/pb/jit_debug_elf_file.h" - -typedef enum -{ - GDB_JIT_NOACTION = 0, - GDB_JIT_REGISTER, - GDB_JIT_UNREGISTER -} jit_actions_t; - -typedef struct gdb_jit_entry { - struct gdb_jit_entry *next_entry; - struct gdb_jit_entry *prev_entry; - const char *symfile_addr; - uint64_t symfile_size; -} gdb_jit_entry; - -typedef struct { - uint32_t version; - uint32_t action_flag; - gdb_jit_entry *relevant_entry; - gdb_jit_entry *first_entry; -} gdb_jit_descriptor; - -gdb_jit_descriptor __jit_debug_descriptor = {1, GDB_JIT_NOACTION, NULL, NULL}; - -void __attribute__((noinline)) __jit_debug_register_code() { __asm__ __volatile__(""); } - -void upb_reg_jit_gdb(upb_decoder *d) { - // Create debug info. - size_t elf_len = upb_pb_jit_debug_elf_file_o_len; - d->debug_info = malloc(elf_len); - memcpy(d->debug_info, upb_pb_jit_debug_elf_file_o, elf_len); - uint64_t *p = (void*)d->debug_info; - for (; (void*)(p+1) <= (void*)d->debug_info + elf_len; ++p) { - if (*p == 0x12345678) { *p = (uintptr_t)d->jit_code; } - if (*p == 0x321) { *p = d->jit_size; } - } - - // Register the JIT-ted code with GDB. - gdb_jit_entry *e = malloc(sizeof(gdb_jit_entry)); - e->next_entry = __jit_debug_descriptor.first_entry; - e->prev_entry = NULL; - if (e->next_entry) e->next_entry->prev_entry = e; - e->symfile_addr = d->debug_info; - e->symfile_size = elf_len; - __jit_debug_descriptor.first_entry = e; - __jit_debug_descriptor.relevant_entry = e; - __jit_debug_descriptor.action_flag = GDB_JIT_REGISTER; - __jit_debug_register_code(); -} - -#else - -void upb_reg_jit_gdb(upb_decoder *d) { - (void)d; -} - -#endif - -|.arch x64 -|.actionlist upb_jit_actionlist -|.globals UPB_JIT_GLOBAL_ -|.globalnames upb_jit_globalnames -| -|// Calling conventions. -|.define ARG1_64, rdi -|.define ARG2_8, sil -|.define ARG2_32, esi -|.define ARG2_64, rsi -|.define ARG3_8, dl -|.define ARG3_32, edx -|.define ARG3_64, rdx -| -|// Register allocation / type map. -|// ALL of the code in this file uses these register allocations. -|// When we "call" within this file, we do not use regular calling -|// conventions, but of course when calling to user callbacks we must. -|.define PTR, rbx -|.define CLOSURE, r12 -|.type FRAME, upb_dispatcher_frame, r13 -|.type STRREF, upb_strref, r14 -|.type DECODER, upb_decoder, r15 -|.type STDARRAY, upb_stdarray -| -|.macro callp, addr -|| if ((uintptr_t)addr < 0xffffffff) { - | call &addr -|| } else { - | mov64 rax, (uintptr_t)addr - | call rax -|| } -|.endmacro -| -|// Checks PTR for end-of-buffer. -|.macro check_eob, m -| cmp PTR, DECODER->effective_end -|| if (m->is_group) { - | jae ->exit_jit -|| } else { - | jae =>m->jit_endofbuf_pclabel -|| } -|.endmacro -| -|// Decodes varint from [PTR + offset] -> ARG3. -|// Saves new pointer as rax. -|.macro decode_loaded_varint, offset -| // Check for <=2 bytes inline, otherwise jump to 2-10 byte decoder. -| lea rax, [PTR + offset + 1] -| mov ARG3_32, ecx -| and ARG3_32, 0x7f -| test cl, cl -| jns >9 -| lea rax, [PTR + offset + 2] -| movzx esi, ch -| and esi, 0x7f -| shl esi, 7 -| or ARG3_32, esi -| test cx, cx -| jns >9 -| mov ARG1_64, rax -| mov ARG2_32, ARG3_32 -| callp upb_vdecode_max8_fast -| test rax, rax -| jz ->exit_jit // >10-byte varint. -|9: -|.endmacro -| -|.macro decode_varint, offset -| mov ecx, dword [PTR + offset] -| decode_loaded_varint offset -| mov PTR, rax -|.endmacro -| -|// Decode the tag -> edx. -|// Could specialize this by avoiding the value masking: could just key the -|// table on the raw (length-masked) varint to save 3-4 cycles of latency. -|// Currently only support tables where all entries are in the array part. -|.macro dyndispatch_, m -|=>m->jit_dyndispatch_pclabel: -| decode_loaded_varint, 0 -| mov ecx, edx -| shr ecx, 3 -| and edx, 0x7 -| cmp ecx, m->max_field_number // Bounds-check the field. -| ja ->exit_jit // In the future; could be unknown label -|| if ((uintptr_t)m->tablearray < 0xffffffff) { -| mov rax, qword [rcx*8 + m->tablearray] // TODO: support hybrid array/hash tables. -|| } else { -| mov64 rax, (uintptr_t)m->tablearray -| mov rax, qword [rax + rcx*8] -|| } -| jmp rax // Dispatch: unpredictable jump. -|.endmacro -| -|.if 1 -| // Replicated dispatch: larger code, but better branch prediction. -| .define dyndispatch, dyndispatch_ -|.else -| .macro dyndispatch, m -| jmp =>m->jit_dyndispatch_pclabel -| .endmacro -|.endif -| -|// Push a stack frame (not the CPU stack, the upb_decoder stack). -|.macro pushframe, f, closure_, end_offset_, is_sequence_ -| lea rax, [FRAME + sizeof(upb_dispatcher_frame)] // rax for shorter addressing. -| cmp rax, qword DECODER->dispatcher.limit -| jae ->exit_jit // Frame stack overflow. -| mov qword FRAME:rax->f, f -| mov qword FRAME:rax->closure, closure_ -| mov dword FRAME:rax->end_ofs, end_offset_ -| mov byte FRAME:rax->is_sequence, is_sequence_ -| mov CLOSURE, rdx -| mov DECODER->dispatcher.top, rax -| mov FRAME, rax -|.endmacro -| -|.macro popframe, m -| sub FRAME, sizeof(upb_dispatcher_frame) -| mov DECODER->dispatcher.top, FRAME -| setmsgend m -| mov CLOSURE, FRAME->closure -|.endmacro -| -|.macro setmsgend, m -| mov rsi, DECODER->jit_end -|| if (m->is_group) { -| mov64 rax, 0xffffffffffffffff -| mov qword DECODER->delim_end, rax -| mov DECODER->effective_end, rsi -|| } else { -| // Could store a correctly-biased version in the frame, at the cost of -| // a larger stack. -| mov eax, dword FRAME->end_ofs -| add rax, qword DECODER->buf -| mov DECODER->delim_end, rax // delim_end = d->buf + f->end_ofs -| cmp rax, rsi -| jb >8 -| mov rax, rsi // effective_end = min(d->delim_end, d->jit_end) -|8: -| mov DECODER->effective_end, rax -|| } -|.endmacro -| -|// rax contains the tag, compare it against "tag", but since it is a varint -|// we must only compare as many bytes as actually have data. -|.macro checktag, tag -|| switch (upb_value_size(tag)) { -|| case 1: -| cmp cl, tag -|| break; -|| case 2: -| cmp cx, tag -|| break; -|| case 3: -| and ecx, 0xffffff // 3 bytes -| cmp rcx, tag -|| case 4: -| cmp ecx, tag -|| break; -|| case 5: -| mov64 rdx, 0xffffffffff // 5 bytes -| and rcx, rdx -| cmp rcx, tag -|| break; -|| default: abort(); -|| } -|.endmacro -| -|// TODO: optimize for 0 (xor) and 32-bits. -|.macro loadfval, f -|| if (f->fval.val.uint64 == 0) { -| xor ARG2_32, ARG2_32 -|| } else if (f->fval.val.uint64 < 0xffffffff) { -| mov ARG2_32, f->fval.val.uint64 -|| } else { -| mov64 ARG2_64, f->fval.val.uint64 -|| } -|.endmacro -| -|.macro sethas, reg, hasbit -|| if (hasbit >= 0) { -| or byte [reg + (hasbit / 8)], (1 << (hasbit % 8)) -|| } -|.endmacro - - -#include -#include "upb/pb/varint.h" -#include "upb/msg.h" - -// Decodes the next val into ARG3, advances PTR. -static void upb_decoder_jit_decodefield(upb_decoder *d, upb_mhandlers *m, - uint8_t type, size_t tag_size) { - // Decode the value into arg 3 for the callback. - switch (type) { - case UPB_TYPE(DOUBLE): - case UPB_TYPE(FIXED64): - case UPB_TYPE(SFIXED64): - | mov ARG3_64, qword [PTR + tag_size] - | add PTR, 8 + tag_size - break; - - case UPB_TYPE(FLOAT): - case UPB_TYPE(FIXED32): - case UPB_TYPE(SFIXED32): - | mov ARG3_32, dword [PTR + tag_size] - | add PTR, 4 + tag_size - break; - - case UPB_TYPE(BOOL): - // Can't assume it's one byte long, because bool must be wire-compatible - // with all of the varint integer types. - | decode_varint tag_size - | test ARG3_64, ARG3_64 - | setne ARG3_8 // Other bytes left with val, should be ok. - break; - - case UPB_TYPE(INT64): - case UPB_TYPE(UINT64): - case UPB_TYPE(INT32): - case UPB_TYPE(UINT32): - case UPB_TYPE(ENUM): - | decode_varint tag_size - break; - - case UPB_TYPE(SINT64): - // 64-bit zig-zag decoding. - | decode_varint tag_size - | mov rax, ARG3_64 - | shr ARG3_64, 1 - | and rax, 1 - | neg rax - | xor ARG3_64, rax - break; - - case UPB_TYPE(SINT32): - // 32-bit zig-zag decoding. - | decode_varint tag_size - | mov eax, ARG3_32 - | shr ARG3_32, 1 - | and eax, 1 - | neg eax - | xor ARG3_32, eax - break; - - case UPB_TYPE(STRING): - case UPB_TYPE(BYTES): - // We only handle the case where the entire string is in our current - // buf, which sidesteps any security problems. The C path has more - // robust checks. - | mov ecx, dword [PTR + tag_size] - | decode_loaded_varint tag_size - | mov rdi, rax - | add rdi, ARG3_64 - | mov STRREF->len, ARG3_32 - | mov STRREF->ptr, rax - | sub rax, DECODER->buf - | add eax, DECODER->bufstart_ofs // = d->ptr - d->buf + d->bufstart_ofs - | mov STRREF->stream_offset, eax - | mov ARG3_64, STRREF - | cmp rdi, DECODER->effective_end - | ja ->exit_jit // Can't deliver, whole string not in buf. - | mov PTR, rdi - break; - - case UPB_TYPE_ENDGROUP: // A pseudo-type. - | add PTR, tag_size - | jmp =>m->jit_endofmsg_pclabel - return; - - // Will dispatch callbacks and call submessage in a second. - case UPB_TYPE(MESSAGE): - | decode_varint tag_size - break; - case UPB_TYPE(GROUP): - | add PTR, tag_size - break; - - default: abort(); - } -} - -#if 0 -// These appear not to speed things up, but keeping around for -// further experimentation. -static void upb_decoder_jit_doappend(upb_decoder *d, uint8_t size, - upb_fhandlers *f) { - | mov eax, STDARRAY:ARG1_64->len - | cmp eax, STDARRAY:ARG1_64->size - | jne >2 - // If array is full, fall back to actual function. - | loadfval f - | callp f->value - | jmp >3 - |2: - | mov rcx, STDARRAY:ARG1_64->ptr - | mov esi, eax - | add eax, 1 - - switch (size) { - case 8: - | mov [rcx + rsi * 8], ARG3_64 - break; - - case 4: - | mov [rcx + rsi * 4], ARG3_32 - break; - - case 1: - | mov [rcx + rsi * 4], ARG3_8 - break; - } - - | mov STDARRAY:ARG1_64->len, eax - |3: -} -#endif - -static void upb_decoder_jit_callcb(upb_decoder *d, upb_fhandlers *f) { - const upb_fielddef *fd = upb_value_getfielddef(f->fval); - // Call callbacks. - if (upb_issubmsgtype(f->type)) { - // Load closure and fval into arg registers. - | mov ARG1_64, CLOSURE - | loadfval f - - // Call startsubmsg handler (if any). - if (f->startsubmsg) { - // upb_sflow_t startsubmsg(void *closure, upb_value fval) - | mov r12d, ARG3_32 - | callp f->startsubmsg - } else { - | mov rdx, CLOSURE - | mov r12d, ARG3_32 - } - if (f->type == UPB_TYPE(MESSAGE)) { - | mov rsi, PTR - | sub rsi, DECODER->buf - | add esi, r12d // = (d->ptr - d->buf) + delim_len - } else { - assert(f->type == UPB_TYPE(GROUP)); - | mov esi, UPB_NONDELIMITED - } - | pushframe f, rdx, esi, false - - const upb_mhandlers *sub_m = upb_fhandlers_getsubmsg(f); - if (sub_m->jit_parent_field_done_pclabel != UPB_MULTIPLE) { - | jmp =>sub_m->jit_startmsg_pclabel; - } else { - | call =>sub_m->jit_startmsg_pclabel; - } - - |=>f->jit_submsg_done_pclabel: - | popframe upb_fhandlers_getmsg(f) - - // Call endsubmsg handler (if any). - if (f->endsubmsg) { - // upb_flow_t endsubmsg(void *closure, upb_value fval); - | mov ARG1_64, CLOSURE - | loadfval f - | callp f->endsubmsg - } - } else { - | mov ARG1_64, CLOSURE - // Test for callbacks we can specialize. - // Can't switch() on function pointers. - if (f->value == &upb_stdmsg_setint64 || - f->value == &upb_stdmsg_setuint64 || - f->value == &upb_stdmsg_setptr || - f->value == &upb_stdmsg_setdouble) { - | mov [ARG1_64 + fd->offset], ARG3_64 - } else if (f->value == &upb_stdmsg_setint32 || - f->value == &upb_stdmsg_setuint32 || - f->value == &upb_stdmsg_setfloat) { - | mov [ARG1_64 + fd->offset], ARG3_32 - } else if (f->value == &upb_stdmsg_setbool) { - | mov [ARG1_64 + fd->offset], ARG3_8 -#if 0 - // These appear not to speed things up, but keeping around for - // further experimentation. - } else if (f->value == &upb_stdmsg_setint64_r || - f->value == &upb_stdmsg_setuint64_r || - f->value == &upb_stdmsg_setptr_r || - f->value == &upb_stdmsg_setdouble_r) { - upb_decoder_jit_doappend(d, 8, f); - } else if (f->value == &upb_stdmsg_setint32_r || - f->value == &upb_stdmsg_setuint32_r || - f->value == &upb_stdmsg_setfloat_r) { - upb_decoder_jit_doappend(d, 4, f); - } else if (f->value == &upb_stdmsg_setbool_r) { - upb_decoder_jit_doappend(d, 1, f); -#endif - } else { - // Load closure and fval into arg registers. - | loadfval f - | callp f->value - } - | sethas CLOSURE, f->valuehasbit - } - // TODO: Handle UPB_SKIPSUBMSG, UPB_BREAK -} - -// PTR should point to the beginning of the tag. -static void upb_decoder_jit_field(upb_decoder *d, uint32_t tag, uint32_t next_tag, - upb_mhandlers *m, - upb_fhandlers *f, upb_fhandlers *next_f) { - // PC-label for the dispatch table. - // We check the wire type (which must be loaded in edx) because the - // table is keyed on field number, not type. - |=>f->jit_pclabel: - | cmp edx, (tag & 0x7) - | jne ->exit_jit // In the future: could be an unknown field or packed. - |=>f->jit_pclabel_notypecheck: - if (f->repeated) { - if (f->startseq) { - | mov ARG1_64, CLOSURE - | loadfval f - | callp f->startseq - } else { - | mov rdx, CLOSURE - } - | mov esi, FRAME->end_ofs - | pushframe f, rdx, esi, true - } - - |1: // Label for repeating this field. - - upb_decoder_jit_decodefield(d, m, f->type, upb_value_size(tag)); - upb_decoder_jit_callcb(d, f); - - // Epilogue: load next tag, check for repeated field. - | check_eob m - | mov rcx, qword [PTR] - if (f->repeated) { - | checktag tag - | je <1 - | popframe m - if (f->endseq) { - | mov ARG1_64, CLOSURE - | loadfval f - | callp f->endseq - } - } - if (next_tag != 0) { - | checktag next_tag - | je =>next_f->jit_pclabel_notypecheck - } - - // Fall back to dynamic dispatch. - | dyndispatch m - |1: -} - -static int upb_compare_uint32(const void *a, const void *b) { - // TODO: always put ENDGROUP at the end. - return *(uint32_t*)a - *(uint32_t*)b; -} - -static void upb_decoder_jit_msg(upb_decoder *d, upb_mhandlers *m) { - |=>m->jit_startmsg_pclabel: - - if (m->jit_parent_field_done_pclabel == UPB_MULTIPLE) { - // There was a call to get here, so we need to align the stack. - | sub rsp, 8 - } - // Call startmsg handler (if any): - if (m->startmsg) { - // upb_flow_t startmsg(void *closure); - | mov ARG1_64, FRAME->closure - | callp m->startmsg - // TODO: Handle UPB_SKIPSUBMSG, UPB_BREAK - } - - | setmsgend m - | check_eob m - | mov ecx, dword [PTR] - | dyndispatch_ m - - // --------- New code section (does not fall through) ------------------------ - - // Emit code for parsing each field (dynamic dispatch contains pointers to - // all of these). - - // Create an ordering over the fields (inttable ordering is undefined). - int num_keys = upb_inttable_count(&m->fieldtab); - uint32_t *keys = malloc(num_keys * sizeof(*keys)); - int idx = 0; - for(upb_inttable_iter i = upb_inttable_begin(&m->fieldtab); !upb_inttable_done(i); - i = upb_inttable_next(&m->fieldtab, i)) { - keys[idx++] = upb_inttable_iter_key(i); - } - qsort(keys, num_keys, sizeof(uint32_t), &upb_compare_uint32); - - upb_fhandlers *last_f = NULL; - uint32_t last_tag = 0; - for(int i = 0; i < num_keys; i++) { - uint32_t key = keys[i]; - upb_fhandlers *f = upb_inttable_lookup(&m->fieldtab, key); - uint32_t tag = upb_vencode32(key); - if (last_f) upb_decoder_jit_field(d, last_tag, tag, m, last_f, f); - last_tag = tag; - last_f = f; - } - upb_decoder_jit_field(d, last_tag, 0, m, last_f, NULL); - - free(keys); - - // --------- New code section (does not fall through) ------------------------ - - // End-of-buf / end-of-message. - if (!m->is_group) { - // This case doesn't exist for groups, because there eob really means - // eob, so that case just exits the jit directly. - |=>m->jit_endofbuf_pclabel: - | cmp PTR, DECODER->delim_end - | jb ->exit_jit // We are at eob, but not end-of-submsg. - } - - |=>m->jit_endofmsg_pclabel: - // We are at end-of-submsg: call endmsg handler (if any): - if (m->endmsg) { - // void endmsg(void *closure, upb_status *status) { - | mov ARG1_64, FRAME->closure - | lea ARG2_64, DECODER->dispatcher.status - | callp m->endmsg - } - - if (m->jit_parent_field_done_pclabel == UPB_MULTIPLE) { - // Counter previous alignment. - | add rsp, 8 - | ret - } else if (m->jit_parent_field_done_pclabel == UPB_TOPLEVEL_ONE) { - | jmp ->exit_jit - } else { - | jmp =>m->jit_parent_field_done_pclabel - } - -} - -static const char *dbgfmt = - "JIT encountered unknown field! wt=%d, fn=%d\n"; - -static void upb_decoder_jit(upb_decoder *d) { - | push rbp - | mov rbp, rsp - | push r15 - | push r14 - | push r13 - | push r12 - | push rbx - // Align stack. - | sub rsp, 8 - | mov DECODER, ARG1_64 - | mov FRAME, DECODER:ARG1_64->dispatcher.top - | lea STRREF, DECODER:ARG1_64->strref - | mov CLOSURE, FRAME->closure - | mov PTR, DECODER->ptr - - upb_handlers *h = d->dispatcher.handlers; - if (h->msgs[0]->jit_parent_field_done_pclabel == UPB_MULTIPLE) { - | call =>h->msgs[0]->jit_startmsg_pclabel - | jmp ->exit_jit - } - - // TODO: push return addresses for re-entry (will be necessary for multiple - // buffer support). - for (int i = 0; i < h->msgs_len; i++) upb_decoder_jit_msg(d, h->msgs[i]); - - |->exit_jit: - | mov DECODER->ptr, PTR - // Counter previous alignment. - | add rsp, 8 - | pop rbx - | pop r12 - | pop r13 - | pop r14 - | pop r15 - | leave - | ret - |=>0: - | mov rdi, stderr - | mov rsi, dbgfmt - | callp fprintf - | callp abort -} - -void upb_decoder_jit_assignfieldlabs(upb_fhandlers *f, - uint32_t *pclabel_count) { - f->jit_pclabel = (*pclabel_count)++; - f->jit_pclabel_notypecheck = (*pclabel_count)++; - f->jit_submsg_done_pclabel = (*pclabel_count)++; -} - -void upb_decoder_jit_assignmsglabs(upb_mhandlers *m, uint32_t *pclabel_count) { - m->jit_startmsg_pclabel = (*pclabel_count)++; - m->jit_endofbuf_pclabel = (*pclabel_count)++; - m->jit_endofmsg_pclabel = (*pclabel_count)++; - m->jit_dyndispatch_pclabel = (*pclabel_count)++; - m->jit_unknownfield_pclabel = (*pclabel_count)++; - m->jit_parent_field_done_pclabel = UPB_NONE; - m->max_field_number = 0; - upb_inttable_iter i; - for(i = upb_inttable_begin(&m->fieldtab); !upb_inttable_done(i); - i = upb_inttable_next(&m->fieldtab, i)) { - uint32_t key = upb_inttable_iter_key(i); - m->max_field_number = UPB_MAX(m->max_field_number, key); - upb_fhandlers *f = upb_inttable_iter_value(i); - upb_decoder_jit_assignfieldlabs(f, pclabel_count); - } - // XXX: Won't work for large field numbers; will need to use a upb_table. - m->tablearray = malloc((m->max_field_number + 1) * sizeof(void*)); -} - -// Second pass: for messages that have only one parent, link them to the field -// from which they are called. -void upb_decoder_jit_assignmsglabs2(upb_mhandlers *m) { - upb_inttable_iter i; - for(i = upb_inttable_begin(&m->fieldtab); !upb_inttable_done(i); - i = upb_inttable_next(&m->fieldtab, i)) { - upb_fhandlers *f = upb_inttable_iter_value(i); - if (upb_issubmsgtype(f->type)) { - upb_mhandlers *sub_m = upb_fhandlers_getsubmsg(f); - if (sub_m->jit_parent_field_done_pclabel == UPB_NONE) { - sub_m->jit_parent_field_done_pclabel = f->jit_submsg_done_pclabel; - } else { - sub_m->jit_parent_field_done_pclabel = UPB_MULTIPLE; - } - } - } -} - -void upb_decoder_makejit(upb_decoder *d) { - d->debug_info = NULL; - - // Assign pclabels. - uint32_t pclabel_count = 1; - upb_handlers *h = d->dispatcher.handlers; - for (int i = 0; i < h->msgs_len; i++) - upb_decoder_jit_assignmsglabs(h->msgs[i], &pclabel_count); - for (int i = 0; i < h->msgs_len; i++) - upb_decoder_jit_assignmsglabs2(h->msgs[i]); - - if (h->msgs[0]->jit_parent_field_done_pclabel == UPB_NONE) { - h->msgs[0]->jit_parent_field_done_pclabel = UPB_TOPLEVEL_ONE; - } - - void **globals = malloc(UPB_JIT_GLOBAL__MAX * sizeof(*globals)); - dasm_init(d, 1); - dasm_setupglobal(d, globals, UPB_JIT_GLOBAL__MAX); - dasm_growpc(d, pclabel_count); - dasm_setup(d, upb_jit_actionlist); - - upb_decoder_jit(d); - - dasm_link(d, &d->jit_size); - - d->jit_code = mmap(NULL, d->jit_size, PROT_READ | PROT_WRITE, - MAP_32BIT | MAP_ANONYMOUS | MAP_PRIVATE, 0, 0); - - upb_reg_jit_gdb(d); - - dasm_encode(d, d->jit_code); - - // Create dispatch tables. - for (int i = 0; i < h->msgs_len; i++) { - upb_mhandlers *m = h->msgs[i]; - for (uint32_t j = 0; j <= m->max_field_number; j++) { - upb_fhandlers *f = NULL; - for (int k = 0; k < 8; k++) { - f = upb_inttable_lookup(&m->fieldtab, (j << 3) | k); - if (f) break; - } - if (f) { - m->tablearray[j] = d->jit_code + dasm_getpclabel(d, f->jit_pclabel); - } else { - // Don't handle unknown fields yet. - m->tablearray[j] = d->jit_code + dasm_getpclabel(d, 0); - } - } - } - - dasm_free(d); - free(globals); - - mprotect(d->jit_code, d->jit_size, PROT_EXEC | PROT_READ); - - // View with: objdump -M intel -D -b binary -mi386 -Mx86-64 /tmp/machine-code - // Or: ndisasm -b 64 /tmp/machine-code - FILE *f = fopen("/tmp/machine-code", "wb"); - fwrite(d->jit_code, d->jit_size, 1, f); - fclose(f); -} - -void upb_decoder_freejit(upb_decoder *d) { - munmap(d->jit_code, d->jit_size); - free(d->debug_info); - // TODO: unregister -} -- cgit v1.2.3