summaryrefslogtreecommitdiff
path: root/upb/pb/decoder_x64.dasc
blob: cd09cfeb99f6a8ffde6ef77c90d7208f2f8a20e2 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
|//
|// upb - a minimalist implementation of protocol buffers.
|//
|// Copyright (c) 2011 Google Inc.  See LICENSE for details.
|// Author: Josh Haberman <jhaberman@gmail.com>
|//
|// JIT compiler for upb_decoder on x86.  Given a upb_decoderplan object (which
|// contains an embedded set of upb_handlers), generates code specialized to
|// parsing the specific message and calling specific handlers.
|//
|// Since the JIT can call other functions (the JIT'ted code is not a leaf
|// function) we must respect alignment rules.  All x86-64 systems require
|// 16-byte stack alignment.

#include <stdio.h>
#include <sys/mman.h>
#include "dynasm/dasm_x86.h"

#ifndef MAP_ANONYMOUS
# define MAP_ANONYMOUS MAP_ANON
#endif

// We map into the low 32 bits when we can, but if this is not available
// (like on OS X) we take what we can get.  It's not required for correctness,
// it's just a performance thing that makes it more likely that our jumps
// can be rel32 (i.e. within 32-bits of our pc) instead of the longer
// sequence required for other jumps (see callp).
#ifndef MAP_32BIT
#define MAP_32BIT 0
#endif

// These are used to track jump targets for messages and fields.
enum {
  STARTMSG = 0,
  AFTER_STARTMSG = 1,
  ENDOFBUF = 2,
  ENDOFMSG = 3,
  DYNDISPATCH = 4,
  TOTAL_MSG_PCLABELS = 5,
};

enum {
  FIELD = 0,
  FIELD_NO_TYPECHECK = 1,
  TOTAL_FIELD_PCLABELS = 2,
};

typedef struct {
  uint32_t max_field_number;
  // Currently keyed on field number.  Could also try keying it
  // on encoded or decoded tag, or on encoded field number.
  void **tablearray;
  // Pointer to the JIT code for parsing this message.
  void *jit_func;
} upb_jitmsginfo;

static uint32_t upb_getpclabel(upb_decoderplan *plan, const void *obj, int n) {
  const upb_value *v = upb_inttable_lookupptr(&plan->pclabels, obj);
  assert(v);
  return upb_value_getuint32(*v) + n;
}

static upb_jitmsginfo *upb_getmsginfo(upb_decoderplan *plan,
                                      const upb_handlers *h) {
  const upb_value *v = upb_inttable_lookupptr(&plan->msginfo, h);
  assert(v);
  return upb_value_getptr(*v);
}

// To debug JIT-ted code with GDB we need to tell GDB about the JIT-ted code
// at runtime.  GDB 7.x+ has defined an interface for doing this, and these
// structure/function defintions are copied out of gdb/jit.h
//
// We need to give GDB an ELF file at runtime describing the symbols we have
// generated.  To avoid implementing the ELF format, we generate an ELF file
// at compile-time and compile it in as a character string.  We can replace
// a few key constants (address of JIT-ted function and its size) by looking
// for a few magic numbers and doing a dumb string replacement.

#ifndef __APPLE__
const unsigned char upb_jit_debug_elf_file[] = {
#include "upb/pb/jit_debug_elf_file.h"
};

typedef enum
{
  GDB_JIT_NOACTION = 0,
  GDB_JIT_REGISTER,
  GDB_JIT_UNREGISTER
} jit_actions_t;

typedef struct gdb_jit_entry {
  struct gdb_jit_entry *next_entry;
  struct gdb_jit_entry *prev_entry;
  const char *symfile_addr;
  uint64_t symfile_size;
} gdb_jit_entry;

typedef struct {
  uint32_t version;
  uint32_t action_flag;
  gdb_jit_entry *relevant_entry;
  gdb_jit_entry *first_entry;
} gdb_jit_descriptor;

gdb_jit_descriptor __jit_debug_descriptor = {1, GDB_JIT_NOACTION, NULL, NULL};

void __attribute__((noinline)) __jit_debug_register_code() {
  __asm__ __volatile__("");
}

void upb_reg_jit_gdb(upb_decoderplan *plan) {
  // Create debug info.
  size_t elf_len = sizeof(upb_jit_debug_elf_file);
  plan->debug_info = malloc(elf_len);
  memcpy(plan->debug_info, upb_jit_debug_elf_file, elf_len);
  uint64_t *p = (void*)plan->debug_info;
  for (; (void*)(p+1) <= (void*)plan->debug_info + elf_len; ++p) {
    if (*p == 0x12345678) { *p = (uintptr_t)plan->jit_code; }
    if (*p == 0x321) { *p = plan->jit_size; }
  }

  // Register the JIT-ted code with GDB.
  gdb_jit_entry *e = malloc(sizeof(gdb_jit_entry));
  e->next_entry = __jit_debug_descriptor.first_entry;
  e->prev_entry = NULL;
  if (e->next_entry) e->next_entry->prev_entry = e;
  e->symfile_addr = plan->debug_info;
  e->symfile_size = elf_len;
  __jit_debug_descriptor.first_entry = e;
  __jit_debug_descriptor.relevant_entry = e;
  __jit_debug_descriptor.action_flag = GDB_JIT_REGISTER;
  __jit_debug_register_code();
}

#else

void upb_reg_jit_gdb(upb_decoderplan *plan) {
  (void)plan;
}

#endif

// Has to be a separate function, otherwise GCC will complain about
// expressions like (&foo != NULL) because they will never evaluate
// to false.
static void upb_assert_notnull(void *addr) { assert(addr != NULL); (void)addr; }

|.arch x64
|.actionlist upb_jit_actionlist
|.globals UPB_JIT_GLOBAL_
|.globalnames upb_jit_globalnames
|
|// Calling conventions.  Note -- this will need to be changed for
|// Windows, which uses a different calling convention!
|.define ARG1_64,   rdi
|.define ARG2_8,    sil
|.define ARG2_32,   esi
|.define ARG2_64,   rsi
|.define ARG3_8,    dl
|.define ARG3_32,   edx
|.define ARG3_64,   rdx
|.define ARG4_64,   rcx
|.define XMMARG1,   xmm0

|
|// Register allocation / type map.
|// ALL of the code in this file uses these register allocations.
|// When we "call" within this file, we do not use regular calling
|// conventions, but of course when calling to user callbacks we must.
|.define PTR,       rbx  // Writing this to DECODER->ptr commits our progress.
|.define CLOSURE,   r12
|.type   SINKFRAME, upb_sink_frame, r13
|.type   FRAME,     upb_decoder_frame, r14
|.type   DECODER,   upb_decoder, r15
|
|.macro callp, addr
|| upb_assert_notnull(addr);
|// TODO(haberman): fix this.  I believe the predicate we should actually be
|// testing is whether the jump distance is greater than INT32_MAX, not the
|// absolute address of the target.
|| if ((uintptr_t)addr < 0xffffffff) {
     |  call   &addr
|| } else {
     |  mov64  rax, (uintptr_t)addr
     |  call   rax
|| }
|.endmacro
|
|// Checkpoints our progress by writing PTR to DECODER, and
|// checks for end-of-buffer.
|.macro checkpoint, h
|  mov   DECODER->ptr, PTR
|  cmp   PTR, DECODER->effective_end
|  jae   =>upb_getpclabel(plan, h, ENDOFBUF)
|.endmacro
|
|.macro check_bool_ret
|  test  al, al
|  jz    ->exit_jit
|.endmacro
|
|.macro check_ptr_ret
|  test  rax, rax
|  jz    ->exit_jit
|.endmacro
|
|// Decodes varint from [PTR + offset] -> ARG3.
|// Saves new pointer as rax.
|.macro decode_loaded_varint, offset
|  // Check for <=2 bytes inline, otherwise jump to 2-10 byte decoder.
|  lea    rax, [PTR + offset + 1]
|  mov    ARG3_32, ecx
|  and    ARG3_32, 0x7f
|  test   cl, cl
|  jns    >9
|  lea    rax, [PTR + offset + 2]
|  movzx  esi, ch
|  and    esi, 0x7f
|  shl    esi, 7
|  or     ARG3_32, esi
|  test   cx, cx
|  jns    >9
|  mov    ARG1_64, rax
|  mov    ARG2_32, ARG3_32
|  callp  upb_vdecode_max8_fast
|  check_ptr_ret  // Check for unterminated, >10-byte varint.
|9:
|.endmacro
|
|.macro decode_varint, offset
|  mov    ecx, dword [PTR + offset]
|  decode_loaded_varint offset
|  mov    PTR, rax
|.endmacro
|
|// Decode the tag -> edx.
|// Could specialize this by avoiding the value masking: could just key the
|// table on the raw (length-masked) varint to save 3-4 cycles of latency.
|// Currently only support tables where all entries are in the array part.
|.macro dyndispatch_, h
|=>upb_getpclabel(plan, h, DYNDISPATCH):
|  decode_loaded_varint, 0
|  mov  ecx, edx
|  shr  ecx, 3
|  and  edx, 0x7   // Note: this value is used in the FIELD pclabel below.
|  cmp  edx, UPB_WIRE_TYPE_END_GROUP
|  je   >1
|| upb_jitmsginfo *mi = upb_getmsginfo(plan, h);
|  cmp  ecx, mi->max_field_number  // Bounds-check the field.
|  ja   ->exit_jit                 // In the future; could be unknown label
|| if ((uintptr_t)mi->tablearray < 0xffffffff) {
|    // TODO: support hybrid array/hash tables.
|    mov  rax, qword [rcx*8 + mi->tablearray]
|| } else {
|    mov64  rax, (uintptr_t)mi->tablearray
|    mov  rax, qword [rax + rcx*8]
|| }
|  jmp  rax  // Dispatch: unpredictable jump.
|1:
|// End group.
|  cmp  ecx, FRAME->group_fieldnum
|  jne  ->exit_jit         // Unexpected END_GROUP tag.
|  mov  PTR, rax   // rax came from decode_loaded_varint
|  mov  DECODER->ptr, PTR
|  jmp  =>upb_getpclabel(plan, h, ENDOFMSG)
|.endmacro
|
|.if 1
|  // Replicated dispatch: larger code, but better branch prediction.
|  .define dyndispatch, dyndispatch_
|.else
|  // Single dispatch: smaller code, could be faster because of reduced
|  // icache usage.  We keep this around to allow for easy comparison between
|  // the two.
|  .macro dyndispatch, h
|    jmp =>upb_getpclabel(plan, h, DYNDISPATCH)
|  .endmacro
|.endif
|
|// Push a stack frame (not the CPU stack, the upb_decoder stack).
|.macro pushframe, h, field, end_offset_, endtype
|// Decoder Frame.
|  lea   rax, [FRAME + sizeof(upb_decoder_frame)]  // rax for short addressing
|  cmp   rax, DECODER->limit
|  jae   ->exit_jit  // Frame stack overflow.
|  mov64 r10, (uintptr_t)field
|  mov   FRAME:rax->f, r10
|  mov   qword FRAME:rax->end_ofs, end_offset_
|  mov   byte FRAME:rax->is_sequence, (endtype == UPB_HANDLER_ENDSEQ)
|  mov   byte FRAME:rax->is_packed, 0
|| if (upb_fielddef_type(field) == UPB_TYPE_GROUP &&
||     endtype == UPB_HANDLER_ENDSUBMSG) {
|    mov dword FRAME:rax->group_fieldnum, upb_fielddef_number(field)
|| } else {
|    mov dword FRAME:rax->group_fieldnum, 0xffffffff
|| }
|  mov   DECODER->top, rax
|  mov   FRAME, rax
|// Sink Frame.
|  lea   rcx, [SINKFRAME + sizeof(upb_sink_frame)]  // rcx for short addressing
|  cmp   rcx, DECODER->sink.limit
|  jae   ->exit_jit  // Frame stack overflow.
|  mov   dword SINKFRAME:rcx->end, getselector(field, endtype)
|| if (upb_fielddef_issubmsg(field)) {
|    mov64 r9, (uintptr_t)upb_handlers_getsubhandlers(h, field)
|| } else {
|    mov64 r9, (uintptr_t)h
|| }
|  mov   SINKFRAME:rcx->h, r9
|  mov   DECODER->sink.top, rcx
|  mov   SINKFRAME, rcx
|.endmacro
|
|.macro popframe
|  sub   FRAME, sizeof(upb_decoder_frame)
|  mov   DECODER->top, FRAME
|  sub   SINKFRAME, sizeof(upb_sink_frame)
|  mov   DECODER->sink.top, SINKFRAME
|  setmsgend
|  mov   CLOSURE, SINKFRAME->closure
|.endmacro
|
|.macro setmsgend
|  mov   rsi, DECODER->jit_end
|  mov   rax, qword FRAME->end_ofs  // Will be UINT64_MAX for groups.
|  sub   rax, qword DECODER->bufstart_ofs
|  add   rax, qword DECODER->buf  // rax = d->buf + f->end_ofs - d->bufstart_ofs
|  jc    >8        // If the addition overflowed, use jit_end
|  cmp   rax, rsi
|  ja    >8        // If jit_end is less, use jit_end
|  mov   rsi, rax  // Use frame end.
|8:
|  mov   DECODER->effective_end, rsi
|.endmacro
|
|// rcx contains the tag, compare it against "tag", but since it is a varint
|// we must only compare as many bytes as actually have data.
|.macro checktag, tag
|| switch (upb_value_size(tag)) {
||    case 1:
|       cmp   cl, tag
||      break;
||    case 2:
|       cmp   cx, tag
||      break;
||    case 3:
|       and   ecx, 0xffffff  // 3 bytes
|       cmp   rcx, tag
||    case 4:
|       cmp   ecx, tag
||      break;
||    case 5:
|       mov64 rdx, 0xffffffffff  // 5 bytes
|       and   rcx, rdx
|       cmp   rcx, tag
||      break;
||    default: abort();
||  }
|.endmacro
|
|.macro sethas, reg, hasbit
|| if (hasbit >= 0) {
|    or   byte [reg + ((uint32_t)hasbit / 8)], (1 << ((uint32_t)hasbit % 8))
|| }
|.endmacro


#include <stdlib.h>
#include "upb/pb/varint.h"

static upb_selector_t getselector(const upb_fielddef *f,
                                  upb_handlertype_t type) {
  upb_selector_t selector;
  bool ok = upb_getselector(f, type, &selector);
  UPB_ASSERT_VAR(ok, ok);
  return selector;
}

static upb_func *gethandler(const upb_handlers *h, const upb_fielddef *f,
                            upb_handlertype_t type) {
  return upb_handlers_gethandler(h, getselector(f, type));
}

static uintptr_t gethandlerdata(const upb_handlers *h, const upb_fielddef *f,
                                upb_handlertype_t type) {
  return (uintptr_t)upb_handlers_gethandlerdata(h, getselector(f, type));
}

// Decodes the next val into ARG3, advances PTR.
static void upb_decoderplan_jit_decodefield(upb_decoderplan *plan,
                                            uint8_t type, size_t tag_size,
                                            const upb_handlers *h,
                                            const upb_fielddef *f) {
  // Decode the value into arg 3 for the callback.
  switch (type) {
    case UPB_TYPE(DOUBLE):
      |  movsd  XMMARG1, qword [PTR + tag_size]
      |  add    PTR, 8 + tag_size
      break;

    case UPB_TYPE(FIXED64):
    case UPB_TYPE(SFIXED64):
      |  mov  ARG3_64, qword [PTR + tag_size]
      |  add  PTR, 8 + tag_size
      break;

    case UPB_TYPE(FLOAT):
      |  movss  XMMARG1, dword [PTR + tag_size]
      |  add    PTR, 4 + tag_size
      break;

    case UPB_TYPE(FIXED32):
    case UPB_TYPE(SFIXED32):
      |  mov  ARG3_32, dword [PTR + tag_size]
      |  add  PTR, 4 + tag_size
      break;

    case UPB_TYPE(BOOL):
      // Can't assume it's one byte long, because bool must be wire-compatible
      // with all of the varint integer types.
      |  decode_varint  tag_size
      |  test  ARG3_64, ARG3_64
      |  setne ARG3_8   // Other bytes left with val, should be ok.
      break;

    case UPB_TYPE(INT64):
    case UPB_TYPE(UINT64):
    case UPB_TYPE(INT32):
    case UPB_TYPE(UINT32):
    case UPB_TYPE(ENUM):
      |  decode_varint  tag_size
      break;

    case UPB_TYPE(SINT64):
      // 64-bit zig-zag decoding.
      |  decode_varint  tag_size
      |  mov  rax, ARG3_64
      |  shr  ARG3_64, 1
      |  and  rax, 1
      |  neg  rax
      |  xor  ARG3_64, rax
      break;

    case UPB_TYPE(SINT32):
      // 32-bit zig-zag decoding.
      |  decode_varint  tag_size
      |  mov  eax, ARG3_32
      |  shr  ARG3_32, 1
      |  and  eax, 1
      |  neg  eax
      |  xor  ARG3_32, eax
      break;

    case UPB_TYPE(STRING):
    case UPB_TYPE(BYTES): {
      // We only handle the case where the entire string is in our current
      // buf, which sidesteps any security problems.  The C path has more
      // robust checks.
      |  mov  ecx, dword [PTR + tag_size]
      |  decode_loaded_varint tag_size
      |  mov  rdi, DECODER->end
      |  sub  rdi, rax
      |  cmp  ARG3_64, rdi  // if (len > d->end - str)
      |  ja   ->exit_jit    // Can't deliver, whole string not in buf.
      |  mov  PTR, rax

      upb_func *handler = gethandler(h, f, UPB_HANDLER_STARTSTR);
      if (handler) {
        |  mov  DECODER->tmp_len, ARG3_64
        |  mov  ARG1_64, CLOSURE
        |  mov64 ARG2_64, gethandlerdata(h, f, UPB_HANDLER_STARTSTR)
        |  callp handler
        |  check_ptr_ret
        |  mov  ARG1_64, rax   // sub-closure
        |  mov  ARG4_64, DECODER->tmp_len
      } else {
        |  mov  ARG1_64, CLOSURE
        |  mov  ARG4_64, ARG3_64
      }

      handler = gethandler(h, f, UPB_HANDLER_STRING);
      if (handler) {
        |  mov64 ARG2_64, gethandlerdata(h, f, UPB_HANDLER_STRING)
        |  mov   ARG3_64, PTR
        |  callp handler
        // TODO: properly handle returns other than "n" (the whole string).
        |  add   PTR, rax
      } else {
        |  add   PTR, ARG4_64
      }

      handler = gethandler(h, f, UPB_HANDLER_ENDSTR);
      if (handler) {
        |  mov    ARG1_64, CLOSURE
        |  mov64  ARG2_64, gethandlerdata(h, f, UPB_HANDLER_ENDSTR)
        |  callp  handler
        |  check_bool_ret
      }
      break;
    }

    // Will dispatch callbacks and call submessage in a second.
    case UPB_TYPE(MESSAGE):
      |  decode_varint  tag_size
      break;
    case UPB_TYPE(GROUP):
      |  add  PTR, tag_size
      break;

    default: abort();
  }
}

static void upb_decoderplan_jit_callcb(upb_decoderplan *plan,
                                       const upb_handlers *h,
                                       const upb_fielddef *f) {
  // Call callbacks.  Specializing the append accessors didn't yield a speed
  // increase in benchmarks.
  if (upb_fielddef_issubmsg(f)) {
    if (upb_fielddef_type(f) == UPB_TYPE(MESSAGE)) {
      |   mov   rsi, PTR
      |   sub   rsi, DECODER->buf
      |   add   rsi, ARG3_64   // = (d->ptr - d->buf) + delim_len
    } else {
      assert(upb_fielddef_type(f) == UPB_TYPE(GROUP));
      |   mov   rsi, UPB_NONDELIMITED
    }
    |  pushframe  h, f, rsi, UPB_HANDLER_ENDSUBMSG

    // Call startsubmsg handler (if any).
    upb_func *startsubmsg = gethandler(h, f, UPB_HANDLER_STARTSUBMSG);
    if (startsubmsg) {
      // upb_sflow_t startsubmsg(void *closure, upb_value fval)
      |  mov   ARG1_64, CLOSURE
      |  mov64 ARG2_64, gethandlerdata(h, f, UPB_HANDLER_STARTSUBMSG);
      |  callp startsubmsg
      |  check_ptr_ret
      |  mov  CLOSURE, rax
    }
    |  mov   qword SINKFRAME->closure, CLOSURE

    // TODO: have to decide what to do with NULLs subhandlers (or whether to
    // disallow them and require a full handlers tree to match the def tree).
    const upb_handlers *sub_h = upb_handlers_getsubhandlers(h, f);
    assert(sub_h);
    |  call  =>upb_getpclabel(plan, sub_h, STARTMSG)
    |  popframe

    // Call endsubmsg handler (if any).
    upb_func *endsubmsg = gethandler(h, f, UPB_HANDLER_ENDSUBMSG);
    if (endsubmsg) {
      // upb_flow_t endsubmsg(void *closure, upb_value fval);
      |  mov   ARG1_64, CLOSURE
      |  mov64 ARG2_64, gethandlerdata(h, f, UPB_HANDLER_ENDSUBMSG);
      |  callp endsubmsg
      |  check_bool_ret
    }
  } else if (!upb_fielddef_isstring(f)) {
    |  mov ARG1_64, CLOSURE
    upb_handlertype_t handlertype = upb_handlers_getprimitivehandlertype(f);
    upb_func *handler = gethandler(h, f, handlertype);
    const upb_stdmsg_fval *fv = (void*)gethandlerdata(h, f, handlertype);
    // Test for callbacks we can specialize.
    // Can't switch() on function pointers.
    if (handler == (void*)&upb_stdmsg_setint64 ||
        handler == (void*)&upb_stdmsg_setuint64) {
      |  mov   [ARG1_64 + fv->offset], ARG3_64
      |  sethas CLOSURE, fv->hasbit
    } else if (handler == (void*)&upb_stdmsg_setdouble) {
      |  movsd  qword [ARG1_64 + fv->offset], XMMARG1
      |  sethas CLOSURE, fv->hasbit
    } else if (handler == (void*)&upb_stdmsg_setint32 ||
               handler == (void*)&upb_stdmsg_setuint32) {
      |  mov   [ARG1_64 + fv->offset], ARG3_32
      |  sethas CLOSURE, fv->hasbit
    } else if (handler == (void*)&upb_stdmsg_setfloat) {
      |  movss  dword [ARG1_64 + fv->offset], XMMARG1
      |  sethas CLOSURE, fv->hasbit
    } else if (handler == (void*)&upb_stdmsg_setbool) {
      |  mov   [ARG1_64 + fv->offset], ARG3_8
      |  sethas CLOSURE, fv->hasbit
    } else if (handler) {
      // Load closure and fval into arg registers.
      |  mov64  ARG2_64, gethandlerdata(h, f, handlertype);
      |  callp  handler
      |  check_bool_ret
    }
  }
}

static uint64_t upb_get_encoded_tag(const upb_fielddef *f) {
  uint32_t tag = (upb_fielddef_number(f) << 3) |
      upb_decoder_types[upb_fielddef_type(f)].native_wire_type;
  uint64_t encoded_tag = upb_vencode32(tag);
  // No tag should be greater than 5 bytes.
  assert(encoded_tag <= 0xffffffffff);
  return encoded_tag;
}

// PTR should point to the beginning of the tag.
static void upb_decoderplan_jit_field(upb_decoderplan *plan,
                                      const upb_handlers *h,
                                      const upb_fielddef *f,
                                      const upb_fielddef *next_f) {
  uint64_t tag = upb_get_encoded_tag(f);
  uint64_t next_tag = next_f ? upb_get_encoded_tag(next_f) : 0;
  int tag_size = upb_value_size(tag);

  // PC-label for the dispatch table.
  // We check the wire type (which must be loaded in edx) because the
  // table is keyed on field number, not type.
  |=>upb_getpclabel(plan, f, FIELD):
  |  cmp  edx, (tag & 0x7)
  |  jne  ->exit_jit     // In the future: could be an unknown field or packed.
  |=>upb_getpclabel(plan, f, FIELD_NO_TYPECHECK):
  if (upb_fielddef_isseq(f)) {
    |  mov   rsi, FRAME->end_ofs
    |  pushframe  h, f, rsi, UPB_HANDLER_ENDSEQ
    upb_func *startseq = gethandler(h, f, UPB_HANDLER_STARTSEQ);
    if (startseq) {
      |  mov    ARG1_64, CLOSURE
      |  mov64  ARG2_64, gethandlerdata(h, f, UPB_HANDLER_STARTSEQ);
      |  callp  startseq
      |  check_ptr_ret
      |  mov    CLOSURE, rax
    }
    |  mov   qword SINKFRAME->closure, CLOSURE
  }

  |1:  // Label for repeating this field.

  upb_decoderplan_jit_decodefield(plan, upb_fielddef_type(f), tag_size, h, f);
  upb_decoderplan_jit_callcb(plan, h, f);

  // Epilogue: load next tag, check for repeated field.
  |  checkpoint  h
  |  mov         rcx, qword [PTR]
  if (upb_fielddef_isseq(f)) {
    |  checktag  tag
    |  je  <1
    upb_func *endseq = gethandler(h, f, UPB_HANDLER_ENDSEQ);
    if (endseq) {
      |  mov   ARG1_64, CLOSURE
      |  mov64  ARG2_64, gethandlerdata(h, f, UPB_HANDLER_ENDSEQ);
      |  callp endseq
    }
    |  popframe
    // Load next tag again (popframe clobbered it).
    |  mov         rcx, qword [PTR]
  }

  if (next_tag != 0) {
    |  checktag  next_tag
    |  je  =>upb_getpclabel(plan, next_f, FIELD_NO_TYPECHECK)
  }

  // Fall back to dynamic dispatch.
  |  dyndispatch  h
}

static int upb_compare_uint32(const void *a, const void *b) {
  return *(uint32_t*)a - *(uint32_t*)b;
}

static void upb_decoderplan_jit_msg(upb_decoderplan *plan,
                                    const upb_handlers *h) {
  |=>upb_getpclabel(plan, h, AFTER_STARTMSG):
  // There was a call to get here, so we need to align the stack.
  |  sub  rsp, 8
  |  jmp  >1

  |=>upb_getpclabel(plan, h, STARTMSG):
  // There was a call to get here, so we need to align the stack.
  |  sub  rsp, 8

  // Call startmsg handler (if any):
  upb_startmsg_handler *startmsg = upb_handlers_getstartmsg(h);
  if (startmsg) {
    // upb_flow_t startmsg(void *closure);
    |  mov   ARG1_64, SINKFRAME->closure
    |  callp startmsg
    |  check_bool_ret
  }

  |1:
  |  setmsgend
  |  checkpoint h
  |  mov    ecx, dword [PTR]
  |  dyndispatch_ h

  // --------- New code section (does not fall through) ------------------------

  // Emit code for parsing each field (dynamic dispatch contains pointers to
  // all of these).

  // Create an ordering over the fields in field number order.
  // Parsing will theoretically be fastest if we emit code in the same
  // order as field numbers are seen on-the-wire because of an optimization
  // in the generated code that skips dynamic dispatch if the next field is
  // as expected.
  const upb_msgdef *md = upb_handlers_msgdef(h);
  int num_keys = upb_msgdef_numfields(md);
  uint32_t *keys = malloc(num_keys * sizeof(*keys));
  int idx = 0;
  upb_msg_iter i;
  for(upb_msg_begin(&i, md); !upb_msg_done(&i); upb_msg_next(&i)) {
    keys[idx++] = upb_fielddef_number(upb_msg_iter_field(&i));
  }
  qsort(keys, num_keys, sizeof(uint32_t), &upb_compare_uint32);

  for(int i = 0; i < num_keys; i++) {
    const upb_fielddef *f = upb_msgdef_itof(md, keys[i]);
    const upb_fielddef *next_f =
        (i + 1 < num_keys) ? upb_msgdef_itof(md, keys[i + 1]) : NULL;
    upb_decoderplan_jit_field(plan, h, f, next_f);
  }

  free(keys);

  // --------- New code section (does not fall through) ------------------------

  // End-of-buf / end-of-message.
  // We hit a buffer limit; either we hit jit_end or end-of-submessage.
  |=>upb_getpclabel(plan, h, ENDOFBUF):
  |  cmp  PTR, DECODER->jit_end
  |  jae  ->exit_jit

  |=>upb_getpclabel(plan, h, ENDOFMSG):
  // We are at end-of-submsg: call endmsg handler (if any):
  upb_endmsg_handler *endmsg = upb_handlers_getendmsg(h);
  if (endmsg) {
    // void endmsg(void *closure, upb_status *status) {
    |  mov   ARG1_64, SINKFRAME->closure
    |  lea   ARG2_64, DECODER->sink.status
    |  callp endmsg
  }

  // Counter previous alignment.
  |  add  rsp, 8
  |  ret
}

static void upb_decoderplan_jit(upb_decoderplan *plan) {
  // The JIT prologue/epilogue trampoline that is generated in this function
  // does not depend on the handlers, so it will never vary.  Ideally we would
  // put it in an object file and just link it into upb so we could have only a
  // single copy of it instead of one copy for each decoderplan.  But our
  // options for doing that are undesirable: GCC inline assembly is
  // complicated, not portable to other compilers, and comes with subtle
  // caveats about incorrect things what the optimizer might do if you eg.
  // execute non-local jumps.  Putting this code in a .s file would force us to
  // calculate the structure offsets ourself instead of symbolically
  // (ie. [r15 + 0xcd] instead of DECODER->ptr).  So we tolerate a bit of
  // unnecessary duplication/redundancy.
  |  push  rbp
  |  mov   rbp, rsp
  |  push  r15
  |  push  r14
  |  push  r13
  |  push  r12
  |  push  rbx
  // Align stack.
  |  sub   rsp, 8
  |  mov   DECODER, ARG1_64
  |  mov   FRAME, DECODER:ARG1_64->top
  |  mov   SINKFRAME, DECODER:ARG1_64->sink.top
  |  mov   CLOSURE, SINKFRAME->closure
  |  mov   PTR, DECODER->ptr

  // TODO: push return addresses for re-entry (will be necessary for multiple
  // buffer support).
  |  call  ARG2_64

  |->exit_jit:
  // Restore stack pointer to where it was before any "call" instructions
  // inside our generated code.
  |  lea   rsp, [rbp - 48]
  // Counter previous alignment.
  |  add   rsp, 8
  |  pop   rbx
  |  pop   r12
  |  pop   r13
  |  pop   r14
  |  pop   r15
  |  leave
  |  ret

  upb_inttable_iter i;
  upb_inttable_begin(&i, &plan->msginfo);
  for(; !upb_inttable_done(&i); upb_inttable_next(&i)) {
    const upb_handlers *h = (const upb_handlers*)upb_inttable_iter_key(&i);
    upb_decoderplan_jit_msg(plan, h);
  }
}

static void upb_decoderplan_jit_assignpclabels(upb_decoderplan *plan,
                                               const upb_handlers *h) {
  // Limit the DFS.
  if (upb_inttable_lookupptr(&plan->pclabels, h)) return;

  upb_inttable_insertptr(&plan->pclabels, h,
                         upb_value_uint32(plan->pclabel_count));
  plan->pclabel_count += TOTAL_MSG_PCLABELS;

  upb_jitmsginfo *info = malloc(sizeof(*info));
  info->max_field_number = 0;
  upb_inttable_insertptr(&plan->msginfo, h, upb_value_ptr(info));

  upb_msg_iter i;
  upb_msg_begin(&i, upb_handlers_msgdef(h));
  for(; !upb_msg_done(&i); upb_msg_next(&i)) {
    const upb_fielddef *f = upb_msg_iter_field(&i);
    info->max_field_number =
        UPB_MAX(info->max_field_number, upb_fielddef_number(f));
    upb_inttable_insertptr(&plan->pclabels, f,
                           upb_value_uint32(plan->pclabel_count));
    plan->pclabel_count += TOTAL_FIELD_PCLABELS;

    // Discover the whole graph of handlers depth-first.  We will probably
    // revise this later to be more explicit about the list of handlers that
    // the plan should include.
    if (upb_fielddef_issubmsg(f)) {
      const upb_handlers *subh = upb_handlers_getsubhandlers(h, f);
      if (subh) upb_decoderplan_jit_assignpclabels(plan, subh);
    }
  }
  // TODO: support large field numbers by either using a hash table or
  // generating code for a binary search.  For now large field numbers
  // will just fall back to the table decoder.
  info->max_field_number = UPB_MIN(info->max_field_number, 16000);
  info->tablearray = malloc((info->max_field_number + 1) * sizeof(void*));
}

static void upb_decoderplan_makejit(upb_decoderplan *plan) {
  upb_inttable_init(&plan->msginfo, UPB_CTYPE_PTR);
  plan->debug_info = NULL;

  // Assign pclabels.
  plan->pclabel_count = 0;
  upb_inttable_init(&plan->pclabels, UPB_CTYPE_UINT32);
  upb_decoderplan_jit_assignpclabels(plan, plan->handlers);

  void **globals = malloc(UPB_JIT_GLOBAL__MAX * sizeof(*globals));
  dasm_init(plan, 1);
  dasm_setupglobal(plan, globals, UPB_JIT_GLOBAL__MAX);
  dasm_growpc(plan, plan->pclabel_count);
  dasm_setup(plan, upb_jit_actionlist);

  upb_decoderplan_jit(plan);

  int dasm_status = dasm_link(plan, &plan->jit_size);
  (void)dasm_status;
  assert(dasm_status == DASM_S_OK);

  plan->jit_code = mmap(NULL, plan->jit_size, PROT_READ | PROT_WRITE,
                        MAP_32BIT | MAP_ANONYMOUS | MAP_PRIVATE, 0, 0);

  upb_reg_jit_gdb(plan);

  dasm_encode(plan, plan->jit_code);

  // Create dispatch tables.
  upb_inttable_iter i;
  upb_inttable_begin(&i, &plan->msginfo);
  for(; !upb_inttable_done(&i); upb_inttable_next(&i)) {
    const upb_handlers *h = (const upb_handlers*)upb_inttable_iter_key(&i);
    upb_jitmsginfo *mi = upb_getmsginfo(plan, h);
    // We jump to after the startmsg handler since it is called before entering
    // the JIT (either by upb_decoder or by a previous call to the JIT).
    mi->jit_func = plan->jit_code +
        dasm_getpclabel(plan, upb_getpclabel(plan, h, AFTER_STARTMSG));
    for (uint32_t j = 0; j <= mi->max_field_number; j++) {
      const upb_fielddef *f = upb_msgdef_itof(upb_handlers_msgdef(h), j);
      if (f) {
        mi->tablearray[j] = plan->jit_code +
            dasm_getpclabel(plan, upb_getpclabel(plan, f, FIELD));
      } else {
        // TODO: extend the JIT to handle unknown fields.
        // For the moment we exit the JIT for any unknown field.
        mi->tablearray[j] = globals[UPB_JIT_GLOBAL_exit_jit];
      }
    }
  }

  upb_inttable_uninit(&plan->pclabels);

  dasm_free(plan);
  free(globals);

  mprotect(plan->jit_code, plan->jit_size, PROT_EXEC | PROT_READ);

#ifndef NDEBUG
  // View with: objdump -M intel -D -b binary -mi386 -Mx86-64 /tmp/machine-code
  // Or: ndisasm -b 64 /tmp/machine-code
  FILE *f = fopen("/tmp/machine-code", "wb");
  fwrite(plan->jit_code, plan->jit_size, 1, f);
  fclose(f);
#endif
}

static void upb_decoderplan_freejit(upb_decoderplan *plan) {
  upb_inttable_iter i;
  upb_inttable_begin(&i, &plan->msginfo);
  for(; !upb_inttable_done(&i); upb_inttable_next(&i)) {
    upb_jitmsginfo *mi = upb_value_getptr(upb_inttable_iter_value(&i));
    free(mi->tablearray);
    free(mi);
  }
  upb_inttable_uninit(&plan->msginfo);
  munmap(plan->jit_code, plan->jit_size);
  free(plan->debug_info);
  // TODO: unregister
}

static void upb_decoder_enterjit(upb_decoder *d) {
  if (d->plan->jit_code &&
      d->sink.top == d->sink.stack &&
      d->ptr && d->ptr < d->jit_end) {
#ifndef NDEBUG
    register uint64_t rbx asm ("rbx") = 11;
    register uint64_t r12 asm ("r12") = 12;
    register uint64_t r13 asm ("r13") = 13;
    register uint64_t r14 asm ("r14") = 14;
    register uint64_t r15 asm ("r15") = 15;
#endif
    // Decodes as many fields as possible, updating d->ptr appropriately,
    // before falling through to the slow(er) path.
    void (*upb_jit_decode)(upb_decoder *d, void*) = (void*)d->plan->jit_code;
    upb_jitmsginfo *mi = upb_getmsginfo(d->plan, d->plan->handlers);
    assert(mi);
    upb_jit_decode(d, mi->jit_func);
    assert(d->ptr <= d->end);

    // Test that callee-save registers were properly restored.
    assert(rbx == 11);
    assert(r12 == 12);
    assert(r13 == 13);
    assert(r14 == 14);
    assert(r15 == 15);
  }
}
generated by cgit on debian on lair
contact matthew@masot.net with questions or feedback