summaryrefslogtreecommitdiff
path: root/src/upb_decoder_x64.asm
blob: c4176445f97037ee9ccff0437cf3cbbc74ff2d99 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
DEFAULT REL  ; Default to RIP-relative addressing instead of absolute.

extern _upb_decode_varint_fast64

SECTION .data

; Our dispatch table; used to jump to the right handler, keyed on the field's
; type.
dispatch_table:
  dq _upb_fastdecode.cant_fast_path  ; field not in table (type == 0).  (check_4).
  dq _upb_fastdecode.fixed64  ; double
  dq _upb_fastdecode.fixed32  ; float
  dq _upb_fastdecode.varint   ; int64
  dq _upb_fastdecode.varint   ; uint64
  dq _upb_fastdecode.varint   ; int32
  dq _upb_fastdecode.fixed64  ; fixed64
  dq _upb_fastdecode.fixed32  ; fixed32
  dq _upb_fastdecode.varint   ; bool
  dq _upb_fastdecode.string   ; string
  dq _upb_fastdecode.cant_fast_path  ; group (check_6)
  dq _upb_fastdecode.cant_fast_path  ; message
  dq _upb_fastdecode.string   ; bytes
  dq _upb_fastdecode.varint   ; uint32
  dq _upb_fastdecode.varint   ; enum
  dq _upb_fastdecode.fixed32  ; sfixed32
  dq _upb_fastdecode.fixed64  ; sfixed64
  dq _upb_fastdecode.varint_sint32 ; sint32
  dq _upb_fastdecode.varint_sint64 ; sint64

  GLOBAL _upb_decode_fast

SECTION .text
; Register allocation.
%define BUF rbx       ; const char *p, current buf position.
%define END rbp       ; const char *end, where the buf ends (either submsg end or buf end)
%define STRING r12    ; unused
%define FVAL r13      ; upb_value fval, needs to be preserved across varint decoding call.
%define UNUSED r14
%define CLOSURE r15

; Stack layout: *tableptr, uint32_t maxfield_times_8
%define STACK_SPACE 24      ; this value + 8 must be a multiple of 16.
%define TABLE_SPILL [rsp]   ; our lookup table, indexed by field number.
%define COMMITTED_BUF_SPILL [rsp+8]
%define MAXFIELD_TIMES_8_SPILL [rsp+16]


; Executing the fast path requires the following conditions:
; - check_1: there are >=12 bytes left (<=2 byte tag and <=10 byte varint).
; - check_2: the tag is <= 2 bytes.
; - check_3: the field number is <= the table size
;   (ie. it must be an array lookup, not a hash lookup).
; - check_4: the field is known (found in the table).
; - check_5: the wire type we read is correct for the field number,
;   ("packed" fields are not accepted, yet.  this could be handled
;    efficiently by doing an extra check on the "type check failed"
;    path that goes into a tight loop if the encoding was packed).
; - check_6: the field is not a group or a message (or string, TODO)
;   (this could be relaxed, but due to delegation it's a bit tricky).
; - check_7: if the value is a string, the entire string is available in
;   the buffer, and our cached string object can be recycled, and
;   our string object already references the source buffer, so
;   absolutely no refcount twiddling is required.


%macro decode_and_dispatch_ 0
align 16
.decode_and_dispatch:
  ; Load a few values we'll need in a sec.
  mov r8, TABLE_SPILL
  mov r9d, MAXFIELD_TIMES_8_SPILL

  mov rax, END
  sub rax, BUF
  cmp rax, 12
  jb _upb_fastdecode.cant_fast_path ; check_1 (<12 bytes left).

  ; Decode a 1 or 2-byte varint -> eax.
  mov cl, byte [BUF]
  lea rdi, [BUF+1]
  movzx eax, cl
  and eax, 0x7f
  test cl, cl
  jns .one_byte_tag ; Should be predictable if fields are in order.
  movzx ecx, byte [BUF+1]
  lea rdi, [BUF+2]
  mov edx, ecx
  and edx, 0x7f
  shl edx, 7
  or eax, edx
  test al, al
  js _upb_fastdecode.cant_fast_path ; check_2 (tag was >2 bytes).
.one_byte_tag:
  mov BUF, rdi

  ; Decode tag and dispatch.
  mov ecx, eax
  and eax, 0x3ff8 ; eax now contains field number * 8
  lea r11, [r8+rax*2]   ; *2 is really *16, since rax is already *8.
  and ecx, 0x7    ; ecx now contains wire type.
  cmp eax, r9d
  jae _upb_fastdecode.cant_fast_path  ; check_3 (field number > table size)
  mov FIELDDEF, [r11+8]     ; Lookup fielddef (upb_itof_ent.f)
  movzx rdx, BYTE [r11+1]   ; Lookup field type.
  mov rax, qword dispatch_table
  jmp [rax+rdx*8]
%endmacro

%macro decode_and_dispatch 0
  jmp .decode_and_dispatch
%endmacro

%macro call_callback 0
  ; Value arg must already be in rdx when macro is called.
  mov rdi, CLOSURE
  mov rsi, FIELDDEF
  mov rcx, 33      ; RAW; we could pass the correct type, or only do this in non-debug modes.
  call CALLBACK
  mov COMMITTED_BUF_SPILL, BUF
  cmp eax, 0
  jne .done    ; Caller requested BREAK or SKIPSUBMSG.
%endmacro

%macro check_type 1
  cmp ecx, %1
  jne _upb_fastdecode.cant_fast_path  ; check_5 (wire type check failed).
%endmacro

; extern upb_flow_t upb_fastdecode(const char **p, const char *end,
;                                  upb_value_handler_t value_cb, void *closure,
;                                  void *table, int table_size);
align 16
global _upb_fastdecode
_upb_fastdecode:
  ; We use all callee-save regs.
  push rbx
  push rbp
  push r12
  push r13
  push r14
  push r15
  sub rsp, STACK_SPACE

  ; Parse arguments into reg vals and stack.
  mov BUF, rdi
  mov COMMITTED_BUF_SPILL, rdi
  mov END, rsi
  mov CALLBACK, rdx
  mov CLOSURE, rcx
  mov TABLE_SPILL, r8
  shl r9, 3
  mov MAXFIELD_TIMES_8_SPILL, r9

  decode_and_dispatch

align 16
.varint:
  call _upb_decode_varint_fast64  ; BUF is already in rdi.
  test rax, rax
  jz _upb_fastdecode.cant_fast_path  ; Varint was unterminated, slow path will handle error.
  mov BUF, rax
  call_callback      ; rdx already holds value.
  decode_and_dispatch_

align 16
.fixed32:
  mov edx, DWORD [BUF]  ; Might be unaligned, but that's ok.
  add BUF, 4
  call_callback
  decode_and_dispatch

align 16
.fixed64:
  mov rdx, QWORD [BUF]   ; Might be unaligned, but that's ok.
  add BUF, 8
  call_callback
  decode_and_dispatch

align 16
.varint_sint32:
  call _upb_decode_varint_fast64  ; BUF is already in rdi.
  test rax, rax
  jz _upb_fastdecode.cant_fast_path  ; Varint was unterminated, slow path will handle error.
  mov BUF, rax

  ; Perform 32-bit zig-zag decoding.
  mov ecx, edx
  shr edx, 1
  and ecx, 0x1
  neg ecx
  xor edx, ecx
  call_callback
  decode_and_dispatch

align 16
.varint_sint64:
  call _upb_decode_varint_fast64  ; BUF is already in rdi.
  test rax, rax
  jz _upb_fastdecode.cant_fast_path  ; Varint was unterminated, slow path will handle error.
  mov BUF, rax

  ; Perform 64-bit zig-zag decoding.
  mov rcx, rdx
  shr rdx, 1
  and ecx, 0x1
  neg rcx
  xor rdx, rcx
  call_callback
  decode_and_dispatch

align 16
.string:

.cant_fast_path:
  mov rax, 0   ; UPB_CONTINUE -- continue as before.
.done:
  ; If coming via done, preserve the user callback's return in rax.

  ; Return committed buf pointer as second parameter.
  mov rdx, COMMITTED_BUF_SPILL
  add rsp, STACK_SPACE
  pop r15
  pop r14
  pop r13
  pop r12
  pop rbp
  pop rbx
  ret
generated by cgit on debian on lair
contact matthew@masot.net with questions or feedback