summaryrefslogtreecommitdiff
path: root/upb/pb/decoder.int.h
blob: 3ed50dfcbc559f7503673395f57a17698613fa4f (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
/*
** Internal-only definitions for the decoder.
*/

#ifndef UPB_DECODER_INT_H_
#define UPB_DECODER_INT_H_

#include "upb/def.h"
#include "upb/handlers.h"
#include "upb/pb/decoder.h"
#include "upb/sink.h"
#include "upb/table.int.h"

#include "upb/port_def.inc"

/* Opcode definitions.  The canonical meaning of each opcode is its
 * implementation in the interpreter (the JIT is written to match this).
 *
 * All instructions have the opcode in the low byte.
 * Instruction format for most instructions is:
 *
 * +-------------------+--------+
 * |     arg (24)      | op (8) |
 * +-------------------+--------+
 *
 * Exceptions are indicated below.  A few opcodes are multi-word. */
typedef enum {
  /* Opcodes 1-8, 13, 15-18 parse their respective descriptor types.
   * Arg for all of these is the upb selector for this field. */
#define T(type) OP_PARSE_ ## type = UPB_DESCRIPTOR_TYPE_ ## type
  T(DOUBLE), T(FLOAT), T(INT64), T(UINT64), T(INT32), T(FIXED64), T(FIXED32),
  T(BOOL), T(UINT32), T(SFIXED32), T(SFIXED64), T(SINT32), T(SINT64),
#undef T
  OP_STARTMSG       = 9,   /* No arg. */
  OP_ENDMSG         = 10,  /* No arg. */
  OP_STARTSEQ       = 11,
  OP_ENDSEQ         = 12,
  OP_STARTSUBMSG    = 14,
  OP_ENDSUBMSG      = 19,
  OP_STARTSTR       = 20,
  OP_STRING         = 21,
  OP_ENDSTR         = 22,

  OP_PUSHTAGDELIM   = 23,  /* No arg. */
  OP_PUSHLENDELIM   = 24,  /* No arg. */
  OP_POP            = 25,  /* No arg. */
  OP_SETDELIM       = 26,  /* No arg. */
  OP_SETBIGGROUPNUM = 27,  /* two words:
                            *   | unused (24)     | opc (8) |
                            *   |        groupnum (32)      | */
  OP_CHECKDELIM     = 28,
  OP_CALL           = 29,
  OP_RET            = 30,
  OP_BRANCH         = 31,

  /* Different opcodes depending on how many bytes expected. */
  OP_TAG1           = 32,  /* | match tag (16) | jump target (8) | opc (8) | */
  OP_TAG2           = 33,  /* | match tag (16) | jump target (8) | opc (8) | */
  OP_TAGN           = 34,  /* three words: */
                           /*   | unused (16) | jump target(8) | opc (8) | */
                           /*   |           match tag 1 (32)             | */
                           /*   |           match tag 2 (32)             | */

  OP_SETDISPATCH    = 35,  /* N words: */
                           /*   | unused (24)         | opc | */
                           /*   | upb_inttable* (32 or 64)  | */

  OP_DISPATCH       = 36,  /* No arg. */

  OP_HALT           = 37   /* No arg. */
} opcode;

#define OP_MAX OP_HALT

UPB_INLINE opcode getop(uint32_t instr) { return (opcode)(instr & 0xff); }

struct upb_pbcodecache {
  upb_arena *arena;
  upb_handlercache *dest;
  bool allow_jit;
  bool lazy;

  /* Array of mgroups. */
  upb_inttable groups;
};

/* Method group; represents a set of decoder methods that had their code
 * emitted together.  Immutable once created.  */
typedef struct {
  /* Maps upb_msgdef/upb_handlers -> upb_pbdecodermethod.  Owned by us.
   *
   * Ideally this would be on pbcodecache (if we were actually caching code).
   * Right now we don't actually cache anything, which is wasteful. */
  upb_inttable methods;

  /* The bytecode for our methods, if any exists.  Owned by us. */
  uint32_t *bytecode;
  uint32_t *bytecode_end;

#ifdef UPB_USE_JIT_X64
  /* JIT-generated machine code, if any. */
  upb_string_handlerfunc *jit_code;
  /* The size of the jit_code (required to munmap()). */
  size_t jit_size;
  char *debug_info;
  void *dl;
#endif
} mgroup;

/* The maximum that any submessages can be nested.  Matches proto2's limit.
 * This specifies the size of the decoder's statically-sized array and therefore
 * setting it high will cause the upb::pb::Decoder object to be larger.
 *
 * If necessary we can add a runtime-settable property to Decoder that allow
 * this to be larger than the compile-time setting, but this would add
 * complexity, particularly since we would have to decide how/if to give users
 * the ability to set a custom memory allocation function. */
#define UPB_DECODER_MAX_NESTING 64

/* Internal-only struct used by the decoder. */
typedef struct {
  /* Space optimization note: we store two pointers here that the JIT
   * doesn't need at all; the upb_handlers* inside the sink and
   * the dispatch table pointer.  We can optimze so that the JIT uses
   * smaller stack frames than the interpreter.  The only thing we need
   * to guarantee is that the fallback routines can find end_ofs. */
  upb_sink sink;

  /* The absolute stream offset of the end-of-frame delimiter.
   * Non-delimited frames (groups and non-packed repeated fields) reuse the
   * delimiter of their parent, even though the frame may not end there.
   *
   * NOTE: the JIT stores a slightly different value here for non-top frames.
   * It stores the value relative to the end of the enclosed message.  But the
   * top frame is still stored the same way, which is important for ensuring
   * that calls from the JIT into C work correctly. */
  uint64_t end_ofs;
  const uint32_t *base;

  /* 0 indicates a length-delimited field.
   * A positive number indicates a known group.
   * A negative number indicates an unknown group. */
  int32_t groupnum;
  upb_inttable *dispatch;  /* Not used by the JIT. */
} upb_pbdecoder_frame;

struct upb_pbdecodermethod {
  /* While compiling, the base is relative in "ofs", after compiling it is
   * absolute in "ptr". */
  union {
    uint32_t ofs;     /* PC offset of method. */
    void *ptr;        /* Pointer to bytecode or machine code for this method. */
  } code_base;

  /* The decoder method group to which this method belongs. */
  const mgroup *group;

  /* Whether this method is native code or bytecode. */
  bool is_native_;

  /* The handler one calls to invoke this method. */
  upb_byteshandler input_handler_;

  /* The destination handlers this method is bound to.  We own a ref. */
  const upb_handlers *dest_handlers_;

  /* Dispatch table -- used by both bytecode decoder and JIT when encountering a
   * field number that wasn't the one we were expecting to see.  See
   * decoder.int.h for the layout of this table. */
  upb_inttable dispatch;
};

struct upb_pbdecoder {
  upb_arena *arena;

  /* Our input sink. */
  upb_bytessink input_;

  /* The decoder method we are parsing with (owned). */
  const upb_pbdecodermethod *method_;

  size_t call_len;
  const uint32_t *pc, *last;

  /* Current input buffer and its stream offset. */
  const char *buf, *ptr, *end, *checkpoint;

  /* End of the delimited region, relative to ptr, NULL if not in this buf. */
  const char *delim_end;

  /* End of the delimited region, relative to ptr, end if not in this buf. */
  const char *data_end;

  /* Overall stream offset of "buf." */
  uint64_t bufstart_ofs;

  /* Buffer for residual bytes not parsed from the previous buffer. */
  char residual[UPB_DECODER_MAX_RESIDUAL_BYTES];
  char *residual_end;

  /* Bytes of data that should be discarded from the input beore we start
   * parsing again.  We set this when we internally determine that we can
   * safely skip the next N bytes, but this region extends past the current
   * user buffer. */
  size_t skip;

  /* Stores the user buffer passed to our decode function. */
  const char *buf_param;
  size_t size_param;
  const upb_bufhandle *handle;

  /* Our internal stack. */
  upb_pbdecoder_frame *stack, *top, *limit;
  const uint32_t **callstack;
  size_t stack_size;

  upb_status *status;

#ifdef UPB_USE_JIT_X64
  /* Used momentarily by the generated code to store a value while a user
   * function is called. */
  uint32_t tmp_len;

  const void *saved_rsp;
#endif
};

/* Decoder entry points; used as handlers. */
void *upb_pbdecoder_startbc(void *closure, const void *pc, size_t size_hint);
void *upb_pbdecoder_startjit(void *closure, const void *hd, size_t size_hint);
size_t upb_pbdecoder_decode(void *closure, const void *hd, const char *buf,
                            size_t size, const upb_bufhandle *handle);
bool upb_pbdecoder_end(void *closure, const void *handler_data);

/* Decoder-internal functions that the JIT calls to handle fallback paths. */
int32_t upb_pbdecoder_resume(upb_pbdecoder *d, void *p, const char *buf,
                             size_t size, const upb_bufhandle *handle);
size_t upb_pbdecoder_suspend(upb_pbdecoder *d);
int32_t upb_pbdecoder_skipunknown(upb_pbdecoder *d, int32_t fieldnum,
                                  uint8_t wire_type);
int32_t upb_pbdecoder_checktag_slow(upb_pbdecoder *d, uint64_t expected);
int32_t upb_pbdecoder_decode_varint_slow(upb_pbdecoder *d, uint64_t *u64);
int32_t upb_pbdecoder_decode_f32(upb_pbdecoder *d, uint32_t *u32);
int32_t upb_pbdecoder_decode_f64(upb_pbdecoder *d, uint64_t *u64);
void upb_pbdecoder_seterr(upb_pbdecoder *d, const char *msg);

/* Error messages that are shared between the bytecode and JIT decoders. */
extern const char *kPbDecoderStackOverflow;
extern const char *kPbDecoderSubmessageTooLong;

/* Access to decoderplan members needed by the decoder. */
const char *upb_pbdecoder_getopname(unsigned int op);

/* JIT codegen entry point. */
void upb_pbdecoder_jit(mgroup *group);
void upb_pbdecoder_freejit(mgroup *group);

/* A special label that means "do field dispatch for this message and branch to
 * wherever that takes you." */
#define LABEL_DISPATCH 0

/* A special slot in the dispatch table that stores the epilogue (ENDMSG and/or
 * RET) for branching to when we find an appropriate ENDGROUP tag. */
#define DISPATCH_ENDMSG 0

/* It's important to use this invalid wire type instead of 0 (which is a valid
 * wire type). */
#define NO_WIRE_TYPE 0xff

/* The dispatch table layout is:
 *   [field number] -> [ 48-bit offset ][ 8-bit wt2 ][ 8-bit wt1 ]
 *
 * If wt1 matches, jump to the 48-bit offset.  If wt2 matches, lookup
 * (UPB_MAX_FIELDNUMBER + fieldnum) and jump there.
 *
 * We need two wire types because of packed/non-packed compatibility.  A
 * primitive repeated field can use either wire type and be valid.  While we
 * could key the table on fieldnum+wiretype, the table would be 8x sparser.
 *
 * Storing two wire types in the primary value allows us to quickly rule out
 * the second wire type without needing to do a separate lookup (this case is
 * less common than an unknown field). */
UPB_INLINE uint64_t upb_pbdecoder_packdispatch(uint64_t ofs, uint8_t wt1,
                                               uint8_t wt2) {
  return (ofs << 16) | (wt2 << 8) | wt1;
}

UPB_INLINE void upb_pbdecoder_unpackdispatch(uint64_t dispatch, uint64_t *ofs,
                                             uint8_t *wt1, uint8_t *wt2) {
  *wt1 = (uint8_t)dispatch;
  *wt2 = (uint8_t)(dispatch >> 8);
  *ofs = dispatch >> 16;
}

/* All of the functions in decoder.c that return int32_t return values according
 * to the following scheme:
 *   1. negative values indicate a return code from the following list.
 *   2. positive values indicate that error or end of buffer was hit, and
 *      that the decode function should immediately return the given value
 *      (the decoder state has already been suspended and is ready to be
 *      resumed). */
#define DECODE_OK -1
#define DECODE_MISMATCH -2  /* Used only from checktag_slow(). */
#define DECODE_ENDGROUP -3  /* Used only from checkunknown(). */

#define CHECK_RETURN(x) { int32_t ret = x; if (ret >= 0) return ret; }

#include "upb/port_undef.inc"

#endif  /* UPB_DECODER_INT_H_ */
generated by cgit on debian on lair
contact matthew@masot.net with questions or feedback