src/upb_msg.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355

/*
 * upb - a minimalist implementation of protocol buffers.
 *
 * Copyright (c) 2009 Joshua Haberman.  See LICENSE for details.
 *
 * A upb_msg provides a full description of a message as defined in a .proto
 * file.  It supports many features and operations for dealing with proto
 * messages:
 * - reflection over .proto types at runtime (list fields, get names, etc).
 * - an in-memory byte-level format for efficiently storing and accessing msgs.
 * - serializing and deserializing from the in-memory format to a protobuf.
 * - optional memory management for handling strings, arrays, and submessages.
 *
 * Throughout this file, the following convention is used:
 * - "struct upb_msg *m" describes a message type (name, list of fields, etc).
 * - "void *data" is an actual message stored using the in-memory format.
 *
 * The in-memory format is very much like a C struct that you can define at
 * run-time, but also supports reflection.  Like C structs it supports
 * offset-based access, as opposed to the much slower name-based lookup.  The
 * format stores both the values themselves and bits describing whether each
 * field is set or not.  For example:
 *
 * parsed message Foo {
 *   optional bool a = 1;
 *   repeated uint32 b = 2;
 *   optional Bar c = 3;
 * }
 *
 * The in-memory layout for this message on a 32-bit machine will be something
 * like:
 *
 *  Foo
 * +------------------------+
 * | set_flags a:1, b:1, c:1|
 * +------------------------+
 * | bool a (1 byte)        |
 * +------------------------+
 * | padding (3 bytes)      |
 * +------------------------+         upb_array
 * | upb_array* b (4 bytes) | ---->  +----------------------------+
 * +------------------------+        | uint32* elements (4 bytes) | ---+
 * | Bar* c (4 bytes)       |        +----------------------------+    |
 * +------------------------+        | uint32 size (4 bytes)      |    |
 *                                   +----------------------------+    |
 *                                                                     |
 *    -----------------------------------------------------------------+
 *    |
 *    V
 *  uint32 array
 * +----+----+----+----+----+----+
 * | e1 | e2 | e3 | e4 | e5 | e6 |
 * +----+----+----+----+----+----+
 *
 * And the corresponding C structure (as emitted by the proto compiler) would be:
 *
 * struct Foo {
 *   union {
 *     uint8_t bytes[1];
 *     struct {
 *       bool a:1;
 *       bool b:1;
 *       bool c:1;
 *     } has;
 *   } set_flags;
 *   bool a;
 *   upb_uint32_array *b;
 *   Bar *c;
 * }
 *
 * Because the C struct emitted by the upb compiler uses exactly the same
 * byte-level format as the reflection interface, you can access the same hunk
 * of memory either way.  The C struct provides maximum performance and static
 * type safety; upb_msg provides flexibility.
 *
 * The in-memory format has no interoperability guarantees whatsoever, except
 * that a single version of upb will interoperate with itself.  Don't even
 * think about persisting the in-memory format or sending it anywhere.  That's
 * what serialized protobufs are for!  The in-memory format is just that -- an
 * in-memory representation that allows for fast access.
 *
 * The in-memory format is carefully designed to *not* mandate any particular
 * memory management scheme.  This should make it easier to integrate with
 * existing memory management schemes, or to perform advanced techniques like
 * reference counting, garbage collection, and string references.  Different
 * clients can read each others messages regardless of what memory management
 * scheme each is using.
 *
 * A memory management scheme is provided for convenience, and it is used by
 * default by the stock message parser.  Clients can substitute their own
 * memory management scheme into this parser without any loss of generality
 * or performance.
 */

#ifndef UPB_MSG_H_
#define UPB_MSG_H_

#include <stdbool.h>
#include <stdint.h>

#include "upb.h"
#include "upb_table.h"
#include "upb_parse.h"

#ifdef __cplusplus
extern "C" {
#endif

/* Message definition. ********************************************************/

/* Structure that describes a single field in a message.  This structure is very
 * consciously designed to fit into 12/16 bytes (32/64 bit, respectively),
 * because copies of this struct are in the hash table that is read in the
 * critical path of parsing.  Minimizing the size of this struct increases
 * cache-friendliness. */
struct upb_msg_field {
  union upb_symbol_ref ref;
  uint32_t byte_offset;     /* Where to find the data. */
  uint16_t field_index;     /* Indexes upb_msg.fields. Also indicates set bit */
  upb_field_type_t type;    /* Copied from descriptor for cache-friendliness. */
  upb_label_t label;
};

/* Structure that describes a single .proto message type. */
struct upb_msg {
  struct google_protobuf_DescriptorProto *descriptor;
  struct upb_string fqname;      /* Fully qualified. */
  size_t size;
  uint32_t num_fields;
  uint32_t set_flags_bytes;
  uint32_t num_required_fields;  /* Required fields have the lowest set bytemasks. */
  struct upb_inttable fields_by_num;
  struct upb_strtable fields_by_name;
  struct upb_msg_field *fields;
  struct google_protobuf_FieldDescriptorProto **field_descriptors;
};

/* The num->field and name->field maps in upb_msg allow fast lookup of fields
 * by number or name.  These lookups are in the critical path of parsing and
 * field lookup, so they must be as fast as possible.  To make these more
 * cache-friendly, we put the data in the table by value. */

struct upb_fieldsbynum_entry {
  struct upb_inttable_entry e;
  struct upb_msg_field f;
};

struct upb_fieldsbyname_entry {
  struct upb_strtable_entry e;
  struct upb_msg_field f;
};

/* Can be used to retrieve a field descriptor given the upb_msg_field ref. */
INLINE struct google_protobuf_FieldDescriptorProto *upb_msg_field_descriptor(
    struct upb_msg_field *f, struct upb_msg *m) {
  return m->field_descriptors[f->field_index];
}

/* Initializes/frees a upb_msg.  Usually this will be called by upb_context, and
 * clients will not have to construct one directly.
 *
 * Caller retains ownership of d, but the msg will contain references to it, so
 * it must outlive the msg.  Note that init does not resolve upb_msg_field.ref
 * the caller should do that post-initialization by calling upb_msg_ref()
 * below.
 *
 * fqname indicates the fully-qualified name of this message.  Ownership of
 * fqname passes to the msg, but the msg will contain references to it, so it
 * must outlive the msg.
 *
 * sort indicates whether or not it is safe to reorder the fields from the order
 * they appear in d.  This should be false if code has been compiled against a
 * header for this type that expects the given order. */
bool upb_msg_init(struct upb_msg *m, struct google_protobuf_DescriptorProto *d,
                  struct upb_string fqname, bool sort);
void upb_msg_free(struct upb_msg *m);

/* Clients use this function on a previously initialized upb_msg to resolve the
 * "ref" field in the upb_msg_field.  Since messages can refer to each other in
 * mutually-recursive ways, this step must be separated from initialization. */
void upb_msg_ref(struct upb_msg *m, struct upb_msg_field *f, union upb_symbol_ref ref);

/* Looks up a field by name or number.  While these are written to be as fast
 * as possible, it will still be faster to cache the results of this lookup if
 * possible.  These return NULL if no such field is found. */
INLINE struct upb_msg_field *upb_msg_fieldbynum(struct upb_msg *m,
                                                uint32_t number) {
  struct upb_fieldsbynum_entry *e =
      (struct upb_fieldsbynum_entry*)upb_inttable_fast_lookup(
          &m->fields_by_num, number, sizeof(struct upb_fieldsbynum_entry));
  return e ? &e->f : NULL;
}
INLINE struct upb_msg_field *upb_msg_fieldbyname(struct upb_msg *m,
                                                 struct upb_string *name) {
  struct upb_fieldsbyname_entry *e =
      (struct upb_fieldsbyname_entry*)upb_strtable_lookup(
          &m->fields_by_name, name);
  return e ? &e->f : NULL;
}

INLINE bool upb_issubmsg(struct upb_msg_field *f) {
  return upb_issubmsgtype(f->type);
}
INLINE bool upb_isstring(struct upb_msg_field *f) {
  return upb_isstringtype(f->type);
}
INLINE bool upb_isarray(struct upb_msg_field *f) {
  return f->label == GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_LABEL_REPEATED;
}

/* "Set" flag reading and writing.  *******************************************/

INLINE size_t upb_isset_offset(uint32_t field_index) {
  return field_index / 8;
}

INLINE uint8_t upb_isset_mask(uint32_t field_index) {
  return 1 << (field_index % 8);
}

/* Functions for reading and writing the "set" flags in the msg.  Note that
 * these do not perform memory management associated with any dynamic memory
 * these fields may be referencing. These *only* set and test the flags. */
INLINE void upb_msg_set(void *s, struct upb_msg_field *f)
{
  ((char*)s)[upb_isset_offset(f->field_index)] |= upb_isset_mask(f->field_index);
}

INLINE void upb_msg_unset(void *s, struct upb_msg_field *f)
{
  ((char*)s)[upb_isset_offset(f->field_index)] &= ~upb_isset_mask(f->field_index);
}

INLINE bool upb_msg_is_set(void *s, struct upb_msg_field *f)
{
  return ((char*)s)[upb_isset_offset(f->field_index)] & upb_isset_mask(f->field_index);
}

INLINE bool upb_msg_all_required_fields_set(void *s, struct upb_msg *m)
{
  int num_fields = m->num_required_fields;
  int i = 0;
  while(num_fields > 8) {
    if(((uint8_t*)s)[i++] != 0xFF) return false;
    num_fields -= 8;
  }
  if(((uint8_t*)s)[i] != (1 << num_fields) - 1) return false;
  return true;
}

INLINE void upb_msg_clear(void *s, struct upb_msg *m)
{
  memset(s, 0, m->set_flags_bytes);
}

/* Scalar (non-array) data access. ********************************************/

/* Returns a pointer to a specific field in a message. */
INLINE union upb_value_ptr upb_msg_getptr(void *data, struct upb_msg_field *f) {
  union upb_value_ptr p;
  p._void = ((char*)data + f->byte_offset);
  return p;
}

/* Memory management  *********************************************************/

/* One important note about these memory management routines: they must be used
 * completely or not at all (for each message).  In other words, you can't
 * allocate your own message and then free it with upb_msgdata_free.  As
 * another example, you can't point a field to your own string and then call
 * upb_msg_reuse_str. */

/* Allocates and frees message data, respectively.  Newly allocated data is
 * initialized to empty.  Freeing a message always frees string data, but
 * the client can decide whether or not submessages should be deleted. */
void *upb_msgdata_new(struct upb_msg *m);
void upb_msgdata_free(void *data, struct upb_msg *m, bool free_submsgs);

/* Given a pointer to the appropriate field of the message or array, these
 * functions will lazily allocate memory for a string, array, or submessage.
 * If the previously allocated memory is big enough, it will reuse it without
 * re-allocating.  See upb_msg.c for example usage. */

/* Reuse a string of at least the given size. */
void upb_msg_reuse_str(struct upb_string **str, uint32_t size);
/* Like the previous, but assumes that the string will be by reference, so
 * doesn't allocate memory for the string itself. */
void upb_msg_reuse_strref(struct upb_string **str);

/* Reuse an array of at least the given size, with the given type. */
void upb_msg_reuse_array(struct upb_array **arr, uint32_t size,
                         upb_field_type_t t);

/* Reuse a submessage of the given type. */
void upb_msg_reuse_submsg(void **msg, struct upb_msg *m);

/* Serialization/Deserialization.  ********************************************/

/* This is all just a layer on top of the stream-oriented facility in
 * upb_parse.h. */

struct upb_msg_parse_frame {
  struct upb_msg *m;
  void *data;
};

//#include "upb_text.h"
struct upb_msg_parse_state {
  struct upb_parse_state s;
  bool merge;
  bool byref;
  struct upb_msg *m;
  struct upb_msg_parse_frame stack[UPB_MAX_NESTING], *top;
  //struct upb_text_printer p;
};

/* Initializes/frees a message parser.  The parser will write the data to the
 * message data "data", which the caller must have previously allocated (the
 * parser will allocate submsgs, strings, and arrays as needed, however).
 *
 * "Merge" controls whether the parser will append to data instead of
 * overwriting.  Merging concatenates arrays and merges submessages instead
 * of clearing both.
 *
 * "Byref" controls whether the new message data copies or references strings
 * it encounters.  If byref == true, then all strings supplied to upb_msg_parse
 * must remain unchanged and must outlive data. */
void upb_msg_parse_init(struct upb_msg_parse_state *s, void *data,
                        struct upb_msg *m, bool merge, bool byref);
void upb_msg_parse_reset(struct upb_msg_parse_state *s, void *data,
                         struct upb_msg *m, bool merge, bool byref);
void upb_msg_parse_free(struct upb_msg_parse_state *s);

/* Parses a protobuf fragment, writing the data to the message that was passed
 * to upb_msg_parse_init.  This function can be called multiple times as more
 * data becomes available. */
upb_status_t upb_msg_parse(struct upb_msg_parse_state *s,
                           void *data, size_t len, size_t *read);

/* Parses the protobuf in s (which is expected to be complete) and allocates
 * new message data to hold it.  This is an alternative to the streaming API
 * above.  "byref" works as in upb_msg_parse_init(). */
void *upb_alloc_and_parse(struct upb_msg *m, struct upb_string *s, bool byref);


/* Text dump  *****************************************************************/

bool upb_msg_eql(void *data1, void *data2, struct upb_msg *m, bool recursive);
void upb_msg_print(void *data, struct upb_msg *m, FILE *stream);

#ifdef __cplusplus
}  /* extern "C" */
#endif

#endif  /* UPB_MSG_H_ */