summaryrefslogtreecommitdiff
path: root/upb/bytestream.h
blob: 3b339f17d75d75b49ef4a9b880d74f22376c75da (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
/*
 * upb - a minimalist implementation of protocol buffers.
 *
 * Copyright (c) 2011 Google Inc.  See LICENSE for details.
 * Author: Josh Haberman <jhaberman@gmail.com>
 *
 * This file defines three core interfaces:
 * - upb_bytesink: for writing streams of data.
 * - upb_bytesrc: for reading streams of data.
 * - upb_byteregion: for reading from a specific region of a upb_bytesrc;
 *   should be used by decoders instead of using upb_bytesrc directly.
 *
 * These interfaces are used by streaming encoders and decoders: for example, a
 * protobuf parser gets its input from a upb_byteregion.  They are virtual base
 * classes so concrete implementations can get the data from a fd, a FILE*, a
 * string, etc.
 */

// A upb_byteregion represents a region of data from a bytesrc.
//
// Parsers get data from this interface instead of a bytesrc because we often
// want to parse only a specific region of the input.  For example, if we parse
// a string from our input but know that the string represents a protobuf, we
// can pass its upb_byteregion to an appropriate protobuf parser.
//
// Since the bytes may be coming from a file or network socket, bytes must be
// fetched before they can be read (though in some cases this fetch may be a
// no-op).  "fetch" is the only operation on a byteregion that could fail or
// block, because it is the only operation that actually performs I/O.
//
// Bytes can be discarded when they are no longer needed.  Parsers should
// always discard bytes they no longer need, both so the buffers can be freed
// when possible and to give better visibility into what bytes the parser is
// still using.
//
// start      discard                     read             fetch             end
// ofs          ofs                       ofs               ofs              ofs
// |             |--->discard()            |                 |--->fetch()      |
// V             V                         V                 V                 V
// +-------------+-------------------------+-----------------+-----------------+
// |  discarded  |                         |                 |    fetchable    |
// +-------------+-------------------------+-----------------+-----------------+
//               | <------------- loaded ------------------> |
//                                         | <- available -> |
//                                         | <---------- remaining ----------> |
//
// Note that the start offset may be something other than zero!  A byteregion
// is a view into an underlying bytesrc stream, and the region may start
// somewhere other than the beginning of that stream.
//
// The region can be either delimited or nondelimited.  A non-delimited region
// will keep returning data until the underlying data source returns EOF.  A
// delimited region will return EOF at a predetermined offset.
//
//                       end
//                       ofs
//                         |
//                         V
// +-----------------------+
// |  delimited region     |   <-- hard EOF, even if data source has more data.
// +-----------------------+
//
// +------------------------
// | nondelimited region   Z   <-- won't return EOF until data source hits EOF.
// +------------------------


#ifndef UPB_BYTESTREAM_H
#define UPB_BYTESTREAM_H

#include <stdarg.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "upb.h"

#ifdef __cplusplus
extern "C" {
#endif


/* upb_bytesrc ****************************************************************/

// A upb_bytesrc allows the consumer of a stream of bytes to obtain buffers as
// they become available, and to preserve some trailing amount of data before
// it is discarded.  Consumers should not use upb_bytesrc directly, but rather
// should use a upb_byteregion (which allows access to a region of a bytesrc).
//
// upb_bytesrc is a virtual base class with implementations that get data from
// eg. a string, a cord, a file descriptor, a FILE*, etc.

typedef uint32_t upb_bytesrc_fetch_func(void*, uint64_t, upb_status*);
typedef void upb_bytesrc_discard_func(void*, uint64_t);
typedef void upb_bytesrc_copy_func(const void*, uint64_t, uint32_t, char*);
typedef const char *upb_bytesrc_getptr_func(const void*, uint64_t, uint32_t*);
typedef struct _upb_bytesrc_vtbl {
  upb_bytesrc_fetch_func     *fetch;
  upb_bytesrc_discard_func   *discard;
  upb_bytesrc_copy_func      *copy;
  upb_bytesrc_getptr_func    *getptr;
} upb_bytesrc_vtbl;

typedef struct {
  upb_bytesrc_vtbl  *vtbl;
} upb_bytesrc;

INLINE void upb_bytesrc_init(upb_bytesrc *src, upb_bytesrc_vtbl *vtbl) {
  src->vtbl = vtbl;
}

// Fetches at least one byte starting at ofs, returning the actual number of
// bytes fetched (or 0 on EOF or error: see *s for details).  Some bytesrc's
// may set EOF on *s after a successful read if no further data is available,
// but not all bytesrc's support this.  It is valid for bytes to be fetched
// multiple times, as long as the bytes have not been previously discarded.
INLINE uint32_t upb_bytesrc_fetch(upb_bytesrc *src, uint64_t ofs,
                                  upb_status *s) {
  return src->vtbl->fetch(src, ofs, s);
}

// Discards all data prior to ofs (except data that is pinned, if pinning
// support is added -- see TODO below).
INLINE void upb_bytesrc_discard(upb_bytesrc *src, uint64_t ofs) {
  src->vtbl->discard(src, ofs);
}

// Copies "len" bytes of data from ofs to "dst", which must be at least "len"
// bytes long.  The given region must not be discarded.
INLINE void upb_bytesrc_copy(const upb_bytesrc *src, uint64_t ofs, uint32_t len,
                             char *dst) {
  src->vtbl->copy(src, ofs, len, dst);
}

// Returns a pointer to the bytesrc's internal buffer, storing in *len how much
// data is available.  The given offset must not be discarded.  The returned
// buffer is valid for as long as its bytes are not discarded (in the case that
// part of the returned buffer is discarded, only the non-discarded bytes
// remain valid).
INLINE const char *upb_bytesrc_getptr(const upb_bytesrc *src, uint64_t ofs,
                                      uint32_t *len) {
  return src->vtbl->getptr(src, ofs, len);
}

// TODO: Add if/when there is a demonstrated need:
//
// // When the caller pins a region (which must not be already discarded), it
// // is guaranteed that the region will not be discarded (nor will the bytesrc
// // be destroyed) until the region is unpinned.  However, not all bytesrc's
// // support pinning; a false return indicates that a pin was not possible.
// INLINE bool upb_bytesrc_pin(upb_bytesrc *src, uint64_t ofs, uint32_t len) {
//   return src->vtbl->refregion(src, ofs, len);
// }
//
// // Releases some number of pinned bytes from the beginning of a pinned
// // region (which may be fewer than the total number of bytes pinned).
// INLINE void upb_bytesrc_unpin(upb_bytesrc *src, uint64_t ofs, uint32_t len,
//                               uint32_t bytes_to_release) {
//   src->vtbl->unpin(src, ofs, len);
// }
//
// Adding pinning support would also involve adding a "pin_ofs" parameter to
// upb_bytesrc_fetch, so that the fetch can extend an already-pinned region.


/* upb_byteregion *************************************************************/

#define UPB_NONDELIMITED (0xffffffffffffffffULL)

typedef struct _upb_byteregion {
  uint64_t start;
  uint64_t discard;
  uint64_t fetch;
  uint64_t end;         // UPB_NONDELIMITED if nondelimited.
  upb_bytesrc *bytesrc;
  bool toplevel;        // If true, discards hit the underlying byteregion.
} upb_byteregion;

// Initializes a byteregion.  Its initial value will be empty.  No methods may
// be called on an empty byteregion except upb_byteregion_reset().
void upb_byteregion_init(upb_byteregion *r);
void upb_byteregion_uninit(upb_byteregion *r);

// Accessors for the regions bounds -- the meaning of these is described in the
// diagram above.
INLINE uint64_t upb_byteregion_startofs(const upb_byteregion *r) {
  return r->start;
}
INLINE uint64_t upb_byteregion_discardofs(const upb_byteregion *r) {
  return r->discard;
}
INLINE uint64_t upb_byteregion_fetchofs(const upb_byteregion *r) {
  return r->fetch;
}
INLINE uint64_t upb_byteregion_endofs(const upb_byteregion *r) {
  return r->end;
}

// Returns how many bytes are fetched and available for reading starting
// from offset "o".
INLINE uint64_t upb_byteregion_available(const upb_byteregion *r, uint64_t o) {
  assert(o >= upb_byteregion_discardofs(r));
  assert(o <= r->fetch);  // Could relax this.
  return r->fetch - o;
}

// Returns the total number of bytes remaining after offset "o", or
// UPB_NONDELIMITED if the byteregion is non-delimited.
INLINE uint64_t upb_byteregion_remaining(const upb_byteregion *r, uint64_t o) {
  return r->end == UPB_NONDELIMITED ? UPB_NONDELIMITED : r->end - o;
}

INLINE uint64_t upb_byteregion_len(const upb_byteregion *r) {
  return upb_byteregion_remaining(r, r->start);
}

// Sets the value of this byteregion to be a subset of the given byteregion's
// data.  The caller is responsible for releasing this region before the src
// region is released (unless the region is first pinned, if pinning support is
// added.  see below).
void upb_byteregion_reset(upb_byteregion *r, const upb_byteregion *src,
                          uint64_t ofs, uint64_t len);
void upb_byteregion_release(upb_byteregion *r);

// Attempts to fetch more data, extending the fetched range of this byteregion.
// Returns true if the fetched region was extended by at least one byte, false
// on EOF or error (see *s for details).
bool upb_byteregion_fetch(upb_byteregion *r, upb_status *s);

// Fetches all remaining data for "r", returning false if the operation failed
// (see "*s" for details).  May only be used on delimited byteregions.
INLINE bool upb_byteregion_fetchall(upb_byteregion *r, upb_status *s) {
  assert(upb_byteregion_len(r) != UPB_NONDELIMITED);
  while (upb_byteregion_fetch(r, s)) ;  // Empty body.
  return upb_eof(s);
}

// Discards bytes from the byteregion up until ofs (which must be greater or
// equal to upb_byteregion_discardofs()).  It is valid to discard bytes that
// have not been fetched (such bytes will never be fetched) but it is an error
// to discard past the end of a delimited byteregion.
INLINE void upb_byteregion_discard(upb_byteregion *r, uint64_t ofs) {
  assert(ofs >= upb_byteregion_discardofs(r));
  assert(ofs <= upb_byteregion_endofs(r));
  r->discard = ofs;
  if (r->toplevel) upb_bytesrc_discard(r->bytesrc, ofs);
}

// Copies "len" bytes of data into "dst", starting at ofs.  The specified
// region must be available.
INLINE void upb_byteregion_copy(const upb_byteregion *r, uint64_t ofs,
                                uint32_t len, char *dst) {
  assert(ofs >= upb_byteregion_discardofs(r));
  assert(len <= upb_byteregion_available(r, ofs));
  upb_bytesrc_copy(r->bytesrc, ofs, len, dst);
}

// Copies all bytes from the byteregion into dst.  Requires that the entire
// byteregion is fetched and that none has been discarded.
INLINE void upb_byteregion_copyall(const upb_byteregion *r, char *dst) {
  assert(r->start == r->discard && r->end == r->fetch);
  upb_byteregion_copy(r, r->start, upb_byteregion_len(r), dst);
}

// Returns a pointer to the internal buffer for the byteregion starting at
// offset "ofs." Stores the number of bytes available in this buffer in *len.
// The returned buffer is invalidated when the byteregion is reset or released,
// or when the bytes are discarded.  If the byteregion is not currently pinned,
// the pointer is only valid for the lifetime of the parent byteregion.
INLINE const char *upb_byteregion_getptr(const upb_byteregion *r,
                                         uint64_t ofs, uint32_t *len) {
  assert(ofs >= upb_byteregion_discardofs(r));
  const char *ret = upb_bytesrc_getptr(r->bytesrc, ofs, len);
  *len = UPB_MIN(*len, upb_byteregion_available(r, ofs));
  return ret;
}

// TODO: add if/when there is a demonstrated need.
//
// // Pins this byteregion's bytes in memory, allowing it to outlive its parent
// // byteregion.  Normally a byteregion may only be used while its parent is
// // still valid, but a pinned byteregion may continue to be used until it is
// // reset or released.  A byteregion must be fully fetched to be pinned
// // (this implies that the byteregion must be delimited).
// //
// // In some cases this operation may cause the input data to be copied.
// //
// // void upb_byteregion_pin(upb_byteregion *r);

// Convenience functions for creating and destroying a byteregion with a simple
// string as its data.  These are relatively inefficient compared with creating
// your own bytesrc (they call malloc() and copy the string data) so should not
// be used on any critical path.
//
// The string data in the returned region is guaranteed to be contiguous and
// NULL-terminated.
upb_byteregion *upb_byteregion_new(const void *str);
upb_byteregion *upb_byteregion_newl(const void *str, uint32_t len);
// May *only* be called on a byteregion created with upb_byteregion_new[l]()!
void upb_byteregion_free(upb_byteregion *r);

// Copies the contents of the byteregion into a newly-allocated, NULL-terminated
// string.  Requires that the byteregion is fully fetched.
char *upb_byteregion_strdup(const upb_byteregion *r);


/* upb_bytesink ***************************************************************/

// A bytesink is an interface that allows the caller to push byte-wise data.
// It is very simple -- the only special capability is the ability to "rewind"
// the stream, which is really only a mechanism of having the bytesink ignore
// some subsequent calls.
typedef int upb_bytesink_write_func(void*, const void*, int);
typedef int upb_bytesink_vprintf_func(void*, const char *fmt, va_list args);

typedef struct {
  upb_bytesink_write_func   *write;
  upb_bytesink_vprintf_func *vprintf;
} upb_bytesink_vtbl;

typedef struct {
  upb_bytesink_vtbl *vtbl;
  upb_status status;
  uint64_t offset;
} upb_bytesink;

// Should be called by derived classes.
void upb_bytesink_init(upb_bytesink *sink, upb_bytesink_vtbl *vtbl);
void upb_bytesink_uninit(upb_bytesink *sink);

INLINE int upb_bytesink_write(upb_bytesink *s, const void *buf, int len) {
  return s->vtbl->write(s, buf, len);
}

INLINE int upb_bytesink_writestr(upb_bytesink *sink, const char *str) {
  return upb_bytesink_write(sink, str, strlen(str));
}

// Returns the number of bytes written or -1 on error.
INLINE int upb_bytesink_printf(upb_bytesink *sink, const char *fmt, ...) {
  va_list args;
  va_start(args, fmt);
  uint32_t ret = sink->vtbl->vprintf(sink, fmt, args);
  va_end(args);
  return ret;
}

INLINE int upb_bytesink_putc(upb_bytesink *sink, char ch) {
  return upb_bytesink_write(sink, &ch, 1);
}

INLINE int upb_bytesink_putrepeated(upb_bytesink *sink, char ch, int len) {
  for (int i = 0; i < len; i++)
    if (upb_bytesink_write(sink, &ch, 1) < 0)
      return -1;
  return len;
}

INLINE uint64_t upb_bytesink_getoffset(upb_bytesink *sink) {
  return sink->offset;
}

// Rewinds the stream to the given offset.  This cannot actually "unput" any
// data, it is for situations like:
//
// // If false is returned (because of error), call again later to resume.
// bool write_some_data(upb_bytesink *sink, int indent) {
//   uint64_t start_offset = upb_bytesink_getoffset(sink);
//   if (upb_bytesink_writestr(sink, "Some data") < 0) goto err;
//   if (upb_bytesink_putrepeated(sink, ' ', indent) < 0) goto err;
//   return true;
//  err:
//   upb_bytesink_rewind(sink, start_offset);
//   return false;
// }
//
// The subsequent bytesink writes *must* be identical to the writes that were
// rewinded past.
INLINE void upb_bytesink_rewind(upb_bytesink *sink, uint64_t offset) {
  // TODO
  (void)sink;
  (void)offset;
}

// OPT: add getappendbuf()
// OPT: add writefrombytesrc()
// TODO: add flush()


/* upb_stdio ******************************************************************/

// bytesrc/bytesink for ANSI C stdio, which is less efficient than posixfd, but
// more portable.
//
// Specifically, stdio functions acquire locks on every operation (unless you
// use the f{read,write,...}_unlocked variants, which are not standard) and
// performs redundant buffering (unless you disable it with setvbuf(), but we
// can only do this on newly-opened filehandles).

typedef struct {
  uint64_t ofs;
  uint32_t len;
  uint32_t refcount;
  char data[];
} upb_stdio_buf;

// We use a single object for both bytesrc and bytesink for simplicity.
// The object is still not thread-safe, and may only be used by one reader
// and one writer at a time.
typedef struct {
  upb_bytesrc src;
  upb_bytesink sink;
  FILE *file;
  bool should_close;
  upb_stdio_buf **bufs;
  uint32_t nbuf, szbuf;
  upb_byteregion byteregion;
} upb_stdio;

void upb_stdio_init(upb_stdio *stdio);
// Caller should call upb_stdio_flush prior to calling this to ensure that
// all data is flushed, otherwise data can be silently dropped if an error
// occurs flushing the remaining buffers.
void upb_stdio_uninit(upb_stdio *stdio);

// Resets the object to read/write to the given "file."  The caller is
// responsible for closing the file, which must outlive this object.
void upb_stdio_reset(upb_stdio *stdio, FILE *file);

// As an alternative to upb_stdio_reset(), initializes the object by opening a
// file, and will handle closing it.  This may result in more efficient I/O
// than the previous since we can call setvbuf() to disable buffering.
void upb_stdio_open(upb_stdio *stdio, const char *filename, const char *mode,
                    upb_status *s);

upb_byteregion *upb_stdio_allbytes(upb_stdio *stdio);
upb_bytesink *upb_stdio_bytesink(upb_stdio *stdio);


/* upb_stringsrc **************************************************************/

// bytesrc/bytesink for a simple contiguous string.

typedef struct {
  upb_bytesrc bytesrc;
  const char *str;
  uint32_t len;
  upb_byteregion byteregion;
} upb_stringsrc;

// Create/free a stringsrc.
void upb_stringsrc_init(upb_stringsrc *s);
void upb_stringsrc_uninit(upb_stringsrc *s);

// Resets the stringsrc to a state where it will vend the given string.  The
// string data must be valid until the stringsrc is reset again or destroyed.
void upb_stringsrc_reset(upb_stringsrc *s, const char *str, uint32_t len);

// Returns the top-level upb_byteregion* for this stringsrc.  Invalidated when
// the stringsrc is reset.
INLINE upb_byteregion *upb_stringsrc_allbytes(upb_stringsrc *s) {
  return &s->byteregion;
}


/* upb_stringsink *************************************************************/

struct _upb_stringsink {
  upb_bytesink bytesink;
  char *str;
  uint32_t len, size;
};
typedef struct _upb_stringsink upb_stringsink;

// Create/free a stringsrc.
void upb_stringsink_init(upb_stringsink *s);
void upb_stringsink_uninit(upb_stringsink *s);

// Resets the sink's string to "str", which the sink takes ownership of.
// "str" may be NULL, which will make the sink allocate a new string.
void upb_stringsink_reset(upb_stringsink *s, char *str, uint32_t len);

// Releases ownership of the returned string (which is "len" bytes long) and
// resets the internal string to be empty again (as if reset were called with
// NULL).
const char *upb_stringsink_release(upb_stringsink *s, uint32_t *len);

// Returns the upb_bytesink* for this stringsrc.  Invalidated by reset above.
upb_bytesink *upb_stringsink_bytesink(upb_stringsink *s);

#ifdef __cplusplus
}  /* extern "C" */
#endif

#endif
generated by cgit on debian on lair
contact matthew@masot.net with questions or feedback