From b5f5ee867e6c91b77490dc8894236f17a47bde00 Mon Sep 17 00:00:00 2001 From: Joshua Haberman Date: Wed, 23 Nov 2011 16:19:22 -0800 Subject: Refinement of upb_bytesrc interface. Added a upb_byteregion that tracks a region of the input buffer; decoders use this instead of using a upb_bytesrc directly. upb_byteregion is also used as the way of passing a string to a upb_handlers callback. This symmetry makes decoders compose better; if you want to take a parsed string and decode it as something else, you can take the string directly from the callback and feed it as input to another parser. A commented-out version of a pinning interface is present; I decline to actually implement it (and accept its extra complexity) until/unless it is clear that it is actually a win. But it is included as a proof-of-concept, to show that it fits well with the existing interface. --- upb/bytestream.h | 378 +++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 258 insertions(+), 120 deletions(-) (limited to 'upb/bytestream.h') diff --git a/upb/bytestream.h b/upb/bytestream.h index cbaef48..3b339f1 100644 --- a/upb/bytestream.h +++ b/upb/bytestream.h @@ -4,19 +4,73 @@ * Copyright (c) 2011 Google Inc. See LICENSE for details. * Author: Josh Haberman * - * This file contains upb_bytesrc and upb_bytesink, which are abstractions of - * stdio (fread()/fwrite()/etc) that provide useful buffering/sharing - * semantics. They are virtual base classes so concrete implementations - * can get the data from a fd, a string, a cord, etc. + * This file defines three core interfaces: + * - upb_bytesink: for writing streams of data. + * - upb_bytesrc: for reading streams of data. + * - upb_byteregion: for reading from a specific region of a upb_bytesrc; + * should be used by decoders instead of using upb_bytesrc directly. * - * Byte streams are NOT thread-safe! (Like f{read,write}_unlocked()) - * This may change (in particular, bytesrc objects may be better thread-safe). + * These interfaces are used by streaming encoders and decoders: for example, a + * protobuf parser gets its input from a upb_byteregion. They are virtual base + * classes so concrete implementations can get the data from a fd, a FILE*, a + * string, etc. */ +// A upb_byteregion represents a region of data from a bytesrc. +// +// Parsers get data from this interface instead of a bytesrc because we often +// want to parse only a specific region of the input. For example, if we parse +// a string from our input but know that the string represents a protobuf, we +// can pass its upb_byteregion to an appropriate protobuf parser. +// +// Since the bytes may be coming from a file or network socket, bytes must be +// fetched before they can be read (though in some cases this fetch may be a +// no-op). "fetch" is the only operation on a byteregion that could fail or +// block, because it is the only operation that actually performs I/O. +// +// Bytes can be discarded when they are no longer needed. Parsers should +// always discard bytes they no longer need, both so the buffers can be freed +// when possible and to give better visibility into what bytes the parser is +// still using. +// +// start discard read fetch end +// ofs ofs ofs ofs ofs +// | |--->discard() | |--->fetch() | +// V V V V V +// +-------------+-------------------------+-----------------+-----------------+ +// | discarded | | | fetchable | +// +-------------+-------------------------+-----------------+-----------------+ +// | <------------- loaded ------------------> | +// | <- available -> | +// | <---------- remaining ----------> | +// +// Note that the start offset may be something other than zero! A byteregion +// is a view into an underlying bytesrc stream, and the region may start +// somewhere other than the beginning of that stream. +// +// The region can be either delimited or nondelimited. A non-delimited region +// will keep returning data until the underlying data source returns EOF. A +// delimited region will return EOF at a predetermined offset. +// +// end +// ofs +// | +// V +// +-----------------------+ +// | delimited region | <-- hard EOF, even if data source has more data. +// +-----------------------+ +// +// +------------------------ +// | nondelimited region Z <-- won't return EOF until data source hits EOF. +// +------------------------ + + #ifndef UPB_BYTESTREAM_H #define UPB_BYTESTREAM_H #include +#include +#include #include #include #include "upb.h" @@ -29,25 +83,22 @@ extern "C" { /* upb_bytesrc ****************************************************************/ // A upb_bytesrc allows the consumer of a stream of bytes to obtain buffers as -// they become available, and to preserve some trailing amount of data, which -// is useful for lazy parsing (among other things). If there is a submessage -// that we want to parse later we can take a reference on that region of the -// input buffer. This will guarantee that the bytesrc keeps the submessage -// data around for later use, without requiring a copy out of the input -// buffers. -typedef size_t upb_bytesrc_fetch_func(void*, uint64_t, upb_status*); -typedef void upb_bytesrc_read_func(const void*, uint64_t, size_t, char*); -typedef const char *upb_bytesrc_getptr_func(void*, uint64_t, size_t*); -typedef void upb_bytesrc_refregion_func(void*, uint64_t, size_t); -typedef void upb_bytesrc_ref_func(void*); +// they become available, and to preserve some trailing amount of data before +// it is discarded. Consumers should not use upb_bytesrc directly, but rather +// should use a upb_byteregion (which allows access to a region of a bytesrc). +// +// upb_bytesrc is a virtual base class with implementations that get data from +// eg. a string, a cord, a file descriptor, a FILE*, etc. + +typedef uint32_t upb_bytesrc_fetch_func(void*, uint64_t, upb_status*); +typedef void upb_bytesrc_discard_func(void*, uint64_t); +typedef void upb_bytesrc_copy_func(const void*, uint64_t, uint32_t, char*); +typedef const char *upb_bytesrc_getptr_func(const void*, uint64_t, uint32_t*); typedef struct _upb_bytesrc_vtbl { upb_bytesrc_fetch_func *fetch; - upb_bytesrc_read_func *read; + upb_bytesrc_discard_func *discard; + upb_bytesrc_copy_func *copy; upb_bytesrc_getptr_func *getptr; - upb_bytesrc_refregion_func *refregion; - upb_bytesrc_refregion_func *unrefregion; - upb_bytesrc_ref_func *ref; - upb_bytesrc_ref_func *unref; } upb_bytesrc_vtbl; typedef struct { @@ -59,114 +110,198 @@ INLINE void upb_bytesrc_init(upb_bytesrc *src, upb_bytesrc_vtbl *vtbl) { } // Fetches at least one byte starting at ofs, returning the actual number of -// bytes fetched (or 0 on error: see "s" for details). A successful return -// gives caller a ref on the fetched region. -// -// If "ofs" may be greater or equal than the end of the already-fetched region. -// It may also be less than the end of the already-fetch region *if* either of -// the following is true: -// -// * the region is ref'd (this implies that the data is still in-memory) -// * the bytesrc is seekable (this implies that the data can be fetched again). -INLINE size_t upb_bytesrc_fetch(upb_bytesrc *src, uint64_t ofs, upb_status *s) { +// bytes fetched (or 0 on EOF or error: see *s for details). Some bytesrc's +// may set EOF on *s after a successful read if no further data is available, +// but not all bytesrc's support this. It is valid for bytes to be fetched +// multiple times, as long as the bytes have not been previously discarded. +INLINE uint32_t upb_bytesrc_fetch(upb_bytesrc *src, uint64_t ofs, + upb_status *s) { return src->vtbl->fetch(src, ofs, s); } -// Copies "len" bytes of data from offset src_ofs to "dst", which must be at -// least "len" bytes long. The caller must own a ref on the given region. -INLINE void upb_bytesrc_read(const upb_bytesrc *src, uint64_t src_ofs, - size_t len, char *dst) { - src->vtbl->read(src, src_ofs, len, dst); +// Discards all data prior to ofs (except data that is pinned, if pinning +// support is added -- see TODO below). +INLINE void upb_bytesrc_discard(upb_bytesrc *src, uint64_t ofs) { + src->vtbl->discard(src, ofs); +} + +// Copies "len" bytes of data from ofs to "dst", which must be at least "len" +// bytes long. The given region must not be discarded. +INLINE void upb_bytesrc_copy(const upb_bytesrc *src, uint64_t ofs, uint32_t len, + char *dst) { + src->vtbl->copy(src, ofs, len, dst); } // Returns a pointer to the bytesrc's internal buffer, storing in *len how much -// data is available. The caller must own refs on the given region. The -// returned buffer is valid for as long as the region remains ref'd. -// -// TODO: if more data is available than the caller has ref'd is it ok for the -// caller to read *len bytes? -INLINE const char *upb_bytesrc_getptr(upb_bytesrc *src, uint64_t ofs, - size_t *len) { +// data is available. The given offset must not be discarded. The returned +// buffer is valid for as long as its bytes are not discarded (in the case that +// part of the returned buffer is discarded, only the non-discarded bytes +// remain valid). +INLINE const char *upb_bytesrc_getptr(const upb_bytesrc *src, uint64_t ofs, + uint32_t *len) { return src->vtbl->getptr(src, ofs, len); } -// Gives the caller a ref on the given region. The caller must know that the -// given region is already ref'd (for example, inside a upb_handlers callback -// that receives a upb_strref, the region is guaranteed to be ref'd -- this -// function allows that handler to take its own ref). -INLINE void upb_bytesrc_refregion(upb_bytesrc *src, uint64_t ofs, size_t len) { - src->vtbl->refregion(src, ofs, len); -} +// TODO: Add if/when there is a demonstrated need: +// +// // When the caller pins a region (which must not be already discarded), it +// // is guaranteed that the region will not be discarded (nor will the bytesrc +// // be destroyed) until the region is unpinned. However, not all bytesrc's +// // support pinning; a false return indicates that a pin was not possible. +// INLINE bool upb_bytesrc_pin(upb_bytesrc *src, uint64_t ofs, uint32_t len) { +// return src->vtbl->refregion(src, ofs, len); +// } +// +// // Releases some number of pinned bytes from the beginning of a pinned +// // region (which may be fewer than the total number of bytes pinned). +// INLINE void upb_bytesrc_unpin(upb_bytesrc *src, uint64_t ofs, uint32_t len, +// uint32_t bytes_to_release) { +// src->vtbl->unpin(src, ofs, len); +// } +// +// Adding pinning support would also involve adding a "pin_ofs" parameter to +// upb_bytesrc_fetch, so that the fetch can extend an already-pinned region. -// Releases a ref on the given region, which the caller must have previously -// ref'd. -INLINE void upb_bytesrc_unrefregion(upb_bytesrc *src, uint64_t ofs, size_t len) { - src->vtbl->unrefregion(src, ofs, len); -} -// Attempts to ref the bytesrc itself, returning false if this bytesrc is -// not ref-able. -INLINE bool upb_bytesrc_tryref(upb_bytesrc *src) { - if (src->vtbl->ref) { - src->vtbl->ref(src); - return true; - } else { - return false; - } -} +/* upb_byteregion *************************************************************/ -// Unref's the bytesrc itself. May only be called when upb_bytesrc_tryref() -// has previously returned true. -INLINE void upb_bytesrc_unref(upb_bytesrc *src) { - assert(src->vtbl->unref); - src->vtbl->unref(src); -} +#define UPB_NONDELIMITED (0xffffffffffffffffULL) +typedef struct _upb_byteregion { + uint64_t start; + uint64_t discard; + uint64_t fetch; + uint64_t end; // UPB_NONDELIMITED if nondelimited. + upb_bytesrc *bytesrc; + bool toplevel; // If true, discards hit the underlying byteregion. +} upb_byteregion; + +// Initializes a byteregion. Its initial value will be empty. No methods may +// be called on an empty byteregion except upb_byteregion_reset(). +void upb_byteregion_init(upb_byteregion *r); +void upb_byteregion_uninit(upb_byteregion *r); + +// Accessors for the regions bounds -- the meaning of these is described in the +// diagram above. +INLINE uint64_t upb_byteregion_startofs(const upb_byteregion *r) { + return r->start; +} +INLINE uint64_t upb_byteregion_discardofs(const upb_byteregion *r) { + return r->discard; +} +INLINE uint64_t upb_byteregion_fetchofs(const upb_byteregion *r) { + return r->fetch; +} +INLINE uint64_t upb_byteregion_endofs(const upb_byteregion *r) { + return r->end; +} -/* upb_strref *****************************************************************/ +// Returns how many bytes are fetched and available for reading starting +// from offset "o". +INLINE uint64_t upb_byteregion_available(const upb_byteregion *r, uint64_t o) { + assert(o >= upb_byteregion_discardofs(r)); + assert(o <= r->fetch); // Could relax this. + return r->fetch - o; +} -// The structure we pass to upb_handlers for a string value. -typedef struct _upb_strref { - // Pointer to the string data. NULL if the string spans multiple input - // buffers (in which case upb_bytesrc_getptr() must be called to obtain - // the actual pointers). - const char *ptr; +// Returns the total number of bytes remaining after offset "o", or +// UPB_NONDELIMITED if the byteregion is non-delimited. +INLINE uint64_t upb_byteregion_remaining(const upb_byteregion *r, uint64_t o) { + return r->end == UPB_NONDELIMITED ? UPB_NONDELIMITED : r->end - o; +} - // Total length of the string. - uint32_t len; +INLINE uint64_t upb_byteregion_len(const upb_byteregion *r) { + return upb_byteregion_remaining(r, r->start); +} - // Offset in the bytesrc that represents the beginning of this string. - uint32_t stream_offset; +// Sets the value of this byteregion to be a subset of the given byteregion's +// data. The caller is responsible for releasing this region before the src +// region is released (unless the region is first pinned, if pinning support is +// added. see below). +void upb_byteregion_reset(upb_byteregion *r, const upb_byteregion *src, + uint64_t ofs, uint64_t len); +void upb_byteregion_release(upb_byteregion *r); + +// Attempts to fetch more data, extending the fetched range of this byteregion. +// Returns true if the fetched region was extended by at least one byte, false +// on EOF or error (see *s for details). +bool upb_byteregion_fetch(upb_byteregion *r, upb_status *s); + +// Fetches all remaining data for "r", returning false if the operation failed +// (see "*s" for details). May only be used on delimited byteregions. +INLINE bool upb_byteregion_fetchall(upb_byteregion *r, upb_status *s) { + assert(upb_byteregion_len(r) != UPB_NONDELIMITED); + while (upb_byteregion_fetch(r, s)) ; // Empty body. + return upb_eof(s); +} - // Bytesrc from which this string data comes. May be NULL if ptr is set. If - // non-NULL, the bytesrc is only guaranteed to be alive from inside the - // callback; however if the handler knows more about its type and how to - // prolong its life, it may do so. - upb_bytesrc *bytesrc; +// Discards bytes from the byteregion up until ofs (which must be greater or +// equal to upb_byteregion_discardofs()). It is valid to discard bytes that +// have not been fetched (such bytes will never be fetched) but it is an error +// to discard past the end of a delimited byteregion. +INLINE void upb_byteregion_discard(upb_byteregion *r, uint64_t ofs) { + assert(ofs >= upb_byteregion_discardofs(r)); + assert(ofs <= upb_byteregion_endofs(r)); + r->discard = ofs; + if (r->toplevel) upb_bytesrc_discard(r->bytesrc, ofs); +} - // Possibly add optional members here like start_line, start_column, etc. -} upb_strref; +// Copies "len" bytes of data into "dst", starting at ofs. The specified +// region must be available. +INLINE void upb_byteregion_copy(const upb_byteregion *r, uint64_t ofs, + uint32_t len, char *dst) { + assert(ofs >= upb_byteregion_discardofs(r)); + assert(len <= upb_byteregion_available(r, ofs)); + upb_bytesrc_copy(r->bytesrc, ofs, len, dst); +} -// Copies the contents of the strref into a newly-allocated, NULL-terminated -// string. -char *upb_strref_dup(const struct _upb_strref *r); +// Copies all bytes from the byteregion into dst. Requires that the entire +// byteregion is fetched and that none has been discarded. +INLINE void upb_byteregion_copyall(const upb_byteregion *r, char *dst) { + assert(r->start == r->discard && r->end == r->fetch); + upb_byteregion_copy(r, r->start, upb_byteregion_len(r), dst); +} -INLINE void upb_strref_read(const struct _upb_strref *r, char *buf) { - if (r->ptr) { - memcpy(buf, r->ptr, r->len); - } else { - assert(r->bytesrc); - upb_bytesrc_read(r->bytesrc, r->stream_offset, r->len, buf); - } +// Returns a pointer to the internal buffer for the byteregion starting at +// offset "ofs." Stores the number of bytes available in this buffer in *len. +// The returned buffer is invalidated when the byteregion is reset or released, +// or when the bytes are discarded. If the byteregion is not currently pinned, +// the pointer is only valid for the lifetime of the parent byteregion. +INLINE const char *upb_byteregion_getptr(const upb_byteregion *r, + uint64_t ofs, uint32_t *len) { + assert(ofs >= upb_byteregion_discardofs(r)); + const char *ret = upb_bytesrc_getptr(r->bytesrc, ofs, len); + *len = UPB_MIN(*len, upb_byteregion_available(r, ofs)); + return ret; } -// Dynamically allocates a upb_strref object whose contents are the given -// string. The given string data is copied into the strref, which makes these -// functions unsuitable for tight loops (in those cases a strref should be made -// to point to existing string data). -upb_strref *upb_strref_new(const char *str); -upb_strref *upb_strref_newl(const void *str, size_t len); -void upb_strref_free(upb_strref *ref); +// TODO: add if/when there is a demonstrated need. +// +// // Pins this byteregion's bytes in memory, allowing it to outlive its parent +// // byteregion. Normally a byteregion may only be used while its parent is +// // still valid, but a pinned byteregion may continue to be used until it is +// // reset or released. A byteregion must be fully fetched to be pinned +// // (this implies that the byteregion must be delimited). +// // +// // In some cases this operation may cause the input data to be copied. +// // +// // void upb_byteregion_pin(upb_byteregion *r); + +// Convenience functions for creating and destroying a byteregion with a simple +// string as its data. These are relatively inefficient compared with creating +// your own bytesrc (they call malloc() and copy the string data) so should not +// be used on any critical path. +// +// The string data in the returned region is guaranteed to be contiguous and +// NULL-terminated. +upb_byteregion *upb_byteregion_new(const void *str); +upb_byteregion *upb_byteregion_newl(const void *str, uint32_t len); +// May *only* be called on a byteregion created with upb_byteregion_new[l]()! +void upb_byteregion_free(upb_byteregion *r); + +// Copies the contents of the byteregion into a newly-allocated, NULL-terminated +// string. Requires that the byteregion is fully fetched. +char *upb_byteregion_strdup(const upb_byteregion *r); /* upb_bytesink ***************************************************************/ @@ -279,6 +414,7 @@ typedef struct { bool should_close; upb_stdio_buf **bufs; uint32_t nbuf, szbuf; + upb_byteregion byteregion; } upb_stdio; void upb_stdio_init(upb_stdio *stdio); @@ -297,7 +433,7 @@ void upb_stdio_reset(upb_stdio *stdio, FILE *file); void upb_stdio_open(upb_stdio *stdio, const char *filename, const char *mode, upb_status *s); -upb_bytesrc *upb_stdio_bytesrc(upb_stdio *stdio); +upb_byteregion *upb_stdio_allbytes(upb_stdio *stdio); upb_bytesink *upb_stdio_bytesink(upb_stdio *stdio); @@ -305,24 +441,26 @@ upb_bytesink *upb_stdio_bytesink(upb_stdio *stdio); // bytesrc/bytesink for a simple contiguous string. -struct _upb_stringsrc { +typedef struct { upb_bytesrc bytesrc; const char *str; - size_t len; -}; -typedef struct _upb_stringsrc upb_stringsrc; + uint32_t len; + upb_byteregion byteregion; +} upb_stringsrc; // Create/free a stringsrc. void upb_stringsrc_init(upb_stringsrc *s); void upb_stringsrc_uninit(upb_stringsrc *s); // Resets the stringsrc to a state where it will vend the given string. The -// stringsrc will take a reference on the string, so the caller need not ensure -// that it outlives the stringsrc. A stringsrc can be reset multiple times. -void upb_stringsrc_reset(upb_stringsrc *s, const char *str, size_t len); +// string data must be valid until the stringsrc is reset again or destroyed. +void upb_stringsrc_reset(upb_stringsrc *s, const char *str, uint32_t len); -// Returns the upb_bytesrc* for this stringsrc. -upb_bytesrc *upb_stringsrc_bytesrc(upb_stringsrc *s); +// Returns the top-level upb_byteregion* for this stringsrc. Invalidated when +// the stringsrc is reset. +INLINE upb_byteregion *upb_stringsrc_allbytes(upb_stringsrc *s) { + return &s->byteregion; +} /* upb_stringsink *************************************************************/ @@ -330,7 +468,7 @@ upb_bytesrc *upb_stringsrc_bytesrc(upb_stringsrc *s); struct _upb_stringsink { upb_bytesink bytesink; char *str; - size_t len, size; + uint32_t len, size; }; typedef struct _upb_stringsink upb_stringsink; @@ -340,12 +478,12 @@ void upb_stringsink_uninit(upb_stringsink *s); // Resets the sink's string to "str", which the sink takes ownership of. // "str" may be NULL, which will make the sink allocate a new string. -void upb_stringsink_reset(upb_stringsink *s, char *str, size_t size); +void upb_stringsink_reset(upb_stringsink *s, char *str, uint32_t len); // Releases ownership of the returned string (which is "len" bytes long) and // resets the internal string to be empty again (as if reset were called with // NULL). -const char *upb_stringsink_release(upb_stringsink *s, size_t *len); +const char *upb_stringsink_release(upb_stringsink *s, uint32_t *len); // Returns the upb_bytesink* for this stringsrc. Invalidated by reset above. upb_bytesink *upb_stringsink_bytesink(upb_stringsink *s); -- cgit v1.2.3