diff options
Diffstat (limited to 'stream')
-rw-r--r-- | stream/upb_byteio.h | 43 | ||||
-rw-r--r-- | stream/upb_decoder.c | 577 | ||||
-rw-r--r-- | stream/upb_decoder.h | 53 | ||||
-rw-r--r-- | stream/upb_encoder.c | 420 | ||||
-rw-r--r-- | stream/upb_encoder.h | 56 | ||||
-rw-r--r-- | stream/upb_text.c | 121 | ||||
-rw-r--r-- | stream/upb_text.h | 36 |
7 files changed, 1306 insertions, 0 deletions
diff --git a/stream/upb_byteio.h b/stream/upb_byteio.h new file mode 100644 index 0000000..69a28b3 --- /dev/null +++ b/stream/upb_byteio.h @@ -0,0 +1,43 @@ +/* + * upb - a minimalist implementation of protocol buffers. + * + * This file contains upb_bytesrc and upb_bytesink implementations for common + * interfaces like strings, UNIX fds, and FILE*. + * + * Copyright (c) 2009-2010 Joshua Haberman. See LICENSE for details. + */ + +#ifndef UPB_BYTEIO_H +#define UPB_BYTEIO_H + +#include "upb_srcsink.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/* upb_stringsrc **************************************************************/ + +struct upb_stringsrc; +typedef struct upb_stringsrc upb_stringsrc; + +// Create/free a stringsrc. +upb_stringsrc *upb_stringsrc_new(); +void upb_stringsrc_free(upb_stringsrc *s); + +// Resets the stringsrc to a state where it will vend the given string. The +// stringsrc will take a reference on the string, so the caller need not ensure +// that it outlives the stringsrc. A stringsrc can be reset multiple times. +void upb_stringsrc_reset(upb_stringsrc *s, upb_string *str); + +// Returns the upb_bytesrc* for this stringsrc. Invalidated by reset above. +upb_bytesrc *upb_stringsrc_bytesrc(); + + +/* upb_fdsrc ******************************************************************/ + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif diff --git a/stream/upb_decoder.c b/stream/upb_decoder.c new file mode 100644 index 0000000..e3fdc49 --- /dev/null +++ b/stream/upb_decoder.c @@ -0,0 +1,577 @@ +/* + * upb - a minimalist implementation of protocol buffers. + * + * Copyright (c) 2008-2009 Joshua Haberman. See LICENSE for details. + */ + +#include "upb_decoder.h" + +#include <inttypes.h> +#include <stddef.h> +#include <stdlib.h> + +#define UPB_GROUP_END_OFFSET UINT32_MAX + +// Returns true if the give wire type and field type combination is valid, +// taking into account both packed and non-packed encodings. +static bool upb_check_type(upb_wire_type_t wt, upb_field_type_t ft) { + return (1 << wt) & upb_types[ft].allowed_wire_types; +} + +// Performs zig-zag decoding, which is used by sint32 and sint64. +static int32_t upb_zzdec_32(uint32_t n) { return (n >> 1) ^ -(int32_t)(n & 1); } +static int64_t upb_zzdec_64(uint64_t n) { return (n >> 1) ^ -(int64_t)(n & 1); } + + +/* upb_decoder ****************************************************************/ + +// The decoder keeps a stack with one entry per level of recursion. +// upb_decoder_frame is one frame of that stack. +typedef struct { + upb_msgdef *msgdef; + upb_fielddef *field; + upb_strlen_t end_offset; // For groups, -1. +} upb_decoder_frame; + +struct upb_decoder { + upb_src src; // upb_decoder is a upb_src. + + upb_msgdef *toplevel_msgdef; + upb_bytesrc *bytesrc; + + // The buffer of input data. NULL is equivalent to the empty string. + upb_string *buf; + + // Holds residual bytes when fewer than UPB_MAX_ENCODED_SIZE bytes remain. + uint8_t tmpbuf[UPB_MAX_ENCODED_SIZE]; + + // The number of bytes we have yet to consume from "buf" or tmpbuf. This is + // always >= 0 unless we were just reset or are eof. + int32_t buf_bytesleft; + + // The offset within "buf" from where we are currently reading. This can be + // <0 if we are reading some residual bytes from the previous buffer, which + // are stored in tmpbuf and combined with bytes from "buf". + int32_t buf_offset; + + // The overall stream offset of the beginning of "buf". + uint32_t buf_stream_offset; + + // Fielddef for the key we just read. + upb_fielddef *field; + + // Wire type of the key we just read. + upb_wire_type_t wire_type; + + // Delimited length of the string field we are reading. + upb_strlen_t delimited_len; + + upb_strlen_t packed_end_offset; + + // We keep a stack of messages we have recursed into. + upb_decoder_frame *top, *limit, stack[UPB_MAX_NESTING]; +}; + + +/* upb_decoder buffering. *****************************************************/ + +static upb_strlen_t upb_decoder_offset(upb_decoder *d) +{ + return d->buf_stream_offset - d->buf_offset; +} + +static bool upb_decoder_nextbuf(upb_decoder *d) +{ + assert(d->buf_bytesleft < UPB_MAX_ENCODED_SIZE); + + // Copy residual bytes to temporary buffer. + if(d->buf_bytesleft > 0) { + memcpy(d->tmpbuf, upb_string_getrobuf(d->buf) + d->buf_offset, + d->buf_bytesleft); + } + + // Recycle old buffer. + if(d->buf) { + d->buf = upb_string_tryrecycle(d->buf); + d->buf_offset -= upb_string_len(d->buf); + d->buf_stream_offset += upb_string_len(d->buf); + } + + // Pull next buffer. + if(upb_bytesrc_get(d->bytesrc, d->buf, UPB_MAX_ENCODED_SIZE)) { + d->buf_bytesleft += upb_string_len(d->buf); + return true; + } else { + // Error or EOF. + if(!upb_bytesrc_eof(d->bytesrc)) { + // Error from bytesrc. + upb_copyerr(&d->src.status, upb_bytesrc_status(d->bytesrc)); + return false; + } else if(d->buf_bytesleft == 0) { + // EOF from bytesrc and we don't have any residual bytes left. + d->src.eof = true; + return false; + } else { + // No more data left from the bytesrc, but we still have residual bytes. + return true; + } + } +} + +static const uint8_t *upb_decoder_getbuf_full(upb_decoder *d, uint32_t *bytes) +{ + if(d->buf_bytesleft < UPB_MAX_ENCODED_SIZE) { + // GCC is currently complaining about use of an uninitialized value if we + // don't set this now. I think this is incorrect, but leaving this in + // to suppress the warning for now. + *bytes = 0; + if(!upb_decoder_nextbuf(d)) return NULL; + } + + assert(d->buf_bytesleft >= UPB_MAX_ENCODED_SIZE); + + if(d->buf_offset >= 0) { + // Common case: the main buffer contains at least UPB_MAX_ENCODED_SIZE + // contiguous bytes, so we can read directly out of it. + *bytes = d->buf_bytesleft; + return (uint8_t*)upb_string_getrobuf(d->buf) + d->buf_offset; + } else { + // We need to accumulate UPB_MAX_ENCODED_SIZE bytes; len is how many we + // have so far. + upb_strlen_t len = -d->buf_offset; + if(d->buf) { + upb_strlen_t to_copy = + UPB_MIN(UPB_MAX_ENCODED_SIZE - len, upb_string_len(d->buf)); + memcpy(d->tmpbuf + len, upb_string_getrobuf(d->buf), to_copy); + len += to_copy; + } + // Pad the buffer out to UPB_MAX_ENCODED_SIZE. + memset(d->tmpbuf + len, 0x80, UPB_MAX_ENCODED_SIZE - len); + *bytes = len; + return d->tmpbuf; + } +} + +// Returns a pointer to a buffer of data that is at least UPB_MAX_ENCODED_SIZE +// bytes long. This buffer contains the next bytes in the stream (even if +// those bytes span multiple buffers). *bytes is set to the number of actual +// stream bytes that are available in the returned buffer. If +// *bytes < UPB_MAX_ENCODED_SIZE, the buffer is padded with 0x80 bytes. +// +// After the data has been read, upb_decoder_consume() should be called to +// indicate how many bytes were consumed. +static const uint8_t *upb_decoder_getbuf(upb_decoder *d, uint32_t *bytes) +{ + if(d->buf_bytesleft >= UPB_MAX_ENCODED_SIZE && d->buf_offset >= 0) { + // Common case: the main buffer contains at least UPB_MAX_ENCODED_SIZE + // contiguous bytes, so we can read directly out of it. + *bytes = d->buf_bytesleft; + return (uint8_t*)upb_string_getrobuf(d->buf) + d->buf_offset; + } else { + return upb_decoder_getbuf_full(d, bytes); + } +} + +static bool upb_decoder_consume(upb_decoder *d, uint32_t bytes) +{ + assert(bytes <= UPB_MAX_ENCODED_SIZE); + d->buf_offset += bytes; + d->buf_bytesleft -= bytes; + if(d->buf_offset < 0) { + // We still have residual bytes we have not consumed. + memmove(d->tmpbuf, d->tmpbuf + bytes, -d->buf_offset); + } + assert(d->buf_bytesleft >= 0); + return true; +} + +static bool upb_decoder_skipbytes(upb_decoder *d, int32_t bytes) +{ + d->buf_offset += bytes; + d->buf_bytesleft -= bytes; + while(d->buf_bytesleft < 0) { + if(!upb_decoder_nextbuf(d)) return false; + } + return true; +} + + +/* Functions to read wire values. *********************************************/ + +// Parses remining bytes of a 64-bit varint that has already had its first byte +// parsed. +INLINE bool upb_decoder_readv64(upb_decoder *d, uint32_t *low, uint32_t *high) +{ + upb_strlen_t bytes_available; + const uint8_t *buf = upb_decoder_getbuf(d, &bytes_available); + const uint8_t *start = buf; + if(!buf) return false; + + *high = 0; + uint32_t b; + b = *(buf++); *low = (b & 0x7f) ; if(!(b & 0x80)) goto done; + b = *(buf++); *low |= (b & 0x7f) << 7; if(!(b & 0x80)) goto done; + b = *(buf++); *low |= (b & 0x7f) << 14; if(!(b & 0x80)) goto done; + b = *(buf++); *low |= (b & 0x7f) << 21; if(!(b & 0x80)) goto done; + b = *(buf++); *low |= (b & 0x7f) << 28; + *high = (b & 0x7f) >> 3; if(!(b & 0x80)) goto done; + b = *(buf++); *high |= (b & 0x7f) << 4; if(!(b & 0x80)) goto done; + b = *(buf++); *high |= (b & 0x7f) << 11; if(!(b & 0x80)) goto done; + b = *(buf++); *high |= (b & 0x7f) << 18; if(!(b & 0x80)) goto done; + b = *(buf++); *high |= (b & 0x7f) << 25; if(!(b & 0x80)) goto done; + + if(bytes_available >= 10) { + upb_seterr(&d->src.status, UPB_STATUS_ERROR, "Varint was unterminated " + "after 10 bytes, stream offset: %u", upb_decoder_offset(d)); + } else { + upb_seterr(&d->src.status, UPB_STATUS_ERROR, "Stream ended in the middle " + "of a varint, stream offset: %u", upb_decoder_offset(d)); + } + return false; + +done: + return upb_decoder_consume(d, buf - start); +} + +// Gets a varint -- called when we only need 32 bits of it. Note that a 32-bit +// varint is not a true wire type. +static bool upb_decoder_readv32(upb_decoder *d, uint32_t *val) +{ + uint32_t high; + if(!upb_decoder_readv64(d, val, &high)) return false; + + // We expect the high bits to be zero, except that signed 32-bit values are + // first sign-extended to be wire-compatible with 64 bits, in which case we + // expect the high bits to be all one. + // + // We could perform a slightly more sophisticated check by having the caller + // indicate whether a signed or unsigned value is being read. We could check + // that the high bits are all zeros for unsigned, and properly sign-extended + // for signed. + if(high != 0 && ~high != 0) { + upb_seterr(&d->src.status, UPB_STATUS_ERROR, "Read a 32-bit varint, but " + "the high bits contained data we should not truncate: " + "%ux, stream offset: %u", high, upb_decoder_offset(d)); + return false; + } + return true; +} + +// Gets a fixed-length 32-bit integer (wire type: UPB_WIRE_TYPE_32BIT). Caller +// promises that 4 bytes are available at buf. +static bool upb_decoder_readf32(upb_decoder *d, uint32_t *val) +{ + upb_strlen_t bytes_available; + const uint8_t *buf = upb_decoder_getbuf(d, &bytes_available); + if(!buf) return false; + if(bytes_available < 4) { + upb_seterr(&d->src.status, UPB_STATUS_ERROR, + "Stream ended in the middle of a 32-bit value"); + return false; + } + memcpy(val, buf, 4); + // TODO: byte swap if big-endian. + return upb_decoder_consume(d, 4); +} + +// Gets a fixed-length 64-bit integer (wire type: UPB_WIRE_TYPE_64BIT). Caller +// promises that 8 bytes are available at buf. +static bool upb_decoder_readf64(upb_decoder *d, uint64_t *val) +{ + upb_strlen_t bytes_available; + const uint8_t *buf = upb_decoder_getbuf(d, &bytes_available); + if(!buf) return false; + if(bytes_available < 8) { + upb_seterr(&d->src.status, UPB_STATUS_ERROR, + "Stream ended in the middle of a 64-bit value"); + return false; + } + memcpy(val, buf, 8); + // TODO: byte swap if big-endian. + return upb_decoder_consume(d, 8); +} + +// Returns the length of a varint (wire type: UPB_WIRE_TYPE_VARINT), allowing +// it to be easily skipped. Caller promises that 10 bytes are available at +// "buf". The function will return a maximum of 11 bytes before quitting. +static uint8_t upb_decoder_skipv64(upb_decoder *d) +{ + uint32_t bytes_available; + const uint8_t *buf = upb_decoder_getbuf(d, &bytes_available); + if(!buf) return false; + uint8_t i; + for(i = 0; i < 10 && buf[i] & 0x80; i++) + ; // empty loop body. + if(i > 10) { + upb_seterr(&d->src.status, UPB_STATUS_ERROR, "Unterminated varint."); + return false; + } + return upb_decoder_consume(d, i); +} + + +/* upb_src implementation for upb_decoder. ************************************/ + +bool upb_decoder_skipval(upb_decoder *d); + +upb_fielddef *upb_decoder_getdef(upb_decoder *d) +{ + // Detect end-of-submessage. + if(upb_decoder_offset(d) >= d->top->end_offset) { + d->src.eof = true; + return NULL; + } + + // Handles the packed field case. + if(d->field) return d->field; + + uint32_t key = 0; +again: + if(!upb_decoder_readv32(d, &key)) return NULL; + upb_wire_type_t wire_type = key & 0x7; + int32_t field_number = key >> 3; + + if(wire_type == UPB_WIRE_TYPE_DELIMITED) { + // For delimited wire values we parse the length now, since we need it in + // all cases. + if(!upb_decoder_readv32(d, &d->delimited_len)) return NULL; + } else if(wire_type == UPB_WIRE_TYPE_END_GROUP) { + if(d->top->end_offset == UPB_GROUP_END_OFFSET) { + d->src.eof = true; + } else { + upb_seterr(&d->src.status, UPB_STATUS_ERROR, "End group seen but current " + "message is not a group, byte offset: %zd", + upb_decoder_offset(d)); + } + return NULL; + } + + // Look up field by tag number. + upb_fielddef *f = upb_msg_itof(d->top->msgdef, field_number); + + if (!f) { + // Unknown field. If/when the upb_src interface supports reporting + // unknown fields we will implement that here. + upb_decoder_skipval(d); + goto again; + } else if (!upb_check_type(wire_type, f->type)) { + // This is a recoverable error condition. We skip the value but also + // return NULL and report the error. + upb_decoder_skipval(d); + // TODO: better error message. + upb_seterr(&d->src.status, UPB_STATUS_ERROR, "Incorrect wire type.\n"); + return NULL; + } + d->field = f; + d->wire_type = wire_type; + return f; +} + +bool upb_decoder_getval(upb_decoder *d, upb_valueptr val) +{ + switch(upb_types[d->field->type].native_wire_type) { + case UPB_WIRE_TYPE_VARINT: { + uint32_t low, high; + if(!upb_decoder_readv64(d, &low, &high)) return false; + uint64_t u64 = ((uint64_t)high << 32) | low; + if(d->field->type == UPB_TYPE(SINT64)) + *val.int64 = upb_zzdec_64(u64); + else + *val.uint64 = u64; + break; + } + case UPB_WIRE_TYPE_32BIT_VARINT: { + uint32_t u32; + if(!upb_decoder_readv32(d, &u32)) return false; + if(d->field->type == UPB_TYPE(SINT32)) + *val.int32 = upb_zzdec_32(u32); + else + *val.uint32 = u32; + break; + } + case UPB_WIRE_TYPE_64BIT: + if(!upb_decoder_readf64(d, val.uint64)) return false; + break; + case UPB_WIRE_TYPE_32BIT: + if(!upb_decoder_readf32(d, val.uint32)) return false; + break; + default: + upb_seterr(&d->src.status, UPB_STATUS_ERROR, + "Attempted to call getval on a group."); + return false; + } + // For a packed field where we have not reached the end, we leave the field + // in the decoder so we will return it again without parsing a key. + if(d->wire_type != UPB_WIRE_TYPE_DELIMITED || + upb_decoder_offset(d) >= d->packed_end_offset) { + d->field = NULL; + } + return true; +} + +bool upb_decoder_getstr(upb_decoder *d, upb_string *str) { + // A string, bytes, or a length-delimited submessage. The latter isn't + // technically a string, but can be gotten as one to perform lazy parsing. + const int32_t total_len = d->delimited_len; + if (d->buf_offset >= 0 && (int32_t)total_len <= d->buf_bytesleft) { + // The entire string is inside our current buffer, so we can just + // return a substring of the buffer without copying. + upb_string_substr(str, d->buf, + upb_string_len(d->buf) - d->buf_bytesleft, + total_len); + upb_decoder_skipbytes(d, total_len); + } else { + // The string spans buffers, so we must copy from the residual buffer + // (if any bytes are there), then the buffer, and finally from the bytesrc. + uint8_t *ptr = (uint8_t*)upb_string_getrwbuf( + str, UPB_MIN(total_len, d->buf_bytesleft)); + int32_t len = 0; + if(d->buf_offset < 0) { + // Residual bytes we need to copy from tmpbuf. + memcpy(ptr, d->tmpbuf, -d->buf_offset); + len += -d->buf_offset; + } + if(d->buf) { + // Bytes from the buffer. + memcpy(ptr + len, upb_string_getrobuf(d->buf) + d->buf_offset, + upb_string_len(str) - len); + } + upb_decoder_skipbytes(d, upb_string_len(str)); + if(len < total_len) { + // Bytes from the bytesrc. + if(!upb_bytesrc_append(d->bytesrc, str, total_len - len)) { + upb_copyerr(&d->src.status, upb_bytesrc_status(d->bytesrc)); + return false; + } + // Have to advance this since the buffering layer of the decoder will + // never see these bytes. + d->buf_stream_offset += total_len - len; + } + } + d->field = NULL; + return true; +} + +static bool upb_decoder_skipgroup(upb_decoder *d); + +bool upb_decoder_startmsg(upb_decoder *d) { + d->top->field = d->field; + if(++d->top >= d->limit) { + upb_seterr(&d->src.status, UPB_ERROR_MAX_NESTING_EXCEEDED, + "Nesting exceeded maximum (%d levels)\n", + UPB_MAX_NESTING); + return false; + } + upb_decoder_frame *frame = d->top; + frame->msgdef = upb_downcast_msgdef(d->field->def); + if(d->field->type == UPB_TYPE(GROUP)) { + frame->end_offset = UPB_GROUP_END_OFFSET; + } else { + frame->end_offset = upb_decoder_offset(d) + d->delimited_len; + } + return true; +} + +bool upb_decoder_endmsg(upb_decoder *d) { + if(d->top > d->stack) { + --d->top; + if(!d->src.eof) { + if(d->top->field->type == UPB_TYPE(GROUP)) + upb_decoder_skipgroup(d); + else + upb_decoder_skipbytes(d, d->top->end_offset - upb_decoder_offset(d)); + } + d->src.eof = false; + return true; + } else { + return false; + } +} + +bool upb_decoder_skipval(upb_decoder *d) { + upb_strlen_t bytes_to_skip; + switch(d->wire_type) { + case UPB_WIRE_TYPE_VARINT: { + return upb_decoder_skipv64(d); + } + case UPB_WIRE_TYPE_START_GROUP: + if(!upb_decoder_startmsg(d)) return false; + if(!upb_decoder_skipgroup(d)) return false; + if(!upb_decoder_endmsg(d)) return false; + return true; + default: + // Including UPB_WIRE_TYPE_END_GROUP. + assert(false); + upb_seterr(&d->src.status, UPB_STATUS_ERROR, "Tried to skip an end group"); + return false; + case UPB_WIRE_TYPE_64BIT: + bytes_to_skip = 8; + break; + case UPB_WIRE_TYPE_32BIT: + bytes_to_skip = 4; + break; + case UPB_WIRE_TYPE_DELIMITED: + // Works for both string/bytes *and* submessages. + bytes_to_skip = d->delimited_len; + break; + } + return upb_decoder_skipbytes(d, bytes_to_skip); +} + +static bool upb_decoder_skipgroup(upb_decoder *d) +{ + // This will be mututally recursive with upb_decoder_skipval() if the group + // has sub-groups. If we wanted to handle EAGAIN in the future, this + // approach would not work; we would need to track the group depth + // explicitly. + while(upb_decoder_getdef(d)) { + if(!upb_decoder_skipval(d)) return false; + } + // If we are at the end of the group like we want to be, then + // upb_decoder_getdef() returned NULL because of eof, not error. + if(!&d->src.eof) return false; + return true; +} + +upb_src_vtable upb_decoder_src_vtbl = { + (upb_src_getdef_fptr)&upb_decoder_getdef, + (upb_src_getval_fptr)&upb_decoder_getval, + (upb_src_skipval_fptr)&upb_decoder_skipval, + (upb_src_startmsg_fptr)&upb_decoder_startmsg, + (upb_src_endmsg_fptr)&upb_decoder_endmsg, +}; + + +/* upb_decoder construction/destruction. **************************************/ + +upb_decoder *upb_decoder_new(upb_msgdef *msgdef) +{ + upb_decoder *d = malloc(sizeof(*d)); + d->toplevel_msgdef = msgdef; + d->limit = &d->stack[UPB_MAX_NESTING]; + d->buf = NULL; + upb_src_init(&d->src, &upb_decoder_src_vtbl); + return d; +} + +void upb_decoder_free(upb_decoder *d) +{ + upb_string_unref(d->buf); + free(d); +} + +void upb_decoder_reset(upb_decoder *d, upb_bytesrc *bytesrc) +{ + upb_string_unref(d->buf); + d->top = d->stack; + d->top->msgdef = d->toplevel_msgdef; + // The top-level message is not delimited (we can keep receiving data for it + // indefinitely), so we set the end offset as high as possible, but not equal + // to UINT32_MAX so it doesn't equal UPB_GROUP_END_OFFSET. + d->top->end_offset = UINT32_MAX - 1; + d->bytesrc = bytesrc; + d->buf = NULL; + d->buf_bytesleft = 0; + d->buf_stream_offset = 0; + d->buf_offset = 0; +} diff --git a/stream/upb_decoder.h b/stream/upb_decoder.h new file mode 100644 index 0000000..dde61fc --- /dev/null +++ b/stream/upb_decoder.h @@ -0,0 +1,53 @@ +/* + * upb - a minimalist implementation of protocol buffers. + * + * upb_decoder implements a high performance, streaming decoder for protobuf + * data that works by implementing upb_src and getting its data from a + * upb_bytesrc. + * + * The decoder does not currently support non-blocking I/O, in the sense that + * if the bytesrc returns UPB_STATUS_TRYAGAIN it is not possible to resume the + * decoder when data becomes available again. Support for this could be added, + * but it would add complexity and perhaps cost efficiency also. + * + * Copyright (c) 2009-2010 Joshua Haberman. See LICENSE for details. + */ + +#ifndef UPB_DECODER_H_ +#define UPB_DECODER_H_ + +#include <stdbool.h> +#include <stdint.h> +#include "upb_def.h" +#include "upb_stream.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/* upb_decoder *****************************************************************/ + +// A upb_decoder decodes the binary protocol buffer format, writing the data it +// decodes to a upb_sink. +struct upb_decoder; +typedef struct upb_decoder upb_decoder; + +// Allocates and frees a upb_decoder, respectively. +upb_decoder *upb_decoder_new(upb_msgdef *md); +void upb_decoder_free(upb_decoder *d); + +// Resets the internal state of an already-allocated decoder. This puts it in a +// state where it has not seen any data, and expects the next data to be from +// the beginning of a new protobuf. Parsers must be reset before they can be +// used. A decoder can be reset multiple times. +void upb_decoder_reset(upb_decoder *d, upb_bytesrc *bytesrc); + +// Returns a upb_src pointer by which the decoder can be used. The returned +// upb_src is invalidated by upb_decoder_reset() or upb_decoder_free(). +upb_src *upb_decoder_getsrc(upb_decoder *d); + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif /* UPB_DECODER_H_ */ diff --git a/stream/upb_encoder.c b/stream/upb_encoder.c new file mode 100644 index 0000000..304a423 --- /dev/null +++ b/stream/upb_encoder.c @@ -0,0 +1,420 @@ +/* + * upb - a minimalist implementation of protocol buffers. + * + * Copyright (c) 2009 Joshua Haberman. See LICENSE for details. + */ + +#include "upb_encoder.h" + +#include <stdlib.h> +#include "descriptor.h" + +/* Functions for calculating sizes of wire values. ****************************/ + +static size_t upb_v_uint64_t_size(uint64_t val) { +#ifdef __GNUC__ + int high_bit = 63 - __builtin_clzll(val); // 0-based, undef if val == 0. +#else + int high_bit = 0; + uint64_t tmp = val; + while(tmp >>= 1) high_bit++; +#endif + return val == 0 ? 1 : high_bit / 7 + 1; +} + +static size_t upb_v_int32_t_size(int32_t val) { + // v_uint32's are sign-extended to maintain wire compatibility with int64s. + return upb_v_uint64_t_size((int64_t)val); +} +static size_t upb_v_uint32_t_size(uint32_t val) { + return upb_v_uint64_t_size(val); +} +static size_t upb_f_uint64_t_size(uint64_t val) { + (void)val; // Length is independent of value. + return sizeof(uint64_t); +} +static size_t upb_f_uint32_t_size(uint32_t val) { + (void)val; // Length is independent of value. + return sizeof(uint32_t); +} + + +/* Functions to write wire values. ********************************************/ + +// Since we know in advance the longest that the value could be, we always make +// sure that our buffer is long enough. This saves us from having to perform +// bounds checks. + +// Puts a varint (wire type: UPB_WIRE_TYPE_VARINT). +static uint8_t *upb_put_v_uint64_t(uint8_t *buf, uint64_t val) +{ + do { + uint8_t byte = val & 0x7f; + val >>= 7; + if(val) byte |= 0x80; + *buf++ = byte; + } while(val); + return buf; +} + +// Puts an unsigned 32-bit varint, verbatim. Never uses the high 64 bits. +static uint8_t *upb_put_v_uint32_t(uint8_t *buf, uint32_t val) +{ + return upb_put_v_uint64_t(buf, val); +} + +// Puts a signed 32-bit varint, first sign-extending to 64-bits. We do this to +// maintain wire-compatibility with 64-bit signed integers. +static uint8_t *upb_put_v_int32_t(uint8_t *buf, int32_t val) +{ + return upb_put_v_uint64_t(buf, (int64_t)val); +} + +static void upb_put32(uint8_t *buf, uint32_t val) { + buf[0] = val & 0xff; + buf[1] = (val >> 8) & 0xff; + buf[2] = (val >> 16) & 0xff; + buf[3] = (val >> 24); +} + +// Puts a fixed-length 32-bit integer (wire type: UPB_WIRE_TYPE_32BIT). +static uint8_t *upb_put_f_uint32_t(uint8_t *buf, uint32_t val) +{ + uint8_t *uint32_end = buf + sizeof(uint32_t); +#if UPB_UNALIGNED_READS_OK + *(uint32_t*)buf = val; +#else + upb_put32(buf, val); +#endif + return uint32_end; +} + +// Puts a fixed-length 64-bit integer (wire type: UPB_WIRE_TYPE_64BIT). +static uint8_t *upb_put_f_uint64_t(uint8_t *buf, uint64_t val) +{ + uint8_t *uint64_end = buf + sizeof(uint64_t); +#if UPB_UNALIGNED_READS_OK + *(uint64_t*)buf = val; +#else + upb_put32(buf, (uint32_t)val); + upb_put32(buf, (uint32_t)(val >> 32)); +#endif + return uint64_end; +} + +/* Functions to write and calculate sizes for .proto values. ******************/ + +// Performs zig-zag encoding, which is used by sint32 and sint64. +static uint32_t upb_zzenc_32(int32_t n) { return (n << 1) ^ (n >> 31); } +static uint64_t upb_zzenc_64(int64_t n) { return (n << 1) ^ (n >> 63); } + +/* Use macros to define a set of two functions for each .proto type: + * + * // Converts and writes a .proto value into buf. "end" indicates the end + * // of the current available buffer (if the buffer does not contain enough + * // space UPB_STATUS_NEED_MORE_DATA is returned). On success, *outbuf will + * // point one past the data that was written. + * uint8_t *upb_put_INT32(uint8_t *buf, int32_t val); + * + * // Returns the number of bytes required to encode val. + * size_t upb_get_INT32_size(int32_t val); + * + * // Given a .proto value s (source) convert it to a wire value. + * uint32_t upb_vtowv_INT32(int32_t s); + */ + +#define VTOWV(type, wire_t, val_t) \ + static wire_t upb_vtowv_ ## type(val_t s) + +#define PUT(type, v_or_f, wire_t, val_t, member_name) \ + static uint8_t *upb_put_ ## type(uint8_t *buf, val_t val) { \ + wire_t tmp = upb_vtowv_ ## type(val); \ + return upb_put_ ## v_or_f ## _ ## wire_t(buf, tmp); \ + } + +#define T(type, v_or_f, wire_t, val_t, member_name) \ + static size_t upb_get_ ## type ## _size(val_t val) { \ + return upb_ ## v_or_f ## _ ## wire_t ## _size(val); \ + } \ + VTOWV(type, wire_t, val_t); /* prototype for PUT below */ \ + PUT(type, v_or_f, wire_t, val_t, member_name) \ + VTOWV(type, wire_t, val_t) + +T(INT32, v, int32_t, int32_t, int32) { return (uint32_t)s; } +T(INT64, v, uint64_t, int64_t, int64) { return (uint64_t)s; } +T(UINT32, v, uint32_t, uint32_t, uint32) { return s; } +T(UINT64, v, uint64_t, uint64_t, uint64) { return s; } +T(SINT32, v, uint32_t, int32_t, int32) { return upb_zzenc_32(s); } +T(SINT64, v, uint64_t, int64_t, int64) { return upb_zzenc_64(s); } +T(FIXED32, f, uint32_t, uint32_t, uint32) { return s; } +T(FIXED64, f, uint64_t, uint64_t, uint64) { return s; } +T(SFIXED32, f, uint32_t, int32_t, int32) { return (uint32_t)s; } +T(SFIXED64, f, uint64_t, int64_t, int64) { return (uint64_t)s; } +T(BOOL, v, uint32_t, bool, _bool) { return (uint32_t)s; } +T(ENUM, v, uint32_t, int32_t, int32) { return (uint32_t)s; } +T(DOUBLE, f, uint64_t, double, _double) { + upb_value v; + v._double = s; + return v.uint64; +} +T(FLOAT, f, uint32_t, float, _float) { + upb_value v; + v._float = s; + return v.uint32; +} +#undef VTOWV +#undef PUT +#undef T + +static uint8_t *upb_encode_value(uint8_t *buf, upb_field_type_t ft, upb_value v) +{ +#define CASE(t, member_name) \ + case UPB_TYPE(t): return upb_put_ ## t(buf, v.member_name); + switch(ft) { + CASE(DOUBLE, _double) + CASE(FLOAT, _float) + CASE(INT32, int32) + CASE(INT64, int64) + CASE(UINT32, uint32) + CASE(UINT64, uint64) + CASE(SINT32, int32) + CASE(SINT64, int64) + CASE(FIXED32, uint32) + CASE(FIXED64, uint64) + CASE(SFIXED32, int32) + CASE(SFIXED64, int64) + CASE(BOOL, _bool) + CASE(ENUM, int32) + default: assert(false); return buf; + } +#undef CASE +} + +static uint32_t _upb_get_value_size(upb_field_type_t ft, upb_value v) +{ +#define CASE(t, member_name) \ + case UPB_TYPE(t): return upb_get_ ## t ## _size(v.member_name); + switch(ft) { + CASE(DOUBLE, _double) + CASE(FLOAT, _float) + CASE(INT32, int32) + CASE(INT64, int64) + CASE(UINT32, uint32) + CASE(UINT64, uint64) + CASE(SINT32, int32) + CASE(SINT64, int64) + CASE(FIXED32, uint32) + CASE(FIXED64, uint64) + CASE(SFIXED32, int32) + CASE(SFIXED64, int64) + CASE(BOOL, _bool) + CASE(ENUM, int32) + default: assert(false); return 0; + } +#undef CASE +} + +static uint8_t *_upb_put_tag(uint8_t *buf, upb_field_number_t num, + upb_wire_type_t wt) +{ + return upb_put_UINT32(buf, wt | (num << 3)); +} + +static uint32_t _upb_get_tag_size(upb_field_number_t num) +{ + return upb_get_UINT32_size(num << 3); +} + + +/* upb_sizebuilder ************************************************************/ + +struct upb_sizebuilder { + // Accumulating size for the current level. + uint32_t size; + + // Stack of sizes for our current nesting. + uint32_t stack[UPB_MAX_NESTING], *top; + + // Vector of sizes. + uint32_t *sizes; + int sizes_len; + int sizes_size; + + upb_status status; +}; + +// upb_sink callbacks. +static upb_sink_status _upb_sizebuilder_valuecb(upb_sink *sink, upb_fielddef *f, + upb_value val, + upb_status *status) +{ + (void)status; + upb_sizebuilder *sb = (upb_sizebuilder*)sink; + uint32_t size = 0; + size += _upb_get_tag_size(f->number); + size += _upb_get_value_size(f->type, val); + sb->size += size; + return UPB_SINK_CONTINUE; +} + +static upb_sink_status _upb_sizebuilder_strcb(upb_sink *sink, upb_fielddef *f, + upb_strptr str, + int32_t start, uint32_t end, + upb_status *status) +{ + (void)status; + (void)str; // String data itself is not used. + upb_sizebuilder *sb = (upb_sizebuilder*)sink; + if(start >= 0) { + uint32_t size = 0; + size += _upb_get_tag_size(f->number); + size += upb_get_UINT32_size(end - start); + sb->size += size; + } + return UPB_SINK_CONTINUE; +} + +static upb_sink_status _upb_sizebuilder_startcb(upb_sink *sink, upb_fielddef *f, + upb_status *status) +{ + (void)status; + (void)f; // Unused (we calculate tag size and delimiter in endcb). + upb_sizebuilder *sb = (upb_sizebuilder*)sink; + if(f->type == UPB_TYPE(MESSAGE)) { + *sb->top = sb->size; + sb->top++; + sb->size = 0; + } else { + assert(f->type == UPB_TYPE(GROUP)); + sb->size += _upb_get_tag_size(f->number); + } + return UPB_SINK_CONTINUE; +} + +static upb_sink_status _upb_sizebuilder_endcb(upb_sink *sink, upb_fielddef *f, + upb_status *status) +{ + (void)status; + upb_sizebuilder *sb = (upb_sizebuilder*)sink; + if(f->type == UPB_TYPE(MESSAGE)) { + sb->top--; + if(sb->sizes_len == sb->sizes_size) { + sb->sizes_size *= 2; + sb->sizes = realloc(sb->sizes, sb->sizes_size * sizeof(*sb->sizes)); + } + uint32_t child_size = sb->size; + uint32_t parent_size = *sb->top; + sb->sizes[sb->sizes_len++] = child_size; + // The size according to the parent includes the tag size and delimiter of + // the submessage. + parent_size += upb_get_UINT32_size(child_size); + parent_size += _upb_get_tag_size(f->number); + // Include size accumulated in parent before child began. + sb->size = child_size + parent_size; + } else { + assert(f->type == UPB_TYPE(GROUP)); + // As an optimization, we could just add this number twice in startcb, to + // avoid having to recalculate it. + sb->size += _upb_get_tag_size(f->number); + } + return UPB_SINK_CONTINUE; +} + +upb_sink_callbacks _upb_sizebuilder_sink_vtbl = { + _upb_sizebuilder_valuecb, + _upb_sizebuilder_strcb, + _upb_sizebuilder_startcb, + _upb_sizebuilder_endcb +}; + + +/* upb_sink callbacks *********************************************************/ + +struct upb_encoder { + upb_sink base; + //upb_bytesink *bytesink; + uint32_t *sizes; + int size_offset; +}; + + +// Within one callback we may need to encode up to two separate values. +#define UPB_ENCODER_BUFSIZE (UPB_MAX_ENCODED_SIZE * 2) + +static upb_sink_status _upb_encoder_push_buf(upb_encoder *s, const uint8_t *buf, + size_t len, upb_status *status) +{ + // TODO: conjure a upb_strptr that points to buf. + //upb_strptr ptr; + (void)s; + (void)buf; + (void)status; + size_t written = 5;// = upb_bytesink_onbytes(s->bytesink, ptr); + if(written < len) { + // TODO: mark to skip "written" bytes next time. + return UPB_SINK_STOP; + } else { + return UPB_SINK_CONTINUE; + } +} + +static upb_sink_status _upb_encoder_valuecb(upb_sink *sink, upb_fielddef *f, + upb_value val, upb_status *status) +{ + upb_encoder *s = (upb_encoder*)sink; + uint8_t buf[UPB_ENCODER_BUFSIZE], *ptr = buf; + upb_wire_type_t wt = upb_types[f->type].expected_wire_type; + // TODO: handle packed encoding. + ptr = _upb_put_tag(ptr, f->number, wt); + ptr = upb_encode_value(ptr, f->type, val); + return _upb_encoder_push_buf(s, buf, ptr - buf, status); +} + +static upb_sink_status _upb_encoder_strcb(upb_sink *sink, upb_fielddef *f, + upb_strptr str, + int32_t start, uint32_t end, + upb_status *status) +{ + upb_encoder *s = (upb_encoder*)sink; + uint8_t buf[UPB_ENCODER_BUFSIZE], *ptr = buf; + if(start >= 0) { + ptr = _upb_put_tag(ptr, f->number, UPB_WIRE_TYPE_DELIMITED); + ptr = upb_put_UINT32(ptr, end - start); + } + // TODO: properly handle partially consumed strings and partially supplied + // strings. + _upb_encoder_push_buf(s, buf, ptr - buf, status); + return _upb_encoder_push_buf(s, (uint8_t*)upb_string_getrobuf(str), end - start, status); +} + +static upb_sink_status _upb_encoder_startcb(upb_sink *sink, upb_fielddef *f, + upb_status *status) +{ + upb_encoder *s = (upb_encoder*)sink; + uint8_t buf[UPB_ENCODER_BUFSIZE], *ptr = buf; + if(f->type == UPB_TYPE(GROUP)) { + ptr = _upb_put_tag(ptr, f->number, UPB_WIRE_TYPE_START_GROUP); + } else { + ptr = _upb_put_tag(ptr, f->number, UPB_WIRE_TYPE_DELIMITED); + ptr = upb_put_UINT32(ptr, s->sizes[--s->size_offset]); + } + return _upb_encoder_push_buf(s, buf, ptr - buf, status); +} + +static upb_sink_status _upb_encoder_endcb(upb_sink *sink, upb_fielddef *f, + upb_status *status) +{ + upb_encoder *s = (upb_encoder*)sink; + uint8_t buf[UPB_ENCODER_BUFSIZE], *ptr = buf; + if(f->type != UPB_TYPE(GROUP)) return UPB_SINK_CONTINUE; + ptr = _upb_put_tag(ptr, f->number, UPB_WIRE_TYPE_END_GROUP); + return _upb_encoder_push_buf(s, buf, ptr - buf, status); +} + +upb_sink_callbacks _upb_encoder_sink_vtbl = { + _upb_encoder_valuecb, + _upb_encoder_strcb, + _upb_encoder_startcb, + _upb_encoder_endcb +}; + diff --git a/stream/upb_encoder.h b/stream/upb_encoder.h new file mode 100644 index 0000000..e879b0b --- /dev/null +++ b/stream/upb_encoder.h @@ -0,0 +1,56 @@ +/* + * upb - a minimalist implementation of protocol buffers. + * + * Implements a upb_sink that writes protobuf data to the binary wire format. + * + * For messages that have any submessages, the encoder needs a buffer + * containing the submessage sizes, so they can be properly written at the + * front of each message. Note that groups do *not* have this requirement. + * + * Copyright (c) 2009-2010 Joshua Haberman. See LICENSE for details. + */ + +#ifndef UPB_ENCODER_H_ +#define UPB_ENCODER_H_ + +#include "upb.h" +#include "upb_srcsink.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/* upb_encoder ****************************************************************/ + +// A upb_encoder is a upb_sink that emits data to a upb_bytesink in the protocol +// buffer binary wire format. +struct upb_encoder; +typedef struct upb_encoder upb_encoder; + +upb_encoder *upb_encoder_new(upb_msgdef *md); +void upb_encoder_free(upb_encoder *e); + +// Resets the given upb_encoder such that is is ready to begin encoding, +// outputting data to "bytesink" (which must live until the encoder is +// reset or destroyed). +void upb_encoder_reset(upb_encoder *e, upb_bytesink *bytesink); + +// Returns the upb_sink to which data can be written. The sink is invalidated +// when the encoder is reset or destroyed. Note that if the client wants to +// encode any length-delimited submessages it must first call +// upb_encoder_buildsizes() below. +upb_sink *upb_encoder_sink(upb_encoder *e); + +// Call prior to pushing any data with embedded submessages. "src" must yield +// exactly the same data as what will next be encoded, but in reverse order. +// The encoder iterates over this data in order to determine the sizes of the +// submessages. If any errors are returned by the upb_src, the status will +// be saved in *status. If the client is sure that the upb_src will not throw +// any errors, "status" may be NULL. +void upb_encoder_buildsizes(upb_encoder *e, upb_src *src, upb_status *status); + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif /* UPB_ENCODER_H_ */ diff --git a/stream/upb_text.c b/stream/upb_text.c new file mode 100644 index 0000000..8662269 --- /dev/null +++ b/stream/upb_text.c @@ -0,0 +1,121 @@ +/* + * upb - a minimalist implementation of protocol buffers. + * + * Copyright (c) 2009 Joshua Haberman. See LICENSE for details. + */ + +#include <inttypes.h> +#include "descriptor.h" +#include "upb_text.h" +#include "upb_data.h" + +void upb_text_printval(upb_field_type_t type, upb_value val, FILE *file) +{ +#define CASE(fmtstr, member) fprintf(file, fmtstr, val.member); break; + switch(type) { + case UPB_TYPE(DOUBLE): + CASE("%0.f", _double); + case UPB_TYPE(FLOAT): + CASE("%0.f", _float) + case UPB_TYPE(INT64): + case UPB_TYPE(SFIXED64): + case UPB_TYPE(SINT64): + CASE("%" PRId64, int64) + case UPB_TYPE(UINT64): + case UPB_TYPE(FIXED64): + CASE("%" PRIu64, uint64) + case UPB_TYPE(INT32): + case UPB_TYPE(SFIXED32): + case UPB_TYPE(SINT32): + CASE("%" PRId32, int32) + case UPB_TYPE(UINT32): + case UPB_TYPE(FIXED32): + case UPB_TYPE(ENUM): + CASE("%" PRIu32, uint32); + case UPB_TYPE(BOOL): + CASE("%hhu", _bool); + case UPB_TYPE(STRING): + case UPB_TYPE(BYTES): + /* TODO: escaping. */ + fprintf(file, "\"" UPB_STRFMT "\"", UPB_STRARG(val.str)); break; + } +} + +static void print_indent(upb_text_printer *p, FILE *stream) +{ + if(!p->single_line) + for(int i = 0; i < p->indent_depth; i++) + fprintf(stream, " "); +} + +void upb_text_printfield(upb_text_printer *p, upb_strptr name, + upb_field_type_t valtype, upb_value val, + FILE *stream) +{ + print_indent(p, stream); + fprintf(stream, UPB_STRFMT ":", UPB_STRARG(name)); + upb_text_printval(valtype, val, stream); + if(p->single_line) + fputc(' ', stream); + else + fputc('\n', stream); +} + +void upb_text_push(upb_text_printer *p, upb_strptr submsg_type, FILE *stream) +{ + print_indent(p, stream); + fprintf(stream, UPB_STRFMT " {", UPB_STRARG(submsg_type)); + if(!p->single_line) fputc('\n', stream); + p->indent_depth++; +} + +void upb_text_pop(upb_text_printer *p, FILE *stream) +{ + p->indent_depth--; + print_indent(p, stream); + fprintf(stream, "}\n"); +} + +static void printval(upb_text_printer *printer, upb_value v, upb_fielddef *f, + FILE *stream); + +static void printmsg(upb_text_printer *printer, upb_msg *msg, upb_msgdef *md, + FILE *stream) +{ + for(upb_field_count_t i = 0; i < md->num_fields; i++) { + upb_fielddef *f = &md->fields[i]; + if(!upb_msg_has(msg, f)) continue; + upb_value v = upb_msg_get(msg, f); + if(upb_isarray(f)) { + upb_arrayptr arr = v.arr; + for(uint32_t j = 0; j < upb_array_len(arr); j++) { + upb_value elem = upb_array_get(arr, f, j); + printval(printer, elem, f, stream); + } + } else { + printval(printer, v, f, stream); + } + } +} + +static void printval(upb_text_printer *printer, upb_value v, upb_fielddef *f, + FILE *stream) +{ + if(upb_issubmsg(f)) { + upb_text_push(printer, f->name, stream); + printmsg(printer, v.msg, upb_downcast_msgdef(f->def), stream); + upb_text_pop(printer, stream); + } else { + upb_text_printfield(printer, f->name, f->type, v, stream); + } +} + + +void upb_msg_print(upb_msg *msg, upb_msgdef *md, bool single_line, + FILE *stream) +{ + upb_text_printer printer; + upb_text_printer_init(&printer, single_line); + printmsg(&printer, msg, md, stream); +} + diff --git a/stream/upb_text.h b/stream/upb_text.h new file mode 100644 index 0000000..d89c9d6 --- /dev/null +++ b/stream/upb_text.h @@ -0,0 +1,36 @@ +/* + * upb - a minimalist implementation of protocol buffers. + * + * Copyright (c) 2009 Joshua Haberman. See LICENSE for details. + */ + +#ifndef UPB_TEXT_H_ +#define UPB_TEXT_H_ + +#include "upb.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct { + int indent_depth; + bool single_line; +} upb_text_printer; + +INLINE void upb_text_printer_init(upb_text_printer *p, bool single_line) { + p->indent_depth = 0; + p->single_line = single_line; +} +void upb_text_printval(upb_field_type_t type, upb_value p, FILE *file); +void upb_text_printfield(upb_text_printer *p, upb_strptr name, + upb_field_type_t valtype, upb_value val, FILE *stream); +void upb_text_push(upb_text_printer *p, upb_strptr submsg_type, + FILE *stream); +void upb_text_pop(upb_text_printer *p, FILE *stream); + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif /* UPB_TEXT_H_ */ |