From 85f6cecb80b48d13dd6c0886c07d38eda0a8f1dd Mon Sep 17 00:00:00 2001 From: Joshua Haberman Date: Sat, 1 Aug 2009 18:02:57 -0700 Subject: Added calculation of sizes for serialization (untested). --- src/upb_msg.c | 117 +++++++++++++++++++++++++++++++++++++++++++++++++++- src/upb_msg.h | 58 ++++++++++++++++++++++++-- src/upb_parse.h | 27 ++++++------ src/upb_serialize.h | 42 +++++++++++-------- 4 files changed, 208 insertions(+), 36 deletions(-) (limited to 'src') diff --git a/src/upb_msg.c b/src/upb_msg.c index bdebe0d..513c0c5 100644 --- a/src/upb_msg.c +++ b/src/upb_msg.c @@ -9,6 +9,7 @@ #include "descriptor.h" #include "upb_msg.h" #include "upb_parse.h" +#include "upb_serialize.h" /* Rounds p up to the next multiple of t. */ #define ALIGN_UP(p, t) ((p) % (t) == 0 ? (p) : (p) + ((t) - ((p) % (t)))) @@ -248,7 +249,7 @@ void upb_msg_reuse_submsg(void **msg, struct upb_msg *m) if(!*msg) *msg = upb_msgdata_new(m); } -/* Serialization/Deserialization. ********************************************/ +/* Parsing. ******************************************************************/ static upb_field_type_t tag_cb(void *udata, struct upb_tag *tag, void **user_field_desc) @@ -390,6 +391,120 @@ void *upb_alloc_and_parse(struct upb_msg *m, struct upb_string *str, bool byref) } } +/* Serialization. ************************************************************/ + +/* We store the message sizes linearly in post-order (size of parent after sizes + * of children) for a right-to-left traversal of the message tree. Iterating + * over this in reverse gives us a pre-order (size of parent before sizes of + * children) left-to-right traversal, which is what we want for parsing. */ +struct upb_msgsizes { + int len; + int size; + size_t *sizes; +}; + +/* Declared below -- this and get_valuesize are mutually recursive. */ +static size_t get_msgsize(struct upb_msgsizes *sizes, void *data, + struct upb_msg *m); + +/* Returns a size of a value as it will be serialized. Does *not* include + * the size of the tag -- that is already accounted for. */ +static size_t get_valuesize(struct upb_msgsizes *sizes, union upb_value_ptr p, + struct upb_msg_field *f, + google_protobuf_FieldDescriptorProto *fd) +{ + switch(f->type) { + default: assert(false); return 0; /* Internal corruption. */ + case GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_MESSAGE: { + size_t submsg_size = get_msgsize(sizes, p.msg, f->ref.msg); + return upb_get_INT32_size(submsg_size) + submsg_size; + } + case GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_GROUP: { + size_t endgrp_tag_size = upb_get_tag_size(fd->number); + return endgrp_tag_size + get_msgsize(sizes, p.msg, f->ref.msg); + } +#define CASE(type, member) \ + case GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_ ## type: \ + return upb_get_ ## type ## _size(*p.member); + CASE(DOUBLE, _double) + CASE(FLOAT, _float) + CASE(INT32, int32) + CASE(INT64, int64) + CASE(UINT32, uint32) + CASE(UINT64, uint64) + CASE(SINT32, int32) + CASE(SINT64, int64) + CASE(FIXED32, uint32) + CASE(FIXED64, uint64) + CASE(SFIXED32, int32) + CASE(SFIXED64, int64) + CASE(BOOL, _bool) + CASE(ENUM, int32) +#undef CASE + } +} + +/* This is mostly just a pure recursive function to calculate the size of a + * message. However it also stores the results of each level of the recursion + * in sizes, because we need all of this intermediate information later. */ +static size_t get_msgsize(struct upb_msgsizes *sizes, void *data, + struct upb_msg *m) +{ + size_t size = 0; + /* We iterate over fields and arrays in reverse order. */ + for(int32_t i = m->num_fields - 1; i >= 0; i--) { + struct upb_msg_field *f = &m->fields[i]; + google_protobuf_FieldDescriptorProto *fd = upb_msg_field_descriptor(f, m); + if(!upb_msg_isset(data, f)) continue; + union upb_value_ptr p = upb_msg_getptr(data, f); + if(upb_isarray(f)) { + for(int32_t j = (*p.arr)->len - 1; j >= 0; j--) { + union upb_value_ptr elem = upb_array_getelementptr((*p.arr), j, f->type); + /* TODO: for packed arrays tag size goes outside the loop. */ + size += upb_get_tag_size(fd->number); + size += get_valuesize(sizes, elem, f, fd); + } + } else { + size += upb_get_tag_size(fd->number); + size += get_valuesize(sizes, p, f, fd); + } + } + /* Resize the 'sizes' array if necessary. */ + assert(sizes->len <= sizes->size); + if(sizes->len == sizes->size) { + sizes->size *= 2; + sizes->sizes = realloc(sizes->sizes, sizes->size * sizeof(size_t)); + } + /* Add our size (already added our children, so post-order). */ + sizes->sizes[sizes->len++] = size; + return size; +} + +void upb_msgsizes_read(struct upb_msgsizes *sizes, void *data, struct upb_msg *m) +{ + get_msgsize(sizes, data, m); +} + +/* Initialize/free a upb_msg_sizes for the given message. */ +void upb_msgsizes_init(struct upb_msgsizes *sizes) +{ + sizes->len = 0; + sizes->size = 0; + sizes->sizes = NULL; +} + +void upb_msgsizes_free(struct upb_msgsizes *sizes) +{ + free(sizes->sizes); +} + +size_t upb_msgsizes_totalsize(struct upb_msgsizes *sizes) +{ + return sizes->sizes[sizes->len-1]; +} + +/* Comparison. ***************************************************************/ + bool upb_value_eql(union upb_value_ptr p1, union upb_value_ptr p2, upb_field_type_t type) { diff --git a/src/upb_msg.h b/src/upb_msg.h index 4ce94a6..8699fbf 100644 --- a/src/upb_msg.h +++ b/src/upb_msg.h @@ -215,32 +215,39 @@ INLINE bool upb_isarray(struct upb_msg_field *f) { /* "Set" flag reading and writing. *******************************************/ +/* Please note that these functions do not perform any memory management or in + * any way ensure that the fields are valid. They *only* test/set/clear a bit + * that indicates whether the field is set or not. */ + +/* Returns the byte offset where we store whether this field is set. */ INLINE size_t upb_isset_offset(uint32_t field_index) { return field_index / 8; } +/* Returns the mask within the appropriate byte that selects the set bit. */ INLINE uint8_t upb_isset_mask(uint32_t field_index) { return 1 << (field_index % 8); } -/* Functions for reading and writing the "set" flags in the msg. Note that - * these do not perform memory management associated with any dynamic memory - * these fields may be referencing. These *only* set and test the flags. */ +/* Returns true if the given field is set, false otherwise. */ INLINE void upb_msg_set(void *s, struct upb_msg_field *f) { ((char*)s)[upb_isset_offset(f->field_index)] |= upb_isset_mask(f->field_index); } +/* Clears the set bit for this field in the given message. */ INLINE void upb_msg_unset(void *s, struct upb_msg_field *f) { ((char*)s)[upb_isset_offset(f->field_index)] &= ~upb_isset_mask(f->field_index); } +/* Tests whether the given field is set. */ INLINE bool upb_msg_isset(void *s, struct upb_msg_field *f) { return ((char*)s)[upb_isset_offset(f->field_index)] & upb_isset_mask(f->field_index); } +/* Returns true if *all* required fields are set, false otherwise. */ INLINE bool upb_msg_all_required_fields_set(void *s, struct upb_msg *m) { int num_fields = m->num_required_fields; @@ -253,6 +260,7 @@ INLINE bool upb_msg_all_required_fields_set(void *s, struct upb_msg *m) return true; } +/* Clears the set bit for all fields. */ INLINE void upb_msg_clear(void *s, struct upb_msg *m) { memset(s, 0, m->set_flags_bytes); @@ -304,7 +312,7 @@ void upb_msg_reuse_array(struct upb_array **arr, uint32_t size, /* Reuse a submessage of the given type. */ void upb_msg_reuse_submsg(void **msg, struct upb_msg *m); -/* Serialization/Deserialization. ********************************************/ +/* Parsing. ******************************************************************/ /* This is all just a layer on top of the stream-oriented facility in * upb_parse.h. */ @@ -352,6 +360,48 @@ upb_status_t upb_msg_parse(struct upb_msg_parse_state *s, * above. "byref" works as in upb_msg_parse_init(). */ void *upb_alloc_and_parse(struct upb_msg *m, struct upb_string *s, bool byref); +/* Serialization *************************************************************/ + +/* For messages that contain any submessages, we must do a pre-pass on the + * message tree to discover the size of all submessages. This is necessary + * because when serializing, the message length has to precede the message data + * itself. + * + * We can calculate these sizes once and reuse them as long as the message is + * known not to have changed. */ +struct upb_msgsizes; + +/* Initialize/free a upb_msgsizes for the given message. */ +void upb_msgsizes_init(struct upb_msgsizes *sizes); +void upb_msgsizes_free(struct upb_msgsizes *sizes); + +/* Given a previously initialized sizes, recurse over the message and store its + * sizes in 'sizes'. */ +void upb_msgsizes_read(struct upb_msgsizes *sizes, void *data, + struct upb_msg *m); + +/* Returns the total size of the serialized message given in sizes. Must be + * preceeded by a call to upb_msgsizes_read. */ +size_t upb_msgsizes_totalsize(struct upb_msgsizes *sizes); + +struct upb_msg_serialize_state; + +/* Initializes the state of serialization. The provided message must not + * change between the upb_msgsizes_read() call that was used to construct + * "sizes" and the parse being fully completed. */ +void upb_msg_serialize_alloc(struct upb_msg_serialize_state *s); +void upb_msg_serialize_free(struct upb_msg_serialize_state *s); +void upb_msg_serialize_init(struct upb_msg_serialize_state *s, void *data, + struct upb_msg *m, struct upb_msgsizes *sizes); + +/* Serializes the next set of bytes into buf (which has size len). Returns + * UPB_STATUS_OK if serialization is complete, or UPB_STATUS_NEED_MORE_DATA + * if there is more data from the message left to be serialized. + * + * The number of bytes written to buf is returned in *read. This will be + * equal to len unless we finished serializing. */ +upb_status_t upb_msg_serialize(struct upb_msg_serialize_state *s, + void *buf, size_t len, size_t *read); /* Text dump *****************************************************************/ diff --git a/src/upb_parse.h b/src/upb_parse.h index ca18937..193c307 100644 --- a/src/upb_parse.h +++ b/src/upb_parse.h @@ -164,8 +164,7 @@ INLINE upb_status_t upb_get_v_uint32_t(uint8_t *buf, uint8_t *end, { uint64_t val64; UPB_CHECK(upb_get_v_uint64_t(buf, end, &val64, outbuf)); - /* TODO: should we throw an error if any of the high bits in val64 are set? */ - *val = (uint32_t)val64; + *val = (uint32_t)val64; /* Discard the high bits. */ return UPB_STATUS_OK; } @@ -245,18 +244,18 @@ INLINE int64_t upb_zzdec_64(uint64_t n) { return (n >> 1) ^ -(int64_t)(n & 1); } GET(type, v_or_f, wire_t, val_t, member_name) \ WVTOV(type, wire_t, val_t) -T(INT32, v, uint32_t, int32_t, int32) { return (int32_t)s; } -T(INT64, v, uint64_t, int64_t, int64) { return (int64_t)s; } -T(UINT32, v, uint32_t, uint32_t, uint32) { return s; } -T(UINT64, v, uint64_t, uint64_t, uint64) { return s; } -T(SINT32, v, uint32_t, int32_t, int32) { return upb_zzdec_32(s); } -T(SINT64, v, uint64_t, int64_t, int64) { return upb_zzdec_64(s); } -T(FIXED32, f, uint32_t, uint32_t, uint32) { return s; } -T(FIXED64, f, uint64_t, uint64_t, uint64) { return s; } -T(SFIXED32, f, uint32_t, int32_t, int32) { return (int32_t)s; } -T(SFIXED64, f, uint64_t, int64_t, int64) { return (int64_t)s; } -T(BOOL, v, uint32_t, bool, _bool) { return (bool)s; } -T(ENUM, v, uint32_t, int32_t, int32) { return (int32_t)s; } +T(INT32, v, uint32_t, int32_t, int32) { return (int32_t)s; } +T(INT64, v, uint64_t, int64_t, int64) { return (int64_t)s; } +T(UINT32, v, uint32_t, uint32_t, uint32) { return s; } +T(UINT64, v, uint64_t, uint64_t, uint64) { return s; } +T(SINT32, v, uint32_t, int32_t, int32) { return upb_zzdec_32(s); } +T(SINT64, v, uint64_t, int64_t, int64) { return upb_zzdec_64(s); } +T(FIXED32, f, uint32_t, uint32_t, uint32) { return s; } +T(FIXED64, f, uint64_t, uint64_t, uint64) { return s; } +T(SFIXED32, f, uint32_t, int32_t, int32) { return (int32_t)s; } +T(SFIXED64, f, uint64_t, int64_t, int64) { return (int64_t)s; } +T(BOOL, v, uint32_t, bool, _bool) { return (bool)s; } +T(ENUM, v, uint32_t, int32_t, int32) { return (int32_t)s; } T(DOUBLE, f, uint64_t, double, _double) { union upb_value v; v.uint64 = s; diff --git a/src/upb_serialize.h b/src/upb_serialize.h index b785e0e..ec735c2 100644 --- a/src/upb_serialize.h +++ b/src/upb_serialize.h @@ -44,11 +44,19 @@ INLINE upb_status_t upb_put_v_uint64_t(uint8_t *buf, uint8_t *end, uint64_t val, return UPB_STATUS_OK; } -/* Puts a varint -- called when we only have 32 bits of data. */ +/* Puts an unsigned 32-bit varint, verbatim. Never uses the high 64 bits. */ INLINE upb_status_t upb_put_v_uint32_t(uint8_t *buf, uint8_t *end, uint32_t val, uint8_t **outbuf) { - return UPB_STATUS_OK; + return upb_put_v_uint64_t(buf, end, val, outbuf); +} + +/* Puts a signed 32-bit varint, first sign-extending to 64-bits. We do this to + * maintain wire-compatibility with 64-bit signed integers. */ +INLINE upb_status_t upb_put_v_int32_t(uint8_t *buf, uint8_t *end, + int32_t val, uint8_t **outbuf) +{ + return upb_put_v_uint64_t(buf, end, (int64_t)val, outbuf); } INLINE void upb_put32(uint8_t *buf, uint32_t val) { @@ -157,18 +165,18 @@ INLINE uint64_t upb_zzenc_64(int64_t n) { return (n << 1) ^ (n >> 63); } PUT(type, v_or_f, wire_t, val_t, member_name) \ VTOWV(type, wire_t, val_t) -T(INT32, v, uint32_t, int32_t, int32) { return (uint32_t)s; } -T(INT64, v, uint64_t, int64_t, int64) { return (uint64_t)s; } -T(UINT32, v, uint32_t, uint32_t, uint32) { return s; } -T(UINT64, v, uint64_t, uint64_t, uint64) { return s; } -T(SINT32, v, uint32_t, int32_t, int32) { return upb_zzenc_32(s); } -T(SINT64, v, uint64_t, int64_t, int64) { return upb_zzdec_64(s); } -T(FIXED32, f, uint32_t, uint32_t, uint32) { return s; } -T(FIXED64, f, uint64_t, uint64_t, uint64) { return s; } -T(SFIXED32, f, uint32_t, int32_t, int32) { return (uint32_t)s; } -T(SFIXED64, f, uint64_t, int64_t, int64) { return (uint64_t)s; } -T(BOOL, v, uint32_t, bool, _bool) { return (uint32_t)s; } -T(ENUM, v, uint32_t, int32_t, int32) { return (uint32_t)s; } +T(INT32, v, uint32_t, int32_t, int32) { return (uint32_t)s; } +T(INT64, v, uint64_t, int64_t, int64) { return (uint64_t)s; } +T(UINT32, v, uint32_t, uint32_t, uint32) { return s; } +T(UINT64, v, uint64_t, uint64_t, uint64) { return s; } +T(SINT32, v, uint32_t, int32_t, int32) { return upb_zzenc_32(s); } +T(SINT64, v, uint64_t, int64_t, int64) { return upb_zzdec_64(s); } +T(FIXED32, f, uint32_t, uint32_t, uint32) { return s; } +T(FIXED64, f, uint64_t, uint64_t, uint64) { return s; } +T(SFIXED32, f, uint32_t, int32_t, int32) { return (uint32_t)s; } +T(SFIXED64, f, uint64_t, int64_t, int64) { return (uint64_t)s; } +T(BOOL, v, uint32_t, bool, _bool) { return (uint32_t)s; } +T(ENUM, v, uint32_t, int32_t, int32) { return (uint32_t)s; } T(DOUBLE, f, uint64_t, double, _double) { union upb_value v; v._double = s; @@ -183,9 +191,9 @@ T(FLOAT, f, uint32_t, float, _float) { #undef PUT #undef T -/* Functions to get sizes of serialized values without serializing. ***********/ - - +size_t upb_get_tag_size(uint32_t fieldnum) { + return upb_v_uint64_t_size((uint64_t)fieldnum << 3); +} #ifdef __cplusplus } /* extern "C" */ -- cgit v1.2.3