From c8d67b2686796b70c946fcd98d72d4c2828b51e8 Mon Sep 17 00:00:00 2001 From: Joshua Haberman Date: Mon, 2 Mar 2009 00:28:44 -0800 Subject: More refactoring of structures. pbstream_internal.h is now where even lower-level parsing functions go. --- Makefile | 7 ++-- pbstream.c | 109 +++++++++++++++++++++++++++++++++++++++++-------------------- pbstream.h | 44 ++++++++++++++++--------- pbstruct.c | 1 + tests.c | 4 +-- 5 files changed, 109 insertions(+), 56 deletions(-) diff --git a/Makefile b/Makefile index b22196c..98bbb06 100644 --- a/Makefile +++ b/Makefile @@ -1,11 +1,14 @@ .PHONY: all clean -all: pbstream.o tests +all: pbstream.o pbstruct.o tests clean: - rm -f pbstream.o tests + rm -f pbstream.o pbstruct.o tests pbstream.o: pbstream.c pbstream.h gcc -std=c99 -O3 -Wall -o pbstream.o -c pbstream.c +pbstruct.o: pbstruct.c pbstruct.h + gcc -std=c99 -O3 -Wall -o pbstruct.o -c pbstruct.c + tests: tests.c pbstream.c pbstream.h gcc -std=c99 -O3 -Wall -o tests tests.c diff --git a/pbstream.c b/pbstream.c index a181128..46a361a 100644 --- a/pbstream.c +++ b/pbstream.c @@ -8,6 +8,7 @@ #include #include #include "pbstream.h" +#include "pbstream_lowlevel.h" /* Branch prediction hints for GCC. */ #ifdef __GNUC__ @@ -107,26 +108,35 @@ done: static pbstream_status_t get_f_uint32_t(char **buf, uint32_t *val) { - uint8_t *b = (uint8_t*)*buf; -#if __BYTE_ORDER == __LITTLE_ENDIAN - *val = *(uint32_t*)b; /* likely unaligned, TODO: verify performance. */ -#else - *val = b[0] | (b[1] << 8) | (b[2] << 16) | (b[3] << 24); -#endif - *buf = (char*)b + sizeof(uint32_t); + char *b = *buf; +#define SHL(val, bits) ((uint32_t)val << bits) + *val = SHL(b[0], 0) | SHL(b[1], 8) | SHL(b[2], 16) | SHL(b[3], 24); +#undef SHL + *buf += sizeof(uint32_t); + return PBSTREAM_STATUS_OK; +} + +static pbstream_status_t skip_f_uint32_t(char **buf) +{ + *buf += sizeof(uint32_t); return PBSTREAM_STATUS_OK; } static pbstream_status_t get_f_uint64_t(char **buf, uint64_t *val) { - uint8_t *b = (uint8_t*)*buf; -#if __BYTE_ORDER == __LITTLE_ENDIAN - *val = *(uint64_t*)buf; /* likely unaligned, TODO: verify performance. */ -#else - *val = (b[0]) | (b[1] << 8 ) | (b[2] << 16) | (b[3] << 24) | - (b[4] << 32) | (b[5] << 40) | (b[6] << 48) | (b[7] << 56); -#endif - *buf = (char*)b + sizeof(uint64_t); + char *b = *buf; + /* TODO: is this worth 32/64 specializing? */ +#define SHL(val, bits) ((uint64_t)val << bits) + *val = SHL(b[0], 0) | SHL(b[1], 8) | SHL(b[2], 16) | SHL(b[3], 24) | + SHL(b[4], 32) | SHL(b[5], 40) | SHL(b[6], 48) | SHL(b[7], 56); +#undef SHL + *buf += sizeof(uint64_t); + return PBSTREAM_STATUS_OK; +} + +static pbstream_status_t skip_f_uint64_t(char **buf) +{ + *buf += sizeof(uint64_t); return PBSTREAM_STATUS_OK; } @@ -148,7 +158,7 @@ static int64_t zz_decode_64(uint64_t n) { return (n >> 1) ^ -(int64_t)(n & 1); } #define GET(type, v_or_f, wire_t, val_t, member_name) \ static pbstream_status_t get_ ## type(struct pbstream_parse_state *s, \ char *buf, \ - struct pbstream_value *d) { \ + struct pbstream_tagged_value *d) { \ wire_t tmp; \ char *b = buf; \ CHECK(get_ ## v_or_f ## _ ## wire_t(&b, &tmp)); \ @@ -175,7 +185,7 @@ T(FIXED64, f, uint64_t, uint64_t, uint64) { *d = s; } T(SFIXED32, f, uint32_t, int32_t, int32) { *d = (int32_t)s; } T(SFIXED64, f, uint64_t, int64_t, int64) { *d = (int64_t)s; } T(BOOL, v, uint32_t, bool, _bool) { *d = (bool)s; } -T(ENUM, v, uint32_t, int32_t, _enum) { *d = (int32_t)s; } +T(ENUM, v, uint32_t, int32_t, int32) { *d = (int32_t)s; } #undef WVTOV #undef GET #undef T @@ -188,7 +198,7 @@ static void wvtov_delimited(uint32_t s, struct pbstream_delimited *d, size_t o) /* Use BYTES version for both STRING and BYTES, leave UTF-8 checks to client. */ static pbstream_status_t get_BYTES(struct pbstream_parse_state *s, char *buf, - struct pbstream_value *d) { + struct pbstream_tagged_value *d) { uint32_t tmp; char *b = buf; CHECK(get_v_uint32_t(&b, &tmp)); @@ -199,7 +209,7 @@ static pbstream_status_t get_BYTES(struct pbstream_parse_state *s, char *buf, } static pbstream_status_t get_MESSAGE(struct pbstream_parse_state *s, char *buf, - struct pbstream_value *d) { + struct pbstream_tagged_value *d) { /* We're entering a sub-message. */ uint32_t tmp; char *b = buf; @@ -216,7 +226,7 @@ static pbstream_status_t get_MESSAGE(struct pbstream_parse_state *s, char *buf, struct pbstream_type_info { pbstream_wire_type_t expected_wire_type; pbstream_status_t (*get)(struct pbstream_parse_state *s, char *buf, - struct pbstream_value *d); + struct pbstream_tagged_value *d); }; static struct pbstream_type_info type_info[] = { {PBSTREAM_WIRE_TYPE_64BIT, get_DOUBLE}, @@ -238,7 +248,7 @@ static struct pbstream_type_info type_info[] = { {PBSTREAM_WIRE_TYPE_DELIMITED, get_MESSAGE} }; -static pbstream_status_t parse_tag(char **buf, struct pbstream_tag *tag) +pbstream_status_t parse_tag(char **buf, struct pbstream_tag *tag) { uint32_t tag_int; CHECK(get_v_uint32_t(buf, &tag_int)); @@ -247,20 +257,45 @@ static pbstream_status_t parse_tag(char **buf, struct pbstream_tag *tag) return PBSTREAM_STATUS_OK; } -static pbstream_status_t parse_unknown_value( - char **buf, int buf_offset, struct pbstream_wire_value *wv) +pbstream_status_t parse_wire_value(char **buf, size_t offset, + pbstream_wire_type_t wt, + union pbstream_wire_value *wv) { - switch(wv->type) { + switch(wt) { case PBSTREAM_WIRE_TYPE_VARINT: - CHECK(get_v_uint64_t(buf, &wv->v.varint)); break; + CHECK(get_v_uint64_t(buf, &wv->varint)); break; case PBSTREAM_WIRE_TYPE_64BIT: - CHECK(get_f_uint64_t(buf, &wv->v._64bit)); break; + CHECK(get_f_uint64_t(buf, &wv->_64bit)); break; case PBSTREAM_WIRE_TYPE_32BIT: - CHECK(get_f_uint32_t(buf, &wv->v._32bit)); break; + CHECK(get_f_uint32_t(buf, &wv->_32bit)); break; case PBSTREAM_WIRE_TYPE_DELIMITED: - wv->v.delimited.offset = buf_offset; - CHECK(get_v_uint32_t(buf, &wv->v.delimited.len)); + wv->delimited.offset = offset; + CHECK(get_v_uint32_t(buf, &wv->delimited.len)); + *buf += wv->delimited.len; + break; + case PBSTREAM_WIRE_TYPE_START_GROUP: + case PBSTREAM_WIRE_TYPE_END_GROUP: + return PBSTREAM_ERROR_GROUP; /* deprecated, no plans to support. */ + } + return PBSTREAM_STATUS_OK; +} + +pbstream_status_t skip_wire_value(char **buf, pbstream_wire_type_t wt) +{ + switch(wt) { + case PBSTREAM_WIRE_TYPE_VARINT: + CHECK(skip_v_uint64_t(buf)); break; + case PBSTREAM_WIRE_TYPE_64BIT: + CHECK(skip_f_uint64_t(buf)); break; + case PBSTREAM_WIRE_TYPE_32BIT: + CHECK(skip_f_uint32_t(buf)); break; + case PBSTREAM_WIRE_TYPE_DELIMITED: { + /* Have to get (not skip) the length to skip the bytes. */ + uint32_t len; + CHECK(get_v_uint32_t(buf, &len)); + *buf += len; break; + } case PBSTREAM_WIRE_TYPE_START_GROUP: case PBSTREAM_WIRE_TYPE_END_GROUP: return PBSTREAM_ERROR_GROUP; /* deprecated, no plans to support. */ @@ -268,8 +303,8 @@ static pbstream_status_t parse_unknown_value( return PBSTREAM_STATUS_OK; } -static struct pbstream_field *find_field(struct pbstream_fieldset* fs, - pbstream_field_number_t num) +struct pbstream_field *pbstream_find_field(struct pbstream_fieldset* fs, + pbstream_field_number_t num) { /* TODO: the hashtable part. */ return fs->array[num-1]; @@ -279,10 +314,9 @@ static struct pbstream_field *find_field(struct pbstream_fieldset* fs, pbstream_status_t pbstream_parse_field(struct pbstream_parse_state *s, char *buf, pbstream_field_number_t *fieldnum, - struct pbstream_value *val, - struct pbstream_wire_value *wv) + struct pbstream_tagged_value *val, + struct pbstream_tagged_wire_value *wv) { - char *b = buf; /* Check for end-of-message at the current stack depth. */ if(unlikely(s->offset >= s->top->end_offset)) { /* If the end offset isn't an exact field boundary, the pb is corrupt. */ @@ -293,9 +327,11 @@ pbstream_status_t pbstream_parse_field(struct pbstream_parse_state *s, } struct pbstream_tag tag; + char *b = buf; CHECK(parse_tag(&b, &tag)); s->offset += (b-buf); - struct pbstream_field *fd = find_field(s->top->fieldset, tag.field_number); + struct pbstream_field *fd = pbstream_find_field(s->top->fieldset, + tag.field_number); pbstream_status_t unknown_value_status; if(unlikely(!fd)) { unknown_value_status = PBSTREAM_ERROR_UNKNOWN_VALUE; @@ -314,7 +350,8 @@ pbstream_status_t pbstream_parse_field(struct pbstream_parse_state *s, unknown_value: wv->type = tag.wire_type; - CHECK(parse_unknown_value(&b, s->offset, wv)); + b = buf; + CHECK(parse_wire_value(&b, s->offset, tag.wire_type, &wv->v)); s->offset += (b-buf); return unknown_value_status; } diff --git a/pbstream.h b/pbstream.h index cd6fe3a..8fcca34 100644 --- a/pbstream.h +++ b/pbstream.h @@ -4,9 +4,16 @@ * Copyright (c) 2008 Joshua Haberman. See LICENSE for details. */ +#ifndef PBSTREAM_H_ +#define PBSTREAM_H_ + #include #include +#ifdef __cplusplus +extern "C" { +#endif + /* The maximum that any submessages can be nested. Matches proto2's limit. */ #define PBSTREAM_MAX_STACK 64 @@ -44,9 +51,9 @@ typedef enum pbstream_wire_type { typedef int32_t pbstream_field_number_t; /* A deserialized value as described in a .proto file. */ -struct pbstream_value { +struct pbstream_tagged_value { struct pbstream_field *field; - union { + union pbstream_value { double _double; float _float; int32_t int32; @@ -58,20 +65,14 @@ struct pbstream_value { size_t offset; /* relative to the beginning of the stream. */ uint32_t len; } delimited; - int32_t _enum; } v; }; -/* A tag occurs before each value on-the-wire. */ -struct pbstream_tag { - pbstream_field_number_t field_number; - pbstream_wire_type_t wire_type; -}; - -/* A value as it is encoded on-the-wire */ -struct pbstream_wire_value { +/* A value as it is encoded on-the-wire, before it has been interpreted as + * any particular .proto type. */ +struct pbstream_tagged_wire_value { pbstream_wire_type_t type; - union { + union pbstream_wire_value { uint64_t varint; uint64_t _64bit; struct { @@ -82,14 +83,19 @@ struct pbstream_wire_value { } v; }; -/* Definition of a single field in a message. */ +/* Definition of a single field in a message. Note that this does not include + * nearly all of the information that can be specified about a field in a + * .proto file. For example, we don't even know the field's name. We keep + * only the information necessary to parse the field. */ struct pbstream_field { pbstream_field_number_t field_number; pbstream_type_t type; struct pbstream_fieldset *fieldset; /* if type == MESSAGE */ }; -/* The set of fields corresponding to a message definition. */ +/* A fieldset is a data structure that supports fast lookup of fields by number. + * It is logically a map of {field_number -> struct pbstream_field*}. Fast + * lookup is important, because it is in the critical path of parsing. */ struct pbstream_fieldset { int num_fields; struct pbstream_field *fields; @@ -170,5 +176,11 @@ struct pbstream_parse_state; pbstream_status_t pbstream_parse_field(struct pbstream_parse_state *s, char *buf, pbstream_field_number_t *fieldnum, - struct pbstream_value *val, - struct pbstream_wire_value *wv); + struct pbstream_tagged_value *val, + struct pbstream_tagged_wire_value *wv); + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif /* PBSTREAM_H_ */ diff --git a/pbstruct.c b/pbstruct.c index 97e0a9f..2c52930 100644 --- a/pbstruct.c +++ b/pbstruct.c @@ -8,4 +8,5 @@ #include "pbstruct.h" #define alignof(t) offsetof(struct { char c; t x; }, x) +#define ALIGN_UP(p, t) (alignof(t) + ((p - 1) & ~(alignof(t) - 1))) diff --git a/tests.c b/tests.c index cea6911..6c9d6fe 100644 --- a/tests.c +++ b/tests.c @@ -60,8 +60,8 @@ void test_simple_proto() pbstream_init_parser(&s, &fieldset1); assert(s.offset == 0); pbstream_field_number_t fieldnum; - struct pbstream_value val; - struct pbstream_wire_value wv; + struct pbstream_tagged_value val; + struct pbstream_tagged_wire_value wv; assert(pbstream_parse_field(&s, message1, &fieldnum, &val, &wv) == PBSTREAM_STATUS_OK); assert(val.field->field_number == 1); -- cgit v1.2.3