Unroll DecodeVarint to speed up int32/int64/uint64 slice decoding. There are two variations here, one is using goto (called cpp) and one using if-else (called java). The cpp is much faster than the original, and slightly faster than java: Original vs. Cpp variant: name old time/op new time/op delta Varint32ArraySmall/Len8-12 282ns ± 3% 280ns ± 1% -0.78% (p=0.005 n=17+16) Varint32ArraySmall/Len16-12 370ns ± 2% 370ns ± 5% +0.04% (p=0.038 n=18+19) Varint32ArraySmall/Len32-12 535ns ± 7% 516ns ± 5% -3.47% (p=0.000 n=20+20) Varint32ArraySmall/Len64-12 801ns ± 2% 763ns ± 3% -4.81% (p=0.000 n=16+19) Varint32ArraySmall/Len128-12 1.31µs ± 1% 1.25µs ± 2% -4.75% (p=0.000 n=16+19) Varint32ArraySmall/Len256-12 2.29µs ± 0% 2.19µs ± 4% -4.70% (p=0.000 n=16+20) Varint32ArraySmall/Len512-12 4.30µs ± 9% 3.94µs ± 3% -8.46% (p=0.000 n=20+20) Varint32ArraySmall/Len1024-12 7.99µs ± 1% 7.41µs ± 2% -7.23% (p=0.000 n=17+19) Varint32ArraySmall/Len2048-12 18.2µs ± 3% 16.9µs ± 2% -7.13% (p=0.000 n=20+19) Varint32ArraySmall/Len4096-12 35.2µs ± 4% 32.4µs ± 0% -7.88% (p=0.000 n=20+17) Varint32ArrayLarge/Len34-12 769ns ± 7% 651ns ± 1% -15.31% (p=0.000 n=20+17) Varint32ArrayLarge/Len68-12 1.23µs ± 2% 1.03µs ± 3% -16.43% (p=0.000 n=17+17) Varint32ArrayLarge/Len136-12 2.18µs ± 4% 1.76µs ± 2% -19.10% (p=0.000 n=20+19) Varint32ArrayLarge/Len272-12 4.03µs ±11% 3.19µs ± 4% -20.68% (p=0.000 n=20+20) Varint32ArrayLarge/Len544-12 7.37µs ± 5% 5.81µs ± 0% -21.22% (p=0.000 n=18+16) Varint32ArrayLarge/Len1088-12 13.5µs ± 4% 10.6µs ± 0% -21.53% (p=0.000 n=20+17) Varint32ArrayLarge/Len2176-12 26.5µs ± 1% 21.0µs ± 2% -20.74% (p=0.000 n=16+17) Varint32ArrayLarge/Len4352-12 56.7µs ± 5% 45.1µs ± 3% -20.35% (p=0.000 n=20+20) Varint32ArrayLarge/Len8704-12 109µs ± 3% 88µs ± 6% -19.75% (p=0.000 n=20+20) Varint32ArrayLarge/Len17408-12 209µs ± 0% 165µs ± 0% -21.17% (p=0.000 n=16+17) Varint64ArraySmall/Len8-12 303ns ± 1% 304ns ± 2% ~ (p=0.205 n=16+17) Varint64ArraySmall/Len16-12 412ns ± 5% 414ns ±13% ~ (p=0.236 n=20+19) Varint64ArraySmall/Len32-12 593ns ± 7% 568ns ± 3% -4.15% (p=0.000 n=19+17) Varint64ArraySmall/Len64-12 921ns ± 3% 874ns ± 3% -5.10% (p=0.000 n=20+20) Varint64ArraySmall/Len128-12 1.54µs ± 0% 1.48µs ± 2% -3.76% (p=0.000 n=20+20) Varint64ArraySmall/Len256-12 2.74µs ± 4% 2.61µs ± 3% -4.87% (p=0.000 n=17+20) Varint64ArraySmall/Len512-12 5.05µs ± 0% 4.76µs ± 2% -5.67% (p=0.000 n=18+19) Varint64ArraySmall/Len1024-12 9.59µs ± 0% 9.07µs ± 3% -5.39% (p=0.000 n=17+19) Varint64ArraySmall/Len2048-12 23.7µs ± 3% 22.3µs ± 2% -6.06% (p=0.000 n=20+18) Varint64ArraySmall/Len4096-12 46.9µs ± 1% 45.0µs ± 1% -3.96% (p=0.000 n=17+18) Varint64ArrayLarge/Len164-12 3.48µs ± 4% 2.71µs ± 3% -22.02% (p=0.000 n=19+20) Varint64ArrayLarge/Len328-12 6.59µs ± 4% 4.97µs ± 0% -24.56% (p=0.000 n=20+17) Varint64ArrayLarge/Len656-12 12.4µs ± 1% 9.4µs ± 1% -24.44% (p=0.000 n=18+17) Varint64ArrayLarge/Len1312-12 23.3µs ± 5% 17.3µs ± 3% -25.85% (p=0.000 n=19+20) Varint64ArrayLarge/Len2624-12 51.9µs ± 5% 39.3µs ± 0% -24.40% (p=0.000 n=19+16) Varint64ArrayLarge/Len5248-12 109µs ± 0% 85µs ± 2% -22.00% (p=0.000 n=20+19) Varint64ArrayLarge/Len10496-12 208µs ± 4% 157µs ± 1% -24.36% (p=0.000 n=20+18) Varint64ArrayLarge/Len20992-12 434µs ± 6% 331µs ± 1% -23.68% (p=0.000 n=20+19) Varint64ArrayLarge/Len41984-12 854µs ± 5% 665µs ± 1% -22.11% (p=0.000 n=20+19) Varint64ArrayLarge/Len83968-12 1.74ms ± 2% 1.38ms ± 2% -20.49% (p=0.000 n=19+19) DecodeEmpty-12 212ns ± 5% 202ns ± 1% -5.01% (p=0.000 n=20+16) Java vs Cpp Variant: name old time/op new time/op delta Varint32ArraySmall/Len8-12 284ns ± 3% 280ns ± 1% -1.46% (p=0.000 n=17+16) Varint32ArraySmall/Len16-12 370ns ± 0% 370ns ± 5% -0.09% (p=0.046 n=16+19) Varint32ArraySmall/Len32-12 514ns ± 0% 516ns ± 5% ~ (p=0.147 n=19+20) Varint32ArraySmall/Len64-12 782ns ± 2% 763ns ± 3% -2.46% (p=0.000 n=18+19) Varint32ArraySmall/Len128-12 1.29µs ± 5% 1.25µs ± 2% -3.56% (p=0.000 n=20+19) Varint32ArraySmall/Len256-12 2.21µs ± 1% 2.19µs ± 4% -0.87% (p=0.021 n=17+20) Varint32ArraySmall/Len512-12 4.01µs ± 2% 3.94µs ± 3% -1.79% (p=0.000 n=17+20) Varint32ArraySmall/Len1024-12 7.55µs ± 0% 7.41µs ± 2% -1.82% (p=0.000 n=16+19) Varint32ArraySmall/Len2048-12 17.0µs ± 0% 16.9µs ± 2% -0.75% (p=0.027 n=19+19) Varint32ArraySmall/Len4096-12 33.1µs ± 2% 32.4µs ± 0% -1.88% (p=0.000 n=17+17) Varint32ArrayLarge/Len34-12 673ns ± 1% 651ns ± 1% -3.21% (p=0.000 n=17+17) Varint32ArrayLarge/Len68-12 1.08µs ± 1% 1.03µs ± 3% -4.82% (p=0.000 n=19+17) Varint32ArrayLarge/Len136-12 1.83µs ± 1% 1.76µs ± 2% -3.65% (p=0.000 n=17+19) Varint32ArrayLarge/Len272-12 3.27µs ± 1% 3.19µs ± 4% -2.34% (p=0.000 n=17+20) Varint32ArrayLarge/Len544-12 6.04µs ± 1% 5.81µs ± 0% -3.86% (p=0.000 n=16+16) Varint32ArrayLarge/Len1088-12 11.0µs ± 0% 10.6µs ± 0% -3.78% (p=0.000 n=16+17) Varint32ArrayLarge/Len2176-12 21.7µs ± 1% 21.0µs ± 2% -3.28% (p=0.000 n=17+17) Varint32ArrayLarge/Len4352-12 46.2µs ± 0% 45.1µs ± 3% -2.24% (p=0.000 n=20+20) Varint32ArrayLarge/Len8704-12 90.7µs ± 8% 87.8µs ± 6% -3.16% (p=0.000 n=19+20) Varint32ArrayLarge/Len17408-12 171µs ± 1% 165µs ± 0% -3.72% (p=0.000 n=16+17) Varint64ArraySmall/Len8-12 309ns ± 5% 304ns ± 2% -1.61% (p=0.000 n=18+17) Varint64ArraySmall/Len16-12 403ns ± 1% 414ns ±13% ~ (p=0.774 n=20+19) Varint64ArraySmall/Len32-12 569ns ± 0% 568ns ± 3% -0.18% (p=0.012 n=17+17) Varint64ArraySmall/Len64-12 898ns ± 0% 874ns ± 3% -2.64% (p=0.000 n=17+20) Varint64ArraySmall/Len128-12 1.52µs ± 1% 1.48µs ± 2% -2.07% (p=0.000 n=16+20) Varint64ArraySmall/Len256-12 2.65µs ± 0% 2.61µs ± 3% -1.48% (p=0.000 n=20+20) Varint64ArraySmall/Len512-12 4.86µs ± 1% 4.76µs ± 2% -2.03% (p=0.000 n=16+19) Varint64ArraySmall/Len1024-12 9.16µs ± 2% 9.07µs ± 3% -0.97% (p=0.001 n=17+19) Varint64ArraySmall/Len2048-12 22.7µs ± 2% 22.3µs ± 2% -1.89% (p=0.000 n=20+18) Varint64ArraySmall/Len4096-12 45.4µs ± 1% 45.0µs ± 1% -0.84% (p=0.000 n=19+18) Varint64ArrayLarge/Len164-12 2.84µs ± 1% 2.71µs ± 3% -4.39% (p=0.000 n=16+20) Varint64ArrayLarge/Len328-12 5.26µs ± 1% 4.97µs ± 0% -5.36% (p=0.000 n=20+17) Varint64ArrayLarge/Len656-12 10.0µs ± 1% 9.4µs ± 1% -6.03% (p=0.000 n=17+17) Varint64ArrayLarge/Len1312-12 18.1µs ± 1% 17.3µs ± 3% -4.61% (p=0.000 n=16+20) Varint64ArrayLarge/Len2624-12 41.3µs ± 0% 39.3µs ± 0% -4.92% (p=0.000 n=17+16) Varint64ArrayLarge/Len5248-12 88.6µs ± 2% 84.8µs ± 2% -4.33% (p=0.000 n=18+19) Varint64ArrayLarge/Len10496-12 165µs ± 1% 157µs ± 1% -4.41% (p=0.000 n=20+18) Varint64ArrayLarge/Len20992-12 341µs ± 2% 331µs ± 1% -3.00% (p=0.000 n=19+19) Varint64ArrayLarge/Len41984-12 696µs ± 1% 665µs ± 1% -4.46% (p=0.000 n=19+19) Varint64ArrayLarge/Len83968-12 1.43ms ± 1% 1.38ms ± 2% -3.38% (p=0.000 n=17+19) DecodeEmpty-12 205ns ± 3% 202ns ± 1% -1.45% (p=0.000 n=17+16) Both were run with 20 iterations on the decoding benchmark, on a mostly quiet machine with cpu frequency scaling disabled. PiperOrigin-RevId: 135494672
diff --git a/proto/decode.go b/proto/decode.go index a54a8cc..aa20729 100644 --- a/proto/decode.go +++ b/proto/decode.go
@@ -61,7 +61,6 @@ // int32, int64, uint32, uint64, bool, and enum // protocol buffer types. func DecodeVarint(buf []byte) (x uint64, n int) { - // x, n already 0 for shift := uint(0); shift < 64; shift += 7 { if n >= len(buf) { return 0, 0 @@ -78,13 +77,7 @@ return 0, 0 } -// DecodeVarint reads a varint-encoded integer from the Buffer. -// This is the format for the -// int32, int64, uint32, uint64, bool, and enum -// protocol buffer types. -func (p *Buffer) DecodeVarint() (x uint64, err error) { - // x, err already 0 - +func (p *Buffer) decodeVarintSlow() (x uint64, err error) { i := p.index l := len(p.buf) @@ -107,6 +100,107 @@ return } +// DecodeVarint reads a varint-encoded integer from the Buffer. +// This is the format for the +// int32, int64, uint32, uint64, bool, and enum +// protocol buffer types. +func (p *Buffer) DecodeVarint() (x uint64, err error) { + i := p.index + buf := p.buf + + if i >= len(buf) { + return 0, io.ErrUnexpectedEOF + } else if buf[i] < 0x80 { + p.index++ + return uint64(buf[i]), nil + } else if len(buf)-i < 10 { + return p.decodeVarintSlow() + } + + var b uint64 + // we already checked the first byte + x = uint64(buf[i]) - 0x80 + i++ + + b = uint64(buf[i]) + i++ + x += b << 7 + if b&0x80 == 0 { + goto done + } + x -= 0x80 << 7 + + b = uint64(buf[i]) + i++ + x += b << 14 + if b&0x80 == 0 { + goto done + } + x -= 0x80 << 14 + + b = uint64(buf[i]) + i++ + x += b << 21 + if b&0x80 == 0 { + goto done + } + x -= 0x80 << 21 + + b = uint64(buf[i]) + i++ + x += b << 28 + if b&0x80 == 0 { + goto done + } + x -= 0x80 << 28 + + b = uint64(buf[i]) + i++ + x += b << 35 + if b&0x80 == 0 { + goto done + } + x -= 0x80 << 35 + + b = uint64(buf[i]) + i++ + x += b << 42 + if b&0x80 == 0 { + goto done + } + x -= 0x80 << 42 + + b = uint64(buf[i]) + i++ + x += b << 49 + if b&0x80 == 0 { + goto done + } + x -= 0x80 << 49 + + b = uint64(buf[i]) + i++ + x += b << 56 + if b&0x80 == 0 { + goto done + } + x -= 0x80 << 56 + + b = uint64(buf[i]) + i++ + x += b << 63 + if b&0x80 == 0 { + goto done + } + // x -= 0x80 << 63 // Always zero. + + return 0, errOverflow + +done: + p.index = i + return x, nil +} + // DecodeFixed64 reads a 64-bit integer from the Buffer. // This is the format for the // fixed64, sfixed64, and double protocol buffer types.