Unroll DecodeVarint to speed up int32/int64/uint64 slice decoding.
There are two variations here, one is using goto (called cpp) and one using if-else (called java). The cpp is much faster than the original, and slightly faster than java:
Original vs. Cpp variant:
name old time/op new time/op delta
Varint32ArraySmall/Len8-12 282ns ± 3% 280ns ± 1% -0.78% (p=0.005 n=17+16)
Varint32ArraySmall/Len16-12 370ns ± 2% 370ns ± 5% +0.04% (p=0.038 n=18+19)
Varint32ArraySmall/Len32-12 535ns ± 7% 516ns ± 5% -3.47% (p=0.000 n=20+20)
Varint32ArraySmall/Len64-12 801ns ± 2% 763ns ± 3% -4.81% (p=0.000 n=16+19)
Varint32ArraySmall/Len128-12 1.31µs ± 1% 1.25µs ± 2% -4.75% (p=0.000 n=16+19)
Varint32ArraySmall/Len256-12 2.29µs ± 0% 2.19µs ± 4% -4.70% (p=0.000 n=16+20)
Varint32ArraySmall/Len512-12 4.30µs ± 9% 3.94µs ± 3% -8.46% (p=0.000 n=20+20)
Varint32ArraySmall/Len1024-12 7.99µs ± 1% 7.41µs ± 2% -7.23% (p=0.000 n=17+19)
Varint32ArraySmall/Len2048-12 18.2µs ± 3% 16.9µs ± 2% -7.13% (p=0.000 n=20+19)
Varint32ArraySmall/Len4096-12 35.2µs ± 4% 32.4µs ± 0% -7.88% (p=0.000 n=20+17)
Varint32ArrayLarge/Len34-12 769ns ± 7% 651ns ± 1% -15.31% (p=0.000 n=20+17)
Varint32ArrayLarge/Len68-12 1.23µs ± 2% 1.03µs ± 3% -16.43% (p=0.000 n=17+17)
Varint32ArrayLarge/Len136-12 2.18µs ± 4% 1.76µs ± 2% -19.10% (p=0.000 n=20+19)
Varint32ArrayLarge/Len272-12 4.03µs ±11% 3.19µs ± 4% -20.68% (p=0.000 n=20+20)
Varint32ArrayLarge/Len544-12 7.37µs ± 5% 5.81µs ± 0% -21.22% (p=0.000 n=18+16)
Varint32ArrayLarge/Len1088-12 13.5µs ± 4% 10.6µs ± 0% -21.53% (p=0.000 n=20+17)
Varint32ArrayLarge/Len2176-12 26.5µs ± 1% 21.0µs ± 2% -20.74% (p=0.000 n=16+17)
Varint32ArrayLarge/Len4352-12 56.7µs ± 5% 45.1µs ± 3% -20.35% (p=0.000 n=20+20)
Varint32ArrayLarge/Len8704-12 109µs ± 3% 88µs ± 6% -19.75% (p=0.000 n=20+20)
Varint32ArrayLarge/Len17408-12 209µs ± 0% 165µs ± 0% -21.17% (p=0.000 n=16+17)
Varint64ArraySmall/Len8-12 303ns ± 1% 304ns ± 2% ~ (p=0.205 n=16+17)
Varint64ArraySmall/Len16-12 412ns ± 5% 414ns ±13% ~ (p=0.236 n=20+19)
Varint64ArraySmall/Len32-12 593ns ± 7% 568ns ± 3% -4.15% (p=0.000 n=19+17)
Varint64ArraySmall/Len64-12 921ns ± 3% 874ns ± 3% -5.10% (p=0.000 n=20+20)
Varint64ArraySmall/Len128-12 1.54µs ± 0% 1.48µs ± 2% -3.76% (p=0.000 n=20+20)
Varint64ArraySmall/Len256-12 2.74µs ± 4% 2.61µs ± 3% -4.87% (p=0.000 n=17+20)
Varint64ArraySmall/Len512-12 5.05µs ± 0% 4.76µs ± 2% -5.67% (p=0.000 n=18+19)
Varint64ArraySmall/Len1024-12 9.59µs ± 0% 9.07µs ± 3% -5.39% (p=0.000 n=17+19)
Varint64ArraySmall/Len2048-12 23.7µs ± 3% 22.3µs ± 2% -6.06% (p=0.000 n=20+18)
Varint64ArraySmall/Len4096-12 46.9µs ± 1% 45.0µs ± 1% -3.96% (p=0.000 n=17+18)
Varint64ArrayLarge/Len164-12 3.48µs ± 4% 2.71µs ± 3% -22.02% (p=0.000 n=19+20)
Varint64ArrayLarge/Len328-12 6.59µs ± 4% 4.97µs ± 0% -24.56% (p=0.000 n=20+17)
Varint64ArrayLarge/Len656-12 12.4µs ± 1% 9.4µs ± 1% -24.44% (p=0.000 n=18+17)
Varint64ArrayLarge/Len1312-12 23.3µs ± 5% 17.3µs ± 3% -25.85% (p=0.000 n=19+20)
Varint64ArrayLarge/Len2624-12 51.9µs ± 5% 39.3µs ± 0% -24.40% (p=0.000 n=19+16)
Varint64ArrayLarge/Len5248-12 109µs ± 0% 85µs ± 2% -22.00% (p=0.000 n=20+19)
Varint64ArrayLarge/Len10496-12 208µs ± 4% 157µs ± 1% -24.36% (p=0.000 n=20+18)
Varint64ArrayLarge/Len20992-12 434µs ± 6% 331µs ± 1% -23.68% (p=0.000 n=20+19)
Varint64ArrayLarge/Len41984-12 854µs ± 5% 665µs ± 1% -22.11% (p=0.000 n=20+19)
Varint64ArrayLarge/Len83968-12 1.74ms ± 2% 1.38ms ± 2% -20.49% (p=0.000 n=19+19)
DecodeEmpty-12 212ns ± 5% 202ns ± 1% -5.01% (p=0.000 n=20+16)
Java vs Cpp Variant:
name old time/op new time/op delta
Varint32ArraySmall/Len8-12 284ns ± 3% 280ns ± 1% -1.46% (p=0.000 n=17+16)
Varint32ArraySmall/Len16-12 370ns ± 0% 370ns ± 5% -0.09% (p=0.046 n=16+19)
Varint32ArraySmall/Len32-12 514ns ± 0% 516ns ± 5% ~ (p=0.147 n=19+20)
Varint32ArraySmall/Len64-12 782ns ± 2% 763ns ± 3% -2.46% (p=0.000 n=18+19)
Varint32ArraySmall/Len128-12 1.29µs ± 5% 1.25µs ± 2% -3.56% (p=0.000 n=20+19)
Varint32ArraySmall/Len256-12 2.21µs ± 1% 2.19µs ± 4% -0.87% (p=0.021 n=17+20)
Varint32ArraySmall/Len512-12 4.01µs ± 2% 3.94µs ± 3% -1.79% (p=0.000 n=17+20)
Varint32ArraySmall/Len1024-12 7.55µs ± 0% 7.41µs ± 2% -1.82% (p=0.000 n=16+19)
Varint32ArraySmall/Len2048-12 17.0µs ± 0% 16.9µs ± 2% -0.75% (p=0.027 n=19+19)
Varint32ArraySmall/Len4096-12 33.1µs ± 2% 32.4µs ± 0% -1.88% (p=0.000 n=17+17)
Varint32ArrayLarge/Len34-12 673ns ± 1% 651ns ± 1% -3.21% (p=0.000 n=17+17)
Varint32ArrayLarge/Len68-12 1.08µs ± 1% 1.03µs ± 3% -4.82% (p=0.000 n=19+17)
Varint32ArrayLarge/Len136-12 1.83µs ± 1% 1.76µs ± 2% -3.65% (p=0.000 n=17+19)
Varint32ArrayLarge/Len272-12 3.27µs ± 1% 3.19µs ± 4% -2.34% (p=0.000 n=17+20)
Varint32ArrayLarge/Len544-12 6.04µs ± 1% 5.81µs ± 0% -3.86% (p=0.000 n=16+16)
Varint32ArrayLarge/Len1088-12 11.0µs ± 0% 10.6µs ± 0% -3.78% (p=0.000 n=16+17)
Varint32ArrayLarge/Len2176-12 21.7µs ± 1% 21.0µs ± 2% -3.28% (p=0.000 n=17+17)
Varint32ArrayLarge/Len4352-12 46.2µs ± 0% 45.1µs ± 3% -2.24% (p=0.000 n=20+20)
Varint32ArrayLarge/Len8704-12 90.7µs ± 8% 87.8µs ± 6% -3.16% (p=0.000 n=19+20)
Varint32ArrayLarge/Len17408-12 171µs ± 1% 165µs ± 0% -3.72% (p=0.000 n=16+17)
Varint64ArraySmall/Len8-12 309ns ± 5% 304ns ± 2% -1.61% (p=0.000 n=18+17)
Varint64ArraySmall/Len16-12 403ns ± 1% 414ns ±13% ~ (p=0.774 n=20+19)
Varint64ArraySmall/Len32-12 569ns ± 0% 568ns ± 3% -0.18% (p=0.012 n=17+17)
Varint64ArraySmall/Len64-12 898ns ± 0% 874ns ± 3% -2.64% (p=0.000 n=17+20)
Varint64ArraySmall/Len128-12 1.52µs ± 1% 1.48µs ± 2% -2.07% (p=0.000 n=16+20)
Varint64ArraySmall/Len256-12 2.65µs ± 0% 2.61µs ± 3% -1.48% (p=0.000 n=20+20)
Varint64ArraySmall/Len512-12 4.86µs ± 1% 4.76µs ± 2% -2.03% (p=0.000 n=16+19)
Varint64ArraySmall/Len1024-12 9.16µs ± 2% 9.07µs ± 3% -0.97% (p=0.001 n=17+19)
Varint64ArraySmall/Len2048-12 22.7µs ± 2% 22.3µs ± 2% -1.89% (p=0.000 n=20+18)
Varint64ArraySmall/Len4096-12 45.4µs ± 1% 45.0µs ± 1% -0.84% (p=0.000 n=19+18)
Varint64ArrayLarge/Len164-12 2.84µs ± 1% 2.71µs ± 3% -4.39% (p=0.000 n=16+20)
Varint64ArrayLarge/Len328-12 5.26µs ± 1% 4.97µs ± 0% -5.36% (p=0.000 n=20+17)
Varint64ArrayLarge/Len656-12 10.0µs ± 1% 9.4µs ± 1% -6.03% (p=0.000 n=17+17)
Varint64ArrayLarge/Len1312-12 18.1µs ± 1% 17.3µs ± 3% -4.61% (p=0.000 n=16+20)
Varint64ArrayLarge/Len2624-12 41.3µs ± 0% 39.3µs ± 0% -4.92% (p=0.000 n=17+16)
Varint64ArrayLarge/Len5248-12 88.6µs ± 2% 84.8µs ± 2% -4.33% (p=0.000 n=18+19)
Varint64ArrayLarge/Len10496-12 165µs ± 1% 157µs ± 1% -4.41% (p=0.000 n=20+18)
Varint64ArrayLarge/Len20992-12 341µs ± 2% 331µs ± 1% -3.00% (p=0.000 n=19+19)
Varint64ArrayLarge/Len41984-12 696µs ± 1% 665µs ± 1% -4.46% (p=0.000 n=19+19)
Varint64ArrayLarge/Len83968-12 1.43ms ± 1% 1.38ms ± 2% -3.38% (p=0.000 n=17+19)
DecodeEmpty-12 205ns ± 3% 202ns ± 1% -1.45% (p=0.000 n=17+16)
Both were run with 20 iterations on the decoding benchmark, on a mostly quiet machine with cpu frequency scaling disabled.
PiperOrigin-RevId: 135494672
diff --git a/proto/decode.go b/proto/decode.go
index a54a8cc..aa20729 100644
--- a/proto/decode.go
+++ b/proto/decode.go
@@ -61,7 +61,6 @@
// int32, int64, uint32, uint64, bool, and enum
// protocol buffer types.
func DecodeVarint(buf []byte) (x uint64, n int) {
- // x, n already 0
for shift := uint(0); shift < 64; shift += 7 {
if n >= len(buf) {
return 0, 0
@@ -78,13 +77,7 @@
return 0, 0
}
-// DecodeVarint reads a varint-encoded integer from the Buffer.
-// This is the format for the
-// int32, int64, uint32, uint64, bool, and enum
-// protocol buffer types.
-func (p *Buffer) DecodeVarint() (x uint64, err error) {
- // x, err already 0
-
+func (p *Buffer) decodeVarintSlow() (x uint64, err error) {
i := p.index
l := len(p.buf)
@@ -107,6 +100,107 @@
return
}
+// DecodeVarint reads a varint-encoded integer from the Buffer.
+// This is the format for the
+// int32, int64, uint32, uint64, bool, and enum
+// protocol buffer types.
+func (p *Buffer) DecodeVarint() (x uint64, err error) {
+ i := p.index
+ buf := p.buf
+
+ if i >= len(buf) {
+ return 0, io.ErrUnexpectedEOF
+ } else if buf[i] < 0x80 {
+ p.index++
+ return uint64(buf[i]), nil
+ } else if len(buf)-i < 10 {
+ return p.decodeVarintSlow()
+ }
+
+ var b uint64
+ // we already checked the first byte
+ x = uint64(buf[i]) - 0x80
+ i++
+
+ b = uint64(buf[i])
+ i++
+ x += b << 7
+ if b&0x80 == 0 {
+ goto done
+ }
+ x -= 0x80 << 7
+
+ b = uint64(buf[i])
+ i++
+ x += b << 14
+ if b&0x80 == 0 {
+ goto done
+ }
+ x -= 0x80 << 14
+
+ b = uint64(buf[i])
+ i++
+ x += b << 21
+ if b&0x80 == 0 {
+ goto done
+ }
+ x -= 0x80 << 21
+
+ b = uint64(buf[i])
+ i++
+ x += b << 28
+ if b&0x80 == 0 {
+ goto done
+ }
+ x -= 0x80 << 28
+
+ b = uint64(buf[i])
+ i++
+ x += b << 35
+ if b&0x80 == 0 {
+ goto done
+ }
+ x -= 0x80 << 35
+
+ b = uint64(buf[i])
+ i++
+ x += b << 42
+ if b&0x80 == 0 {
+ goto done
+ }
+ x -= 0x80 << 42
+
+ b = uint64(buf[i])
+ i++
+ x += b << 49
+ if b&0x80 == 0 {
+ goto done
+ }
+ x -= 0x80 << 49
+
+ b = uint64(buf[i])
+ i++
+ x += b << 56
+ if b&0x80 == 0 {
+ goto done
+ }
+ x -= 0x80 << 56
+
+ b = uint64(buf[i])
+ i++
+ x += b << 63
+ if b&0x80 == 0 {
+ goto done
+ }
+ // x -= 0x80 << 63 // Always zero.
+
+ return 0, errOverflow
+
+done:
+ p.index = i
+ return x, nil
+}
+
// DecodeFixed64 reads a 64-bit integer from the Buffer.
// This is the format for the
// fixed64, sfixed64, and double protocol buffer types.