Unroll DecodeVarint to speed up int32/int64/uint64 slice decoding.

There are two variations here, one is using goto (called cpp) and one using if-else (called java).  The cpp is much faster than the original, and slightly faster than java:

Original vs. Cpp variant:
name                            old time/op  new time/op  delta
Varint32ArraySmall/Len8-12       282ns ± 3%   280ns ± 1%   -0.78%  (p=0.005 n=17+16)
Varint32ArraySmall/Len16-12      370ns ± 2%   370ns ± 5%   +0.04%  (p=0.038 n=18+19)
Varint32ArraySmall/Len32-12      535ns ± 7%   516ns ± 5%   -3.47%  (p=0.000 n=20+20)
Varint32ArraySmall/Len64-12      801ns ± 2%   763ns ± 3%   -4.81%  (p=0.000 n=16+19)
Varint32ArraySmall/Len128-12    1.31µs ± 1%  1.25µs ± 2%   -4.75%  (p=0.000 n=16+19)
Varint32ArraySmall/Len256-12    2.29µs ± 0%  2.19µs ± 4%   -4.70%  (p=0.000 n=16+20)
Varint32ArraySmall/Len512-12    4.30µs ± 9%  3.94µs ± 3%   -8.46%  (p=0.000 n=20+20)
Varint32ArraySmall/Len1024-12   7.99µs ± 1%  7.41µs ± 2%   -7.23%  (p=0.000 n=17+19)
Varint32ArraySmall/Len2048-12   18.2µs ± 3%  16.9µs ± 2%   -7.13%  (p=0.000 n=20+19)
Varint32ArraySmall/Len4096-12   35.2µs ± 4%  32.4µs ± 0%   -7.88%  (p=0.000 n=20+17)
Varint32ArrayLarge/Len34-12      769ns ± 7%   651ns ± 1%  -15.31%  (p=0.000 n=20+17)
Varint32ArrayLarge/Len68-12     1.23µs ± 2%  1.03µs ± 3%  -16.43%  (p=0.000 n=17+17)
Varint32ArrayLarge/Len136-12    2.18µs ± 4%  1.76µs ± 2%  -19.10%  (p=0.000 n=20+19)
Varint32ArrayLarge/Len272-12    4.03µs ±11%  3.19µs ± 4%  -20.68%  (p=0.000 n=20+20)
Varint32ArrayLarge/Len544-12    7.37µs ± 5%  5.81µs ± 0%  -21.22%  (p=0.000 n=18+16)
Varint32ArrayLarge/Len1088-12   13.5µs ± 4%  10.6µs ± 0%  -21.53%  (p=0.000 n=20+17)
Varint32ArrayLarge/Len2176-12   26.5µs ± 1%  21.0µs ± 2%  -20.74%  (p=0.000 n=16+17)
Varint32ArrayLarge/Len4352-12   56.7µs ± 5%  45.1µs ± 3%  -20.35%  (p=0.000 n=20+20)
Varint32ArrayLarge/Len8704-12    109µs ± 3%    88µs ± 6%  -19.75%  (p=0.000 n=20+20)
Varint32ArrayLarge/Len17408-12   209µs ± 0%   165µs ± 0%  -21.17%  (p=0.000 n=16+17)
Varint64ArraySmall/Len8-12       303ns ± 1%   304ns ± 2%     ~     (p=0.205 n=16+17)
Varint64ArraySmall/Len16-12      412ns ± 5%   414ns ±13%     ~     (p=0.236 n=20+19)
Varint64ArraySmall/Len32-12      593ns ± 7%   568ns ± 3%   -4.15%  (p=0.000 n=19+17)
Varint64ArraySmall/Len64-12      921ns ± 3%   874ns ± 3%   -5.10%  (p=0.000 n=20+20)
Varint64ArraySmall/Len128-12    1.54µs ± 0%  1.48µs ± 2%   -3.76%  (p=0.000 n=20+20)
Varint64ArraySmall/Len256-12    2.74µs ± 4%  2.61µs ± 3%   -4.87%  (p=0.000 n=17+20)
Varint64ArraySmall/Len512-12    5.05µs ± 0%  4.76µs ± 2%   -5.67%  (p=0.000 n=18+19)
Varint64ArraySmall/Len1024-12   9.59µs ± 0%  9.07µs ± 3%   -5.39%  (p=0.000 n=17+19)
Varint64ArraySmall/Len2048-12   23.7µs ± 3%  22.3µs ± 2%   -6.06%  (p=0.000 n=20+18)
Varint64ArraySmall/Len4096-12   46.9µs ± 1%  45.0µs ± 1%   -3.96%  (p=0.000 n=17+18)
Varint64ArrayLarge/Len164-12    3.48µs ± 4%  2.71µs ± 3%  -22.02%  (p=0.000 n=19+20)
Varint64ArrayLarge/Len328-12    6.59µs ± 4%  4.97µs ± 0%  -24.56%  (p=0.000 n=20+17)
Varint64ArrayLarge/Len656-12    12.4µs ± 1%   9.4µs ± 1%  -24.44%  (p=0.000 n=18+17)
Varint64ArrayLarge/Len1312-12   23.3µs ± 5%  17.3µs ± 3%  -25.85%  (p=0.000 n=19+20)
Varint64ArrayLarge/Len2624-12   51.9µs ± 5%  39.3µs ± 0%  -24.40%  (p=0.000 n=19+16)
Varint64ArrayLarge/Len5248-12    109µs ± 0%    85µs ± 2%  -22.00%  (p=0.000 n=20+19)
Varint64ArrayLarge/Len10496-12   208µs ± 4%   157µs ± 1%  -24.36%  (p=0.000 n=20+18)
Varint64ArrayLarge/Len20992-12   434µs ± 6%   331µs ± 1%  -23.68%  (p=0.000 n=20+19)
Varint64ArrayLarge/Len41984-12   854µs ± 5%   665µs ± 1%  -22.11%  (p=0.000 n=20+19)
Varint64ArrayLarge/Len83968-12  1.74ms ± 2%  1.38ms ± 2%  -20.49%  (p=0.000 n=19+19)
DecodeEmpty-12                   212ns ± 5%   202ns ± 1%   -5.01%  (p=0.000 n=20+16)

Java vs Cpp Variant:
name                            old time/op  new time/op  delta
Varint32ArraySmall/Len8-12       284ns ± 3%   280ns ± 1%  -1.46%  (p=0.000 n=17+16)
Varint32ArraySmall/Len16-12      370ns ± 0%   370ns ± 5%  -0.09%  (p=0.046 n=16+19)
Varint32ArraySmall/Len32-12      514ns ± 0%   516ns ± 5%    ~     (p=0.147 n=19+20)
Varint32ArraySmall/Len64-12      782ns ± 2%   763ns ± 3%  -2.46%  (p=0.000 n=18+19)
Varint32ArraySmall/Len128-12    1.29µs ± 5%  1.25µs ± 2%  -3.56%  (p=0.000 n=20+19)
Varint32ArraySmall/Len256-12    2.21µs ± 1%  2.19µs ± 4%  -0.87%  (p=0.021 n=17+20)
Varint32ArraySmall/Len512-12    4.01µs ± 2%  3.94µs ± 3%  -1.79%  (p=0.000 n=17+20)
Varint32ArraySmall/Len1024-12   7.55µs ± 0%  7.41µs ± 2%  -1.82%  (p=0.000 n=16+19)
Varint32ArraySmall/Len2048-12   17.0µs ± 0%  16.9µs ± 2%  -0.75%  (p=0.027 n=19+19)
Varint32ArraySmall/Len4096-12   33.1µs ± 2%  32.4µs ± 0%  -1.88%  (p=0.000 n=17+17)
Varint32ArrayLarge/Len34-12      673ns ± 1%   651ns ± 1%  -3.21%  (p=0.000 n=17+17)
Varint32ArrayLarge/Len68-12     1.08µs ± 1%  1.03µs ± 3%  -4.82%  (p=0.000 n=19+17)
Varint32ArrayLarge/Len136-12    1.83µs ± 1%  1.76µs ± 2%  -3.65%  (p=0.000 n=17+19)
Varint32ArrayLarge/Len272-12    3.27µs ± 1%  3.19µs ± 4%  -2.34%  (p=0.000 n=17+20)
Varint32ArrayLarge/Len544-12    6.04µs ± 1%  5.81µs ± 0%  -3.86%  (p=0.000 n=16+16)
Varint32ArrayLarge/Len1088-12   11.0µs ± 0%  10.6µs ± 0%  -3.78%  (p=0.000 n=16+17)
Varint32ArrayLarge/Len2176-12   21.7µs ± 1%  21.0µs ± 2%  -3.28%  (p=0.000 n=17+17)
Varint32ArrayLarge/Len4352-12   46.2µs ± 0%  45.1µs ± 3%  -2.24%  (p=0.000 n=20+20)
Varint32ArrayLarge/Len8704-12   90.7µs ± 8%  87.8µs ± 6%  -3.16%  (p=0.000 n=19+20)
Varint32ArrayLarge/Len17408-12   171µs ± 1%   165µs ± 0%  -3.72%  (p=0.000 n=16+17)
Varint64ArraySmall/Len8-12       309ns ± 5%   304ns ± 2%  -1.61%  (p=0.000 n=18+17)
Varint64ArraySmall/Len16-12      403ns ± 1%   414ns ±13%    ~     (p=0.774 n=20+19)
Varint64ArraySmall/Len32-12      569ns ± 0%   568ns ± 3%  -0.18%  (p=0.012 n=17+17)
Varint64ArraySmall/Len64-12      898ns ± 0%   874ns ± 3%  -2.64%  (p=0.000 n=17+20)
Varint64ArraySmall/Len128-12    1.52µs ± 1%  1.48µs ± 2%  -2.07%  (p=0.000 n=16+20)
Varint64ArraySmall/Len256-12    2.65µs ± 0%  2.61µs ± 3%  -1.48%  (p=0.000 n=20+20)
Varint64ArraySmall/Len512-12    4.86µs ± 1%  4.76µs ± 2%  -2.03%  (p=0.000 n=16+19)
Varint64ArraySmall/Len1024-12   9.16µs ± 2%  9.07µs ± 3%  -0.97%  (p=0.001 n=17+19)
Varint64ArraySmall/Len2048-12   22.7µs ± 2%  22.3µs ± 2%  -1.89%  (p=0.000 n=20+18)
Varint64ArraySmall/Len4096-12   45.4µs ± 1%  45.0µs ± 1%  -0.84%  (p=0.000 n=19+18)
Varint64ArrayLarge/Len164-12    2.84µs ± 1%  2.71µs ± 3%  -4.39%  (p=0.000 n=16+20)
Varint64ArrayLarge/Len328-12    5.26µs ± 1%  4.97µs ± 0%  -5.36%  (p=0.000 n=20+17)
Varint64ArrayLarge/Len656-12    10.0µs ± 1%   9.4µs ± 1%  -6.03%  (p=0.000 n=17+17)
Varint64ArrayLarge/Len1312-12   18.1µs ± 1%  17.3µs ± 3%  -4.61%  (p=0.000 n=16+20)
Varint64ArrayLarge/Len2624-12   41.3µs ± 0%  39.3µs ± 0%  -4.92%  (p=0.000 n=17+16)
Varint64ArrayLarge/Len5248-12   88.6µs ± 2%  84.8µs ± 2%  -4.33%  (p=0.000 n=18+19)
Varint64ArrayLarge/Len10496-12   165µs ± 1%   157µs ± 1%  -4.41%  (p=0.000 n=20+18)
Varint64ArrayLarge/Len20992-12   341µs ± 2%   331µs ± 1%  -3.00%  (p=0.000 n=19+19)
Varint64ArrayLarge/Len41984-12   696µs ± 1%   665µs ± 1%  -4.46%  (p=0.000 n=19+19)
Varint64ArrayLarge/Len83968-12  1.43ms ± 1%  1.38ms ± 2%  -3.38%  (p=0.000 n=17+19)
DecodeEmpty-12                   205ns ± 3%   202ns ± 1%  -1.45%  (p=0.000 n=17+16)

Both were run with 20 iterations on the decoding benchmark, on a mostly quiet machine with cpu frequency scaling disabled.
PiperOrigin-RevId: 135494672
diff --git a/proto/decode.go b/proto/decode.go
index a54a8cc..aa20729 100644
--- a/proto/decode.go
+++ b/proto/decode.go
@@ -61,7 +61,6 @@
 // int32, int64, uint32, uint64, bool, and enum
 // protocol buffer types.
 func DecodeVarint(buf []byte) (x uint64, n int) {
-	// x, n already 0
 	for shift := uint(0); shift < 64; shift += 7 {
 		if n >= len(buf) {
 			return 0, 0
@@ -78,13 +77,7 @@
 	return 0, 0
 }
 
-// DecodeVarint reads a varint-encoded integer from the Buffer.
-// This is the format for the
-// int32, int64, uint32, uint64, bool, and enum
-// protocol buffer types.
-func (p *Buffer) DecodeVarint() (x uint64, err error) {
-	// x, err already 0
-
+func (p *Buffer) decodeVarintSlow() (x uint64, err error) {
 	i := p.index
 	l := len(p.buf)
 
@@ -107,6 +100,107 @@
 	return
 }
 
+// DecodeVarint reads a varint-encoded integer from the Buffer.
+// This is the format for the
+// int32, int64, uint32, uint64, bool, and enum
+// protocol buffer types.
+func (p *Buffer) DecodeVarint() (x uint64, err error) {
+	i := p.index
+	buf := p.buf
+
+	if i >= len(buf) {
+		return 0, io.ErrUnexpectedEOF
+	} else if buf[i] < 0x80 {
+		p.index++
+		return uint64(buf[i]), nil
+	} else if len(buf)-i < 10 {
+		return p.decodeVarintSlow()
+	}
+
+	var b uint64
+	// we already checked the first byte
+	x = uint64(buf[i]) - 0x80
+	i++
+
+	b = uint64(buf[i])
+	i++
+	x += b << 7
+	if b&0x80 == 0 {
+		goto done
+	}
+	x -= 0x80 << 7
+
+	b = uint64(buf[i])
+	i++
+	x += b << 14
+	if b&0x80 == 0 {
+		goto done
+	}
+	x -= 0x80 << 14
+
+	b = uint64(buf[i])
+	i++
+	x += b << 21
+	if b&0x80 == 0 {
+		goto done
+	}
+	x -= 0x80 << 21
+
+	b = uint64(buf[i])
+	i++
+	x += b << 28
+	if b&0x80 == 0 {
+		goto done
+	}
+	x -= 0x80 << 28
+
+	b = uint64(buf[i])
+	i++
+	x += b << 35
+	if b&0x80 == 0 {
+		goto done
+	}
+	x -= 0x80 << 35
+
+	b = uint64(buf[i])
+	i++
+	x += b << 42
+	if b&0x80 == 0 {
+		goto done
+	}
+	x -= 0x80 << 42
+
+	b = uint64(buf[i])
+	i++
+	x += b << 49
+	if b&0x80 == 0 {
+		goto done
+	}
+	x -= 0x80 << 49
+
+	b = uint64(buf[i])
+	i++
+	x += b << 56
+	if b&0x80 == 0 {
+		goto done
+	}
+	x -= 0x80 << 56
+
+	b = uint64(buf[i])
+	i++
+	x += b << 63
+	if b&0x80 == 0 {
+		goto done
+	}
+	// x -= 0x80 << 63 // Always zero.
+
+	return 0, errOverflow
+
+done:
+	p.index = i
+	return x, nil
+}
+
 // DecodeFixed64 reads a 64-bit integer from the Buffer.
 // This is the format for the
 // fixed64, sfixed64, and double protocol buffer types.