Skip to content

Commit ed6e916

Browse files
author
Julien Cretel
committed
mime: speed up ParseMediaType
Eschew UTF-8 decoding and strings.IndexFunc where possible; rely on ASCII sets instead. Some benchmark results (no changes to allocations): goos: darwin goarch: amd64 pkg: mime cpu: Intel(R) Core(TM) i7-6700HQ CPU @ 2.60GHz │ old │ new │ │ sec/op │ sec/op vs base │ ParseMediaType-8 73.71µ ± 2% 54.18µ ± 1% -26.49% (p=0.000 n=20) ParseMediaTypeBogus-8 5.532µ ± 0% 3.370µ ± 0% -39.08% (p=0.000 n=20) geomean 20.19µ 13.51µ -33.08%
1 parent aec834d commit ed6e916

File tree

2 files changed

+73
-24
lines changed

2 files changed

+73
-24
lines changed

src/mime/grammar.go

+67-11
Original file line numberDiff line numberDiff line change
@@ -4,22 +4,35 @@
44

55
package mime
66

7-
import (
8-
"strings"
7+
const tspecialsString = `()<>@,;:\"/[]?=`
8+
9+
var (
10+
tspecials asciiSet
11+
tokenChars asciiSet
912
)
1013

11-
// isTSpecial reports whether rune is in 'tspecials' as defined by RFC
14+
func init() {
15+
// tspecials := "(" / ")" / "<" / ">" / "@" /
16+
// "," / ";" / ":" / "\" / <">
17+
// "/" / "[" / "]" / "?" / "="
18+
tspecials.add(tspecialsString)
19+
20+
// token := 1*<any (US-ASCII) CHAR except SPACE, CTLs,
21+
// or tspecials>
22+
tokenChars.addRange('!', 0x7f)
23+
tokenChars.remove(tspecialsString)
24+
}
25+
26+
// isTSpecial reports whether c is in 'tspecials' as defined by RFC
1227
// 1521 and RFC 2045.
13-
func isTSpecial(r rune) bool {
14-
return strings.ContainsRune(`()<>@,;:\"/[]?=`, r)
28+
func isTSpecial(c byte) bool {
29+
return tspecials.contains(c)
1530
}
1631

17-
// isTokenChar reports whether rune is in 'token' as defined by RFC
32+
// isTokenChar reports whether c is in 'token' as defined by RFC
1833
// 1521 and RFC 2045.
19-
func isTokenChar(r rune) bool {
20-
// token := 1*<any (US-ASCII) CHAR except SPACE, CTLs,
21-
// or tspecials>
22-
return r > 0x20 && r < 0x7f && !isTSpecial(r)
34+
func isTokenChar(c byte) bool {
35+
return tokenChars.contains(c)
2336
}
2437

2538
// isToken reports whether s is a 'token' as defined by RFC 1521
@@ -28,5 +41,48 @@ func isToken(s string) bool {
2841
if s == "" {
2942
return false
3043
}
31-
return strings.IndexFunc(s, isNotTokenChar) < 0
44+
for _, c := range []byte(s) {
45+
if !tokenChars.contains(c) {
46+
return false
47+
}
48+
}
49+
return true
50+
}
51+
52+
// asciiSet is a 32-byte value, where each bit represents the presence of a
53+
// given ASCII character in the set. The 128-bits of the lower 16 bytes,
54+
// starting with the least-significant bit of the lowest word to the
55+
// most-significant bit of the highest word, map to the full range of all
56+
// 128 ASCII characters. The 128-bits of the upper 16 bytes will be zeroed,
57+
// ensuring that any non-ASCII character will be reported as not in the set.
58+
// This allocates a total of 32 bytes even though the upper half
59+
// is unused to avoid bounds checks in asciiSet.contains.
60+
type asciiSet [8]uint32
61+
62+
// add adds all the characters in chars to the set.
63+
// Precondition: all the characters in chars are ASCII.
64+
func (as *asciiSet) add(chars string) {
65+
for _, c := range []byte(chars) {
66+
as[c/32] |= 1 << (c % 32)
67+
}
68+
}
69+
70+
// addRange adds all the characters between lo (inclusive) and hi (exclusive) to the set.
71+
// Precondition: hi <= utf8.RuneSelf (0x80)
72+
func (as *asciiSet) addRange(lo, hi byte) {
73+
for c := lo; c < hi; c++ {
74+
as[c/32] |= 1 << (c % 32)
75+
}
76+
}
77+
78+
// remove removes all the characters in chars from the set.
79+
func (as *asciiSet) remove(chars string) {
80+
for _, c := range []byte(chars) {
81+
as[c/32] &^= 1 << (c % 32)
82+
}
83+
}
84+
85+
// contains reports whether c is inside the set.
86+
func (as *asciiSet) contains(c byte) bool {
87+
return (as[c/32] & (1 << (c % 32))) != 0
3288
}

src/mime/mediatype.go

+6-13
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ func FormatMediaType(t string, param map[string]string) string {
6060
// attribute-char := <any (US-ASCII) CHAR except SPACE, CTLs, "*", "'", "%", or tspecials>
6161
if ch <= ' ' || ch >= 0x7F ||
6262
ch == '*' || ch == '\'' || ch == '%' ||
63-
isTSpecial(rune(ch)) {
63+
isTSpecial(ch) {
6464

6565
b.WriteString(value[offset:index])
6666
offset = index + 1
@@ -250,23 +250,16 @@ func decode2231Enc(v string) (string, bool) {
250250
return encv, true
251251
}
252252

253-
func isNotTokenChar(r rune) bool {
254-
return !isTokenChar(r)
255-
}
256-
257253
// consumeToken consumes a token from the beginning of provided
258254
// string, per RFC 2045 section 5.1 (referenced from 2183), and return
259255
// the token consumed and the rest of the string. Returns ("", v) on
260256
// failure to consume at least one character.
261257
func consumeToken(v string) (token, rest string) {
262-
notPos := strings.IndexFunc(v, isNotTokenChar)
263-
if notPos == -1 {
264-
return v, ""
265-
}
266-
if notPos == 0 {
267-
return "", v
258+
var i int
259+
for ; i < len(v) && isTokenChar(v[i]); i++ {
260+
// deliberately empty
268261
}
269-
return v[0:notPos], v[notPos:]
262+
return v[:i], v[i:]
270263
}
271264

272265
// consumeValue consumes a "value" per RFC 2045, where a value is
@@ -299,7 +292,7 @@ func consumeValue(v string) (value, rest string) {
299292
// and intended as a literal backslash. This makes Go servers deal better
300293
// with MSIE without affecting the way they handle conforming MIME
301294
// generators.
302-
if r == '\\' && i+1 < len(v) && isTSpecial(rune(v[i+1])) {
295+
if r == '\\' && i+1 < len(v) && isTSpecial(v[i+1]) {
303296
buffer.WriteByte(v[i+1])
304297
i++
305298
continue

0 commit comments

Comments
 (0)