mime: speed up ParseMediaType

Julien Cretel · Julien Cretel · commit ed6e916d5d1f · 2025-04-18T18:15:21.000+02:00
Eschew UTF-8 decoding and strings.IndexFunc where possible;
rely on ASCII sets instead.

Some benchmark results (no changes to allocations):

goos: darwin
goarch: amd64
pkg: mime
cpu: Intel(R) Core(TM) i7-6700HQ CPU @ 2.60GHz
                      │     old     │                 new                 │
                      │   sec/op    │   sec/op     vs base                │
ParseMediaType-8        73.71µ ± 2%   54.18µ ± 1%  -26.49% (p=0.000 n=20)
ParseMediaTypeBogus-8   5.532µ ± 0%   3.370µ ± 0%  -39.08% (p=0.000 n=20)
geomean                 20.19µ        13.51µ       -33.08%
diff --git a/src/mime/grammar.go b/src/mime/grammar.go
@@ -4,22 +4,35 @@
 
 package mime
 
-import (
-	"strings"
+const tspecialsString = `()<>@,;:\"/[]?=`
+
+var (
+	tspecials  asciiSet
+	tokenChars asciiSet
 )
 
-// isTSpecial reports whether rune is in 'tspecials' as defined by RFC
+func init() {
+	// tspecials :=  "(" / ")" / "<" / ">" / "@" /
+	//               "," / ";" / ":" / "\" / <">
+	//               "/" / "[" / "]" / "?" / "="
+	tspecials.add(tspecialsString)
+
+	// token := 1*<any (US-ASCII) CHAR except SPACE, CTLs,
+	//             or tspecials>
+	tokenChars.addRange('!', 0x7f)
+	tokenChars.remove(tspecialsString)
+}
+
+// isTSpecial reports whether c is in 'tspecials' as defined by RFC
 // 1521 and RFC 2045.
-func isTSpecial(r rune) bool {
-	return strings.ContainsRune(`()<>@,;:\"/[]?=`, r)
+func isTSpecial(c byte) bool {
+	return tspecials.contains(c)
 }
 
-// isTokenChar reports whether rune is in 'token' as defined by RFC
+// isTokenChar reports whether c is in 'token' as defined by RFC
 // 1521 and RFC 2045.
-func isTokenChar(r rune) bool {
-	// token := 1*<any (US-ASCII) CHAR except SPACE, CTLs,
-	//             or tspecials>
-	return r > 0x20 && r < 0x7f && !isTSpecial(r)
+func isTokenChar(c byte) bool {
+	return tokenChars.contains(c)
 }
 
 // isToken reports whether s is a 'token' as defined by RFC 1521
@@ -28,5 +41,48 @@ func isToken(s string) bool {
 	if s == "" {
 		return false
 	}
-	return strings.IndexFunc(s, isNotTokenChar) < 0
+	for _, c := range []byte(s) {
+		if !tokenChars.contains(c) {
+			return false
+		}
+	}
+	return true
+}
+
+// asciiSet is a 32-byte value, where each bit represents the presence of a
+// given ASCII character in the set. The 128-bits of the lower 16 bytes,
+// starting with the least-significant bit of the lowest word to the
+// most-significant bit of the highest word, map to the full range of all
+// 128 ASCII characters. The 128-bits of the upper 16 bytes will be zeroed,
+// ensuring that any non-ASCII character will be reported as not in the set.
+// This allocates a total of 32 bytes even though the upper half
+// is unused to avoid bounds checks in asciiSet.contains.
+type asciiSet [8]uint32
+
+// add adds all the characters in chars to the set.
+// Precondition: all the characters in chars are ASCII.
+func (as *asciiSet) add(chars string) {
+	for _, c := range []byte(chars) {
+		as[c/32] |= 1 << (c % 32)
+	}
+}
+
+// addRange adds all the characters between lo (inclusive) and hi (exclusive) to the set.
+// Precondition: hi <= utf8.RuneSelf (0x80)
+func (as *asciiSet) addRange(lo, hi byte) {
+	for c := lo; c < hi; c++ {
+		as[c/32] |= 1 << (c % 32)
+	}
+}
+
+// remove removes all the characters in chars from the set.
+func (as *asciiSet) remove(chars string) {
+	for _, c := range []byte(chars) {
+		as[c/32] &^= 1 << (c % 32)
+	}
+}
+
+// contains reports whether c is inside the set.
+func (as *asciiSet) contains(c byte) bool {
+	return (as[c/32] & (1 << (c % 32))) != 0
 }
diff --git a/src/mime/mediatype.go b/src/mime/mediatype.go
@@ -60,7 +60,7 @@ func FormatMediaType(t string, param map[string]string) string {
 				// attribute-char := <any (US-ASCII) CHAR except SPACE, CTLs, "*", "'", "%", or tspecials>
 				if ch <= ' ' || ch >= 0x7F ||
 					ch == '*' || ch == '\'' || ch == '%' ||
-					isTSpecial(rune(ch)) {
+					isTSpecial(ch) {
 
 					b.WriteString(value[offset:index])
 					offset = index + 1
@@ -250,23 +250,16 @@ func decode2231Enc(v string) (string, bool) {
 	return encv, true
 }
 
-func isNotTokenChar(r rune) bool {
-	return !isTokenChar(r)
-}
-
 // consumeToken consumes a token from the beginning of provided
 // string, per RFC 2045 section 5.1 (referenced from 2183), and return
 // the token consumed and the rest of the string. Returns ("", v) on
 // failure to consume at least one character.
 func consumeToken(v string) (token, rest string) {
-	notPos := strings.IndexFunc(v, isNotTokenChar)
-	if notPos == -1 {
-		return v, ""
-	}
-	if notPos == 0 {
-		return "", v
+	var i int
+	for ; i < len(v) && isTokenChar(v[i]); i++ {
+		// deliberately empty
 	}
-	return v[0:notPos], v[notPos:]
+	return v[:i], v[i:]
 }
 
 // consumeValue consumes a "value" per RFC 2045, where a value is
@@ -299,7 +292,7 @@ func consumeValue(v string) (value, rest string) {
 		// and intended as a literal backslash. This makes Go servers deal better
 		// with MSIE without affecting the way they handle conforming MIME
 		// generators.
-		if r == '\\' && i+1 < len(v) && isTSpecial(rune(v[i+1])) {
+		if r == '\\' && i+1 < len(v) && isTSpecial(v[i+1]) {
 			buffer.WriteByte(v[i+1])
 			i++
 			continue