-
Notifications
You must be signed in to change notification settings - Fork 107
/
Copy pathtest_encoding.py
337 lines (287 loc) · 13.1 KB
/
test_encoding.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
import codecs
import unittest
from typing import Optional, Union, List, Any
import pytest
from w3lib._encoding import _get_encoding, _LABEL_ENCODINGS
from w3lib.encoding import (
html_body_declared_encoding,
http_content_type_encoding,
html_to_unicode,
read_bom,
resolve_encoding,
to_unicode,
)
# Encodings from the spec that Python does not support.
_UNSUPPORTED_ENCODINGS = {
"iso-8859-8-i", # https://bugs.python.org/msg213772
"replacement", # Not an actual encoding
# Not supported.
# We could bring support to it with the webencodings package.
"x-user-defined",
}
@pytest.mark.parametrize(
"label,name", tuple((label, name) for label, name in _LABEL_ENCODINGS.items())
)
def test_get_encoding_python(label, name):
"""The encodings that _get_encoding can return must work as encoding
aliases in Python."""
assert _get_encoding(label) == name
if name not in _UNSUPPORTED_ENCODINGS:
codecs.lookup(name) # Raises LookupError if not found.
class RequestEncodingTests(unittest.TestCase):
utf8_fragments = [
# Content-Type as meta http-equiv
b"""<meta http-equiv="content-type" content="text/html;charset=UTF-8" />""",
b"""\n<meta http-equiv="Content-Type"\ncontent="text/html; charset=utf-8">""",
b"""<meta http-equiv="Content-Type" content="text/html" charset="utf-8">""",
b"""<meta http-equiv=Content-Type content="text/html" charset='utf-8'>""",
b"""<meta http-equiv="Content-Type" content\t=\n"text/html" charset\t="utf-8">""",
b"""<meta content="text/html; charset=utf-8"\n http-equiv='Content-Type'>""",
b""" bad html still supported < meta http-equiv='Content-Type'\n content="text/html; charset=utf-8">""",
# html5 meta charset
b"""<meta charset="utf-8">""",
b"""<meta charset =\n"utf-8">""",
# xml encoding
b"""<?xml version="1.0" encoding="utf-8"?>""",
]
def test_bom(self):
# cjk water character in unicode
water_unicode = "\u6C34"
# BOM + water character encoded
utf16be = b"\xfe\xff\x6c\x34"
utf16le = b"\xff\xfe\x34\x6c"
utf32be = b"\x00\x00\xfe\xff\x00\x00\x6c\x34"
utf32le = b"\xff\xfe\x00\x00\x34\x6c\x00\x00"
for string in (utf16be, utf16le, utf32be, utf32le):
bom_encoding, bom = read_bom(string)
assert bom_encoding is not None
assert bom is not None
decoded = string[len(bom) :].decode(bom_encoding)
self.assertEqual(water_unicode, decoded)
# Body without BOM
enc, bom = read_bom(b"foo")
self.assertEqual(enc, None)
self.assertEqual(bom, None)
# Empty body
enc, bom = read_bom(b"")
self.assertEqual(enc, None)
self.assertEqual(bom, None)
def test_http_encoding_header(self):
header_value = "Content-Type: text/html; charset=ISO-8859-4"
extracted = http_content_type_encoding(header_value)
self.assertEqual(extracted, "iso8859-4")
self.assertEqual(None, http_content_type_encoding("something else"))
def test_html_body_declared_encoding(self):
for fragment in self.utf8_fragments:
encoding = html_body_declared_encoding(fragment)
self.assertEqual(encoding, "utf-8", fragment)
self.assertEqual(None, html_body_declared_encoding(b"something else"))
self.assertEqual(
None,
html_body_declared_encoding(
b"""
<head></head><body>
this isn't searched
<meta charset="utf-8">
"""
),
)
self.assertEqual(
None,
html_body_declared_encoding(
b"""<meta http-equiv="Fake-Content-Type-Header" content="text/html; charset=utf-8">"""
),
)
def test_html_body_declared_encoding_unicode(self):
# html_body_declared_encoding should work when unicode body is passed
self.assertEqual(None, html_body_declared_encoding("something else"))
for fragment in self.utf8_fragments:
encoding = html_body_declared_encoding(fragment.decode("utf8"))
self.assertEqual(encoding, "utf-8", fragment)
self.assertEqual(
None,
html_body_declared_encoding(
"""
<head></head><body>
this isn't searched
<meta charset="utf-8">
"""
),
)
self.assertEqual(
None,
html_body_declared_encoding(
"""<meta http-equiv="Fake-Content-Type-Header" content="text/html; charset=utf-8">"""
),
)
class CodecsEncodingTestCase(unittest.TestCase):
def test_resolve_encoding(self):
self.assertEqual(resolve_encoding("latin1"), "cp1252")
self.assertEqual(resolve_encoding(" Latin-1"), "cp1252")
self.assertEqual(resolve_encoding("gb_2312-80"), "gb18030")
self.assertEqual(resolve_encoding("unknown encoding"), None)
class UnicodeDecodingTestCase(unittest.TestCase):
def test_utf8(self):
self.assertEqual(to_unicode(b"\xc2\xa3", "utf-8"), "\xa3")
def test_invalid_utf8(self):
self.assertEqual(to_unicode(b"\xc2\xc2\xa3", "utf-8"), "\ufffd\xa3")
def ct(charset: Optional[str]) -> Optional[str]:
return "Content-Type: text/html; charset=" + charset if charset else None
def norm_encoding(enc: str) -> str:
return codecs.lookup(enc).name
class HtmlConversionTests(unittest.TestCase):
def test_unicode_body(self):
unicode_string = "\u043a\u0438\u0440\u0438\u043b\u043b\u0438\u0447\u0435\u0441\u043a\u0438\u0439 \u0442\u0435\u043a\u0441\u0442"
original_string = unicode_string.encode("cp1251")
encoding, body_unicode = html_to_unicode(ct("cp1251"), original_string)
# check body_as_unicode
self.assertTrue(isinstance(body_unicode, str))
self.assertEqual(body_unicode, unicode_string)
def _assert_encoding(
self,
content_type: Optional[str],
body: bytes,
expected_encoding: str,
expected_unicode: Union[str, List[str]],
) -> None:
assert not isinstance(body, str)
encoding, body_unicode = html_to_unicode(ct(content_type), body)
self.assertTrue(isinstance(body_unicode, str))
self.assertEqual(norm_encoding(encoding), norm_encoding(expected_encoding))
if isinstance(expected_unicode, str):
self.assertEqual(body_unicode, expected_unicode)
else:
self.assertTrue(
body_unicode in expected_unicode,
f"{body_unicode} is not in {expected_unicode}",
)
def test_content_type_and_conversion(self):
"""Test content type header is interpreted and text converted as
expected
"""
self._assert_encoding("utf-8", b"\xc2\xa3", "utf-8", "\xa3")
# something like this in the scrapy tests - but that's invalid?
# self._assert_encoding('', "\xa3", 'utf-8', "\xa3")
# iso-8859-1 is overridden to cp1252
self._assert_encoding("iso-8859-1", b"\xa3", "cp1252", "\xa3")
self._assert_encoding("", b"\xc2\xa3", "utf-8", "\xa3")
self._assert_encoding("none", b"\xc2\xa3", "utf-8", "\xa3")
self._assert_encoding("gb2312", b"\xa8D", "gb18030", "\u2015")
self._assert_encoding("gbk", b"\xa8D", "gb18030", "\u2015")
self._assert_encoding("big5", b"\xf9\xda", "big5hkscs", "\u6052")
def test_invalid_utf8_encoded_body_with_valid_utf8_BOM(self):
# unlike scrapy, the BOM is stripped
self._assert_encoding(
"utf-8", b"\xef\xbb\xbfWORD\xe3\xabWORD2", "utf-8", "WORD\ufffdWORD2"
)
self._assert_encoding(
None, b"\xef\xbb\xbfWORD\xe3\xabWORD2", "utf-8", "WORD\ufffdWORD2"
)
def test_utf8_unexpected_end_of_data_with_valid_utf8_BOM(self):
# Python implementations handle unexpected end of UTF8 data
# differently (see https://bugs.pypy.org/issue1536).
# It is hard to fix this for PyPy in w3lib, so the test
# is permissive.
# unlike scrapy, the BOM is stripped
self._assert_encoding(
"utf-8",
b"\xef\xbb\xbfWORD\xe3\xab",
"utf-8",
["WORD\ufffd\ufffd", "WORD\ufffd"],
)
self._assert_encoding(
None,
b"\xef\xbb\xbfWORD\xe3\xab",
"utf-8",
["WORD\ufffd\ufffd", "WORD\ufffd"],
)
def test_replace_wrong_encoding(self):
"""Test invalid chars are replaced properly"""
encoding, body_unicode = html_to_unicode(ct("utf-8"), b"PREFIX\xe3\xabSUFFIX")
# XXX: Policy for replacing invalid chars may suffer minor variations
# but it should always contain the unicode replacement char ('\ufffd')
assert "\ufffd" in body_unicode, repr(body_unicode)
assert "PREFIX" in body_unicode, repr(body_unicode)
assert "SUFFIX" in body_unicode, repr(body_unicode)
# Do not destroy html tags due to encoding bugs
encoding, body_unicode = html_to_unicode(ct("utf-8"), b"\xf0<span>value</span>")
assert "<span>value</span>" in body_unicode, repr(body_unicode)
def _assert_encoding_detected(
self,
content_type: Optional[str],
expected_encoding: str,
body: bytes,
**kwargs: Any,
) -> None:
assert not isinstance(body, str)
encoding, body_unicode = html_to_unicode(ct(content_type), body, **kwargs)
self.assertTrue(isinstance(body_unicode, str))
self.assertEqual(norm_encoding(encoding), norm_encoding(expected_encoding))
def test_BOM(self):
# utf-16 cases already tested, as is the BOM detection function
# BOM takes precedence, ahead of the http header
bom_be_str = codecs.BOM_UTF16_BE + "hi".encode("utf-16-be")
expected = "hi"
self._assert_encoding("utf-8", bom_be_str, "utf-16-be", expected)
# BOM is stripped when present
bom_utf8_str = codecs.BOM_UTF8 + b"hi"
self._assert_encoding("utf-8", bom_utf8_str, "utf-8", "hi")
self._assert_encoding(None, bom_utf8_str, "utf-8", "hi")
def test_utf16_32(self):
# tools.ietf.org/html/rfc2781 section 4.3
# USE BOM and strip it
bom_be_str = codecs.BOM_UTF16_BE + "hi".encode("utf-16-be")
self._assert_encoding("utf-16", bom_be_str, "utf-16-be", "hi")
self._assert_encoding(None, bom_be_str, "utf-16-be", "hi")
bom_le_str = codecs.BOM_UTF16_LE + "hi".encode("utf-16-le")
self._assert_encoding("utf-16", bom_le_str, "utf-16-le", "hi")
self._assert_encoding(None, bom_le_str, "utf-16-le", "hi")
bom_be_str = codecs.BOM_UTF32_BE + "hi".encode("utf-32-be")
self._assert_encoding("utf-32", bom_be_str, "utf-32-be", "hi")
self._assert_encoding(None, bom_be_str, "utf-32-be", "hi")
bom_le_str = codecs.BOM_UTF32_LE + "hi".encode("utf-32-le")
self._assert_encoding("utf-32", bom_le_str, "utf-32-le", "hi")
self._assert_encoding(None, bom_le_str, "utf-32-le", "hi")
# if there is no BOM, big endian should be chosen
self._assert_encoding("utf-16", "hi".encode("utf-16-be"), "utf-16-be", "hi")
self._assert_encoding("utf-32", "hi".encode("utf-32-be"), "utf-32-be", "hi")
def test_python_crash(self):
import random
from io import BytesIO
random.seed(42)
buf = BytesIO()
for i in range(150000):
buf.write(bytes([random.randint(0, 255)]))
to_unicode(buf.getvalue(), "utf-16-le")
to_unicode(buf.getvalue(), "utf-16-be")
to_unicode(buf.getvalue(), "utf-32-le")
to_unicode(buf.getvalue(), "utf-32-be")
def test_html_encoding(self):
# extracting the encoding from raw html is tested elsewhere
body = b"""blah blah < meta http-equiv="Content-Type"
content="text/html; charset=iso-8859-1"> other stuff"""
self._assert_encoding_detected(None, "cp1252", body)
# header encoding takes precedence
self._assert_encoding_detected("utf-8", "utf-8", body)
# BOM encoding takes precedence
self._assert_encoding_detected(None, "utf-8", codecs.BOM_UTF8 + body)
def test_autodetect(self):
def asciif(x):
return "ascii"
body = b"""<meta charset="utf-8">"""
# body encoding takes precedence
self._assert_encoding_detected(None, "utf-8", body, auto_detect_fun=asciif)
# if no other encoding, the auto detect encoding is used.
self._assert_encoding_detected(
None, "ascii", b"no encoding info", auto_detect_fun=asciif
)
def test_default_encoding(self):
# if no other method available, the default encoding of utf-8 is used
self._assert_encoding_detected(None, "utf-8", b"no encoding info")
# this can be overridden
self._assert_encoding_detected(
None, "ascii", b"no encoding info", default_encoding="ascii"
)
def test_empty_body(self):
# if no other method available, the default encoding of utf-8 is used
self._assert_encoding_detected(None, "utf-8", b"")