Skip to content

Commit e9131c8

Browse files
author
Kareem Zidane
authored
Merge pull request #44 from cs50/invalid-utf8
More performant utf8 checking
2 parents 69006d8 + 60536f8 commit e9131c8

File tree

1 file changed

+17
-10
lines changed

1 file changed

+17
-10
lines changed

compare50/__main__.py

+17-10
Original file line numberDiff line numberDiff line change
@@ -69,20 +69,11 @@ def _get(self, path, preprocessor, is_archive=False):
6969
else:
7070
included, excluded = lib50.files(self.patterns, require_tags=[], root=path)
7171

72-
decodable_files = []
73-
for file_path in included:
74-
try:
75-
with open(path / file_path) as f:
76-
f.read()
77-
except UnicodeDecodeError:
78-
pass
79-
else:
80-
decodable_files.append(file_path)
72+
decodable_files = sorted(file_path for file_path in included if self._is_valid_utf8(path / file_path))
8173

8274
if not decodable_files:
8375
raise _api.Error(f"Empty submission: {path}")
8476

85-
decodable_files = sorted(decodable_files)
8677
return _data.Submission(path, decodable_files, preprocessor=preprocessor, is_archive=is_archive)
8778

8879
def get_all(self, paths, preprocessor, is_archive=False):
@@ -100,6 +91,22 @@ def get_all(self, paths, preprocessor, is_archive=False):
10091
_api.get_progress_bar().update()
10192
return subs
10293

94+
@staticmethod
95+
def _is_valid_utf8(file_path):
96+
"""
97+
Check if file_path is valid utf-8.
98+
f.read() is not performant since the entire file is read in before checking if it is valid utf8.
99+
This function reads in the file in increasingly large blocks so that it can error out early if necessary.
100+
"""
101+
try:
102+
with open(file_path, encoding="utf8") as f:
103+
blocksize = 64
104+
while f.read(blocksize):
105+
blocksize *= 2;
106+
except UnicodeDecodeError:
107+
return False
108+
return True
109+
103110

104111
class ArgParser(argparse.ArgumentParser):
105112
def error(self, message):

0 commit comments

Comments
 (0)