@@ -69,20 +69,11 @@ def _get(self, path, preprocessor, is_archive=False):
69
69
else :
70
70
included , excluded = lib50 .files (self .patterns , require_tags = [], root = path )
71
71
72
- decodable_files = []
73
- for file_path in included :
74
- try :
75
- with open (path / file_path ) as f :
76
- f .read ()
77
- except UnicodeDecodeError :
78
- pass
79
- else :
80
- decodable_files .append (file_path )
72
+ decodable_files = sorted (file_path for file_path in included if self ._is_valid_utf8 (path / file_path ))
81
73
82
74
if not decodable_files :
83
75
raise _api .Error (f"Empty submission: { path } " )
84
76
85
- decodable_files = sorted (decodable_files )
86
77
return _data .Submission (path , decodable_files , preprocessor = preprocessor , is_archive = is_archive )
87
78
88
79
def get_all (self , paths , preprocessor , is_archive = False ):
@@ -100,6 +91,22 @@ def get_all(self, paths, preprocessor, is_archive=False):
100
91
_api .get_progress_bar ().update ()
101
92
return subs
102
93
94
+ @staticmethod
95
+ def _is_valid_utf8 (file_path ):
96
+ """
97
+ Check if file_path is valid utf-8.
98
+ f.read() is not performant since the entire file is read in before checking if it is valid utf8.
99
+ This function reads in the file in increasingly large blocks so that it can error out early if necessary.
100
+ """
101
+ try :
102
+ with open (file_path , encoding = "utf8" ) as f :
103
+ blocksize = 64
104
+ while f .read (blocksize ):
105
+ blocksize *= 2 ;
106
+ except UnicodeDecodeError :
107
+ return False
108
+ return True
109
+
103
110
104
111
class ArgParser (argparse .ArgumentParser ):
105
112
def error (self , message ):
0 commit comments