Skip to content
This repository was archived by the owner on Dec 24, 2019. It is now read-only.

Commit 3f0a770

Browse files
author
Paŭlo Ebermann
committed
convert to unicode and back when fixing stuff on line-level.
1 parent 1c5b8fc commit 3f0a770

File tree

1 file changed

+71
-23
lines changed

1 file changed

+71
-23
lines changed

codevalidator.py

+71-23
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
import sys
3232
import tempfile
3333
import shutil
34+
import codecs
3435

3536
if sys.version_info.major == 2:
3637
# Pythontidy is only supported on Python2
@@ -46,6 +47,10 @@
4647

4748
DEFAULT_CONFIG_PATHS = ['~/.codevalidatorrc', '/etc/codevalidatorrc']
4849

50+
# The first rule name which matches a registered encoding is used
51+
# both as a check that the file can be read with that encoding,
52+
# as well as a encoding filter for those rules which support fixing.
53+
4954
DEFAULT_RULES = [
5055
'utf8',
5156
'nobom',
@@ -77,7 +82,7 @@
7782
'*.php': DEFAULT_RULES + ['phpcs'],
7883
'*.phtml': DEFAULT_RULES,
7984
'*.pp': DEFAULT_RULES + ['puppet'],
80-
'*.properties': DEFAULT_RULES + ['ascii'],
85+
'*.properties': ['ascii'] + DEFAULT_RULES,
8186
'*.py': DEFAULT_RULES + ['pyflakes', 'pythontidy'],
8287
'*.rst': DEFAULT_RULES,
8388
'*.rb': DEFAULT_RULES + ['ruby', 'rubocop'],
@@ -113,6 +118,7 @@
113118

114119
STDIN_CONTENTS = None
115120

121+
ENCODING_BY_FILE = dict()
116122

117123
class BaseException(Exception):
118124

@@ -164,6 +170,35 @@ def wrap(f):
164170

165171
return wrap
166172

173+
def needs_unicode(fix_function):
174+
"""
175+
decorator for a _fix_... function to make it work with a pair of
176+
unicode files (or file-like objects) internally instead of a pair
177+
of byte-files (which are still used externally).
178+
179+
The returned function has an attribute `needs_encoding` which tells
180+
the calling function that it needs an encoding argument (the name of
181+
the encoding to use).
182+
"""
183+
184+
def wrapped_fix(src, dst, encoding_or_options):
185+
if isinstance(encoding_or_options, basestring):
186+
encoding = encoding_or_options
187+
options = None
188+
else:
189+
encoding = encoding_or_options['encoding']
190+
options = encoding_or_options
191+
# decode + encode
192+
src = codecs.EncodedFile(src, encoding)
193+
dst = codecs.EncodedFile(dst, encoding)
194+
if options:
195+
return fix_function(src, dst, options)
196+
else:
197+
return fix_function(src, dst)
198+
199+
wrapped_fix.needs_encoding = True
200+
return wrapped_fix
201+
167202

168203
def is_python3(fd):
169204
'''check first line of file object whether it contains "python3" (shebang)'''
@@ -183,39 +218,33 @@ def _validate_notabs(fd):
183218
return b'\t' not in fd.read()
184219

185220

221+
@needs_unicode
186222
def _fix_notabs(src, dst):
187223
original = src.read()
188224
fixed = original.replace(b'\t', b' ' * 4)
189-
dst.write(fixed.decode())
225+
dst.write(fixed)
190226

191227

192228
@message('contains carriage return (CR)')
193229
def _validate_nocr(fd):
194230
return b'\r' not in fd.read()
195231

196232

233+
@needs_unicode
197234
def _fix_nocr(src, dst):
198235
original = src.read()
199-
fixed = original.replace(b'\r', b'')
200-
dst.write(fixed.decode())
201-
202-
203-
@message('is not UTF-8 encoded')
204-
def _validate_utf8(fd):
205-
try:
206-
fd.read().decode('utf-8')
207-
except UnicodeDecodeError:
208-
return False
209-
return True
236+
fixed = original.replace('\r', '')
237+
dst.write(fixed)
210238

211239

212-
@message('is not ASCII encoded')
213-
def _validate_ascii(fd):
214-
try:
215-
fd.read().decode('ascii')
216-
except UnicodeDecodeError:
217-
return False
218-
return True
240+
def encoding_validator(encoding):
241+
def validate_encoding(fd):
242+
try:
243+
fd.read().decode(encoding)
244+
except UnicodeDecodeError:
245+
return "is not %s-encoded" % encoding.upper()
246+
return True
247+
return validate_encoding
219248

220249

221250
@message('has UTF-8 byte order mark (BOM)')
@@ -245,6 +274,7 @@ def _validate_notrailingws(fd):
245274
return True
246275

247276

277+
@needs_unicode
248278
def _fix_notrailingws(src, dst):
249279
for line in src:
250280
dst.write(line.rstrip())
@@ -773,15 +803,29 @@ def notify(*args):
773803
print(*args)
774804

775805

806+
def get_encoding_rule(rules):
807+
for rule in rules:
808+
try:
809+
codecs.lookup(rule)
810+
return rule
811+
except LookupError:
812+
continue
813+
814+
776815
def validate_file_with_rules(fname, rules):
816+
encoding = get_encoding_rule(rules)
817+
ENCODING_BY_FILE[fname] = encoding
777818
with open_file_for_read(fname) as fd:
778819
for rule in rules:
779820
logging.debug('Validating %s with %s..', fname, rule)
780821
fd.seek(0)
781822
func = globals().get('_validate_' + rule)
782823
if not func:
783-
notify(rule, 'does not exist')
784-
continue
824+
if rule == encoding:
825+
func = encoding_validator(encoding)
826+
else:
827+
notify(rule, 'does not exist')
828+
continue
785829
options = CONFIG.get('options', {}).get(rule)
786830
try:
787831
if options:
@@ -837,6 +881,7 @@ def fix_file(fname, rules):
837881
if CONFIG.get('create_backup', True):
838882
dirname, basename = os.path.split(fname)
839883
shutil.copy2(fname, os.path.join(dirname, CONFIG['backup_filename'].format(original=basename))) # creates a backup
884+
encoding = ENCODING_BY_FILE[fname]
840885
with open_file_for_read(fname) as fd:
841886
dst = fd
842887
for rule in rules:
@@ -849,7 +894,10 @@ def fix_file(fname, rules):
849894
src.seek(0)
850895
try:
851896
if options:
897+
options['encoding'] = encoding
852898
func(src, dst, options)
899+
elif func.needs_encoding:
900+
func(src, dst, encoding)
853901
else:
854902
func(src, dst)
855903
was_fixed &= True
@@ -863,7 +911,7 @@ def fix_file(fname, rules):
863911
# b) some fix functions destroyed the code
864912
if was_fixed and len(fixed) > 0:
865913
with open_file_for_write(fname) as fd:
866-
fd.write(fixed.encode())
914+
fd.write(fixed)
867915
return True
868916
else:
869917
notify('{0}: ERROR fixing file. File remained unchanged'.format(fname))

0 commit comments

Comments
 (0)