From 48b8ffd5f48f457379b51331a6cb7bd058f63218 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pa=C5=ADlo=20Ebermann?= <paul@fencing-game.de>
Date: Sun, 2 Aug 2015 16:29:18 +0200
Subject: [PATCH 1/2] issue 37: add test file.

This file can be checked with codevalidator (complains about
trailing white space, use of tabs, use of carriage return),
but with the current code it can't be fixed (will be overwritten
with an empty file).

https://github.com/hjacobs/codevalidator/issues/37
---
 test/unicode-test.yaml | 3 +++
 1 file changed, 3 insertions(+)
 create mode 100644 test/unicode-test.yaml

diff --git a/test/unicode-test.yaml b/test/unicode-test.yaml
new file mode 100644
index 0000000..bf77456
--- /dev/null
+++ b/test/unicode-test.yaml
@@ -0,0 +1,3 @@
+example: |
+      This line ends with a space. 
+      This line has non-ASCII signs in it – which breaks codevalidator 0.8.2.

From 31a2fccdc2ba681811fe07b895ed0e1426e0531c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pa=C5=ADlo=20Ebermann?= <paul@fencing-game.de>
Date: Sun, 2 Aug 2015 22:12:52 +0200
Subject: [PATCH 2/2] convert to unicode and back when fixing stuff on
 line-level.

---
 codevalidator.py | 98 +++++++++++++++++++++++++++++++++++-------------
 1 file changed, 71 insertions(+), 27 deletions(-)

diff --git a/codevalidator.py b/codevalidator.py
index 620a956..8624524 100755
--- a/codevalidator.py
+++ b/codevalidator.py
@@ -32,6 +32,7 @@
 import sys
 import tempfile
 import shutil
+import codecs
 
 if sys.version_info.major == 2:
     # Pythontidy is only supported on Python2
@@ -47,6 +48,10 @@
 
 DEFAULT_CONFIG_PATHS = ['~/.codevalidatorrc', '/etc/codevalidatorrc']
 
+# The first rule name which matches a registered encoding is used
+# both as a check that the file can be read with that encoding,
+# as well as a encoding filter for those rules which support fixing.
+
 DEFAULT_RULES = [
     'utf8',
     'nobom',
@@ -78,7 +83,7 @@
         '*.php': DEFAULT_RULES + ['phpcs'],
         '*.phtml': DEFAULT_RULES,
         '*.pp': DEFAULT_RULES + ['puppet'],
-        '*.properties': DEFAULT_RULES + ['ascii'],
+        '*.properties': ['ascii'] + DEFAULT_RULES,
         '*.py': DEFAULT_RULES + ['pyflakes', 'pythontidy'],
         '*.rst': DEFAULT_RULES,
         '*.rb': DEFAULT_RULES + ['ruby', 'rubocop'],
@@ -114,6 +119,7 @@
 
 STDIN_CONTENTS = None
 
+ENCODING_BY_FILE = dict()
 
 class BaseException(Exception):
 
@@ -165,6 +171,35 @@ def wrap(f):
 
     return wrap
 
+def needs_unicode(fix_function):
+    """
+    decorator for a _fix_... function to make it work with a pair of
+    unicode files (or file-like objects) internally instead of a pair
+    of byte-files (which are still used externally).
+    
+    The returned function has an attribute `needs_encoding` which tells
+    the calling function that it needs an encoding argument (the name of
+    the encoding to use).
+    """
+
+    def wrapped_fix(src, dst, encoding_or_options):
+         if isinstance(encoding_or_options, basestring):
+            encoding = encoding_or_options
+            options = None
+         else:
+            encoding = encoding_or_options['encoding']
+            options = encoding_or_options
+         # decode + encode
+         src = codecs.EncodedFile(src, encoding)
+         dst = codecs.EncodedFile(dst, encoding)
+         if options:
+            return fix_function(src, dst, options)
+         else:
+            return fix_function(src, dst)
+
+    wrapped_fix.needs_encoding = True
+    return wrapped_fix
+
 
 def is_python3(fd):
     '''check first line of file object whether it contains "python3" (shebang)'''
@@ -191,10 +226,11 @@ def _validate_notabs(fd):
     return b'\t' not in fd.read()
 
 
+@needs_unicode
 def _fix_notabs(src, dst):
     original = src.read()
     fixed = original.replace(b'\t', b' ' * 4)
-    dst.write(fixed.decode())
+    dst.write(fixed)
 
 
 @message('contains carriage return (CR)')
@@ -202,32 +238,21 @@ def _validate_nocr(fd):
     return b'\r' not in fd.read()
 
 
+@needs_unicode
 def _fix_nocr(src, dst):
     original = src.read()
-    fixed = original.replace(b'\r', b'')
-    dst.write(fixed.decode())
-
-
-@message('is not UTF-8 encoded')
-def _validate_utf8(fd):
-    '''
-    >>> _validate_utf8(BytesIO(b'foo'))
-    True
-    '''
-    try:
-        fd.read().decode('utf-8')
-    except UnicodeDecodeError:
-        return False
-    return True
+    fixed = original.replace('\r', '')
+    dst.write(fixed)
 
 
-@message('is not ASCII encoded')
-def _validate_ascii(fd):
-    try:
-        fd.read().decode('ascii')
-    except UnicodeDecodeError:
-        return False
-    return True
+def encoding_validator(encoding):
+    def validate_encoding(fd):
+        try:
+            fd.read().decode(encoding)
+        except UnicodeDecodeError:
+            return "is not %s-encoded" % encoding.upper()
+        return True
+    return validate_encoding
 
 
 @message('has UTF-8 byte order mark (BOM)')
@@ -263,6 +288,7 @@ def _validate_notrailingws(fd):
     return True
 
 
+@needs_unicode
 def _fix_notrailingws(src, dst):
     for line in src:
         dst.write(line.rstrip())
@@ -806,15 +832,29 @@ def notify(*args):
         print(*args)
 
 
+def get_encoding_rule(rules):
+    for rule in rules:
+        try:
+            codecs.lookup(rule)
+            return rule
+        except LookupError:
+            continue
+
+
 def validate_file_with_rules(fname, rules):
+    encoding = get_encoding_rule(rules)
+    ENCODING_BY_FILE[fname] = encoding
     with open_file_for_read(fname) as fd:
         for rule in rules:
             logging.debug('Validating %s with %s..', fname, rule)
             fd.seek(0)
             func = globals().get('_validate_' + rule)
             if not func:
-                notify(rule, 'does not exist')
-                continue
+                if rule == encoding:
+                    func = encoding_validator(encoding)
+                else:
+                    notify(rule, 'does not exist')
+                    continue
             options = CONFIG.get('options', {}).get(rule)
             try:
                 if options:
@@ -870,6 +910,7 @@ def fix_file(fname, rules):
     if CONFIG.get('create_backup', True):
         dirname, basename = os.path.split(fname)
         shutil.copy2(fname, os.path.join(dirname, CONFIG['backup_filename'].format(original=basename)))  # creates a backup
+    encoding = ENCODING_BY_FILE[fname]
     with open_file_for_read(fname) as fd:
         dst = fd
         for rule in rules:
@@ -882,7 +923,10 @@ def fix_file(fname, rules):
                 src.seek(0)
                 try:
                     if options:
+                        options['encoding'] = encoding
                         func(src, dst, options)
+                    elif func.needs_encoding:
+                        func(src, dst, encoding)
                     else:
                         func(src, dst)
                     was_fixed &= True
@@ -896,7 +940,7 @@ def fix_file(fname, rules):
     # b) some fix functions destroyed the code
     if was_fixed and len(fixed) > 0:
         with open_file_for_write(fname) as fd:
-            fd.write(fixed.encode())
+            fd.write(fixed)
         return True
     else:
         notify('{0}: ERROR fixing file. File remained unchanged'.format(fname))