-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcleanWordHtml.py
53 lines (49 loc) · 1.81 KB
/
cleanWordHtml.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
'''
Input: MS Word generated html file.
What it does: change encoding to utf-8, remove style info specific to Microsoft Outlook, replace curly double quotes (“) with straights (").
'''
from tempfile import TemporaryFile
from os import path
import shutil
from myfile import sysArgvOrInput
MSO_9_START = '<!--[if '
MSO_9_END = '<![endif]-->'
def main():
filename = sysArgvOrInput()
with TemporaryFile() as tmp:
with open(filename, 'r', encoding='gb18030') as f:
during_mso_9 = False
remains = ''
try:
while True:
if remains:
line = remains
remains = ''
else:
line = next(f)
if during_mso_9:
parts = line.split(MSO_9_END, 1)
if len(parts) == 2:
line, remains = parts
during_mso_9 = False
continue
else:
parts = line.split(MSO_9_START, 1)
if len(parts) == 2:
line, remains = parts
during_mso_9 = True
if MSO_9_START in line or MSO_9_END in line:
print(line)
raise Exception('Error q3490626')
line = line.replace('gb2312', 'utf-8')
line = line.replace('“', '"')
line = line.replace('”', '"')
tmp.write(line.encode('utf-8'))
except StopIteration:
pass
tmp.seek(0)
with open(filename, 'wb') as f:
shutil.copyfileobj(tmp, f)
print('Done. ')
if __name__ == '__main__':
main()