!/usr/bin/env python
coding: utf-8
import sys
if len(sys.argv) != 3:
print("Usage:\npython {} input.file output.file".format(sys.argv[0]))
sys.exit(0)
entree = open(sys.argv[1], 'r') sortie = open(sys.argv[2], 'w')
special = {
u'\u0153' : 'oe', # ligature oe
u'\u266a' : '#', # musique
u'\u2019' : '\'', # guillemet simple
u'\u2026' : '...', # dots
u'\ufeff' : '', # 0-width space
'<i>' : '', # strip html tags
'</i>' : '', # strip html tags
}
def dic_replace(text, dic):
for key in dic:
text = text.replace(key, dic[key])
return text
for line in entree:
try:
nospecial = dic_replace(line.decode('utf-8'), special)
decoded = nospecial.encode('latin1', 'strict')
sortie.write(decoded)
except UnicodeEncodeError:
print(line)
print(line.decode('utf-8'),)