devtools/fix_po_encoding
author Sylvain Thénault <sylvain.thenault@logilab.fr>
Tue, 30 Jun 2015 10:06:00 +0200
changeset 10914 fed8bd56f223
parent 0 b97547f5f1fa
permissions -rwxr-xr-x
[repository] deprecate the extid2eid based multi-sources API This API is cumbersome and lead to obfuscated code because of the callback mecanism implied when some entity has to be created. Since we dropped the "true" multi-source, this mecanism is not needed anymore and one should prefer to use the dataimport API inside its parser instead. Notice the cwxml parser will trigger the deprecation warning, it should not be used anymore in favor of ad-hoc parsers.

#!/usr/bin/python

"""usage: fix-po-encodings [filename...]
change the encoding of the po files passed as arguments to utf-8
"""
import sys
import re
import codecs

def change_encoding(filename, target='UTF-8'):
    fdesc = open(filename)
    data = fdesc.read()
    fdesc.close()
    encoding = find_encoding(data)
    if encoding == target:
        return
    data = fix_encoding(data, target)
    data = unicode(data, encoding)
    fdesc = codecs.open(filename, 'wb', encoding=target)
    fdesc.write(data)
    fdesc.close()

def find_encoding(data):
    regexp = re.compile(r'"Content-Type:.* charset=([a-zA-Z0-9-]+)\\n"', re.M)
    mo = regexp.search(data)
    if mo is None:
        raise ValueError('No encoding declaration')
    return mo.group(1)

def fix_encoding(data, target_encoding):
    regexp = re.compile(r'("Content-Type:.* charset=)(.*)(\\n")', re.M)
    return regexp.sub(r'\1%s\3' % target_encoding, data)
    


for filename in sys.argv[1:]:
    print filename
    change_encoding(filename)