devtools/fix_po_encoding
author Pierre-Yves David <pierre-yves.david@logilab.fr>
Fri, 26 Apr 2013 11:52:35 +0200
changeset 8939 30ce8eccfe3f
parent 0 b97547f5f1fa
permissions -rwxr-xr-x
[htmlparser] store unaltered source in pageinfo.source Do not store the parser to preserve the source aspect. Store the initial input in the source argument instead. This is very useful when parsing html. In such case we need to drop ``<script>`` tag.

#!/usr/bin/python

"""usage: fix-po-encodings [filename...]
change the encoding of the po files passed as arguments to utf-8
"""
import sys
import re
import codecs

def change_encoding(filename, target='UTF-8'):
    fdesc = open(filename)
    data = fdesc.read()
    fdesc.close()
    encoding = find_encoding(data)
    if encoding == target:
        return
    data = fix_encoding(data, target)
    data = unicode(data, encoding)
    fdesc = codecs.open(filename, 'wb', encoding=target)
    fdesc.write(data)
    fdesc.close()

def find_encoding(data):
    regexp = re.compile(r'"Content-Type:.* charset=([a-zA-Z0-9-]+)\\n"', re.M)
    mo = regexp.search(data)
    if mo is None:
        raise ValueError('No encoding declaration')
    return mo.group(1)

def fix_encoding(data, target_encoding):
    regexp = re.compile(r'("Content-Type:.* charset=)(.*)(\\n")', re.M)
    return regexp.sub(r'\1%s\3' % target_encoding, data)
    


for filename in sys.argv[1:]:
    print filename
    change_encoding(filename)