devtools/fix_po_encoding
author Sylvain Thénault <sylvain.thenault@logilab.fr>
Tue, 11 Oct 2011 13:50:32 +0200
branchstable
changeset 7932 2ad26cc3b5c6
parent 0 b97547f5f1fa
permissions -rwxr-xr-x
[datafeed parser] only update an entity attributes when needed. Closes #1989142 This may avoid error with some entities which have immutable attributes and don't check something actually changed. Also, avoid modifying the cached item representation, this may cause pb when an item is seen several times during an import.

#!/usr/bin/python

"""usage: fix-po-encodings [filename...]
change the encoding of the po files passed as arguments to utf-8
"""
import sys
import re
import codecs

def change_encoding(filename, target='UTF-8'):
    fdesc = open(filename)
    data = fdesc.read()
    fdesc.close()
    encoding = find_encoding(data)
    if encoding == target:
        return
    data = fix_encoding(data, target)
    data = unicode(data, encoding)
    fdesc = codecs.open(filename, 'wb', encoding=target)
    fdesc.write(data)
    fdesc.close()

def find_encoding(data):
    regexp = re.compile(r'"Content-Type:.* charset=([a-zA-Z0-9-]+)\\n"', re.M)
    mo = regexp.search(data)
    if mo is None:
        raise ValueError('No encoding declaration')
    return mo.group(1)

def fix_encoding(data, target_encoding):
    regexp = re.compile(r'("Content-Type:.* charset=)(.*)(\\n")', re.M)
    return regexp.sub(r'\1%s\3' % target_encoding, data)
    


for filename in sys.argv[1:]:
    print filename
    change_encoding(filename)