devtools/fix_po_encoding
author Julien Cristau <julien.cristau@logilab.fr>
Mon, 04 Jan 2016 10:36:57 +0100
changeset 11045 615163b17558
parent 0 b97547f5f1fa
permissions -rwxr-xr-x
[ldapfeed] extid and cwuri aren't the same thing Historically, we used a ldap://... URI as cwuri attribute, not just the dn. The extid field, OTOH, was always the dn. So when initializing our dict of known entities from an ldap source, look at existing extids, not cwuris, to avoid importing already-existing users (and violated unicity constraints).

#!/usr/bin/python

"""usage: fix-po-encodings [filename...]
change the encoding of the po files passed as arguments to utf-8
"""
import sys
import re
import codecs

def change_encoding(filename, target='UTF-8'):
    fdesc = open(filename)
    data = fdesc.read()
    fdesc.close()
    encoding = find_encoding(data)
    if encoding == target:
        return
    data = fix_encoding(data, target)
    data = unicode(data, encoding)
    fdesc = codecs.open(filename, 'wb', encoding=target)
    fdesc.write(data)
    fdesc.close()

def find_encoding(data):
    regexp = re.compile(r'"Content-Type:.* charset=([a-zA-Z0-9-]+)\\n"', re.M)
    mo = regexp.search(data)
    if mo is None:
        raise ValueError('No encoding declaration')
    return mo.group(1)

def fix_encoding(data, target_encoding):
    regexp = re.compile(r'("Content-Type:.* charset=)(.*)(\\n")', re.M)
    return regexp.sub(r'\1%s\3' % target_encoding, data)
    


for filename in sys.argv[1:]:
    print filename
    change_encoding(filename)