Source code for bag.more_codecs

#!/usr/bin/env python3

"""What do you do when Python does not know about some exotic encoder or
decoder that exists out there? Suppose you have some text,
perhaps an e-mail message, that Python won't decode, saying::

    LookupError: unknown encoding: ansi_x3.110-1983

What you do is tell Python to call some UNIX command that does the
encoding/decoding for you. This module sets that up using the iconv program.

Usage::

    import bag.more_codecs

That is it. Importing the module registers a codec. It will convert
to and from anything in codecs_dict (if iconv supports it).

However, the only possible error modes are 'strict' and 'ignore'. Therefore,
this raises an exception::

    u'hi'.encode('utf32', errors='replace')

The module will look for iconv in the path. You may set a specific location::

    bag.more_codecs.COMMAND = '/usr/bin/iconv'

Unfortunately performance suffers:
A process is started for every iconv call.
You can help us by writing code that calls iconv by using ctypes
or something like that.

Both Python and iconv change over time, so the list of codecs that each
supports is prolly going to change. The registered codecs are in a dictionary::

    from bag.more_codecs import codecs_dict
    print(list(codecs_dict.keys()))

You may add and remove codecs from the codecs_dict.

A reasonable, but perhaps not bug-free, method has been used to try to
determine which codecs iconv has that Python 2.5 does not, and this has
resulted in the default value of codecs_dict, with 894 codecs.
"""

import codecs
import subprocess
import os

COMMAND = 'iconv'
codecs_dict = {c: c for c in [
    '500v1', '851', '856', '866nav', '874', '904', '1046', '1047', '8859_1',
    '8859_2', '8859_3', '8859_4', '8859_5', '8859_6', '8859_7', '8859_8',
    '8859_9', '10646-1:1993', '10646-1:1993/ucs4', 'ansi_x3.4',
    'ansi_x3.110-1983', 'ansi_x3.110', 'arabic7', 'armscii-8', 'asmo_449',
    'baltic', 'big-5', 'big-five', 'bigfive', 'brf', 'bs_4730', 'ca',
    'cn-big5', 'cn-gb', 'cn', 'cp-ar', 'cp-hu', 'cp038', 'cp273', 'cp274',
    'cp275', 'cp278', 'cp280', 'cp281', 'cp282', 'cp284', 'cp285', 'cp290',
    'cp297', 'cp420', 'cp423', 'cp803', 'cp813', 'cp851', 'cp866nav',
    'cp868', 'cp870', 'cp871', 'cp880', 'cp891', 'cp901', 'cp902', 'cp903',
    'cp904', 'cp905', 'cp912', 'cp915', 'cp916', 'cp918', 'cp920', 'cp921',
    'cp922', 'cp930', 'cp933', 'cp935', 'cp937', 'cp939', 'cp1004', 'cp1008',
    'cp1025', 'cp1046', 'cp1047', 'cp1070', 'cp1079', 'cp1081', 'cp1084',
    'cp1089', 'cp1097', 'cp1112', 'cp1122', 'cp1123', 'cp1124', 'cp1125',
    'cp1129', 'cp1130', 'cp1132', 'cp1133', 'cp1137', 'cp1141', 'cp1142',
    'cp1143', 'cp1144', 'cp1145', 'cp1146', 'cp1147', 'cp1148', 'cp1149',
    'cp1153', 'cp1154', 'cp1155', 'cp1156', 'cp1157', 'cp1158', 'cp1160',
    'cp1161', 'cp1162', 'cp1163', 'cp1164', 'cp1166', 'cp1167', 'cp1282',
    'cp1364', 'cp1371', 'cp1388', 'cp1390', 'cp1399', 'cp4517', 'cp4899',
    'cp4909', 'cp4971', 'cp5347', 'cp9030', 'cp9066', 'cp9448', 'cp10007',
    'cp12712', 'cp16804', 'cpibm861', 'csa7-1', 'csa7-2', 'csa_t500-1983',
    'csa_t500', 'csa_z243.4-1985-1', 'csa_z243.4-1985-2', 'csa_z243.419851',
    'csa_z243.419852', 'csdecmcs', 'csebcdicatde', 'csebcdicatdea',
    'csebcdiccafr', 'csebcdicdkno', 'csebcdicdknoa', 'csebcdices',
    'csebcdicesa', 'csebcdicess', 'csebcdicfise', 'csebcdicfisea',
    'csebcdicfr', 'csebcdicit', 'csebcdicpt', 'csebcdicuk', 'csebcdicus',
    'cseuckr', 'cseucpkdfmtjapanese', 'csgb2312', 'cshproman8', 'csibm038',
    'csibm273', 'csibm274', 'csibm275', 'csibm277', 'csibm278', 'csibm280',
    'csibm281', 'csibm284', 'csibm285', 'csibm290', 'csibm297', 'csibm420',
    'csibm423', 'csibm803', 'csibm851', 'csibm856', 'csibm868', 'csibm870',
    'csibm871', 'csibm880', 'csibm891', 'csibm901', 'csibm902', 'csibm903',
    'csibm904', 'csibm905', 'csibm918', 'csibm921', 'csibm922', 'csibm930',
    'csibm932', 'csibm933', 'csibm935', 'csibm937', 'csibm939', 'csibm943',
    'csibm1008', 'csibm1025', 'csibm1097', 'csibm1112', 'csibm1122',
    'csibm1123', 'csibm1124', 'csibm1129', 'csibm1130', 'csibm1132',
    'csibm1133', 'csibm1137', 'csibm1140', 'csibm1141', 'csibm1142',
    'csibm1143', 'csibm1144', 'csibm1145', 'csibm1146', 'csibm1147',
    'csibm1148', 'csibm1149', 'csibm1153', 'csibm1154', 'csibm1155',
    'csibm1156', 'csibm1157', 'csibm1158', 'csibm1160', 'csibm1161',
    'csibm1163', 'csibm1164', 'csibm1166', 'csibm1167', 'csibm1364',
    'csibm1371', 'csibm1388', 'csibm1390', 'csibm1399', 'csibm4517',
    'csibm4899', 'csibm4909', 'csibm4971', 'csibm5347', 'csibm9030',
    'csibm9066', 'csibm9448', 'csibm12712', 'csibm16804', 'csibm11621162',
    'csiso4unitedkingdom', 'csiso10swedish', 'csiso11swedishfornames',
    'csiso14jisc6220ro', 'csiso15italian', 'csiso16portugese',
    'csiso17spanish', 'csiso18greek7old', 'csiso19latingreek',
    'csiso21german', 'csiso25french', 'csiso27latingreek1', 'csiso49inis',
    'csiso50inis8', 'csiso51iniscyrillic', 'csiso58gb1988',
    'csiso60danishnorwegian', 'csiso60norwegian1', 'csiso61norwegian2',
    'csiso69french', 'csiso84portuguese2', 'csiso85spanish2',
    'csiso86hungarian', 'csiso88greek7', 'csiso89asmo449', 'csiso90',
    'csiso92jisc62991984b', 'csiso99naplps', 'csiso103t618bit',
    'csiso111ecmacyrillic', 'csiso121canadian1', 'csiso122canadian2',
    'csiso139csn369103', 'csiso141jusib1002', 'csiso143iecp271', 'csiso150',
    'csiso150greekccitt', 'csiso151cuba', 'csiso153gost1976874',
    'csiso646danish', 'csiso2022cn', 'csiso2022jp2', 'csiso2033',
    'csiso5427cyrillic', 'csiso5427cyrillic1981', 'csiso5428greek',
    'csiso10367box', 'csksc5636', 'csmacintosh', 'csnatsdano', 'csnatssefi',
    'csn_369103', 'csucs4', 'csunicode', 'cswindows31j', 'cuba', 'cwi-2',
    'cwi', 'de', 'dec-mcs', 'dec', 'decmcs', 'din_66003', 'dk', 'ds2089',
    'ds_2089', 'e13b', 'ebcdic-at-de-a', 'ebcdic-at-de', 'ebcdic-be',
    'ebcdic-br', 'ebcdic-ca-fr', 'ebcdic-cp-ar1', 'ebcdic-cp-ar2',
    'ebcdic-cp-dk', 'ebcdic-cp-es', 'ebcdic-cp-fi', 'ebcdic-cp-fr',
    'ebcdic-cp-gb', 'ebcdic-cp-gr', 'ebcdic-cp-is', 'ebcdic-cp-it',
    'ebcdic-cp-no', 'ebcdic-cp-roece', 'ebcdic-cp-se', 'ebcdic-cp-tr',
    'ebcdic-cp-yu', 'ebcdic-cyrillic', 'ebcdic-dk-no-a', 'ebcdic-dk-no',
    'ebcdic-es-a', 'ebcdic-es-s', 'ebcdic-es', 'ebcdic-fi-se-a',
    'ebcdic-fi-se', 'ebcdic-fr', 'ebcdic-greek', 'ebcdic-int', 'ebcdic-int1',
    'ebcdic-is-friss', 'ebcdic-it', 'ebcdic-jp-e', 'ebcdic-jp-kana',
    'ebcdic-pt', 'ebcdic-uk', 'ebcdic-us', 'ebcdicatde', 'ebcdicatdea',
    'ebcdiccafr', 'ebcdicdkno', 'ebcdicdknoa', 'ebcdices', 'ebcdicesa',
    'ebcdicess', 'ebcdicfise', 'ebcdicfisea', 'ebcdicfr', 'ebcdicisfriss',
    'ebcdicit', 'ebcdicpt', 'ebcdicuk', 'ebcdicus', 'ecma-128',
    'ecma-cyrillic', 'ecmacyrillic', 'es', 'es2', 'euc-jp-ms', 'euc-tw',
    'eucjp-ms', 'eucjp-open', 'eucjp-win', 'euctw', 'fi', 'fr', 'gb',
    'gb13000', 'gb_1988-80', 'gb_198880', 'georgian-academy', 'georgian-ps',
    'gost_19768-74', 'gost_19768', 'gost_1976874', 'greek-ccitt',
    'greek7-old', 'greek7', 'greek7old', 'greekccitt', 'hp-greek8',
    'hp-roman9', 'hp-thai8', 'hp-turkish8', 'hpgreek8', 'hproman8',
    'hproman9', 'hpthai8', 'hpturkish8', 'hu', 'ibm-803', 'ibm-856',
    'ibm-901', 'ibm-902', 'ibm-921', 'ibm-922', 'ibm-930', 'ibm-932',
    'ibm-933', 'ibm-935', 'ibm-937', 'ibm-939', 'ibm-943', 'ibm-1008',
    'ibm-1025', 'ibm-1046', 'ibm-1047', 'ibm-1097', 'ibm-1112', 'ibm-1122',
    'ibm-1123', 'ibm-1124', 'ibm-1129', 'ibm-1130', 'ibm-1132', 'ibm-1133',
    'ibm-1137', 'ibm-1140', 'ibm-1141', 'ibm-1142', 'ibm-1143', 'ibm-1144',
    'ibm-1145', 'ibm-1146', 'ibm-1147', 'ibm-1148', 'ibm-1149', 'ibm-1153',
    'ibm-1154', 'ibm-1155', 'ibm-1156', 'ibm-1157', 'ibm-1158', 'ibm-1160',
    'ibm-1161', 'ibm-1162', 'ibm-1163', 'ibm-1164', 'ibm-1166', 'ibm-1167',
    'ibm-1364', 'ibm-1371', 'ibm-1388', 'ibm-1390', 'ibm-1399', 'ibm-4517',
    'ibm-4899', 'ibm-4909', 'ibm-4971', 'ibm-5347', 'ibm-9030', 'ibm-9066',
    'ibm-9448', 'ibm-12712', 'ibm-16804', 'ibm038', 'ibm256', 'ibm273',
    'ibm274', 'ibm275', 'ibm277', 'ibm278', 'ibm280', 'ibm281', 'ibm284',
    'ibm285', 'ibm290', 'ibm297', 'ibm420', 'ibm423', 'ibm803', 'ibm813',
    'ibm848', 'ibm851', 'ibm856', 'ibm866nav', 'ibm868', 'ibm870', 'ibm871',
    'ibm874', 'ibm875', 'ibm880', 'ibm891', 'ibm901', 'ibm902', 'ibm903',
    'ibm904', 'ibm905', 'ibm912', 'ibm915', 'ibm916', 'ibm918', 'ibm920',
    'ibm921', 'ibm922', 'ibm930', 'ibm932', 'ibm933', 'ibm935', 'ibm937',
    'ibm939', 'ibm943', 'ibm1004', 'ibm1008', 'ibm1025', 'ibm1046', 'ibm1047',
    'ibm1089', 'ibm1097', 'ibm1112', 'ibm1122', 'ibm1123', 'ibm1124',
    'ibm1129', 'ibm1130', 'ibm1132', 'ibm1133', 'ibm1137', 'ibm1141',
    'ibm1142', 'ibm1143', 'ibm1144', 'ibm1145', 'ibm1146', 'ibm1147',
    'ibm1148', 'ibm1149', 'ibm1153', 'ibm1154', 'ibm1155', 'ibm1156',
    'ibm1157', 'ibm1158', 'ibm1160', 'ibm1161', 'ibm1162', 'ibm1163',
    'ibm1164', 'ibm1166', 'ibm1167', 'ibm1364', 'ibm1371', 'ibm1388',
    'ibm1390', 'ibm1399', 'ibm4517', 'ibm4899', 'ibm4909', 'ibm4971',
    'ibm5347', 'ibm9030', 'ibm9066', 'ibm9448', 'ibm12712', 'ibm16804',
    'iec_p27-1', 'iec_p271', 'inis-8', 'inis-cyrillic', 'inis', 'inis8',
    'iniscyrillic', 'isiri-3342', 'isiri3342', 'iso-2022-cn-ext',
    'iso-2022-cn', 'iso-8859-9e', 'iso-10646', 'iso-10646/ucs2',
    'iso-10646/ucs4', 'iso-10646/utf-8', 'iso-10646/utf8', 'iso-ir-4',
    'iso-ir-8-1', 'iso-ir-9-1', 'iso-ir-10', 'iso-ir-11', 'iso-ir-14',
    'iso-ir-15', 'iso-ir-16', 'iso-ir-17', 'iso-ir-18', 'iso-ir-19',
    'iso-ir-21', 'iso-ir-25', 'iso-ir-27', 'iso-ir-37', 'iso-ir-49',
    'iso-ir-50', 'iso-ir-51', 'iso-ir-54', 'iso-ir-55', 'iso-ir-57',
    'iso-ir-60', 'iso-ir-61', 'iso-ir-69', 'iso-ir-84', 'iso-ir-85',
    'iso-ir-86', 'iso-ir-88', 'iso-ir-89', 'iso-ir-90', 'iso-ir-92',
    'iso-ir-98', 'iso-ir-99', 'iso-ir-103', 'iso-ir-111', 'iso-ir-121',
    'iso-ir-122', 'iso-ir-139', 'iso-ir-141', 'iso-ir-143', 'iso-ir-150',
    'iso-ir-151', 'iso-ir-153', 'iso-ir-155', 'iso-ir-156', 'iso-ir-179',
    'iso-ir-193', 'iso-ir-197', 'iso-ir-203', 'iso-ir-209', 'iso/tr_11548-1',
    'iso646-ca', 'iso646-ca2', 'iso646-cn', 'iso646-cu', 'iso646-de',
    'iso646-dk', 'iso646-es', 'iso646-es2', 'iso646-fi', 'iso646-fr',
    'iso646-fr1', 'iso646-gb', 'iso646-hu', 'iso646-it', 'iso646-jp-ocr-b',
    'iso646-jp', 'iso646-kr', 'iso646-no', 'iso646-no2', 'iso646-pt',
    'iso646-pt2', 'iso646-se', 'iso646-se2', 'iso646-yu', 'iso2022cn',
    'iso2022cnext', 'iso2022jp2', 'iso6937', 'iso8859-9e', 'iso11548-1',
    'iso88591', 'iso88592', 'iso88593', 'iso88594', 'iso88595', 'iso88596',
    'iso88597', 'iso88598', 'iso88599', 'iso88599e', 'iso885910', 'iso885911',
    'iso885913', 'iso885914', 'iso885915', 'iso885916', 'iso_2033-1983',
    'iso_2033', 'iso_5427-ext', 'iso_5427', 'iso_5427:1981', 'iso_5427ext',
    'iso_5428', 'iso_5428:1980', 'iso_6937-2', 'iso_6937-2:1983', 'iso_6937',
    'iso_6937:1992', 'iso_8859-7:2003', 'iso_8859-9e', 'iso_8859-15:1998',
    'iso_9036', 'iso_10367-box', 'iso_10367box', 'iso_11548-1', 'iso_69372',
    'it', 'jis_c6220-1969-ro', 'jis_c6229-1984-b', 'jis_c62201969ro',
    'jis_c62291984b', 'jp-ocr-b', 'jp', 'js', 'jus_i.b1.002', 'koi-7',
    'koi-8', 'koi8-ru', 'koi8-t', 'koi8', 'koi8r', 'koi8u', 'ksc5636', 'l7',
    'latin-9', 'latin-greek-1', 'latin-greek', 'latin7', 'latingreek',
    'latingreek1', 'mac-centraleurope', 'mac-is', 'mac-sami', 'mac-uk', 'mac',
    'macintosh', 'macis', 'macuk', 'macukrainian', 'mik', 'ms-ansi',
    'ms-arab', 'ms-cyrl', 'ms-ee', 'ms-greek', 'ms-hebr', 'ms-mac-cyrillic',
    'ms-turk', 'mscp949', 'mscp1361', 'msmaccyrillic', 'msz_7795.3', 'naplps',
    'nats-dano', 'nats-sefi', 'natsdano', 'natssefi', 'nc_nc0010',
    'nc_nc00-10', 'nc_nc00-10:81', 'nf_z_62-010', 'nf_z_62-010_(1973)',
    'nf_z_62-010_1973', 'nf_z_62010', 'nf_z_62010_1973', 'no', 'no2',
    'ns_4551-1', 'ns_4551-2', 'ns_45511', 'ns_45512', 'os2latin1',
    'osf00010001', 'osf00010002', 'osf00010003', 'osf00010004', 'osf00010005',
    'osf00010006', 'osf00010007', 'osf00010008', 'osf00010009', 'osf0001000a',
    'osf00010020', 'osf00010100', 'osf00010101', 'osf00010102', 'osf00010104',
    'osf00010105', 'osf00010106', 'osf00030010', 'osf0004000a', 'osf0005000a',
    'osf05010001', 'osf100201a4', 'osf100201a8', 'osf100201b5', 'osf100201f4',
    'osf100203b5', 'osf1002011c', 'osf1002011d', 'osf1002035d', 'osf1002035e',
    'osf1002035f', 'osf1002036b', 'osf1002037b', 'osf10010001', 'osf10010004',
    'osf10010006', 'osf10020025', 'osf10020111', 'osf10020115', 'osf10020116',
    'osf10020118', 'osf10020122', 'osf10020129', 'osf10020352', 'osf10020354',
    'osf10020357', 'osf10020359', 'osf10020360', 'osf10020364', 'osf10020365',
    'osf10020366', 'osf10020367', 'osf10020370', 'osf10020387', 'osf10020388',
    'osf10020396', 'osf10020402', 'osf10020417', 'pt', 'pt2', 'r9', 'rk1048',
    'roman9', 'ruscii', 'se', 'se2', 'sen_850200_b', 'sen_850200_c',
    'sjis-open', 'sjis-win', 'ss636127', 'strk1048-2002', 'st_sev_358-88',
    't.61-8bit', 't.61', 't.618bit', 'tcvn-5712', 'tcvn', 'tcvn5712-1',
    'tcvn5712-1:1993', 'thai8', 'tis620-0', 'tis620.2529-1', 'tis620.2533-0',
    'ts-5881', 'tscii', 'turkish8', 'ucs-2', 'ucs-2be', 'ucs-2le', 'ucs-4',
    'ucs-4be', 'ucs-4le', 'ucs2', 'ucs4', 'uk', 'unicode', 'unicodebig',
    'unicodelittle', 'utf-32', 'utf-32be', 'utf-32le', 'utf16be', 'utf16le',
    'utf32', 'utf32be', 'utf32le', 'viscii', 'wchar_t', 'win-sami-2',
    'winbaltrim', 'windows-31j', 'windows-874', 'windows-936', 'winsami2',
    'ws2', 'yu']
}


[docs]def get_supported_codecs(): """Return a list of the codec names that iconv supports.""" cmd = [COMMAND, '--list'] iconv = subprocess.Popen(cmd, env={'LANG': 'C'}, stdout=subprocess.PIPE, stdin=open(os.devnull, 'w+'), stderr=open(os.devnull, 'w+')) return [line.strip('/').lower() for line in iconv.communicate()[0].splitlines()]
[docs]def discover_interesting_codecs(): global codecs_dict temp = codecs_dict codecs_dict = {} # deactivate iconv for now supported_codecs = get_supported_codecs() interesting_codecs = [] for c in supported_codecs: try: u'a'.encode(c) except UnicodeEncodeError: pass except LookupError: interesting_codecs.append(c) codecs_dict = temp # reactivate iconv return interesting_codecs
''' From Python documentation: #Codec Aliases _python_codecs = """ ascii 646, us-ascii big5 big5-tw, csbig5 big5hkscs big5-hkscs, hkscs cp037 IBM037, IBM039 cp424 EBCDIC-CP-HE, IBM424 cp437 437, IBM437 cp500 EBCDIC-CP-BE, EBCDIC-CP-CH, IBM500 cp737   cp775 IBM775 cp850 850, IBM850 cp852 852, IBM852 cp855 855, IBM855 cp856   cp857 857, IBM857 cp860 860, IBM860 cp861 861, CP-IS, IBM861 cp862 862, IBM862 cp863 863, IBM863 cp864 IBM864 cp865 865, IBM865 cp866 866, IBM866 cp869 869, CP-GR, IBM869 cp874   cp875   cp932 932, ms932, mskanji, ms-kanji cp949 949, ms949, uhc cp950 950, ms950 cp1006   cp1026 ibm1026 cp1140 ibm1140 cp1250 windows-1250 cp1251 windows-1251 cp1252 windows-1252 cp1253 windows-1253 cp1254 windows-1254 cp1255 windows-1255 cp1256 windows1256 cp1257 windows-1257 cp1258 windows-1258 euc_jp eucjp, ujis, u-jis euc_jis_2004 jisx0213, eucjis2004 euc_jisx0213 eucjisx0213 euc_kr euckr, korean, ksc5601, ks_c-5601, ks_c-5601-1987, ksx1001, ks_x-1001 gb2312 chinese, csiso58gb231280, euc- cn, euccn, eucgb2312-cn, gb2312-1980, gb2312-80, iso- ir-58 gbk 936, cp936, ms936 gb18030 gb18030-2000 hz hzgb, hz-gb, hz-gb-2312 iso2022_jp csiso2022jp, iso2022jp, iso-2022-jp iso2022_jp_1 iso2022jp-1, iso-2022-jp-1 iso2022_jp_2 iso2022jp-2, iso-2022-jp-2 iso2022_jp_2004 iso2022jp-2004, iso-2022-jp-2004 iso2022_jp_3 iso2022jp-3, iso-2022-jp-3 iso2022_jp_ext iso2022jp-ext, iso-2022-jp-ext iso2022_kr csiso2022kr, iso2022kr, iso-2022-kr latin_1 iso-8859-1, iso8859-1, 8859, cp819, latin, latin1, L1 iso8859_2 iso-8859-2, latin2, L2 iso8859_3 iso-8859-3, latin3, L3 iso8859_4 iso-8859-4, latin4, L4 iso8859_5 iso-8859-5, cyrillic iso8859_6 iso-8859-6, arabic iso8859_7 iso-8859-7, greek, greek8 iso8859_8 iso-8859-8, hebrew iso8859_9 iso-8859-9, latin5, L5 iso8859_10 iso-8859-10, latin6, L6 iso8859_13 iso-8859-13 iso8859_14 iso-8859-14, latin8, L8 iso8859_15 iso-8859-15 johab cp1361, ms1361 koi8_r   koi8_u   mac_cyrillic maccyrillic mac_greek macgreek mac_iceland maciceland mac_latin2 maclatin2, maccentraleurope mac_roman macroman mac_turkish macturkish ptcp154 csptcp154, pt154, cp154, cyrillic-asian shift_jis csshiftjis, shiftjis, sjis, s_jis shift_jis_2004 shiftjis2004, sjis_2004, sjis2004 shift_jisx0213 shiftjisx0213, sjisx0213, s_jisx0213 utf_32 U32, utf32 utf_32_be UTF-32BE utf_32_le UTF-32LE utf_16 U16, utf16 utf_16_be UTF-16BE utf_16_le UTF-16LE utf_7 U7, unicode-1-1-utf-7 utf_8 U8, UTF, utf8 utf_8_sig   """ _python_codecs = re.findall(r"([^\r\n\t ,]+)", python_codecs.lower()) additional_codecs = [c for c in supported_codecs \ if c != 'utf-8' and c not in python_codecs] ''' def _run_iconv(from_codec, to_codec, extra_params=None): cmd = [COMMAND, '-f', from_codec, '-t', to_codec, '-s'] # -s is silent if extra_params: cmd.extend(extra_params) iconv = subprocess.Popen(cmd, env={'LANG': 'C'}, stdout=subprocess.PIPE, stdin=subprocess.PIPE, stderr=subprocess.PIPE) return iconv def _iconv_factory(codec_name): codec_name = codec_name.lower() if codec_name in codecs_dict: def iconvencode(input, errors='strict', encoding=codec_name): extra = [] if errors == 'ignore': extra.append('-c') elif errors != 'strict': raise NotImplementedError("%r error handling not implemented" " for codec %r" % (errors, encoding)) _input = input.encode('utf-8') iconv = _run_iconv('utf-8', encoding, extra) output, error = iconv.communicate(_input) if error: error = error.splitlines()[0] raise UnicodeEncodeError(encoding, input, 0, len(input), error) return output, len(input) def iconvdecode(input, errors='strict', encoding=codec_name): extra = [] if errors == 'ignore': extra.append('-c') elif errors != 'strict': raise NotImplementedError('%r error handling not implemented' ' for codec %r' % (errors, encoding)) _input = str(input) iconv = _run_iconv(encoding, 'utf-8', extra) output, error = iconv.communicate(_input) if error: error = error.splitlines()[0] raise UnicodeDecodeError(encoding, input, 0, len(input), error) output = output.decode('utf-8') return output, len(input) class IncrementalEncoder(codecs.IncrementalEncoder): def encode(self, input, final=False): return iconvencode(input, self.errors)[0] class IncrementalDecoder(codecs.BufferedIncrementalDecoder): _buffer_decode = staticmethod(iconvdecode) class StreamWriter(codecs.StreamWriter): pass StreamWriter.encode = staticmethod(iconvencode) class StreamReader(codecs.StreamReader): pass StreamReader.decode = staticmethod(iconvdecode) return codecs.CodecInfo( name=codec_name, encode=iconvencode, decode=iconvdecode, incrementalencoder=IncrementalEncoder, incrementaldecoder=IncrementalDecoder, streamreader=StreamReader, streamwriter=StreamWriter, ) if __name__ == '__main__': interesting_codecs = discover_interesting_codecs() print("Here are the %i codecs that Python does not know, but iconv does:" % len(interesting_codecs)) print(interesting_codecs) # Now register the codecs and test codecs.register(_iconv_factory) x = u'áéíóúççç' assert x == x.encode('utf32').decode('utf32') else: codecs.register(_iconv_factory)