# -*- coding: utf-8 -*- import codecs, types # from utf-8 to iso6937 def iso6937_encode(input,errors,encoding_map): result = '' num_bytes = 0 for c in input: try: if c in encoding_map: num_bytes += 2 result += chr(ord(encoding_map[c][0].decode('unicode-escape'))) + encoding_map[c][1] else: code = ord(c) if code > 0xFF: num_bytes += 1 result += chr(code >> 8) num_bytes += 1 result += chr(code & 0xFF) except KeyError: raise UnicodeError, "internal conversion algoritm error" return (result, num_bytes) # from iso6937 to utf-8 def iso6937_decode(input,errors,decoding_map): result = u'' num_bytes = 0 i = 0 while i <= len(input) - 1: c = input[i] try: if c in nonspacing_diacritical_marks: i += 1 if i > len(input) - 1: raise KeyError str = c + input[i] if str in decoding_map: num_bytes += 2 result += decoding_map[str] else: raise KeyError else: num_bytes += 1 result += c except KeyError: if errors == 'strict': raise UnicodeError, "invalid iso6937 character" elif errors == 'replace': num_bytes += 1 result += chr(0x3f) #question mark elif errors == 'ignore': pass else: raise UnicodeError, "unknown error handling" i += 1 return (result, num_bytes) ### Codec APIs class Codec(codecs.Codec): def encode(self,input,errors='strict'): return iso6937_encode(input,errors,encoding_map) def decode(self,input,errors='strict'): return iso6937_decode(input,errors,decoding_map) class IncrementalEncoder(codecs.IncrementalEncoder): def encode(self, input, final=False): return iso6937_encode(input,self.errors,encoding_map)[0] class IncrementalDecoder(codecs.IncrementalDecoder): def decode(self, input, final=False): return iso6937_decode(input,self.errors,decoding_map)[0] class StreamWriter(Codec,codecs.StreamWriter): pass class StreamReader(Codec,codecs.StreamReader): pass ### encodings module API def getregentry(): return codecs.CodecInfo( name='iso6937', encode=Codec().encode, decode=Codec().decode, incrementalencoder=IncrementalEncoder, incrementaldecoder=IncrementalDecoder, streamreader=StreamReader, streamwriter=StreamWriter, ) nonspacing_diacritical_marks = ['\xc1', '\xc2', '\xc3', '\xc4', '\xc5', '\xc6', '\xc7', '\xc8', '\xca', '\xcb', '\xcd', '\xce', '\xcf'] ### Decoding Map decoding_map = { '\xc1'+'A': u'À', '\xc1'+'E': u'È', '\xc1'+'I': u'Ì', '\xc1'+'O': u'Ò', '\xc1'+'U': u'Ù', '\xc1'+'a': u'à', '\xc1'+'e': u'è', '\xc1'+'i': u'ì', '\xc1'+'o': u'ò', '\xc1'+'u': u'ù', '\xc2'+'A': u'Á', '\xc2'+'C': u'Ć', '\xc2'+'E': u'É', '\xc2'+'I': u'Í', '\xc2'+'L': u'Ĺ', '\xc2'+'N': u'Ń', '\xc2'+'O': u'Ó', '\xc2'+'R': u'Ŕ', '\xc2'+'S': u'Ś', '\xc2'+'U': u'Ú', '\xc2'+'Y': u'Ý', '\xc2'+'Z': u'Ź', '\xc2'+'a': u'á', '\xc2'+'c': u'ć', '\xc2'+'e': u'é', '\xc2'+'g': u'ģ', '\xc2'+'i': u'í', '\xc2'+'l': u'ĺ', '\xc2'+'n': u'ń', '\xc2'+'o': u'ó', '\xc2'+'r': u'ŕ', '\xc2'+'s': u'ś', '\xc2'+'u': u'ú', '\xc2'+'y': u'ý', '\xc2'+'z': u'ź', '\xc3'+'A': u'Â', '\xc3'+'C': u'Ĉ', '\xc3'+'E': u'Ê', '\xc3'+'G': u'Ĝ', '\xc3'+'H': u'Ĥ', '\xc3'+'I': u'Î', '\xc3'+'J': u'Ĵ', '\xc3'+'O': u'Ô', '\xc3'+'S': u'Ŝ', '\xc3'+'U': u'Û', '\xc3'+'W': u'Ŵ', '\xc3'+'Y': u'Ŷ', '\xc3'+'a': u'â', '\xc3'+'c': u'ĉ', '\xc3'+'e': u'ê', '\xc3'+'g': u'ĝ', '\xc3'+'h': u'ĥ', '\xc3'+'i': u'î', '\xc3'+'j': u'ĵ', '\xc3'+'o': u'ô', '\xc3'+'s': u'ŝ', '\xc3'+'u': u'û', '\xc3'+'w': u'ŵ', '\xc3'+'y': u'ŷ', '\xc4'+'A': u'Ã', '\xc4'+'I': u'Ĩ', '\xc4'+'N': u'Ñ', '\xc4'+'O': u'Õ', '\xc4'+'U': u'Ũ', '\xc4'+'a': u'ã', '\xc4'+'i': u'ĩ', '\xc4'+'n': u'ñ', '\xc4'+'o': u'õ', '\xc4'+'u': u'ũ', '\xc5'+'A': u'Ā', '\xc5'+'E': u'Ē', '\xc5'+'I': u'Ī', '\xc5'+'O': u'Ō', '\xc5'+'U': u'Ū', '\xc5'+'a': u'ā', '\xc5'+'e': u'ē', '\xc5'+'i': u'ī', '\xc5'+'o': u'ō', '\xc5'+'u': u'ū', '\xc6'+'A': u'Ă', '\xc6'+'G': u'Ğ', '\xc6'+'U': u'Ŭ', '\xc6'+'a': u'ă', '\xc6'+'g': u'ğ', '\xc6'+'u': u'ŭ', '\xc7'+'C': u'Ċ', '\xc7'+'E': u'Ė', '\xc7'+'G': u'Ġ', '\xc7'+'I': u'İ', '\xc7'+'Z': u'Ż', '\xc7'+'c': u'ċ', '\xc7'+'e': u'ė', '\xc7'+'g': u'ġ', '\xc7'+'z': u'ż', '\xc8'+'A': u'Ä', '\xc8'+'E': u'Ë', '\xc8'+'I': u'Ï', '\xc8'+'O': u'Ö', '\xc8'+'U': u'Ü', '\xc8'+'Y': u'Ÿ', '\xc8'+'a': u'ä', '\xc8'+'e': u'ë', '\xc8'+'i': u'ï', '\xc8'+'o': u'ö', '\xc8'+'u': u'ü', '\xc8'+'y': u'ÿ', '\xca'+'A': u'Å', '\xca'+'U': u'Ů', '\xca'+'a': u'å', '\xca'+'u': u'ů', '\xcb'+'C': u'Ç', '\xcb'+'G': u'Ģ', '\xcb'+'K': u'Ķ', '\xcb'+'L': u'Ļ', '\xcb'+'N': u'Ņ', '\xcb'+'R': u'Ŗ', '\xcb'+'S': u'Ş', '\xcb'+'T': u'Ţ', '\xcb'+'c': u'ç', '\xcb'+'k': u'ķ', '\xcb'+'l': u'ļ', '\xcb'+'n': u'ņ', '\xcb'+'r': u'ŗ', '\xcb'+'s': u'ş', '\xcb'+'t': u'ţ', '\xcd'+'O': u'Ő', '\xcd'+'U': u'Ű', '\xcd'+'o': u'ő', '\xcd'+'u': u'ű', '\xce'+'A': u'Ą', '\xce'+'E': u'Ę', '\xce'+'I': u'Į', '\xce'+'U': u'Ų', '\xce'+'a': u'ą', '\xce'+'e': u'ę', '\xce'+'i': u'į', '\xce'+'u': u'ų', '\xcf'+'C': u'Č', '\xcf'+'D': u'Ď', '\xcf'+'E': u'Ě', '\xcf'+'L': u'Ľ', '\xcf'+'N': u'Ň', '\xcf'+'R': u'Ř', '\xcf'+'S': u'Š', '\xcf'+'T': u'Ť', '\xcf'+'Z': u'Ž', '\xcf'+'c': u'č', '\xcf'+'d': u'ď', '\xcf'+'e': u'ě', '\xcf'+'l': u'ľ', '\xcf'+'n': u'ň', '\xcf'+'r': u'ř', '\xcf'+'s': u'š', '\xcf'+'t': u'ť', '\xcf'+'z': u'ž', } ### Encoding Map encoding_map = { u'À': ['\xc1','A'], u'È': ['\xc1','E'], u'Ì': ['\xc1','I'], u'Ò': ['\xc1','O'], u'Ù': ['\xc1','U'], u'à': ['\xc1','a'], u'è': ['\xc1','e'], u'ì': ['\xc1','i'], u'ò': ['\xc1','o'], u'ù': ['\xc1','u'], u'Á': ['\xc2','A'], u'Ć': ['\xc2','C'], u'É': ['\xc2','E'], u'Í': ['\xc2','I'], u'Ĺ': ['\xc2','L'], u'Ń': ['\xc2','N'], u'Ó': ['\xc2','O'], u'Ŕ': ['\xc2','R'], u'Ś': ['\xc2','S'], u'Ú': ['\xc2','U'], u'Ý': ['\xc2','Y'], u'Ź': ['\xc2','Z'], u'á': ['\xc2','a'], u'ć': ['\xc2','c'], u'é': ['\xc2','e'], u'ģ': ['\xc2','g'], u'í': ['\xc2','i'], u'ĺ': ['\xc2','l'], u'ń': ['\xc2','n'], u'ó': ['\xc2','o'], u'ŕ': ['\xc2','r'], u'ś': ['\xc2','s'], u'ú': ['\xc2','u'], u'ý': ['\xc2','y'], u'ź': ['\xc2','z'], u'Â': ['\xc3','A'], u'Ĉ': ['\xc3','C'], u'Ê': ['\xc3','E'], u'Ĝ': ['\xc3','G'], u'Ĥ': ['\xc3','H'], u'Î': ['\xc3','I'], u'Ĵ': ['\xc3','J'], u'Ô': ['\xc3','O'], u'Ŝ': ['\xc3','S'], u'Û': ['\xc3','U'], u'Ŵ': ['\xc3','W'], u'Ŷ': ['\xc3','Y'], u'â': ['\xc3','a'], u'ĉ': ['\xc3','c'], u'ê': ['\xc3','e'], u'ĝ': ['\xc3','g'], u'ĥ': ['\xc3','h'], u'î': ['\xc3','i'], u'ĵ': ['\xc3','j'], u'ô': ['\xc3','o'], u'ŝ': ['\xc3','s'], u'û': ['\xc3','u'], u'ŵ': ['\xc3','w'], u'ŷ': ['\xc3','y'], u'Ã': ['\xc4','A'], u'Ĩ': ['\xc4','I'], u'Ñ': ['\xc4','N'], u'Õ': ['\xc4','O'], u'Ũ': ['\xc4','U'], u'ã': ['\xc4','a'], u'ĩ': ['\xc4','i'], u'ñ': ['\xc4','n'], u'õ': ['\xc4','o'], u'ũ': ['\xc4','u'], u'Ā': ['\xc5','A'], u'Ē': ['\xc5','E'], u'Ī': ['\xc5','I'], u'Ō': ['\xc5','O'], u'Ū': ['\xc5','U'], u'ā': ['\xc5','a'], u'ē': ['\xc5','e'], u'ī': ['\xc5','i'], u'ō': ['\xc5','o'], u'ū': ['\xc5','u'], u'Ă': ['\xc6','A'], u'Ğ': ['\xc6','G'], u'Ŭ': ['\xc6','U'], u'ă': ['\xc6','a'], u'ğ': ['\xc6','g'], u'ŭ': ['\xc6','u'], u'Ċ': ['\xc7','C'], u'Ė': ['\xc7','E'], u'Ġ': ['\xc7','G'], u'İ': ['\xc7','I'], u'Ż': ['\xc7','Z'], u'ċ': ['\xc7','c'], u'ė': ['\xc7','e'], u'ġ': ['\xc7','g'], u'ż': ['\xc7','z'], u'Ä': ['\xc8','A'], u'Ë': ['\xc8','E'], u'Ï': ['\xc8','I'], u'Ö': ['\xc8','O'], u'Ü': ['\xc8','U'], u'Ÿ': ['\xc8','Y'], u'ä': ['\xc8','a'], u'ë': ['\xc8','e'], u'ï': ['\xc8','i'], u'ö': ['\xc8','o'], u'ü': ['\xc8','u'], u'ÿ': ['\xc8','y'], u'Å': ['\xca','A'], u'Ů': ['\xca','U'], u'å': ['\xca','a'], u'ů': ['\xca','u'], u'Ç': ['\xcb','C'], u'Ģ': ['\xcb','G'], u'Ķ': ['\xcb','K'], u'Ļ': ['\xcb','L'], u'Ņ': ['\xcb','N'], u'Ŗ': ['\xcb','R'], u'Ş': ['\xcb','S'], u'Ţ': ['\xcb','T'], u'ç': ['\xcb','c'], u'ķ': ['\xcb','k'], u'ļ': ['\xcb','l'], u'ņ': ['\xcb','n'], u'ŗ': ['\xcb','r'], u'ş': ['\xcb','s'], u'ţ': ['\xcb','t'], u'Ő': ['\xcd','O'], u'Ű': ['\xcd','U'], u'ő': ['\xcd','o'], u'ű': ['\xcd','u'], u'Ą': ['\xce','A'], u'Ę': ['\xce','E'], u'Į': ['\xce','I'], u'Ų': ['\xce','U'], u'ą': ['\xce','a'], u'ę': ['\xce','e'], u'į': ['\xce','i'], u'ų': ['\xce','u'], u'Č': ['\xcf','C'], u'Ď': ['\xcf','D'], u'Ě': ['\xcf','E'], u'Ľ': ['\xcf','L'], u'Ň': ['\xcf','N'], u'Ř': ['\xcf','R'], u'Š': ['\xcf','S'], u'Ť': ['\xcf','T'], u'Ž': ['\xcf','Z'], u'č': ['\xcf','c'], u'ď': ['\xcf','d'], u'ě': ['\xcf','e'], u'ľ': ['\xcf','l'], u'ň': ['\xcf','n'], u'ř': ['\xcf','r'], u'š': ['\xcf','s'], u'ť': ['\xcf','t'], u'ž': ['\xcf','z'], }