Python Unicode to UTF-8 Replacement Dictionary

I recently found an increasing need to replace Unicode characters with their English equivalents. This is in response to use of the ISO 8895-1 character set in html. Below is my dictionary for doing so and a code snippet for using it.

{"\x2013":"-","\x2014":"--","\x2018":"'","\x2019":"'","\x201A":",","\x201D":"~","\x2022":"*","\x2026":"...","\x2030":"%","\x2032":"'","\x2033":"`","\x2039":"","\x203E":"--","\x2044":"/","\x20AC":" euro ","\x2111":"i","\x2118":"P","\x2122":" TM ","\x2135":" alef ","\x2190":"","\x2193":" down-arrow ","\x2194":"","\x21B5":" crarr ","\x21D0":"","\x21D4":"","\x2200":"ALL","\x2202":" part ","\x2203":"EVERY","\x2205":"empty-set","\x2207":"nabla","\x2208":"isin","\x2209":"notin","\x2217":"*","\x221A":"sqrt","\x2329":"","\x25CA":" loz ","\x2660":"spades","\x2663":"clubs","\x2665":"hearts","\x2666":"diamonds","\x200C":" zwnj ","\x200D":" zwj ","\x200E":" lrm ","\x200F":" rlm ","\x27":"'","\xc2|\xA0|\x2002|\x2003|\x2009":" ","\x3E":">","\x3C":"> ","\xBC":"1/4","\xBD":"1/2","\xBE":"1/4","\xBE":"3/4","\xBF":" iquest "}

Multiple entities and a larger dictionary are provided below after an update to this function.

The code:

    def def encodeHTML(self,html,foreignKeys=None,replaceNonPrintable=False,multiEntities={"\xe2\x81\x91":"**","\xe2\x81\x95":"*","\xe2\x81\x97":'""',"\xe2\x81\xa0|\xe2\x80\x8b|\xe2\x80\x8c|\xe2\x80\x8d|\xe2\x80\x8e|‏\xe2\x80\x8f":"","\xe2\x80\x86|\xe2\x80\x87":"   ","\xe2\x80\x84|\xe2\x80\x85|\xe2\x80\x88":"  ","\xe2\x80\x8a|\xe2\x80\x89|\xe2\x80\x80|\xe2\x80\x81|\xe2\x80\x82|\xe2\x80\x82|\xe2\x80\x83":" ","\xe2\x80\x93|\xe2\x80\x92|\xe2\x80\x91|\xe2\x80\x90":"-","\xe2\x80\x96":"||","\xe2\x80\x95|\xe2\x80\x94":"--","\xe2\x81\x87":"??","\xe2\x81\x88":"?!","\xe2\x81\x89":"!?","\xe2\x81\x9d|\xe2\x81\x9e":":","\xe2\x81\x92":"-","\xe2\x81\x8b":" PILCROW ","\xe2\x80\xbc":"!!","\xe2\x80\xba":">","\xe2\x80\xb9":"<","\xe2\x80\xb8":"^","\xe2\x80\xb1":"%000","\xe2\x80\xb0":"%0","\xe2\x80\xa4|\xe2\x80\xa7":".","\xe2\x80\xa5":"..","\u2013":"-","\u2014":"--","\u2018":"'","\u2019":"'","\u201A":",","\u201D":"~","\u2022|\xe2\x80\xa3|\xe2\x80\xa2":"*"},replaceEntities={"\u2026":"...","\u2030":"%","\u2032":"'","\u2033":"`","\u2039":"","\u203E":"--","\u2044":"/","\u20AC":" euro ","\u2111":"i","\u2118":"P","\u2122":" TM ","\u2135":" alef ","\u2190":"","\u2193":" down-arrow ","\u2194":"","\u21B5":" crarr ","\u21D0":"","\u21D4":"","\u2200":"ALL","\u2202":" part ","\u2203":"EVERY","\u2205":"empty-set","\u2207":"nabla","\u2208":"isin","\u2209":"notin","\u2217":"*","\u221A":"sqrt","\u2329":"","\u25CA":" loz ","\u2660":"spades","\u2663":"clubs","\u2665":"hearts","\u2666":"diamonds","\u200C":" zwnj ","\u200D":" zwj ","\u200E":" lrm ","\u200F":" rlm ","\u27":"'","\xc2|\xA0|\u2002|\u2003|\u2009":" ","\x3E":">","\x3C":"> ","\uBC":"1/4","\xBD":"1/2","\xBE":"1/4","\xBE":"3/4","\xBF":" iquest "}):
        '''
        Encode HTML. Unfortunately
        
        *Required Parameters:
        
        :param html: html to run replacements on
        
        *Optional Parameters*
        :param multiEntities: entities represented my multiple unicode hex numbers 
        :param replaceNonPrintable: replace non printable characters after all other sets and encodings complete
        :param foreignKeys: dictionary of mapping to keys not in replaceEntities (for which are not in the default dict such as foreign letters e.g. {"\xF8":"[oslash]"} )
        :param replaceEntities: a list of entities to replace such as ,copyright symboles, micro; etc. that may be in the ISO or other format converted to unicode Hex formats (see non Latin characters at https://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references)
         
        '''
        if multiEntities is not None:
            for k in multiEntities.keys():
                html=re.sub(k,multiEntities[k],html)
                
        if replaceEntities is not None:
            for k in replaceEntities.keys():
                html=re.sub(k,replaceEntities[k],html)
        
        html=HTMLParser.HTMLParser().unescape(Soup(Soup(html).encode()).prettify())
        
        
        if replaceNonPrintable is True:
            import string
            html=filter(lambda x: x in string.printable,html)    
        
        if foreignKeys is not None:
            for k in foreignKeys.keys():
                html=re.sub(k,foreignKeys[k],html)
        
        return html
Advertisements

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s