I recently found an increasing need to replace Unicode characters with their English equivalents. This is in response to use of the ISO 8895-1 character set in html. Below is my dictionary for doing so and a code snippet for using it.
{"\x2013":"-","\x2014":"--","\x2018":"'","\x2019":"'","\x201A":",","\x201D":"~","\x2022":"*","\x2026":"...","\x2030":"%","\x2032":"'","\x2033":"`","\x2039":"","\x203E":"--","\x2044":"/","\x20AC":" euro ","\x2111":"i","\x2118":"P","\x2122":" TM ","\x2135":" alef ","\x2190":"","\x2193":" down-arrow ","\x2194":"","\x21B5":" crarr ","\x21D0":"","\x21D4":"","\x2200":"ALL","\x2202":" part ","\x2203":"EVERY","\x2205":"empty-set","\x2207":"nabla","\x2208":"isin","\x2209":"notin","\x2217":"*","\x221A":"sqrt","\x2329":"","\x25CA":" loz ","\x2660":"spades","\x2663":"clubs","\x2665":"hearts","\x2666":"diamonds","\x200C":" zwnj ","\x200D":" zwj ","\x200E":" lrm ","\x200F":" rlm ","\x27":"'","\xc2|\xA0|\x2002|\x2003|\x2009":" ","\x3E":">","\x3C":"> ","\xBC":"1/4","\xBD":"1/2","\xBE":"1/4","\xBE":"3/4","\xBF":" iquest "}
Multiple entities and a larger dictionary are provided below after an update to this function.
The code:
def def encodeHTML(self,html,foreignKeys=None,replaceNonPrintable=False,multiEntities={"\xe2\x81\x91":"**","\xe2\x81\x95":"*","\xe2\x81\x97":'""',"\xe2\x81\xa0|\xe2\x80\x8b|\xe2\x80\x8c|\xe2\x80\x8d|\xe2\x80\x8e|\xe2\x80\x8f":"","\xe2\x80\x86|\xe2\x80\x87":" ","\xe2\x80\x84|\xe2\x80\x85|\xe2\x80\x88":" ","\xe2\x80\x8a|\xe2\x80\x89|\xe2\x80\x80|\xe2\x80\x81|\xe2\x80\x82|\xe2\x80\x82|\xe2\x80\x83":" ","\xe2\x80\x93|\xe2\x80\x92|\xe2\x80\x91|\xe2\x80\x90":"-","\xe2\x80\x96":"||","\xe2\x80\x95|\xe2\x80\x94":"--","\xe2\x81\x87":"??","\xe2\x81\x88":"?!","\xe2\x81\x89":"!?","\xe2\x81\x9d|\xe2\x81\x9e":":","\xe2\x81\x92":"-","\xe2\x81\x8b":" PILCROW ","\xe2\x80\xbc":"!!","\xe2\x80\xba":">","\xe2\x80\xb9":"<","\xe2\x80\xb8":"^","\xe2\x80\xb1":"%000","\xe2\x80\xb0":"%0","\xe2\x80\xa4|\xe2\x80\xa7":".","\xe2\x80\xa5":"..","\u2013":"-","\u2014":"--","\u2018":"'","\u2019":"'","\u201A":",","\u201D":"~","\u2022|\xe2\x80\xa3|\xe2\x80\xa2":"*"},replaceEntities={"\u2026":"...","\u2030":"%","\u2032":"'","\u2033":"`","\u2039":"","\u203E":"--","\u2044":"/","\u20AC":" euro ","\u2111":"i","\u2118":"P","\u2122":" TM ","\u2135":" alef ","\u2190":"","\u2193":" down-arrow ","\u2194":"","\u21B5":" crarr ","\u21D0":"","\u21D4":"","\u2200":"ALL","\u2202":" part ","\u2203":"EVERY","\u2205":"empty-set","\u2207":"nabla","\u2208":"isin","\u2209":"notin","\u2217":"*","\u221A":"sqrt","\u2329":"","\u25CA":" loz ","\u2660":"spades","\u2663":"clubs","\u2665":"hearts","\u2666":"diamonds","\u200C":" zwnj ","\u200D":" zwj ","\u200E":" lrm ","\u200F":" rlm ","\u27":"'","\xc2|\xA0|\u2002|\u2003|\u2009":" ","\x3E":">","\x3C":"> ","\uBC":"1/4","\xBD":"1/2","\xBE":"1/4","\xBE":"3/4","\xBF":" iquest "}): ''' Encode HTML. Unfortunately *Required Parameters: :param html: html to run replacements on *Optional Parameters* :param multiEntities: entities represented my multiple unicode hex numbers :param replaceNonPrintable: replace non printable characters after all other sets and encodings complete :param foreignKeys: dictionary of mapping to keys not in replaceEntities (for which are not in the default dict such as foreign letters e.g. {"\xF8":"[oslash]"} ) :param replaceEntities: a list of entities to replace such as ,copyright symboles, micro; etc. that may be in the ISO or other format converted to unicode Hex formats (see non Latin characters at https://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references) ''' if multiEntities is not None: for k in multiEntities.keys(): html=re.sub(k,multiEntities[k],html) if replaceEntities is not None: for k in replaceEntities.keys(): html=re.sub(k,replaceEntities[k],html) html=HTMLParser.HTMLParser().unescape(Soup(Soup(html).encode()).prettify()) if replaceNonPrintable is True: import string html=filter(lambda x: x in string.printable,html) if foreignKeys is not None: for k in foreignKeys.keys(): html=re.sub(k,foreignKeys[k],html) return html