2.2.8. 将字符实体替换为Unicode数字实体:replaceStrEntToNumEnt


#------------------------------------------------------------------------------
# convert the string entity to unicode unmber entity
# refer: http://www.htmlhelp.com/reference/html40/entities/latin1.html
# TODO: need later use this htmlentitydefs instead following
def replaceStrEntToNumEnt(text) :
    strToNumEntDict = {
        # Latin-1 Entities
        " "	:   " ",
        "¡"	:   "¡",
        "¢"    :   "¢",
        "£"	:   "£",
        "¤"	:   "¤",
        "¥"	    :   "¥",
        "¦"	:   "¦",
        "§"	:   "§",
        "¨"	    :   "¨",
        "©"	:   "©",
        "ª"	:   "ª",
        "«"	:   "«",
        "¬"	    :   "¬",
        "­"	    :   "­",
        "®"	    :   "®",
        "¯"	:   "¯",
        "°"	    :   "°",
        "±"	:   "±",
        "²"	:   "²",
        "³"	:   "³",
        "´"	:   "´",
        "µ"	:   "µ",
        "¶"	:   "¶",
        "·"	:   "·",
        "¸"	:   "¸",
        "¹"    :   "¹",
        "º"    :   "º",
        "»"	:   "»",
        "¼"	:   "¼",
        "½"	:   "½",
        "¾"	:   "¾",
        "¿"	:   "¿",
        "À"	:   "À",
        "Á"	:   "Á",
        "Â"	:   "Â",
        "Ã"	:   "Ã",
        "Ä"	:   "Ä",
        "Å"	:   "Å",
        "Æ"	:   "Æ",
        "Ç"	:   "Ç",
        "È"	:   "È",
        "É"	:   "É",
        "Ê"	:   "Ê",
        "Ë"    :   "Ë",
        "Ì"	:   "Ì",
        "Í"	:   "Í",
        "Î"	:   "Î",
        "Ï"    :   "Ï",
        "Ð"	    :   "Ð",
        "Ñ"	:   "Ñ",
        "Ò"	:   "Ò",
        "Ó"	:   "Ó",
        "Ô"	:   "Ô",
        "Õ"	:   "Õ",
        "Ö"	:   "Ö",
        "×"	:   "×",
        "Ø"	:   "Ø",
        "Ù"	:   "Ù",
        "Ú"	:   "Ú",
        "Û"	:   "Û",
        "Ü"	:   "Ü",
        "Ý"	:   "Ý",
        "Þ"	:   "Þ",
        "ß"	:   "ß",
        "à"	:   "à",
        "á"	:   "á",
        "â"	:   "â",
        "ã"	:   "ã",
        "ä"	:   "ä",
        "å"	:   "å",
        "æ"	:   "æ",
        "ç"	:   "ç",
        "è"	:   "è",
        "é"	:   "é",
        "ê"	:   "ê",
        "ë"	:   "ë",
        "ì"	:   "ì",
        "í"	:   "í",
        "î"	:   "î",
        "ï"	:   "ï",
        "ð"	    :   "ð",
        "ñ"	:   "ñ",
        "ò"	:   "ò",
        "ó"	:   "ó",
        "ô"	:   "ô",
        "õ"	:   "õ",
        "ö" 	:   "ö",
        "÷"	:   "÷",
        "ø"	:   "ø",
        "ù"	:   "ù",
        "ú"	:   "ú",
        "û"	:   "û",
        "ü"	:   "ü",
        "ý"	:   "ý",
        "þ"	:   "þ",
        "ÿ"	:   "ÿ",
        # http://www.htmlhelp.com/reference/html40/entities/special.html
        # Special Entities
        """    : """,
        "&"     : "&",
        "<"      : "<",
        ">"      : ">",
        "Œ"   : "Œ",
        "œ"   : "œ",
        "Š"  : "Š",
        "š"  : "š",
        "Ÿ"    : "Ÿ",
        "ˆ"    : "ˆ",
        "˜"   : "˜",
        " "    : " ",
        " "    : " ",
        " "  : " ",
        "‌"    : "‌",
        "‍"     : "‍",
        "‎"     : "‎",
        "‏"     : "‏",
        "–"   : "–",
        "—"   : "—",
        "‘"   : "‘",
        "’"   : "’",
        "‚"   : "‚",
        "“"   : "“",
        "”"   : "”",
        "„"   : "„",
        "†"  : "†",
        "‡"  : "‡",
        "‰"  : "‰",
        "‹"  : "‹",
        "›"  : "›",
        "€"    : "€",
        }

    replacedText = text;
    for key in strToNumEntDict.keys() :
        replacedText = re.compile(key).sub(strToNumEntDict[key], replacedText);
    return replacedText;

        

例 2.12. replaceStrEntToNumEnt的使用范例

line = replaceStrEntToNumEnt(line);