2.2. 和字符串(str,unicode等)处理有关的函数

2.2. 和字符串(str,unicode等)处理有关的函数
	第 2 章 crifanLib.py函数及用法详解

2.2.1. 从绝对路径中提取出文件名:extractFilename

#------------------------------------------------------------------------------
# got python script self file name
# extract out xxx from:
# D:\yyy\zzz\xxx.py
# xxx.py
def extractFilename(inputStr) :
    argv0List = inputStr.split("\\");
    scriptName = argv0List[len(argv0List) - 1]; # get script file name self
    possibleSuf = scriptName[-3:];
    if possibleSuf == ".py" :
        scriptName = scriptName[0:-3]; # remove ".py"
    return scriptName;

例 2.5. extractFilename的使用范例

if __name__=="__main__":
    # for : python xxx.py -s yyy    # -> sys.argv[0]=xxx.py
    # for : xxx.py -s yyy           # -> sys.argv[0]=D:\yyy\zzz\xxx.py
    scriptSelfName = extractFilename(sys.argv[0]);

2.2.2. 将实体定义替换为字符:repUniNumEntToChar


#------------------------------------------------------------------------------
# replace the &#N; (N is digit number, N > 1) to unicode char
# eg: replace "&amp;#39;" with "'" in "Creepin&#39; up on you"
def repUniNumEntToChar(text):
    unicodeP = re.compile('&#[0-9]+;');
    def transToUniChr(match): # translate the matched string to unicode char
        numStr = match.group(0)[2:-1]; # remove '&#' and ';'
        num = int(numStr);
        unicodeChar = unichr(num);
        return unicodeChar;
    return unicodeP.sub(transToUniChr, text);

例 2.6. repUniNumEntToChar的使用范例

infoDict['title'] = repUniNumEntToChar(infoDict['title']);

2.2.3. 生成全路径的URL地址:genFullUrl

#------------------------------------------------------------------------------
# generate the full url, which include the main url plus the parameter list
# Note: 
# normally just use urllib.urlencode is OK.
# only use this if you do NOT want urllib.urlencode convert some special chars($,:,{,},...) into %XX
def genFullUrl(mainUrl, paraDict) :
    fullUrl = mainUrl;
    fullUrl += '?';
    for i, para in enumerate(paraDict.keys()) :
        if(i == 0):
            # first para no '&'
            fullUrl += str(para) + '=' + str(paraDict[para]);
        else :
            fullUrl += '&' + str(para) + '=' + str(paraDict[para]);
    return fullUrl;

例 2.7. genFullUrl的使用范例

# Note: here not use urllib.urlencode to encode para, 
#       for the encoded result will convert some special chars($,:,{,},...) into %XX
paraDict = {
    'asyn'          :   '1',
    'thread_id_enc' :   '',
    'start'         :   '',
    'count'         :   '',
    'orderby_type'  :   '0',
};
paraDict['thread_id_enc'] = str(threadIdEnc);
paraDict['start'] = str(startCmtIdx);
paraDict['count'] = str(reqCmtNum);
paraDict['t'] = str(cmtReqTime);
    
mainUrl = "http://hi.baidu.com/cmt/spcmt/get_thread";
getCmtUrl = genFullUrl(mainUrl, paraDict);

2.2.4. 判断两个URL地址是否相似:urlIsSimilar

#------------------------------------------------------------------------------
# check whether two url is similar
# note: input two url both should be str type
def urlIsSimilar(url1, url2) :
    isSim = False;

    url1 = str(url1);
    url2 = str(url2);

    slashList1 = url1.split('/');
    slashList2 = url2.split('/');
    lenS1 = len(slashList1);
    lenS2 = len(slashList2);

    # all should have same structure
    if lenS1 != lenS2 :
        # not same sturcture -> must not similar
        isSim = False;
    else :
        sufPos1 = url1.rfind('.');
        sufPos2 = url2.rfind('.');
        suf1 = url1[(sufPos1 + 1) : ];
        suf2 = url2[(sufPos2 + 1) : ];
        # at least, suffix should same
        if (suf1 == suf2) : 
            lastSlashPos1 = url1.rfind('/');
            lastSlashPos2 = url2.rfind('/');
            exceptName1 = url1[:lastSlashPos1];
            exceptName2 = url2[:lastSlashPos2];
            # except name, all other part should same
            if (exceptName1 == exceptName2) :
                isSim = True;
            else :
                # except name, other part is not same -> not similar
                isSim = False;
        else :
            # suffix not same -> must not similar
            isSim = False;

    return isSim;

例 2.8. urlIsSimilar的使用范例

if urlIsSimilar(url, srcUrl) :
    isSimilar = True;

2.2.5. 判断一个Url地址是否和一个Url地址列表中的某个Url地址相似:findSimilarUrl

如果相似，返回True和相似的地址；

如果不相似，返回False。

#------------------------------------------------------------------------------
# found whether the url is similar in urlList
# if found, return True, similarSrcUrl
# if not found, return False, ''
def findSimilarUrl(url, urlList) :
    (isSimilar, similarSrcUrl) = (False, '');
    for srcUrl in urlList :
        if urlIsSimilar(url, srcUrl) :
            isSimilar = True;
            similarSrcUrl = srcUrl;
            break;
    return (isSimilar, similarSrcUrl);

例 2.9. findSimilarUrl的使用范例

# to check is similar, only when need check and the list it not empty
if ((gCfg['omitSimErrUrl'] == 'yes') and gVal['errorUrlList']):
    (isSimilar, simSrcUrl) = findSimilarUrl(curUrl, gVal['errorUrlList']);
    if isSimilar :
        logging.warning("  Omit process %s for similar with previous error url", curUrl);
        logging.warning("               %s", simSrcUrl);
        continue;

2.2.6. 去除非单词（non-word）的字符:removeNonWordChar

#------------------------------------------------------------------------------
# remove non-word char == only retian alphanumeric character (char+number) and underscore
# eg:
# from againinput4@yeah to againinput4yeah
# from green-waste to greenwaste
def removeNonWordChar(inputString) :
    return re.sub(r"[^\w]", "", inputString); # non [a-zA-Z0-9_]

例 2.10. removeNonWordChar的使用范例

wxrValidUsername = removeNonWordChar(gVal['blogUser']);
wxrValidUsername = wxrValidUsername.replace("_", "");
logging.info("Generated WXR safe username is %s", wxrValidUsername);

2.2.7. 去除控制字符:removeCtlChr

使得处理后的字符串，在XML都是合法的了。

#------------------------------------------------------------------------------
# remove control character from input string
# otherwise will cause wordpress importer import failed
# for wordpress importer, if contains contrl char, will fail to import wxr
# eg:
# 1. http://againinput4.blog.163.com/blog/static/172799491201110111145259/
# content contains some invalid ascii control chars
# 2. http://hi.baidu.com/notebookrelated/blog/item/8bd88e351d449789a71e12c2.html
# 165th comment contains invalid control char: ETX
# 3. http://green-waste.blog.163.com/blog/static/32677678200879111913911/
# title contains control char:DC1, BS, DLE, DLE, DLE, DC1
def removeCtlChr(inputString) :
    validContent = '';
    for c in inputString :
        asciiVal = ord(c);
        validChrList = [
            9, # 9=\t=tab
            10, # 10=\n=LF=Line Feed=换行
            13, # 13=\r=CR=回车
        ];
        # filter out others ASCII control character, and DEL=delete
        isValidChr = True;
        if (asciiVal == 0x7F) :
            isValidChr = False;
        elif ((asciiVal < 32) and (asciiVal not in validChrList)) :
            isValidChr = False;
        
        if(isValidChr) :
            validContent += c;

    return validContent;

例 2.11. removeCtlChr的使用范例

# remove the control char in title:
# eg;
# http://green-waste.blog.163.com/blog/static/32677678200879111913911/
# title contains control char:DC1, BS, DLE, DLE, DLE, DC1
infoDict['title'] = removeCtlChr(infoDict['title']);

	关于控制字符
	如果不了解什么是控制字符，请参考：ASCII字符集中的功能/控制字符

2.2.8. 将字符实体替换为Unicode数字实体:replaceStrEntToNumEnt


#------------------------------------------------------------------------------
# convert the string entity to unicode unmber entity
# refer: http://www.htmlhelp.com/reference/html40/entities/latin1.html
# TODO: need later use this htmlentitydefs instead following
def replaceStrEntToNumEnt(text) :
    strToNumEntDict = {
        # Latin-1 Entities
        "&nbsp;"	:   "&#160;",
        "&iexcl;"	:   "&#161;",
        "&cent;"    :   "&#162;",
        "&pound;"	:   "&#163;",
        "&curren;"	:   "&#164;",
        "&yen;"	    :   "&#165;",
        "&brvbar;"	:   "&#166;",
        "&sect;"	:   "&#167;",
        "&uml;"	    :   "&#168;",
        "&copy;"	:   "&#169;",
        "&ordf;"	:   "&#170;",
        "&laquo;"	:   "&#171;",
        "&not;"	    :   "&#172;",
        "&shy;"	    :   "&#173;",
        "&reg;"	    :   "&#174;",
        "&macr;"	:   "&#175;",
        "&deg;"	    :   "&#176;",
        "&plusmn;"	:   "&#177;",
        "&sup2;"	:   "&#178;",
        "&sup3;"	:   "&#179;",
        "&acute;"	:   "&#180;",
        "&micro;"	:   "&#181;",
        "&para;"	:   "&#182;",
        "&middot;"	:   "&#183;",
        "&cedil;"	:   "&#184;",
        "&sup1;"    :   "&#185;",
        "&ordm;"    :   "&#186;",
        "&raquo;"	:   "&#187;",
        "&frac14;"	:   "&#188;",
        "&frac12;"	:   "&#189;",
        "&frac34;"	:   "&#190;",
        "&iquest;"	:   "&#191;",
        "&Agrave;"	:   "&#192;",
        "&Aacute;"	:   "&#193;",
        "&Acirc;"	:   "&#194;",
        "&Atilde;"	:   "&#195;",
        "&Auml;"	:   "&#196;",
        "&Aring;"	:   "&#197;",
        "&AElig;"	:   "&#198;",
        "&Ccedil;"	:   "&#199;",
        "&Egrave;"	:   "&#200;",
        "&Eacute;"	:   "&#201;",
        "&Ecirc;"	:   "&#202;",
        "&Euml;"    :   "&#203;",
        "&Igrave;"	:   "&#204;",
        "&Iacute;"	:   "&#205;",
        "&Icirc;"	:   "&#206;",
        "&Iuml;"    :   "&#207;",
        "&ETH;"	    :   "&#208;",
        "&Ntilde;"	:   "&#209;",
        "&Ograve;"	:   "&#210;",
        "&Oacute;"	:   "&#211;",
        "&Ocirc;"	:   "&#212;",
        "&Otilde;"	:   "&#213;",
        "&Ouml;"	:   "&#214;",
        "&times;"	:   "&#215;",
        "&Oslash;"	:   "&#216;",
        "&Ugrave;"	:   "&#217;",
        "&Uacute;"	:   "&#218;",
        "&Ucirc;"	:   "&#219;",
        "&Uuml;"	:   "&#220;",
        "&Yacute;"	:   "&#221;",
        "&THORN;"	:   "&#222;",
        "&szlig;"	:   "&#223;",
        "&agrave;"	:   "&#224;",
        "&aacute;"	:   "&#225;",
        "&acirc;"	:   "&#226;",
        "&atilde;"	:   "&#227;",
        "&auml;"	:   "&#228;",
        "&aring;"	:   "&#229;",
        "&aelig;"	:   "&#230;",
        "&ccedil;"	:   "&#231;",
        "&egrave;"	:   "&#232;",
        "&eacute;"	:   "&#233;",
        "&ecirc;"	:   "&#234;",
        "&euml;"	:   "&#235;",
        "&igrave;"	:   "&#236;",
        "&iacute;"	:   "&#237;",
        "&icirc;"	:   "&#238;",
        "&iuml;"	:   "&#239;",
        "&eth;"	    :   "&#240;",
        "&ntilde;"	:   "&#241;",
        "&ograve;"	:   "&#242;",
        "&oacute;"	:   "&#243;",
        "&ocirc;"	:   "&#244;",
        "&otilde;"	:   "&#245;",
        "&ouml;" 	:   "&#246;",
        "&divide;"	:   "&#247;",
        "&oslash;"	:   "&#248;",
        "&ugrave;"	:   "&#249;",
        "&uacute;"	:   "&#250;",
        "&ucirc;"	:   "&#251;",
        "&uuml;"	:   "&#252;",
        "&yacute;"	:   "&#253;",
        "&thorn;"	:   "&#254;",
        "&yuml;"	:   "&#255;",
        # http://www.htmlhelp.com/reference/html40/entities/special.html
        # Special Entities
        "&quot;"    : "&#34;",
        "&amp;"     : "&#38;",
        "&lt;"      : "&#60;",
        "&gt;"      : "&#62;",
        "&OElig;"   : "&#338;",
        "&oelig;"   : "&#339;",
        "&Scaron;"  : "&#352;",
        "&scaron;"  : "&#353;",
        "&Yuml;"    : "&#376;",
        "&circ;"    : "&#710;",
        "&tilde;"   : "&#732;",
        "&ensp;"    : "&#8194;",
        "&emsp;"    : "&#8195;",
        "&thinsp;"  : "&#8201;",
        "&zwnj;"    : "&#8204;",
        "&zwj;"     : "&#8205;",
        "&lrm;"     : "&#8206;",
        "&rlm;"     : "&#8207;",
        "&ndash;"   : "&#8211;",
        "&mdash;"   : "&#8212;",
        "&lsquo;"   : "&#8216;",
        "&rsquo;"   : "&#8217;",
        "&sbquo;"   : "&#8218;",
        "&ldquo;"   : "&#8220;",
        "&rdquo;"   : "&#8221;",
        "&bdquo;"   : "&#8222;",
        "&dagger;"  : "&#8224;",
        "&Dagger;"  : "&#8225;",
        "&permil;"  : "&#8240;",
        "&lsaquo;"  : "&#8249;",
        "&rsaquo;"  : "&#8250;",
        "&euro;"    : "&#8364;",
        }

    replacedText = text;
    for key in strToNumEntDict.keys() :
        replacedText = re.compile(key).sub(strToNumEntDict[key], replacedText);
    return replacedText;

例 2.12. replaceStrEntToNumEnt的使用范例

line = replaceStrEntToNumEnt(line);

2.2.9. 将xxx=yyy转换为元祖（tuple）变量:convertToTupleVal

#------------------------------------------------------------------------------
# convert the xxx=yyy into tuple('xxx', yyy), then return the tuple value
# [makesure input string]
# (1) is not include whitespace
# (2) include '='
# (3) last is no ';'
# [possible input string]
# blogUserName="againinput4"
# publisherEmail=""
# synchMiniBlog=false
# publishTime=1322129849397
# publisherName=null
# publisherNickname="\u957F\u5927\u662F\u70E6\u607C"
def convertToTupleVal(equationStr) :
    (key, value) = ('', None);

    try :
        # Note:
        # here should not use split with '=', for maybe input string contains string like this:
        # http://img.bimg.126.net/photo/hmZoNQaqzZALvVp0rE7faA==/0.jpg
        # so use find('=') instead
        firstEqualPos = equationStr.find("=");
        key = equationStr[0:firstEqualPos];
        valuePart = equationStr[(firstEqualPos + 1):];

        # string type
        valLen = len(valuePart);
        if valLen >= 2 :
            # maybe string
            if valuePart[0] == '"' and valuePart[-1] == '"' :
                # is string type
                value = str(valuePart[1:-1]);
            elif (valuePart.lower() == 'null'):
                value = None;
            elif (valuePart.lower() == 'false'):
                value = False;
            elif (valuePart.lower() == 'true') :
                value = True;
            else :
                # must int value
                value = int(valuePart);
        else :
            # len=1 -> must be value
            value = int(valuePart);

        #print "Convert %s to [%s]=%s"%(equationStr, key, value);
    except :
        (key, value) = ('', None);
        print "Fail of convert the equal string %s to value"%(equationStr);

    return (key, value);

例 2.13. convertToTupleVal的使用范例

# (4) convert to value
for equation in equationList :
    (key, value) = convertToTupleVal(equation);

2.2.10. 去除列表（List）中的空值:removeEmptyInList

#------------------------------------------------------------------------------
# remove the empty ones in list
def removeEmptyInList(list) :
    newList = [];
    for val in list :
        if val :
            newList.append(val);
    return newList;

例 2.14. removeEmptyInList的使用范例

# Note: some list contain [u''], so is not meaningful, remove it here
# for only [] is empty, [u''] is not empty -> error while exporting to WXR
infoDict['tags'] = removeEmptyInList(infoDict['tags']);

2.2.11. 列表去重（去除重复的值）:uniqueList

#------------------------------------------------------------------------------
# remove overlapped item in the list
def uniqueList(old_list):
    newList = []
    for x in old_list:
        if x not in newList :
            newList.append(x)
    return newList

例 2.15. uniqueList的使用范例

nonOverlapList = uniqueList(matchedList); # remove processed

2.2.12. 过滤列表（去除在b中出现的a中的某值）:filterList

#------------------------------------------------------------------------------
# for listToFilter, remove the ones which is in listToCompare
# also return the ones which is already exist in listToCompare
def filterList(listToFilter, listToCompare) :
    filteredList = [];
    existedList = [];
    for singleOne in listToFilter : # remove processed
        if (not(singleOne in listToCompare)) :
            # omit the ones in listToCompare
            filteredList.append(singleOne);
        else :
            # record the already exist ones
            existedList.append(singleOne);
    return (filteredList, existedList);

例 2.16. filterList的使用范例

# remove processed and got ones that has been processed
(filteredPicList, existedList) = filterList(nonOverlapList, gVal['processedUrlList']);

2.2.13. 生成随机数的字符串:randDigitsStr

#------------------------------------------------------------------------------
# generated the random digits number string
# max digit number is 12
def randDigitsStr(digitNum = 12) :
    if(digitNum > 12):
        digitNum = 12;

    randVal = random.random();
    #print "randVal=",randVal; #randVal= 0.134248340235
    randVal = str(randVal);
    #print "randVal=",randVal; #randVal= 0.134248340235
    
    randVal = randVal.replace("0.", "");
    #print "randVal=",randVal; #randVal= 0.134248340235
    
    # if last is 0, append that 0    
    if(len(randVal)==11):
        randVal = randVal + "0";
    #print "randVal=",randVal; #randVal= 0.134248340235
    
    #randVal = randVal.replace("e+11", "");
    #randVal = randVal.replace(".", "");
    #print "randVal=",randVal; #randVal= 0.134248340235
    randVal = randVal[0 : digitNum];
    #print "randVal=",randVal; #randVal= 0.134248340235
    
    return randVal;

例 2.17. randDigitsStr 的使用范例

captchaUrl += str(randDigitsStr(6));

2.2.14. 将元组列表转换为字典变量:tupleListToDict

#------------------------------------------------------------------------------
# convert tuple list to dict value
# [(u'type', u'text/javascript'), (u'src', u'http://partner.googleadservices.com/gampad/google_service.js')]
# { u'type':u'text/javascript', u'src':u'http://partner.googleadservices.com/gampad/google_service.js' }
def tupleListToDict(tupleList):
    convertedDict = {};
    
    for eachTuple in tupleList:
        (key, value) = eachTuple;
        convertedDict[key] = value;
    
    return convertedDict;

例 2.18. tupleListToDict 的使用范例

#singleContent: name=script, attrMap=None, attrs=[(u'type', u'text/javascript'), (u'src', u'http://partner.googleadservices.com/gampad/google_service.js')]
attrsDict = tupleListToDict(singleContent.attrs);


第 2 章 crifanLib.py函数及用法详解		2.3. 文件(file等)方面的函数