#------------------------------------------------------------------------------
# got python script self file name
# extract out xxx from:
# D:\yyy\zzz\xxx.py
# xxx.py
def extractFilename(inputStr) :
argv0List = inputStr.split("\\");
scriptName = argv0List[len(argv0List) - 1]; # get script file name self
possibleSuf = scriptName[-3:];
if possibleSuf == ".py" :
scriptName = scriptName[0:-3]; # remove ".py"
return scriptName;
例 2.5. extractFilename的使用范例
if __name__=="__main__":
# for : python xxx.py -s yyy # -> sys.argv[0]=xxx.py
# for : xxx.py -s yyy # -> sys.argv[0]=D:\yyy\zzz\xxx.py
scriptSelfName = extractFilename(sys.argv[0]);
#------------------------------------------------------------------------------
# replace the &#N; (N is digit number, N > 1) to unicode char
# eg: replace "'" with "'" in "Creepin' up on you"
def repUniNumEntToChar(text):
unicodeP = re.compile('&#[0-9]+;');
def transToUniChr(match): # translate the matched string to unicode char
numStr = match.group(0)[2:-1]; # remove '&#' and ';'
num = int(numStr);
unicodeChar = unichr(num);
return unicodeChar;
return unicodeP.sub(transToUniChr, text);
#------------------------------------------------------------------------------
# generate the full url, which include the main url plus the parameter list
# Note:
# normally just use urllib.urlencode is OK.
# only use this if you do NOT want urllib.urlencode convert some special chars($,:,{,},...) into %XX
def genFullUrl(mainUrl, paraDict) :
fullUrl = mainUrl;
fullUrl += '?';
for i, para in enumerate(paraDict.keys()) :
if(i == 0):
# first para no '&'
fullUrl += str(para) + '=' + str(paraDict[para]);
else :
fullUrl += '&' + str(para) + '=' + str(paraDict[para]);
return fullUrl;
例 2.7. genFullUrl的使用范例
# Note: here not use urllib.urlencode to encode para,
# for the encoded result will convert some special chars($,:,{,},...) into %XX
paraDict = {
'asyn' : '1',
'thread_id_enc' : '',
'start' : '',
'count' : '',
'orderby_type' : '0',
};
paraDict['thread_id_enc'] = str(threadIdEnc);
paraDict['start'] = str(startCmtIdx);
paraDict['count'] = str(reqCmtNum);
paraDict['t'] = str(cmtReqTime);
mainUrl = "http://hi.baidu.com/cmt/spcmt/get_thread";
getCmtUrl = genFullUrl(mainUrl, paraDict);
#------------------------------------------------------------------------------
# check whether two url is similar
# note: input two url both should be str type
def urlIsSimilar(url1, url2) :
isSim = False;
url1 = str(url1);
url2 = str(url2);
slashList1 = url1.split('/');
slashList2 = url2.split('/');
lenS1 = len(slashList1);
lenS2 = len(slashList2);
# all should have same structure
if lenS1 != lenS2 :
# not same sturcture -> must not similar
isSim = False;
else :
sufPos1 = url1.rfind('.');
sufPos2 = url2.rfind('.');
suf1 = url1[(sufPos1 + 1) : ];
suf2 = url2[(sufPos2 + 1) : ];
# at least, suffix should same
if (suf1 == suf2) :
lastSlashPos1 = url1.rfind('/');
lastSlashPos2 = url2.rfind('/');
exceptName1 = url1[:lastSlashPos1];
exceptName2 = url2[:lastSlashPos2];
# except name, all other part should same
if (exceptName1 == exceptName2) :
isSim = True;
else :
# except name, other part is not same -> not similar
isSim = False;
else :
# suffix not same -> must not similar
isSim = False;
return isSim;
如果相似,返回True和相似的地址;
如果不相似,返回False。
#------------------------------------------------------------------------------
# found whether the url is similar in urlList
# if found, return True, similarSrcUrl
# if not found, return False, ''
def findSimilarUrl(url, urlList) :
(isSimilar, similarSrcUrl) = (False, '');
for srcUrl in urlList :
if urlIsSimilar(url, srcUrl) :
isSimilar = True;
similarSrcUrl = srcUrl;
break;
return (isSimilar, similarSrcUrl);
例 2.9. findSimilarUrl的使用范例
# to check is similar, only when need check and the list it not empty
if ((gCfg['omitSimErrUrl'] == 'yes') and gVal['errorUrlList']):
(isSimilar, simSrcUrl) = findSimilarUrl(curUrl, gVal['errorUrlList']);
if isSimilar :
logging.warning(" Omit process %s for similar with previous error url", curUrl);
logging.warning(" %s", simSrcUrl);
continue;
#------------------------------------------------------------------------------
# remove non-word char == only retian alphanumeric character (char+number) and underscore
# eg:
# from againinput4@yeah to againinput4yeah
# from green-waste to greenwaste
def removeNonWordChar(inputString) :
return re.sub(r"[^\w]", "", inputString); # non [a-zA-Z0-9_]
例 2.10. removeNonWordChar的使用范例
wxrValidUsername = removeNonWordChar(gVal['blogUser']);
wxrValidUsername = wxrValidUsername.replace("_", "");
logging.info("Generated WXR safe username is %s", wxrValidUsername);
使得处理后的字符串,在XML都是合法的了。
#------------------------------------------------------------------------------
# remove control character from input string
# otherwise will cause wordpress importer import failed
# for wordpress importer, if contains contrl char, will fail to import wxr
# eg:
# 1. http://againinput4.blog.163.com/blog/static/172799491201110111145259/
# content contains some invalid ascii control chars
# 2. http://hi.baidu.com/notebookrelated/blog/item/8bd88e351d449789a71e12c2.html
# 165th comment contains invalid control char: ETX
# 3. http://green-waste.blog.163.com/blog/static/32677678200879111913911/
# title contains control char:DC1, BS, DLE, DLE, DLE, DC1
def removeCtlChr(inputString) :
validContent = '';
for c in inputString :
asciiVal = ord(c);
validChrList = [
9, # 9=\t=tab
10, # 10=\n=LF=Line Feed=换行
13, # 13=\r=CR=回车
];
# filter out others ASCII control character, and DEL=delete
isValidChr = True;
if (asciiVal == 0x7F) :
isValidChr = False;
elif ((asciiVal < 32) and (asciiVal not in validChrList)) :
isValidChr = False;
if(isValidChr) :
validContent += c;
return validContent;
例 2.11. removeCtlChr的使用范例
# remove the control char in title:
# eg;
# http://green-waste.blog.163.com/blog/static/32677678200879111913911/
# title contains control char:DC1, BS, DLE, DLE, DLE, DC1
infoDict['title'] = removeCtlChr(infoDict['title']);
![]() |
关于控制字符 |
|---|---|
如果不了解什么是控制字符,请参考:ASCII字符集中的功能/控制字符 |
#------------------------------------------------------------------------------
# convert the string entity to unicode unmber entity
# refer: http://www.htmlhelp.com/reference/html40/entities/latin1.html
# TODO: need later use this htmlentitydefs instead following
def replaceStrEntToNumEnt(text) :
strToNumEntDict = {
# Latin-1 Entities
" " : " ",
"¡" : "¡",
"¢" : "¢",
"£" : "£",
"¤" : "¤",
"¥" : "¥",
"¦" : "¦",
"§" : "§",
"¨" : "¨",
"©" : "©",
"ª" : "ª",
"«" : "«",
"¬" : "¬",
"­" : "­",
"®" : "®",
"¯" : "¯",
"°" : "°",
"±" : "±",
"²" : "²",
"³" : "³",
"´" : "´",
"µ" : "µ",
"¶" : "¶",
"·" : "·",
"¸" : "¸",
"¹" : "¹",
"º" : "º",
"»" : "»",
"¼" : "¼",
"½" : "½",
"¾" : "¾",
"¿" : "¿",
"À" : "À",
"Á" : "Á",
"Â" : "Â",
"Ã" : "Ã",
"Ä" : "Ä",
"Å" : "Å",
"Æ" : "Æ",
"Ç" : "Ç",
"È" : "È",
"É" : "É",
"Ê" : "Ê",
"Ë" : "Ë",
"Ì" : "Ì",
"Í" : "Í",
"Î" : "Î",
"Ï" : "Ï",
"Ð" : "Ð",
"Ñ" : "Ñ",
"Ò" : "Ò",
"Ó" : "Ó",
"Ô" : "Ô",
"Õ" : "Õ",
"Ö" : "Ö",
"×" : "×",
"Ø" : "Ø",
"Ù" : "Ù",
"Ú" : "Ú",
"Û" : "Û",
"Ü" : "Ü",
"Ý" : "Ý",
"Þ" : "Þ",
"ß" : "ß",
"à" : "à",
"á" : "á",
"â" : "â",
"ã" : "ã",
"ä" : "ä",
"å" : "å",
"æ" : "æ",
"ç" : "ç",
"è" : "è",
"é" : "é",
"ê" : "ê",
"ë" : "ë",
"ì" : "ì",
"í" : "í",
"î" : "î",
"ï" : "ï",
"ð" : "ð",
"ñ" : "ñ",
"ò" : "ò",
"ó" : "ó",
"ô" : "ô",
"õ" : "õ",
"ö" : "ö",
"÷" : "÷",
"ø" : "ø",
"ù" : "ù",
"ú" : "ú",
"û" : "û",
"ü" : "ü",
"ý" : "ý",
"þ" : "þ",
"ÿ" : "ÿ",
# http://www.htmlhelp.com/reference/html40/entities/special.html
# Special Entities
""" : """,
"&" : "&",
"<" : "<",
">" : ">",
"Œ" : "Œ",
"œ" : "œ",
"Š" : "Š",
"š" : "š",
"Ÿ" : "Ÿ",
"ˆ" : "ˆ",
"˜" : "˜",
" " : " ",
" " : " ",
" " : " ",
"‌" : "‌",
"‍" : "‍",
"‎" : "‎",
"‏" : "‏",
"–" : "–",
"—" : "—",
"‘" : "‘",
"’" : "’",
"‚" : "‚",
"“" : "“",
"”" : "”",
"„" : "„",
"†" : "†",
"‡" : "‡",
"‰" : "‰",
"‹" : "‹",
"›" : "›",
"€" : "€",
}
replacedText = text;
for key in strToNumEntDict.keys() :
replacedText = re.compile(key).sub(strToNumEntDict[key], replacedText);
return replacedText;
#------------------------------------------------------------------------------
# convert the xxx=yyy into tuple('xxx', yyy), then return the tuple value
# [makesure input string]
# (1) is not include whitespace
# (2) include '='
# (3) last is no ';'
# [possible input string]
# blogUserName="againinput4"
# publisherEmail=""
# synchMiniBlog=false
# publishTime=1322129849397
# publisherName=null
# publisherNickname="\u957F\u5927\u662F\u70E6\u607C"
def convertToTupleVal(equationStr) :
(key, value) = ('', None);
try :
# Note:
# here should not use split with '=', for maybe input string contains string like this:
# http://img.bimg.126.net/photo/hmZoNQaqzZALvVp0rE7faA==/0.jpg
# so use find('=') instead
firstEqualPos = equationStr.find("=");
key = equationStr[0:firstEqualPos];
valuePart = equationStr[(firstEqualPos + 1):];
# string type
valLen = len(valuePart);
if valLen >= 2 :
# maybe string
if valuePart[0] == '"' and valuePart[-1] == '"' :
# is string type
value = str(valuePart[1:-1]);
elif (valuePart.lower() == 'null'):
value = None;
elif (valuePart.lower() == 'false'):
value = False;
elif (valuePart.lower() == 'true') :
value = True;
else :
# must int value
value = int(valuePart);
else :
# len=1 -> must be value
value = int(valuePart);
#print "Convert %s to [%s]=%s"%(equationStr, key, value);
except :
(key, value) = ('', None);
print "Fail of convert the equal string %s to value"%(equationStr);
return (key, value);
例 2.13. convertToTupleVal的使用范例
# (4) convert to value
for equation in equationList :
(key, value) = convertToTupleVal(equation);
#------------------------------------------------------------------------------
# remove the empty ones in list
def removeEmptyInList(list) :
newList = [];
for val in list :
if val :
newList.append(val);
return newList;
例 2.14. removeEmptyInList的使用范例
# Note: some list contain [u''], so is not meaningful, remove it here
# for only [] is empty, [u''] is not empty -> error while exporting to WXR
infoDict['tags'] = removeEmptyInList(infoDict['tags']);
#------------------------------------------------------------------------------
# remove overlapped item in the list
def uniqueList(old_list):
newList = []
for x in old_list:
if x not in newList :
newList.append(x)
return newList
#------------------------------------------------------------------------------
# for listToFilter, remove the ones which is in listToCompare
# also return the ones which is already exist in listToCompare
def filterList(listToFilter, listToCompare) :
filteredList = [];
existedList = [];
for singleOne in listToFilter : # remove processed
if (not(singleOne in listToCompare)) :
# omit the ones in listToCompare
filteredList.append(singleOne);
else :
# record the already exist ones
existedList.append(singleOne);
return (filteredList, existedList);
例 2.16. filterList的使用范例
# remove processed and got ones that has been processed
(filteredPicList, existedList) = filterList(nonOverlapList, gVal['processedUrlList']);
#------------------------------------------------------------------------------
# generated the random digits number string
# max digit number is 12
def randDigitsStr(digitNum = 12) :
if(digitNum > 12):
digitNum = 12;
randVal = random.random();
#print "randVal=",randVal; #randVal= 0.134248340235
randVal = str(randVal);
#print "randVal=",randVal; #randVal= 0.134248340235
randVal = randVal.replace("0.", "");
#print "randVal=",randVal; #randVal= 0.134248340235
# if last is 0, append that 0
if(len(randVal)==11):
randVal = randVal + "0";
#print "randVal=",randVal; #randVal= 0.134248340235
#randVal = randVal.replace("e+11", "");
#randVal = randVal.replace(".", "");
#print "randVal=",randVal; #randVal= 0.134248340235
randVal = randVal[0 : digitNum];
#print "randVal=",randVal; #randVal= 0.134248340235
return randVal;
#------------------------------------------------------------------------------
# convert tuple list to dict value
# [(u'type', u'text/javascript'), (u'src', u'http://partner.googleadservices.com/gampad/google_service.js')]
# { u'type':u'text/javascript', u'src':u'http://partner.googleadservices.com/gampad/google_service.js' }
def tupleListToDict(tupleList):
convertedDict = {};
for eachTuple in tupleList:
(key, value) = eachTuple;
convertedDict[key] = value;
return convertedDict;
例 2.18. tupleListToDict 的使用范例
#singleContent: name=script, attrMap=None, attrs=[(u'type', u'text/javascript'), (u'src', u'http://partner.googleadservices.com/gampad/google_service.js')]
attrsDict = tupleListToDict(singleContent.attrs);