#------------------------------------------------------------------------------
# depend on chardet
# check whether the strToDect is ASCII string
def strIsAscii(strToDect) :
isAscii = False;
encInfo = chardet.detect(strToDect);
if (encInfo['confidence'] > 0.9) and (encInfo['encoding'] == 'ascii') :
isAscii = True;
return isAscii;
例 2.27. strIsAscii的使用范例
if(not strIsAscii(extractedBlogUser)) :
# if is: http://hi.baidu.com/资料收集
# then should quote it, otherwise later output to WXR will fail !
extractedBlogUser = urllib.quote(extractedBlogUser);
此代码中是判断是否大于0.5来决定是否是可能的字符串类型。使用者可根据自己需要,改为自己想要的概率,比如0.8等。
#------------------------------------------------------------------------------
# get the possible(possiblility > 0.5) charset of input string
def getStrPossibleCharset(inputStr) :
possibleCharset = "ascii";
#possibleCharset = "UTF-8";
encInfo = chardet.detect(inputStr);
#print "encInfo=",encInfo;
if (encInfo['confidence'] > 0.5):
possibleCharset = encInfo['encoding'];
return possibleCharset;
#return encInfo['encoding'];
例 2.28. getStrPossibleCharset的使用范例
validCharset = getStrPossibleCharset(dataJsonStr);
logging.debug("Now try use the detected charset %s to decode it again", validCharset);