2.4. 网络方面的函数

2.4.1. 检查/判断/校验网络上某个文件是否有效:isFileValid

#------------------------------------------------------------------------------
# check file validation:
# open file url to check return info is match or not
# with exception support
# note: should handle while the file url is redirect
# eg :
# http://publish.it168.com/2007/0627/images/500754.jpg ->
# http://img.publish.it168.com/2007/0627/images/500754.jpg
# other special one:
# sina pic url: 
# http://s14.sinaimg.cn/middle/3d55a9b7g9522d474a84d&690
# http://s14.sinaimg.cn/orignal/3d55a9b7g9522d474a84d
# the real url is same with above url
def isFileValid(fileUrl) :
    fileIsValid = False;
    errReason = "Unknown error";

    try :
        #print "original fileUrl=",fileUrl;
        origFileName = fileUrl.split('/')[-1];
        #print "origFileName=",origFileName;
        
        #old: https://ie2zeq.bay.livefilestore.com/y1mo7UWr-TrmqbBhkw52I0ii__WE6l2UtMRSTZHSky66-uDxnCdKPr3bdqVrpUcQHcoJLedlFXa43bvCp_O0zEGF3JdG_yZ4wRT-c2AQmJ_TNcWvVZIXfBDgGerouWyx19WpA4I0XQR1syRJXjDNpwAbQ/IMG_5214_thumb[1].jpg
        #new: https://kxoqva.bay.livefilestore.com/y1mQlGjwNAYiHKoH5Aw6TMNhsCmX2YDR3vPKnP86snuqQEtnZgy3dHkwUvZ61Ah8zU3AGiS4whmm_ADrvxdufEAfMGo56KjLdhIbosn9F34olQ/IMG_5214_thumb%5b1%5d.jpg
        unquotedOrigFilenname = urllib.unquote(origFileName);
        #print "unquotedOrigFilenname=",unquotedOrigFilenname
        lowUnquotedOrigFilename = unquotedOrigFilenname.lower();
        #print "lowUnquotedOrigFilename=",lowUnquotedOrigFilename;
        
        resp = urllib2.urlopen(fileUrl, timeout=gConst['defaultTimeout']); # note: Python 2.6 has added timeout support.
        #print "resp=",resp;
        realUrl = resp.geturl();
        #print "realUrl=",realUrl;
        newFilename = realUrl.split('/')[-1];
        #print "newFilename=",newFilename;
        
        #http://blog.sina.com.cn/s/blog_696e50390100ntxs.html
        unquotedNewFilename = urllib.unquote(newFilename);
        #print "unquotedNewFilename=",unquotedNewFilename;
        unquotedLowNewFilename = unquotedNewFilename.lower();
        #print "unquotedLowNewFilename=",unquotedLowNewFilename;
        
        respInfo = resp.info();
        #print "respInfo=",respInfo;
        respCode = resp.getcode();
        #print "respCode=",respCode;

        # special:
        # http://116.img.pp.sohu.com/images/blog/2007/5/24/17/24/11355bf42a9.jpg
        # return no content-length
        #contentLen = respInfo['Content-Length'];
        
        # for redirect, if returned size>0 and filename is same, also should be considered valid
        #if (origFileName == newFilename) and (contentLen > 0):
        # for redirect, if returned response code is 200(OK) and filename is same, also should be considered valid
        #if (origFileName == newFilename) and (respCode == 200):
        if (lowUnquotedOrigFilename == unquotedLowNewFilename) and (respCode == 200):
            fileIsValid = True;
        else :
            fileIsValid = False;
            
            # eg: Content-Type= image/gif, ContentTypes : audio/mpeg
            # more ContentTypes can refer: http://kenya.bokee.com/3200033.html
            contentType = respInfo['Content-Type'];
        
            errReason = "file url returned info: type=%s, len=%d, realUrl=%s"%(contentType, contentLen, realUrl);
    except urllib2.URLError,reason :
        fileIsValid = False;
        errReason = reason;
    except urllib2.HTTPError,code :
        fileIsValid = False;
        errReason = code;
    except :
        fileIsValid = False;
        errReason = "Unknown error";

    # here type(errReason)= <class 'urllib2.HTTPError'>, so just convert it to str
    errReason = str(errReason);
    return (fileIsValid, errReason);
        

例 2.20. isFileValid的使用范例

# indeed is pic, process it
(picIsValid, errReason) = isFileValid(curUrl);
            

2.4.2. 下载网络上某个文件:downloadFile

#------------------------------------------------------------------------------
# download from fileUrl then save to fileToSave
# with exception support
# note: the caller should make sure the fileUrl is a valid internet resource/file
def downloadFile(fileUrl, fileToSave, needReport = False) :
    isDownOK = False;
    downloadingFile = '';

    #---------------------------------------------------------------------------
    # note: totalFileSize -> may be -1 on older FTP servers which do not return a file size in response to a retrieval request
    def reportHook(copiedBlocks, blockSize, totalFileSize) :
        #global downloadingFile
        if copiedBlocks == 0 : # 1st call : once on establishment of the network connection
            print 'Begin to download %s, total size=%d'%(downloadingFile, totalFileSize);
        else : # rest call : once after each block read thereafter
            print 'Downloaded bytes: %d' % ( blockSize * copiedBlocks);
        return;
    #---------------------------------------------------------------------------

    try :
        if fileUrl :
            downloadingFile = fileUrl;
            if needReport :
                urllib.urlretrieve(fileUrl, fileToSave, reportHook);
            else :
                urllib.urlretrieve(fileUrl, fileToSave);
            isDownOK = True;
        else :
            print "Input download file url is NULL";
    except urllib.ContentTooShortError(msg) :
        isDownOK = False;
    except :
        isDownOK = False;

    return isDownOK;
        

例 2.21. downloadFile的使用范例

if dstPicFile and downloadFile(curUrl, dstPicFile) : 
    # replace old url with new url
            

2.4.3. (不用urlretrieve)手动从网络上下载单个文件:manuallyDownloadFile

#------------------------------------------------------------------------------
# manually download fileUrl then save to fileToSave
def manuallyDownloadFile(fileUrl, fileToSave) :
    isDownOK = False;
    downloadingFile = '';

    try :
        if fileUrl :
            # 1. find real address
            #print "fileUrl=",fileUrl;
            resp = urllib2.urlopen(fileUrl, timeout=gConst['defaultTimeout']);
            #print "resp=",resp;
            realUrl = resp.geturl(); # not same with original file url if redirect
            
            # if url is invalid, then add timeout can avoid dead
            respHtml = getUrlRespHtml(realUrl, useGzip=False, timeout=gConst['defaultTimeout']);
            
            isDownOK = saveBinDataToFile(respHtml, fileToSave);
        else :
            print "Input download file url is NULL";
    except urllib.ContentTooShortError(msg) :
        isDownOK = False;
    except :
        isDownOK = False;

    return isDownOK;
        

例 2.22. manuallyDownloadFile的使用范例

#if dstPicFile and downloadFile(curUrl, dstPicFile) : 
# urlretrieve in downloadFile is too slow while download QQ Space Picture
# so here use manuallyDownloadFile instead
if dstPicFile and manuallyDownloadFile(curUrl, dstPicFile) :
    # replace old url with new url
            

2.4.4. 获得Url地址的响应:getUrlResponse

#------------------------------------------------------------------------------
# get response from url
# note: if you have already used cookiejar, then here will automatically use it
# while using rllib2.Request
def getUrlResponse(url, postDict={}, headerDict={}, timeout=0, useGzip=False) :
    # makesure url is string, not unicode, otherwise urllib2.urlopen will error
    url = str(url);

    if (postDict) :
        postData = urllib.urlencode(postDict);
        req = urllib2.Request(url, postData);
        req.add_header('Content-Type', "application/x-www-form-urlencoded");
    else :
        req = urllib2.Request(url);

    if(headerDict) :
        #print "added header:",headerDict;
        for key in headerDict.keys() :
            req.add_header(key, headerDict[key]);

    defHeaderDict = {
        'User-Agent'    : gConst['userAgentIE9'],
        'Cache-Control' : 'no-cache',
        'Accept'        : '*/*',
        'Connection'    : 'Keep-Alive',
    };

    # add default headers firstly
    for eachDefHd in defHeaderDict.keys() :
        #print "add default header: %s=%s"%(eachDefHd,defHeaderDict[eachDefHd]);
        req.add_header(eachDefHd, defHeaderDict[eachDefHd]);

    if(useGzip) :
        #print "use gzip for",url;
        req.add_header('Accept-Encoding', 'gzip, deflate');

    # add customized header later -> allow overwrite default header 
    if(headerDict) :
        #print "added header:",headerDict;
        for key in headerDict.keys() :
            req.add_header(key, headerDict[key]);

    if(timeout > 0) :
        # set timeout value if necessary
        resp = urllib2.urlopen(req, timeout=timeout);
    else :
        resp = urllib2.urlopen(req);
    
    return resp;
        

例 2.23. getUrlResponse的使用范例

resp = getUrlResponse(url, postDict, headerDict, timeout, useGzip);
respHtml = resp.read();
            

2.4.5. 获得Url返回的HTML网页(源码)内容:getUrlRespHtml

#------------------------------------------------------------------------------
# get response html==body from url
#def getUrlRespHtml(url, postDict={}, headerDict={}, timeout=0, useGzip=False) :
def getUrlRespHtml(url, postDict={}, headerDict={}, timeout=0, useGzip=True) :
    resp = getUrlResponse(url, postDict, headerDict, timeout, useGzip);
    respHtml = resp.read();
    if(useGzip) :
        #print "---before unzip, len(respHtml)=",len(respHtml);
        respInfo = resp.info();
        
        # Server: nginx/1.0.8
        # Date: Sun, 08 Apr 2012 12:30:35 GMT
        # Content-Type: text/html
        # Transfer-Encoding: chunked
        # Connection: close
        # Vary: Accept-Encoding
        # ...
        # Content-Encoding: gzip
        
        # sometime, the request use gzip,deflate, but actually returned is un-gzip html
        # -> response info not include above "Content-Encoding: gzip"
        # eg: http://blog.sina.com.cn/s/comment_730793bf010144j7_3.html
        # -> so here only decode when it is indeed is gziped data
        if( ("Content-Encoding" in respInfo) and (respInfo['Content-Encoding'] == "gzip")) :
            respHtml = zlib.decompress(respHtml, 16+zlib.MAX_WBITS);
            #print "+++ after unzip, len(respHtml)=",len(respHtml);

    return respHtml;
        

例 2.24. getUrlRespHtml的使用范例:不带额外参数

respHtml = getUrlRespHtml(url);
            

例 2.25. getUrlRespHtml的使用范例:带额外参数

modifyUrl = gVal['blogEntryUrl'] + "/blog/submit/modifyblog";
#logging.debug("Modify Url is %s", modifyUrl);

#http://hi.baidu.com/wwwhaseecom/blog/item/79188d1b4fa36f068718bf79.html
foundSpBlogID = re.search(r"blog/item/(?P<spBlogID>\w+?).html", url);
if(foundSpBlogID) :
    spBlogID = foundSpBlogID.group("spBlogID");
    logging.debug("Extracted spBlogID=%s", spBlogID);
else :
    modifyOk = False;
    errInfo = "Can't extract post spBlogID !";
    return (modifyOk, errInfo);

newPostContentGb18030 = newPostContentUni.encode("GB18030");
categoryGb18030 = infoDict['category'].encode("GB18030");
titleGb18030 = infoDict['title'].encode("GB18030");

postDict = {
    "bdstoken"      : gVal['spToken'],
    "ct"            : "1",
    "mms_flag"      : "0",
    "cm"            : "2",
    "spBlogID"      : spBlogID,
    "spBlogCatName_o": categoryGb18030, # old catagory
    "edithid"       : "",
    "previewImg"    : "",
    "spBlogTitle"   : titleGb18030,
    "spBlogText"    : newPostContentGb18030,
    "spBlogCatName" : categoryGb18030, # new catagory
    "spBlogPower"   : "0",
    "spIsCmtAllow"  : "1",
    "spShareNotAllow":"0",
    "spVcode"       : "",
    "spVerifyKey"   : "",
}
        
headerDict = {
    # 如果不添加Referer,则返回的html则会出现错误:"数据添加的一般错误"
    "Referer" : gVal['blogEntryUrl'] + "/blog/modify/" + spBlogID,
    }
respHtml = getUrlRespHtml(modifyUrl, postDict, headerDict);
            

2.4.6. 检查(所返回的)cookieJar中,是否所有的cookie都存在:checkAllCookiesExist

因为成功登录某网页后,一般都会有对应的cookie返回,所以常用此函数去判断是否成功登录某网页。

#------------------------------------------------------------------------------
# check all cookies in cookiesDict is exist in cookieJar or not
def checkAllCookiesExist(cookieNameList, cookieJar) :
    cookiesDict = {};
    for eachCookieName in cookieNameList :
        cookiesDict[eachCookieName] = False;
    
    allCookieFound = True;
    for cookie in cookieJar :
        if(cookie.name in cookiesDict) :
            cookiesDict[cookie.name] = True;
    
    for eachCookie in cookiesDict.keys() :
        if(not cookiesDict[eachCookie]) :
            allCookieFound = False;
            break;

    return allCookieFound;
        

例 2.26. checkAllCookiesExist的使用范例

#http://www.darlingtree.com/wordpress/archives/242
gVal['cj'] = cookielib.CookieJar();

opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(gVal['cj']));
urllib2.install_opener(opener);
resp = urllib2.urlopen(baiduSpaceEntryUrl);

loginBaiduUrl = "https://passport.baidu.com/?login";
#username=%D0%C4%C7%E9%C6%DC%CF%A2%B5%D8&password=xxx&mem_pass=on
postDict = {
    'username'  : username,
    'password'  : password,
    'mem_pass'  : 'on',
    };
resp = getUrlResponse(loginBaiduUrl, postDict);

# check whether the cookie is OK
cookieNameList = ["USERID", "PTOKEN", "STOKEN"];
loginOk = checkAllCookiesExist(cookieNameList, gVal['cj']);
if (not loginOk) :
    logging.error("Login fail for not all expected cookies exist !");
    return loginOk;