2.4.5. 获得Url返回的HTML网页(源码)内容:getUrlRespHtml

#------------------------------------------------------------------------------
# get response html==body from url
#def getUrlRespHtml(url, postDict={}, headerDict={}, timeout=0, useGzip=False) :
def getUrlRespHtml(url, postDict={}, headerDict={}, timeout=0, useGzip=True) :
    resp = getUrlResponse(url, postDict, headerDict, timeout, useGzip);
    respHtml = resp.read();
    if(useGzip) :
        #print "---before unzip, len(respHtml)=",len(respHtml);
        respInfo = resp.info();
        
        # Server: nginx/1.0.8
        # Date: Sun, 08 Apr 2012 12:30:35 GMT
        # Content-Type: text/html
        # Transfer-Encoding: chunked
        # Connection: close
        # Vary: Accept-Encoding
        # ...
        # Content-Encoding: gzip
        
        # sometime, the request use gzip,deflate, but actually returned is un-gzip html
        # -> response info not include above "Content-Encoding: gzip"
        # eg: http://blog.sina.com.cn/s/comment_730793bf010144j7_3.html
        # -> so here only decode when it is indeed is gziped data
        if( ("Content-Encoding" in respInfo) and (respInfo['Content-Encoding'] == "gzip")) :
            respHtml = zlib.decompress(respHtml, 16+zlib.MAX_WBITS);
            #print "+++ after unzip, len(respHtml)=",len(respHtml);

    return respHtml;
        

例 2.24. getUrlRespHtml的使用范例:不带额外参数

respHtml = getUrlRespHtml(url);
            


例 2.25. getUrlRespHtml的使用范例:带额外参数

modifyUrl = gVal['blogEntryUrl'] + "/blog/submit/modifyblog";
#logging.debug("Modify Url is %s", modifyUrl);

#http://hi.baidu.com/wwwhaseecom/blog/item/79188d1b4fa36f068718bf79.html
foundSpBlogID = re.search(r"blog/item/(?P<spBlogID>\w+?).html", url);
if(foundSpBlogID) :
    spBlogID = foundSpBlogID.group("spBlogID");
    logging.debug("Extracted spBlogID=%s", spBlogID);
else :
    modifyOk = False;
    errInfo = "Can't extract post spBlogID !";
    return (modifyOk, errInfo);

newPostContentGb18030 = newPostContentUni.encode("GB18030");
categoryGb18030 = infoDict['category'].encode("GB18030");
titleGb18030 = infoDict['title'].encode("GB18030");

postDict = {
    "bdstoken"      : gVal['spToken'],
    "ct"            : "1",
    "mms_flag"      : "0",
    "cm"            : "2",
    "spBlogID"      : spBlogID,
    "spBlogCatName_o": categoryGb18030, # old catagory
    "edithid"       : "",
    "previewImg"    : "",
    "spBlogTitle"   : titleGb18030,
    "spBlogText"    : newPostContentGb18030,
    "spBlogCatName" : categoryGb18030, # new catagory
    "spBlogPower"   : "0",
    "spIsCmtAllow"  : "1",
    "spShareNotAllow":"0",
    "spVcode"       : "",
    "spVerifyKey"   : "",
}
        
headerDict = {
    # 如果不添加Referer,则返回的html则会出现错误:"数据添加的一般错误"
    "Referer" : gVal['blogEntryUrl'] + "/blog/modify/" + spBlogID,
    }
respHtml = getUrlRespHtml(modifyUrl, postDict, headerDict);