【代码分享】Python代码:download_yupoo_pic – 下载www.yupoo.com图片并保存信息到csv

【背景】

之前写的,去抓取:

http://www.yupoo.com

中的图片,然后保存相关信息为csv。

 

【download_yupoo_pic代码分享】

1.截图:

(1)运行效果:

download_yupoo_pic.py run ui

(2)保存信息为csv文件:

download_yupoo_pic.py save csv file

(3)下载的图片:

download_yupoo_pic.py downloaded pic

 

2.Python项目代码下载:

download_yupoo_pic_2013-01-11.7z

 

3.代码分享:

(1)download_yupoo_pic.py

#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
-------------------------------------------------------------------------------
Function:
Python code for Downloading Pictures from Yupoo
https://www.elance.com/j/python-code-downloading-pictures-from-yupoo/36479285/

Python crawler for photos
https://www.elance.com/j/python-crawler-photos/36700192/

Version:    2013-01-11
Author:     Crifan Li
Contact:    http://www.crifan.com/about/me/

-------------------------------------------------------------------------------
"""
#--------------------------------const values-----------------------------------
__VERSION__ = "v16.8";

gConst = {
    "yupooMainUrl"      : "http://www.yupoo.com",
    "picStorePath"      : "downloadedPictures",
    "fullOutputFilename": "csvoutput_template.csv",
};

gCfg = {
    "maxPicUrlNumOnce"      : 0,
    "startPageNum"          : 1,
};

gVal = {
    "processedPicId"    : [],   # store processed picture id
                                # each one is userId-picId, eg: 
                                # 379879-87329678
                                # for http://www.yupoo.com/photos/shanshu/87329678/
    "outputFp"          : None,
};

#---------------------------------import---------------------------------------
import re;
import sys;
sys.path.append("libs");
from BeautifulSoup import BeautifulSoup,Tag,CData;
import crifanLib;
import logging;
import urllib;
import json;
import os;
import csv;
import argparse;
import codecs;

def findAllPicUrlFromTag(inputTagUtf8, startPageNum = 1):
    """
    find all pic url from input tag
    """
    
    (allPicUrlList, hasMoreUrl, nextStartPageNum) = ([], False, 1);
    continueFind = True;
    
    pageNum = startPageNum;
    while(continueFind):
        #http://www.yupoo.com/ajax/explore/category_photos/?tag_name=%E5%8A%A8%E7%89%A9&page=2
        categoryMainUrl = "http://www.yupoo.com/ajax/explore/category_photos/";
        paraDict = {
            "tag_name": inputTagUtf8,
            "page":     pageNum,
        };
        
        logging.debug("paraDict=%s", paraDict);
        
        encodedQueryPara = urllib.urlencode(paraDict);
        logging.debug("encodedQueryPara=%s", encodedQueryPara);
        
        categoryFullUrl = categoryMainUrl + "?" + encodedQueryPara
        logging.debug("categoryFullUrl=%s", categoryFullUrl);
        
        categoryRespHtml = crifanLib.getUrlRespHtml(categoryFullUrl);
        #logging.debug("categoryRespHtml=%s", categoryRespHtml);
        
        if(categoryRespHtml):
            #find all large-box
            #<a class="large-box" href="/photos/janshiu/87584265/" target="_blank" style="width:200px;height:133px">
            soup = BeautifulSoup(categoryRespHtml);
            foundAllLargebox = soup.findAll(name="a", attrs={"class":"large-box"});
            logging.debug("foundAllLargebox=%s", foundAllLargebox);
            if(foundAllLargebox):
                for eachLargebox in foundAllLargebox:
                    href = eachLargebox['href']; #/photos/janshiu/87584265/
                    logging.debug("href=%s", href);
                    fullPicUrl = gConst['yupooMainUrl'] + href; #http://www.yupoo.com/photos/janshiu/87584265/
                    logging.debug("fullPicUrl=%s", fullPicUrl);
                    allPicUrlList.append(fullPicUrl);

                hasGotPicUrlLen = len(allPicUrlList);
                logging.info("  page %d found %d pic url, total found %d", pageNum, len(foundAllLargebox), hasGotPicUrlLen);

                #update
                pageNum += 1;
                
                #check if hax exceed max limit
                if(gCfg['maxPicUrlNumOnce'] != 0):
                    if(hasGotPicUrlLen > gCfg['maxPicUrlNumOnce']):
                        continueFind = False;
                        nextStartPageNum = pageNum;
                        hasMoreUrl = True;
                        logging.info("  Has got %d pic url", hasGotPicUrlLen);
            else:
                continueFind = False;
                logging.debug("can not found further pic, now break");
        else:
            continueFind = False;
            logging.debug("resp html is empty, now break");
            
    return (allPicUrlList, hasMoreUrl, nextStartPageNum);
    
def processSinglePicUrl(singlePicUrl):
    """
    process each pic url:
        extract infos
        download pic
    """
    
    outputInfoDict = {
        "URL"           : "",
        "ID"            : "",
        "Tags"          : "",
        "DateTaken"     : "",
        "DateUploaded"  : "",
        "PictureFilename": "",
        "omited"        : False,
    };
    
    # print "==========debug"
    # #singlePicUrl = "http://www.yupoo.com/photos/sinaweibo723506601/85924357/";
    # #singlePicUrl = "http://www.yupoo.com/photos/caddile/87585318/";
    # singlePicUrl = "http://www.yupoo.com/photos/hombbb/87596561/";
       
    
    logging.info("  process pic url %s", singlePicUrl);
    picRespHtml = crifanLib.getUrlRespHtml(singlePicUrl);
    #logging.debug("picRespHtml=%s", picRespHtml);

    #http://www.yupoo.com/photos/shanshu/87329678/
    # uPai.page.photo ={id:'379879-87329678',owner:'379879',ownername:'shanshu',title:'IMG_3464',description:'',bucket:'shanshu',key:'CsFzMuHz',license:0,stats_notes: 0,albums: ['379879-181880',],tags:[{name:'20121202', author: '379879'},{name:'天平山赏红枫', author: '379879'}],owner:{id: 379879,username: 'shanshu',nickname: 'shanshu'}}
# uPai.page.share = {};
    # uPai.page.albums = uPai.page.photo.albums;
    
    foundPhotoJson = re.search("uPai.page.photo\s+=(?P<photoInfoJson>{id:.+,nickname:.+})\s+uPai.page.share", picRespHtml, re.S);
    logging.debug("foundPhotoJson=%s", foundPhotoJson);
    if(foundPhotoJson):
        photoInfoJson = foundPhotoJson.group("photoInfoJson");
        logging.debug("photoInfoJson=%s", photoInfoJson);

        #extrat id
        foundPicId = re.search("id:'(?P<id>\d+-\d+)'", photoInfoJson);
        logging.debug("foundPicId=%s", foundPicId);
        if(foundPicId):
            picId = foundPicId.group("id");
            logging.debug("picId=%s", picId);
            
            outputInfoDict['ID'] = picId;
            
            #check duplicated or not
            if(outputInfoDict['ID'] in gVal['processedPicId']):
                outputInfoDict['omited'] = True;
                logging.info("    omit pic url %s, for duplicated pic id %s", singlePicUrl, outputInfoDict['ID']);
                return outputInfoDict;
            else:
                gVal['processedPicId'].append(outputInfoDict['ID']);

        #extract tags
        foundTagsList = re.findall("\{name\s*:\s*'(?P<tagName>[^']+?)',\s*author:\s*'[^']+?'\}", photoInfoJson);
        logging.debug("foundTagsList=%s", foundTagsList);
        if(foundTagsList):
            tagsStr = crifanLib.genListStr(foundTagsList);
            logging.debug("tagsStr=%s", tagsStr);
            
            outputInfoDict['Tags'] = tagsStr;
        else:
            #seems error, for we found pic from tag, but no tag
            #speical: not tag:
            #http://www.yupoo.com/photos/hombbb/87596561/
            #{id:'4195818-87596561',owner:'4195818',ownername:'hombbb',title:'9',description:'',bucket:'hombbb',key:'CyoGNBSc',license:5,stats_notes: 0,albums: [],owner:{id: 4195818,username: 'hombbb',nickname: '豪B'}}
            logging.error("Not found tag for pic url %s !!!", singlePicUrl);

        # # #debug here write fix string json to test
        # # photoInfoJson = """{id:'379879-87329678',owner:'379879',ownername:'shanshu',title:'IMG_3464',description:'xxx',bucket:'shanshu',key:'CsFzMuHz',license:0,stats_notes: 0,albums: ['379879-181880',],tags:[{name:'20121202', author: '379879'},{name:'天平山赏红枫', author: '379879'}],owner:{id: 379879,username: 'shanshu',nickname: 'shanshu'}}""";
        
        # #workable:
        # # photoInfoJson = """{id:'379879-87329678',owner:'379879',ownername:'shanshu',title:'IMG_3464',description:'xxx',bucket:'shanshu',key:'CsFzMuHz',license:0,stats_notes: 0,albums: ['379879-181880'],tags:[{name:'20121202', author: '379879'},{name:'天平山赏红枫', author: '379879'}],owner:{id: 379879,username: 'shanshu',nickname: 'shanshu'}}""";
        
        # #http://www.yupoo.com/photos/sinaweibo723506601/85924357/
        # #{id:'4015182-85924357',owner:'4015182',ownername:'sinaweibo723506601',title:'Scene: crouching tiger, hidden dragon',description:'',bucket:'sinaweibo723506601',key:'C4CSjbe8',license:4,stats_notes: 0,albums: ['4015182-4857285',],tags:[{name:'2012', author: '4015182'},{name:'Nikon', author: '4015182'},{name:'动物', author: '4015182'},{name:'旅游', author: '4015182'},{name:'美国', author: '4015182'},{name:'鸟', author: '4015182'},{name:'鹤', author: '4015182'},{name:'Everglades', author: '4015182'},{name:'Miami', author: '4015182'}],owner:{id: 4015182,username: 'sinaweibo723506601',nickname: 'mrtom1999'}}
        
        # photoInfoJsonAddQuote = re.sub(r"(,?)(\w+?)\s*?:\s*(?='|\d|\[|{)", r"\1'\2':", photoInfoJson);
        # logging.debug("photoInfoJsonAddQuote=%s", photoInfoJsonAddQuote);
        # photoInfoJsonDoubleQuote = photoInfoJsonAddQuote.replace("'", "\"");
        # logging.debug("photoInfoJsonDoubleQuote=%s", photoInfoJsonDoubleQuote);
        
        # #remove comma before end of list
        # afterRemoveLastCommaInList = re.sub(r",\s*?]", "]", photoInfoJsonDoubleQuote);
        # logging.debug("photoInfoJsonDoubleQuote=%s", afterRemoveLastCommaInList);

        # #photoInfoJsonDoubleQuoteUni = photoInfoJsonDoubleQuote.decode("UTF-8");
        # #photoInfoJsonAddQuoteAnsi = photoInfoJsonDoubleQuoteUni.encode("GB18030");
        
        # #print "type(photoInfoJson)=",type(photoInfoJsonDoubleQuoteUni);
        # #print crifanLib.getStrPossibleCharset(photoInfoJsonDoubleQuoteUni);
        
        # #photoInfoJsonDoubleQuote = photoInfoJson.replace("'", '"');
        # #logging.debug("photoInfoJsonDoubleQuote=%s", photoInfoJsonDoubleQuote);
        # #photoInfoDict = json.loads(photoInfoJsonDoubleQuote, "UTF-8");
        # #photoInfoDict = json.loads(photoInfoJsonDoubleQuote);
        # #photoInfoDict = json.loads(photoInfoJsonAddQuoteAnsi, "GB18030");
        # #photoInfoDict = json.loads(photoInfoJsonDoubleQuote);
        
        
        # #http://www.yupoo.com/photos/314159subin/87556247/
        # #after process:
        # #{"id":"34393-87556247","owner":"34393","ownername":"314159subin","title":"雪舞飞扬","description":"1月3日新年伊始,一场大雪让杭城\x26quot;雪舞飞扬\x26quot;.","bucket":"314159subin","key":"CxsKKLUc","license":0,"stats_notes": 0,"albums": ["34393-1693900"],"tags":[{"name":"城事", "author": "34393"},{"name":"西湖", "author": "34393"},{"name":"纪实", "author": "34393"},{"name":"街头", "author": "34393"},{"name":"城市", "author": "34393"},{"name":"杭州", "author": "34393"},{"name":"动物", "author": "34393"},{"name":"下雪喽", "author": "34393"},{"name":"雪景", "author": "34393"}],"owner":{"id": 34393,"username": "314159subin","nickname": "三点一四一屋酒"}}
        
        
        # #replacedQuoteJson = re.sub(r"\\x26([a-zA-Z]{2,6});", r"&\1;", afterRemoveLastCommaInList);

        # #http://www.yupoo.com/photos/hzsldk/82463343/
        # #{id:'2300591-82463343',owner:'2300591',ownername:'hzsldk',title:'树蛙',description:'大图:\x3ca href=\x22http://pic.yupoo.com/hzsldk/BkjKBaG5/TJtm.jpg\x22 target=\x22_blank\x22 rel=\x22external nofollow\x22\x3ehttp://pic.yupoo.com/hzsldk/BkjKBaG5/TJtm.jpg\x3c/a\x3e',bucket:'hzsldk',key:'BkjKBaG5',license:0,stats_notes: 1,albums: ['2300591-772527',],tags:[{name:'动物', author: '2480438'}],owner:{id: 2300591,username: 'hzsldk',nickname: '关注自然,善待生命!杭州三刀'}}
        
        # #replacedQuoteJson = re.sub(r"\\x[a-f\d]([a-zA-Z]{2,6});", r"&\1;", afterRemoveLastCommaInList); #now abandon this json parse method
        
        # logging.debug("replacedQuoteJson=%s", replacedQuoteJson);

        # photoInfoDict = json.loads(replacedQuoteJson);
        # logging.debug("photoInfoDict=%s", photoInfoDict);
        
        # if(photoInfoDict):
            # outputInfoDict['ID'] = photoInfoDict['id'];
            
            # #check duplicated or not
            # if(photoInfoDict['id'] in gVal['processedPicId']):
                # outputInfoDict['omited'] = True;
                # logging.info("    omit pic url %s, for duplicated pic id %s", singlePicUrl, photoInfoDict['id']);
                # return outputInfoDict;
            # else:
                # gVal['processedPicId'].append(photoInfoDict['id']);

            # if( "tags" in photoInfoDict):
                # tagDictList = photoInfoDict['tags'];
                # if(tagDictList):
                    # tagsList = [];
                    # for eachTagDict in tagDictList:
                        # tagsList.append(eachTagDict["name"]);
                    # tagsStr = crifanLib.genListStr(tagsList);
                    # logging.debug("tagsStr=%s", tagsStr);
                    
                    # outputInfoDict['Tags'] = tagsStr;


    #find URL
    #http://www.yupoo.com/photos/shanshu/87329678/
    #<div id="photo_container" style="width: 428px;"><img id="photo_img" src="http://photo.yupoo.com/shanshu/CsFzMuHz/medish.jpg" width="426" height="640" alt="IMG_3464" class="Photo"/></div>
    soup = BeautifulSoup(picRespHtml);
    foundPhotoContainer = soup.find(name="div", attrs={"id":"photo_container"});
    logging.debug("foundPhotoContainer=%s", foundPhotoContainer);
    if(foundPhotoContainer):
        img = foundPhotoContainer.img;
        logging.debug("img=%s", img);
        if(img):
            picUrl = img['src'];
            logging.debug("picUrl=%s", picUrl);
            
            outputInfoDict['URL'] = picUrl;

            #generate pic filename
            slashList = picUrl.split("/");
            logging.debug("slashList=%s", slashList);
            picName = slashList[-3] + "_" + slashList[-2];
            pointSuffix = re.sub(r".+(\.\w{2,4})$", r"\1", picUrl);
            picName += pointSuffix;
            logging.debug("picName=%s", picName);
            picFullName = os.path.join(gConst['picStorePath'], picName);
            logging.debug("picFullName=%s", picFullName);
            
            fileToSave = picFullName;
            #http://photo.yupoo.com/shanshu/CsFzMuHz/medish.jpg
            logging.info("    Downloading picture %s", picUrl);
            #crifanLib.downloadFile(picUrl, fileToSave, True);
            crifanLib.downloadFile(picUrl, fileToSave);
            
            outputInfoDict['PictureFilename'] = picName;

    #datd taken
    #http://www.yupoo.com/photos/shanshu/87329678/
    #<a class="plain" href="/photos/shanshu/archives/date-taken/2012/12/02/" title="查看所有由shanshu拍摄于2012-12-02的照片">2012-12-02</a>
    foundDateTaken = soup.find(name="a", attrs={"class":"plain", "href":re.compile("/photos/[\w/]+?/date-taken/[\w/]+")});
    logging.debug("foundDateTaken=%s", foundDateTaken);
    if(foundDateTaken):
        dateTakenStr = foundDateTaken.string;
        logging.debug("dateTakenStr=%s", dateTakenStr);
        outputInfoDict['DateTaken'] = dateTakenStr;
    else:
        logging.debug("Not found date taken for pic url %s", singlePicUrl);
        
    #date posted
    #http://www.yupoo.com/photos/shanshu/87329678/
    #<a class="plain" href="/photos/shanshu/archives/date-posted/2012/12/03/" title="查看所有由shanshu上传于2012-12-03的照片">2012-12-03</a>
    foundDatePosted = soup.find(name="a", attrs={"class":"plain", "href":re.compile("/photos/[\w/]+?/date-posted/[\w/]+")});
    logging.debug("foundDatePosted=%s", foundDatePosted);
    if(foundDatePosted):
        datePostedStr = foundDatePosted.string;
        logging.debug("datePostedStr=%s", datePostedStr);
        outputInfoDict['DateUploaded'] = datePostedStr;
    else:
        logging.debug("Not found date uploaded for pic url %s", singlePicUrl);

    return outputInfoDict;
    
def processSingleTag(num, inputTagUni):
    """
        process for each tag:
        find all match pic url
        process each pic url:
            extract infos
            should omit duplicated pic
    """
    logging.info("%s", '{0:=^80}'.format(" "+str(num)+" "));
    logging.info("Processing for single tag %s", inputTagUni);
    
    inputTagUtf8 = inputTagUni.encode("UTF-8");
    
    #find all match pic url
    # tagEntryUrl = "http://www.yupoo.com/photos/tags/";
    # tagUrl = tagEntryUrl + urllib.quote(inputTagUtf8);
    # logging.info("tagUrl=%s", tagUrl);
    # tagRespHtml = crifanLib.getUrlRespHtml(tagUrl);
    # logging.debug("tagRespHtml=%s", tagRespHtml);
    
    needFindMore = True;
    firstStartPageNum = gCfg['startPageNum'];
    curPageNum = firstStartPageNum;
    while(needFindMore):
        logging.info("1. Find pic url");
        (allPicUrlList, hasMoreUrl, nextStartPageNum) = findAllPicUrlFromTag(inputTagUtf8, curPageNum);

        logging.debug("allPicUrlList=%s", allPicUrlList);

        if(allPicUrlList):
            needFindMore = hasMoreUrl;
            curPageNum = nextStartPageNum;

            picInfoDictList = [];
            
            logging.info("2. download picture and extract info from each pic url");
            #process for each pic url
            for eachPicUrl in allPicUrlList:
                singlePicInfoDict = processSinglePicUrl(eachPicUrl);
                if(not singlePicInfoDict['omited']):
                    picInfoDictList.append(singlePicInfoDict);
            
            logging.info("3. output extracted info");
            #output all info dict list
            outputFp = open(gConst['fullOutputFilename'], 'a+');
            csvWriter = csv.writer(outputFp, dialect='excel');
            for eachInfoDict in picInfoDictList:
                #["URL", "ID", "Tags", "DateTaken", "DateUploaded", "PictureFilename"]
                fieldList = [];
                fieldList.append(eachInfoDict['URL']);
                fieldList.append(eachInfoDict['ID']);
                fieldList.append(eachInfoDict['Tags']);
                fieldList.append(eachInfoDict['DateTaken']);
                fieldList.append(eachInfoDict['DateUploaded']);
                fieldList.append(eachInfoDict['PictureFilename']);
                logging.debug("fieldList=%s", fieldList);

                csvWriter.writerow(fieldList);
            outputFp.close();
        else:
            needFindMore = False;
            logging.info("2. Not found any pic url");
    return;
    
def main():
    newParser = argparse.ArgumentParser(description="Extarct yupoo picture info and download picture from input tag.");
    newParser.add_argument("-f", "--tagFile", dest="tagFile", default="tags.txt", help="File that store tags, speparated by comma");
    #newParser.add_argument("-l", "--tagListStr", dest="tagListStr", default="20121202,天平山赏红枫", help="in command para, speparated by comma");
    newParser.add_argument("-l", "--tagListStr", dest="tagListStr", help="in command para, speparated by comma");
    newParser.add_argument("-m", "--maxPicUrlNumOnce", dest="maxPicUrlNumOnce", type=int, default=30, help="max number of found pic url, then go to process");
    newParser.add_argument("-s", "--startPageNum", dest="startPageNum", type=int, default=1, help="start page number for found pic url of input tag, expecially used for debug purpose");
    args = newParser.parse_args();
    argsDict = args.__dict__;
    for eachArg in argsDict.keys():
        exec(eachArg + " = args." + eachArg);

    gCfg['maxPicUrlNumOnce'] = maxPicUrlNumOnce;
    gCfg['startPageNum'] = startPageNum;

    #init outpu dir
    if(os.path.isdir(gConst['picStorePath']) == False) :
        os.makedirs(gConst['picStorePath']);# create dir recursively

    #init tag list
    if(tagListStr):
        inputStrEnc = crifanLib.getStrPossibleCharset(tagListStr);
        logging.info("inputStrEnc=%s", inputStrEnc);
        tagListStrUni = tagListStr.decode(inputStrEnc);
        logging.info("From input string, tags=%s", tagListStrUni);
        tagListUni = tagListStrUni.split(",");
    else:
        logging.debug("Tag from file %s", tagFile);
        if(not os.path.isfile(tagFile)):
            logging.error("File %s not exist !!!", tagFile);

        tagFp = codecs.open(tagFile, 'r', 'utf-8');
        tagListStrFromFile = tagFp.read();
        #print "type(tagListStrFromFile)=",type(tagListStrFromFile);
        logging.debug("tagListStrFromFile=%s", tagListStrFromFile);
        tagFp.close();
        logging.info("Read from file %s, tags=%s", tagFile, tagListStrFromFile);
        tagListUni = tagListStrFromFile.split(",");

    #int output file
    # 'a+': read,write,append
    # 'w' : clear before, then write
    outputFp = open(gConst['fullOutputFilename'], 'a+');
    csvWriter = csv.writer(outputFp, dialect='excel');
    # outputInfoDict = {
        # "URL"           : "",
        # "ID"            : "",
        # "Tags"          : "",
        # "DateTaken"     : "",
        # "DateUploaded"  : "",
        # "PictureFilename": "",
    # };
    csvWriter.writerow(["URL", "ID", "Tags", "DateTaken", "DateUploaded", "PictureFilename"]);
    outputFp.close();

    #init cookie
    crifanLib.initAutoHandleCookies();

    #make sure Unicode !!!
    # tagListUni = [
        # u"下雪喽",
        # #u"20121202",
        # #u"天平山赏红枫",
        # #u"动物",
    # ];

    # go to process
    logging.debug("tagListUni=%s", tagListUni);
    for idx,eachTagUni in enumerate(tagListUni):
        num = idx + 1;
        processSingleTag(num, eachTagUni);
    
    
###############################################################################
if __name__=="__main__":
    scriptSelfName = crifanLib.extractFilename(sys.argv[0]);

    logging.basicConfig(
                    level    = logging.DEBUG,
                    format   = 'LINE %(lineno)-4d  %(levelname)-8s %(message)s',
                    datefmt  = '%m-%d %H:%M',
                    filename = scriptSelfName + ".log",
                    filemode = 'w');
    # define a Handler which writes INFO messages or higher to the sys.stderr
    console = logging.StreamHandler();
    console.setLevel(logging.INFO);
    # set a format which is simpler for console use
    formatter = logging.Formatter('LINE %(lineno)-4d : %(levelname)-8s %(message)s');
    # tell the handler to use this format
    console.setFormatter(formatter);
    logging.getLogger('').addHandler(console);
    try:
        main();
    except:
        logging.exception("Unknown Error !");
        raise;

 

【总结】



发表评论

电子邮件地址不会被公开。 必填项已用*标注

无觅相关文章插件,快速提升流量