【代码分享】Python代码:37959390_data_scraping_from_website – 下载www.autopartoo.com中图片并保存图片信息为csv

【背景】

之前写的,去下载:

http://www.autopartoo.com

中的图片,并且保存图片信息为csv文件。

 

【37959390_data_scraping_from_website代码分享】

1.截图:

(1)运行效果:

37959390_data_scraping_from_website.py run ui

(2)下载的图片:

37959390_data_scraping_from_website downloaded pics

(3)记录图片信息保存为csv文件:

37959390_data_scraping_from_website.py save out csv file

2.Python项目代码下载:

37959390_data_scraping_from_website_2013-02-17.7z

3.代码分享:

(1)37959390_data_scraping_from_website.py

#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
-------------------------------------------------------------------------------
Function:
Data scraping from a website
https://www.elance.com/j/data-scraping-from-website/37959390/

Version:    2013-02-17
Author:     Crifan Li
Contact:    admin@crifan.com

-------------------------------------------------------------------------------
"""
#--------------------------------const values-----------------------------------
__VERSION__ = "v1.0";

gConst = {
    "csvFilename" : "outputInfo.csv",
    "xls"   : {
        'fileName'  : "outputInfo.xls",
        'sheetName' : "outputInfo",
    },
    'picStorePath'  : "downloadedPictures",
};

gCfg = {
};

gVal = {

};

#---------------------------------import---------------------------------------
import re;
import sys;
sys.path.append("libs");
from BeautifulSoup import BeautifulSoup,Tag,CData;
import crifanLib;
import logging;
import urllib;
import json;
import os;
#import argparse;
import codecs;

import csv;
import xlwt;
import xlrd;
#import xlutils;
from xlutils.copy import copy;

def scrapeVehicleInfos():
    """
        Scrape vehicles realted info:
            part No.
            download picture
            ...
    """
    logging.info("Here just to demo to scrape some info and download pictures");
    
    #1.open http://www.autopartoo.com/
    # entryUrl = "http://www.autopartoo.com/";
    # respHtml = crifanLib.getUrlRespHtml(entryUrl);
    # logging.debug("respHtml=%s", respHtml);
    # searchJsUrl = "http://www.autopartoo.com/js/search.js";
    # searchJsRespHtml = crifanLib.getUrlRespHtml(searchJsUrl);
    # respHtml = crifanLib.getUrlRespHtml(entryUrl);
    # logging.debug("respHtml=%s", respHtml);
    
    #2.select AUDI from 'select make' then click search
    #we can find out the AUDI corresponding ID is 504 from
    #"<option value='504'>AUDI</option>"
    #in http://www.autopartoo.com/js/search.js
    searchAudiUrl = "http://www.autopartoo.com/search/vehiclesearch.aspx?braid=504&date=0&modid=0&typid=0&class1=0&class2=0&class3=0";
    searchAudiRespHtml = crifanLib.getUrlRespHtml(searchAudiUrl);
    logging.debug("searchAudiRespHtml=%s", searchAudiRespHtml);
    
    # <li class="SupplierList01">
    # <h2><strong><a href="/oem/magneti-marelli/944280189200.html" target="_blank">
    # MAGNETI MARELLI 944280189200 </strong></a>
    # <span>(7 manufacturers found)</span></h2>
    soup = BeautifulSoup(searchAudiRespHtml);
    foundSupplierList01 = soup.findAll("li", {"class":"SupplierList01"});
    logging.debug("foundSupplierList01=%s", foundSupplierList01);
    if(foundSupplierList01):
        supplierDictList = [];
        
        supplierList01Len = len(foundSupplierList01);
        logging.info("supplierList01Len=%s", supplierList01Len);
        
        for eachSupplierList in foundSupplierList01:
            supplierDict = {
                'Header'    : "",
                'DownloadedPictureFilename' : "",
            }
            
            eachH2 = eachSupplierList.h2;
            logging.debug("eachH2=%s", eachH2);
            
            h2Contents = eachH2.contents;
            logging.debug("h2Contents=%s", h2Contents);
            
            h2Strong = eachH2.strong;
            logging.debug("h2Strong=%s", h2Strong);
            
            h2StrongAString = h2Strong.a.string;
            logging.debug("h2StrongAString=%s", h2StrongAString);
            
            filteredTitle = h2StrongAString.strip(); #MAGNETI MARELLI 944280189200
            logging.info("filteredTitle=%s", filteredTitle);
            supplierDict['Header'] = filteredTitle;
            
            foundImgSrc = eachSupplierList.find("img");
            logging.debug("foundImgSrc=%s", foundImgSrc);
            src = foundImgSrc['src']; #http://file.autopartoo.com/oeimage/2012/7/6/931/633242140s.JPG
            logging.info("src=%s", src);
            
            foundFilename = re.search("\w+\.\w{2,4}$", src);
            logging.debug("foundFilename=%s", foundFilename);
            imgFilename = foundFilename.group(0);
            logging.info("imgFilename=%s", imgFilename);
            supplierDict['DownloadedPictureFilename'] = imgFilename;
            
            crifanLib.downloadFile(src, os.path.join(gConst['picStorePath'], imgFilename), needReport=True);
            
            supplierDictList.append(supplierDict);

    #open existed xls file
    logging.info("Saving scraped info ...");
    #newWb = xlutils.copy(gConst['xls']['fileName']);
    #newWb = copy(gConst['xls']['fileName']);
    oldWb = xlrd.open_workbook(gConst['xls']['fileName'], formatting_info=True); #xlrd.book.Book
    #print oldWb; #<xlrd.book.Book object at 0x000000000315C940>
    newWb = copy(oldWb); #xlwt.Workbook.Workbook
    #print newWb; #<xlwt.Workbook.Workbook object at 0x000000000315F470>
    #write new values
    newWs = newWb.get_sheet(0);
    
    for index,supplierDict in enumerate(supplierDictList):
        rowIndex = index + 1;
        newWs.write(rowIndex, 0, supplierDict['Header']);
        newWs.write(rowIndex, 1, supplierDict['DownloadedPictureFilename']);

    #save
    newWb.save(gConst['xls']['fileName']);
    
    lllll
    
    
def main():

    #create outpu dir if necessary
    if(os.path.isdir(gConst['picStorePath']) == False) :
        os.makedirs(gConst['picStorePath']);# create dir recursively

    #init output file

    #init xls file
    #styleBlueBkg= xlwt.easyxf('pattern: pattern solid, fore_colour sky_blue;');
    #styleBold   = xlwt.easyxf('font: bold on');
    styleBoldRed   = xlwt.easyxf('font: color-index red, bold on');
    headerStyle = styleBoldRed;
    wb = xlwt.Workbook();
    ws = wb.add_sheet(gConst['xls']['sheetName']);
    ws.write(0, 0, "Header",   headerStyle);
    ws.write(0, 1, "DownloadedPictureFilename",   headerStyle);
    wb.save(gConst['xls']['fileName']);

    #init cookie
    crifanLib.initAutoHandleCookies();

    #do main job
    scrapeVehicleInfos();
    
    
###############################################################################
if __name__=="__main__":
    scriptSelfName = crifanLib.extractFilename(sys.argv[0]);

    logging.basicConfig(
                    level    = logging.DEBUG,
                    format   = 'LINE %(lineno)-4d  %(levelname)-8s %(message)s',
                    datefmt  = '%m-%d %H:%M',
                    filename = scriptSelfName + ".log",
                    filemode = 'w');
    # define a Handler which writes INFO messages or higher to the sys.stderr
    console = logging.StreamHandler();
    console.setLevel(logging.INFO);
    # set a format which is simpler for console use
    formatter = logging.Formatter('LINE %(lineno)-4d : %(levelname)-8s %(message)s');
    # tell the handler to use this format
    console.setFormatter(formatter);
    logging.getLogger('').addHandler(console);
    try:
        main();
    except:
        logging.exception("Unknown Error !");
        raise;

 

【总结】



发表评论

电子邮件地址不会被公开。 必填项已用*标注

无觅相关文章插件,快速提升流量