【代码分享】Python代码:scrape_fishersci_com – 模拟www.fishersci.com搜索并抓取信息然后保存为csv

【背景】

之前写的,去模拟:

http://www.fishersci.com

中的搜索,并保存抓取出来的信息为csv文件。

 

【scrape_fishersci_com 代码分享】

1.截图:

(1)运行效果:

scrape_fishersci_com run ui

(2)保存信息为csv文件:

scrape_fishersci_com save out csv file

2.Python项目代码下载:

scrape_fishersci_com_2013-01-16.7z

 

3.代码分享:

(1)scrape_fishersci_com.py

#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
-------------------------------------------------------------------------------
Function:
scrape www.fishersci.com
https://www.elance.com/j/extract-data-from-website/36621245/

Version:    2013-01-16
Author:     Crifan Li
Contact:    http://www.crifan.com/about/me/

-------------------------------------------------------------------------------
"""

#--------------------------------const values-----------------------------------

gConst = {
    "domain"    : "http://www.fishersci.com",
    
    "csvFilename" : "outputInfo.csv",
    "xls"   : {
        'fileName'  : "outputInfo.xls",
        'sheetName' : "outputInfo",
    },
};

gCfg = {

};

gVal = {

};

#---------------------------------import---------------------------------------
import os;
import re;
import sys;
sys.path.append("libs");
from BeautifulSoup import BeautifulSoup,Tag,CData;
import crifanLib;
import logging;
import urllib;
# import json;
import csv;
# import argparse;
# import codecs;
import xlwt;
import xlrd;
#import xlutils;
from xlutils.copy import copy;


def main():
    # #init csv file
    # # 'a+': read,write,append
    # # 'w' : clear before, then write
    # csvFp = open(gConst['csvFilename'], 'a+');
    # csvWriter = csv.writer(csvFp, dialect='excel');
    # # outputInfoDict = {
        # # 'Header'        : "",
        # # 'CatalogNumber' : "",
        # # 'PartNumber'    : "",
    # # };
    # csvWriter.writerow(["Header", "CatalogNumber", "PartNumber"]);
    # csvFp.close();
    

    #init xls file
    #styleBlueBkg= xlwt.easyxf('pattern: pattern solid, fore_colour sky_blue;');
    #styleBold   = xlwt.easyxf('font: bold on');
    styleBoldRed   = xlwt.easyxf('font: color-index red, bold on');
    headerStyle = styleBoldRed;
    wb = xlwt.Workbook();
    ws = wb.add_sheet(gConst['xls']['sheetName']);
    ws.write(0, 0, "Header",        headerStyle);
    ws.write(0, 1, "CatalogNumber", headerStyle);
    ws.write(0, 2, "PartNumber",    headerStyle);
    wb.save(gConst['xls']['fileName']);

    #init cookie
    crifanLib.initAutoHandleCookies();
    #crifanLib.initAutoHandleCookies("localCookieFile.txt");
    
    mailUrl = "http://www.fishersci.com/";
    
    #response = crifanLib.getUrlResponse(mailUrl);
    #logging.info("response=%s", response);
    #respInfo = response.info();
    #logging.info("respInfo=%s", respInfo);
    respHtml = crifanLib.getUrlRespHtml(mailUrl);
    #logging.info("respHtml=%s", respHtml);
    
    testSearchKeyword = "small molecules inc";
    logging.info("testSearchKeyword=%s", testSearchKeyword);
    encodedKeyword = urllib.quote_plus(testSearchKeyword);
    logging.debug("encodedKeyword=%s", encodedKeyword);
    #http://www.fishersci.com/ecomm/servlet/Search?keyWord=small+molecules+inc&store=Scientific&nav=0&offSet=0&storeId=10652&langId=-1&fromSearchPage=1&searchType=PROD
    searchBaseUrl = "http://www.fishersci.com/ecomm/servlet/Search";
    paraDict = {
        "keyWord"   : encodedKeyword,
        "store"     : "Scientific",
        "nav"       : "0",
        "offSet"    : "0",
        "storeId"   : "10652",
        "langId"    : "-1",
        "fromSearchPage": "1",
        "searchType"    : "PROD",
    };
    
    searchWholeUrl = crifanLib.genFullUrl(searchBaseUrl, paraDict);
    logging.debug("searchWholeUrl=%s", searchWholeUrl);
    searchRespHtml = crifanLib.getUrlRespHtml(searchWholeUrl);
    #logging.debug("searchRespHtml=%s", searchRespHtml);
    
    soup = BeautifulSoup(searchRespHtml);
    foundAllPtitleLink = soup.findAll(name="a", attrs={"class":"ptitlelink"});
    #logging.info("foundAllPtitleLink=%s", foundAllPtitleLink);
    logging.debug("len(foundAllPtitleLink)=%d", len(foundAllPtitleLink));
    if(foundAllPtitleLink):
        outputInfoDictList = [];
        for i, eachPtitleLink in enumerate(foundAllPtitleLink):
            outputInfoDict = {
                'Header'        : "",
                'CatalogNumber' : "",
                'PartNumber'    : "",
            };
            
            logging.info("%s", '{0:-^80}'.format(" "+str(i)+" "));
            # <A data-title-link = "177" class="ptitlelink" id="qa_srch_res_title_0" 
                # href="/ecomm/servlet/itemdetail?LBCID=08502235&itemdetail='item'&storeId=10652&productId=6701199&catalogId=29104&matchedCatNo=NC9670658&fromSearch=1&searchKey=small+molecules+inc&highlightProductsItemsFlag=Y&endecaSearchQuery=%23store%3DScientific%23nav%3D0%23rpp%3D25%23offSet%3D0%23keyWord%3Dsmall%2Bmolecules%2Binc&xrefPartType=From&savings=0.0&xrefEvent=1358155008902_0&searchType=PROD"
                # onClick="javascript:forceRankWebTrends('false','6701199');">
            # CBZ-3-PYRROLIDINONE 900G
            # </a>
        
            href = eachPtitleLink['href'];
            #logging.info("href=%s", href);
            singleUrl = gConst['domain'] + href;
            logging.debug("singleUrl=%s", singleUrl);
            
            respHtml = crifanLib.getUrlRespHtml(singleUrl);
            logging.debug("respHtml=%s", respHtml);
            
            #remove <!-- --->
            filterHtml = re.sub("<!--?[\w \t&\./:\(\)\-'\\!*]+-->", "", respHtml);
            #logging.debug("filterHtml=%s", filterHtml);
            soup = BeautifulSoup(filterHtml);
            
            #found values:
            
            # <h1 id="item_default_header_text">
                # <!-- display vendor name --><span class=search_highlight>SMALL MOLECULES INC</span>&nbsp;<!-- 30-char desciption --><!-- Item Diffentiator -->CBZ-3-PYRROLIDINONE 900G
            # </h1>
            

	        # <h1 id="item_default_header_text">

	        	# <span class=search_highlight>SMALL MOLECULES INC</span>&nbsp;CBZ-3-PYRROLIDINONE 900G

	        # </h1>

            foundHeaderText = soup.find(name="h1", attrs={"id":"item_default_header_text"});
            logging.debug("foundHeaderText=%s", foundHeaderText);
            if(foundHeaderText):
                headerTextContens = foundHeaderText.contents;
                logging.debug("headerTextContens=%s", headerTextContens); #headerTextContens=[u'\n', <span class="search_highlight">SMALL MOLECULES INC</span>, u'&nbsp;CBZ-3-PYRROLIDINONE 900G\r\n\t        ']
                removedSpanContents = crifanLib.removeSoupContentsTagAttr(headerTextContens, "span", "class", "search_highlight");
                removedSpanUni = crifanLib.soupContentsToUnicode(removedSpanContents);
                filterHtmlEntUni = crifanLib.decodeHtmlEntity(removedSpanUni);
                logging.debug("filterHtmlEntUni=%s", filterHtmlEntUni);
                strippedUni = filterHtmlEntUni.strip();
                logging.debug("strippedUni=%s", strippedUni); #CBZ-3-PYRROLIDINONE 900G
                
                outputInfoDict['Header'] = strippedUni;
            else:
                logging.error("Can not find header text !!!");
                sys.exit(-1);
            #Catalog No.:NC9670658
            # <span id="qa_sku_cat_no_label">Catalog No.:

                # <span id="qa_sku_cat_no">NC9670658</span>

            # </span>
            foundCatalogNo = soup.find(name="span", attrs={"id":"qa_sku_cat_no"});
            logging.debug("foundCatalogNo=%s", foundCatalogNo);
            if(foundCatalogNo):
                catalogNo = foundCatalogNo.string;
                logging.info("catalogNo=%s", catalogNo); #NC9670658
                
                outputInfoDict['CatalogNumber'] = catalogNo;
            else:
                logging.error("Can not category No !!!");
                sys.exit(-2);

            #find No.
					# <span id="qa_cat_details">

					

				            # <p id="item_default_intropara">

				                # <i>Fisher Scientific offers many products that do not appear in our catalogs. This may be one of those products, so pictures and detailed descriptions are not available. However, you may be able to order it by adding it to your shopping cart.</i>

				            # </p>

				            # CBZ-3-PYRROLIDINONE 900G

							# <input type="hidden" name="nonCompliance" value="false">    

				          	# NC9670658

						# <br>

						# <br>No.:11-1240/900G

			# </span>		
            
            # foundCatDetail = soup.find(name="span", attrs={"id":"qa_cat_details"});
            # logging.info("foundCatDetail=%s", foundCatDetail);
            # if(foundCatDetail):
                # catalogDetailUni = crifanLib.soupContentsToUnicode(foundCatDetail.contents);
                # logging.info("catalogDetailUni=%s", catalogDetailUni);
                # foundNo = re.search("No.:(?P<no>[\w\-/]+)", catalogDetailUni);
                # logging.info("foundNo=%s", foundNo);
                # if(foundNo):
                    # noUni = foundNo.group("no");
                    # logging.info("noUni=%s", noUni);
            
            #also can extract from:
            #<input type="hidden" name="partNum" value="11-1240/900G">
            foundPartNum = soup.find(name="input", attrs={"name":"partNum"});
            logging.debug("foundPartNum=%s", foundPartNum);
            if(foundPartNum):
                partNum = foundPartNum['value'];
                logging.info("partNum=%s", partNum);
                
                outputInfoDict['PartNumber'] = partNum;
            else:
                logging.error("Can not part num !!!");
                sys.exit(-3);
            
            #store
            outputInfoDictList.append(outputInfoDict);

    #open existed xls file
    oldWb = xlrd.open_workbook(gConst['xls']['fileName'], formatting_info=True);
    newWb = copy(oldWb);
    newWs = newWb.get_sheet(0);
    #write info to xls
    logging.info("Now to save all extracted info into %s", gConst['xls']['fileName']);
    for idx,eachInfoDict in enumerate(outputInfoDictList):
        num = idx + 1;
        # outputInfoDict = {
            # 'Header'        : "",
            # 'CatalogNumber' : "",
            # 'PartNumber'    : "",
        # };
        newWs.write(num, 0, eachInfoDict['Header']);
        newWs.write(num, 1, eachInfoDict['CatalogNumber']);
        newWs.write(num, 2, eachInfoDict['PartNumber']);

    newWb.save(gConst['xls']['fileName']);
    logging.info("Successfully saved all data into %s", gConst['xls']['fileName']);

###############################################################################
if __name__=="__main__":
    scriptSelfName = crifanLib.extractFilename(sys.argv[0]);

    logging.basicConfig(
                    level    = logging.DEBUG,
                    format   = 'LINE %(lineno)-4d  %(levelname)-8s %(message)s',
                    datefmt  = '%m-%d %H:%M',
                    filename = scriptSelfName + ".log",
                    filemode = 'w');
    # define a Handler which writes INFO messages or higher to the sys.stderr
    console = logging.StreamHandler();
    console.setLevel(logging.INFO);
    # set a format which is simpler for console use
    formatter = logging.Formatter('LINE %(lineno)-4d : %(levelname)-8s %(message)s');
    # tell the handler to use this format
    console.setFormatter(formatter);
    logging.getLogger('').addHandler(console);
    try:
        main();
    except:
        logging.exception("Unknown Error !");
        raise;

 

【总结】



发表评论

电子邮件地址不会被公开。 必填项已用*标注

无觅相关文章插件,快速提升流量