最新消息:20210816 当前crifan.com域名已被污染,为防止失联,请关注(页面右下角的)公众号

【代码分享】Python代码:scrape_fishersci_com – 模拟www.fishersci.com搜索并抓取信息然后保存为csv

CodeShare crifan 2575浏览 0评论

【背景】

之前写的,去模拟:

http://www.fishersci.com

中的搜索,并保存抓取出来的信息为csv文件。

 

【scrape_fishersci_com 代码分享】

1.截图:

(1)运行效果:

scrape_fishersci_com run ui

(2)保存信息为csv文件:

scrape_fishersci_com save out csv file

2.Python项目代码下载:

scrape_fishersci_com_2013-01-16.7z

 

3.代码分享:

(1)scrape_fishersci_com.py

#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
-------------------------------------------------------------------------------
Function:
scrape www.fishersci.com
https://www.elance.com/j/extract-data-from-website/36621245/

Version:    2013-01-16
Author:     Crifan Li
Contact:    https://www.crifan.com/about/me/

-------------------------------------------------------------------------------
"""

#--------------------------------const values-----------------------------------

gConst = {
    "domain"    : "http://www.fishersci.com",
    
    "csvFilename" : "outputInfo.csv",
    "xls"   : {
        'fileName'  : "outputInfo.xls",
        'sheetName' : "outputInfo",
    },
};

gCfg = {

};

gVal = {

};

#---------------------------------import---------------------------------------
import os;
import re;
import sys;
sys.path.append("libs");
from BeautifulSoup import BeautifulSoup,Tag,CData;
import crifanLib;
import logging;
import urllib;
# import json;
import csv;
# import argparse;
# import codecs;
import xlwt;
import xlrd;
#import xlutils;
from xlutils.copy import copy;


def main():
    # #init csv file
    # # 'a+': read,write,append
    # # 'w' : clear before, then write
    # csvFp = open(gConst['csvFilename'], 'a+');
    # csvWriter = csv.writer(csvFp, dialect='excel');
    # # outputInfoDict = {
        # # 'Header'        : "",
        # # 'CatalogNumber' : "",
        # # 'PartNumber'    : "",
    # # };
    # csvWriter.writerow(["Header", "CatalogNumber", "PartNumber"]);
    # csvFp.close();
    

    #init xls file
    #styleBlueBkg= xlwt.easyxf('pattern: pattern solid, fore_colour sky_blue;');
    #styleBold   = xlwt.easyxf('font: bold on');
    styleBoldRed   = xlwt.easyxf('font: color-index red, bold on');
    headerStyle = styleBoldRed;
    wb = xlwt.Workbook();
    ws = wb.add_sheet(gConst['xls']['sheetName']);
    ws.write(0, 0, "Header",        headerStyle);
    ws.write(0, 1, "CatalogNumber", headerStyle);
    ws.write(0, 2, "PartNumber",    headerStyle);
    wb.save(gConst['xls']['fileName']);

    #init cookie
    crifanLib.initAutoHandleCookies();
    #crifanLib.initAutoHandleCookies("localCookieFile.txt");
    
    mailUrl = "http://www.fishersci.com/";
    
    #response = crifanLib.getUrlResponse(mailUrl);
    #logging.info("response=%s", response);
    #respInfo = response.info();
    #logging.info("respInfo=%s", respInfo);
    respHtml = crifanLib.getUrlRespHtml(mailUrl);
    #logging.info("respHtml=%s", respHtml);
    
    testSearchKeyword = "small molecules inc";
    logging.info("testSearchKeyword=%s", testSearchKeyword);
    encodedKeyword = urllib.quote_plus(testSearchKeyword);
    logging.debug("encodedKeyword=%s", encodedKeyword);
    #http://www.fishersci.com/ecomm/servlet/Search?keyWord=small+molecules+inc&store=Scientific&nav=0&offSet=0&storeId=10652&langId=-1&fromSearchPage=1&searchType=PROD
    searchBaseUrl = "http://www.fishersci.com/ecomm/servlet/Search";
    paraDict = {
        "keyWord"   : encodedKeyword,
        "store"     : "Scientific",
        "nav"       : "0",
        "offSet"    : "0",
        "storeId"   : "10652",
        "langId"    : "-1",
        "fromSearchPage": "1",
        "searchType"    : "PROD",
    };
    
    searchWholeUrl = crifanLib.genFullUrl(searchBaseUrl, paraDict);
    logging.debug("searchWholeUrl=%s", searchWholeUrl);
    searchRespHtml = crifanLib.getUrlRespHtml(searchWholeUrl);
    #logging.debug("searchRespHtml=%s", searchRespHtml);
    
    soup = BeautifulSoup(searchRespHtml);
    foundAllPtitleLink = soup.findAll(name="a", attrs={"class":"ptitlelink"});
    #logging.info("foundAllPtitleLink=%s", foundAllPtitleLink);
    logging.debug("len(foundAllPtitleLink)=%d", len(foundAllPtitleLink));
    if(foundAllPtitleLink):
        outputInfoDictList = [];
        for i, eachPtitleLink in enumerate(foundAllPtitleLink):
            outputInfoDict = {
                'Header'        : "",
                'CatalogNumber' : "",
                'PartNumber'    : "",
            };
            
            logging.info("%s", '{0:-^80}'.format(" "+str(i)+" "));
            # <A data-title-link = "177" class="ptitlelink" id="qa_srch_res_title_0" 
                # href="/ecomm/servlet/itemdetail?LBCID=08502235&itemdetail='item'&storeId=10652&productId=6701199&catalogId=29104&matchedCatNo=NC9670658&fromSearch=1&searchKey=small+molecules+inc&highlightProductsItemsFlag=Y&endecaSearchQuery=%23store%3DScientific%23nav%3D0%23rpp%3D25%23offSet%3D0%23keyWord%3Dsmall%2Bmolecules%2Binc&xrefPartType=From&savings=0.0&xrefEvent=1358155008902_0&searchType=PROD"
                # onClick="javascript:forceRankWebTrends('false','6701199');">
            # CBZ-3-PYRROLIDINONE 900G
            # </a>
        
            href = eachPtitleLink['href'];
            #logging.info("href=%s", href);
            singleUrl = gConst['domain'] + href;
            logging.debug("singleUrl=%s", singleUrl);
            
            respHtml = crifanLib.getUrlRespHtml(singleUrl);
            logging.debug("respHtml=%s", respHtml);
            
            #remove <!-- --->
            filterHtml = re.sub("<!--?[\w \t&\./:\(\)\-'\\!*]+-->", "", respHtml);
            #logging.debug("filterHtml=%s", filterHtml);
            soup = BeautifulSoup(filterHtml);
            
            #found values:
            
            # <h1 id="item_default_header_text">
                # <!-- display vendor name --><span class=search_highlight>SMALL MOLECULES INC</span>&nbsp;<!-- 30-char desciption --><!-- Item Diffentiator -->CBZ-3-PYRROLIDINONE 900G
            # </h1>
            

	        # <h1 id="item_default_header_text">

	        	# <span class=search_highlight>SMALL MOLECULES INC</span>&nbsp;CBZ-3-PYRROLIDINONE 900G

	        # </h1>

            foundHeaderText = soup.find(name="h1", attrs={"id":"item_default_header_text"});
            logging.debug("foundHeaderText=%s", foundHeaderText);
            if(foundHeaderText):
                headerTextContens = foundHeaderText.contents;
                logging.debug("headerTextContens=%s", headerTextContens); #headerTextContens=[u'\n', <span class="search_highlight">SMALL MOLECULES INC</span>, u'&nbsp;CBZ-3-PYRROLIDINONE 900G\r\n\t        ']
                removedSpanContents = crifanLib.removeSoupContentsTagAttr(headerTextContens, "span", "class", "search_highlight");
                removedSpanUni = crifanLib.soupContentsToUnicode(removedSpanContents);
                filterHtmlEntUni = crifanLib.decodeHtmlEntity(removedSpanUni);
                logging.debug("filterHtmlEntUni=%s", filterHtmlEntUni);
                strippedUni = filterHtmlEntUni.strip();
                logging.debug("strippedUni=%s", strippedUni); #CBZ-3-PYRROLIDINONE 900G
                
                outputInfoDict['Header'] = strippedUni;
            else:
                logging.error("Can not find header text !!!");
                sys.exit(-1);
            #Catalog No.:NC9670658
            # <span id="qa_sku_cat_no_label">Catalog No.:

                # <span id="qa_sku_cat_no">NC9670658</span>

            # </span>
            foundCatalogNo = soup.find(name="span", attrs={"id":"qa_sku_cat_no"});
            logging.debug("foundCatalogNo=%s", foundCatalogNo);
            if(foundCatalogNo):
                catalogNo = foundCatalogNo.string;
                logging.info("catalogNo=%s", catalogNo); #NC9670658
                
                outputInfoDict['CatalogNumber'] = catalogNo;
            else:
                logging.error("Can not category No !!!");
                sys.exit(-2);

            #find No.
					# <span id="qa_cat_details">

					

				            # <p id="item_default_intropara">

				                # <i>Fisher Scientific offers many products that do not appear in our catalogs. This may be one of those products, so pictures and detailed descriptions are not available. However, you may be able to order it by adding it to your shopping cart.</i>

				            # </p>

				            # CBZ-3-PYRROLIDINONE 900G

							# <input type="hidden" name="nonCompliance" value="false">    

				          	# NC9670658

						# <br>

						# <br>No.:11-1240/900G

			# </span>		
            
            # foundCatDetail = soup.find(name="span", attrs={"id":"qa_cat_details"});
            # logging.info("foundCatDetail=%s", foundCatDetail);
            # if(foundCatDetail):
                # catalogDetailUni = crifanLib.soupContentsToUnicode(foundCatDetail.contents);
                # logging.info("catalogDetailUni=%s", catalogDetailUni);
                # foundNo = re.search("No.:(?P<no>[\w\-/]+)", catalogDetailUni);
                # logging.info("foundNo=%s", foundNo);
                # if(foundNo):
                    # noUni = foundNo.group("no");
                    # logging.info("noUni=%s", noUni);
            
            #also can extract from:
            #<input type="hidden" name="partNum" value="11-1240/900G">
            foundPartNum = soup.find(name="input", attrs={"name":"partNum"});
            logging.debug("foundPartNum=%s", foundPartNum);
            if(foundPartNum):
                partNum = foundPartNum['value'];
                logging.info("partNum=%s", partNum);
                
                outputInfoDict['PartNumber'] = partNum;
            else:
                logging.error("Can not part num !!!");
                sys.exit(-3);
            
            #store
            outputInfoDictList.append(outputInfoDict);

    #open existed xls file
    oldWb = xlrd.open_workbook(gConst['xls']['fileName'], formatting_info=True);
    newWb = copy(oldWb);
    newWs = newWb.get_sheet(0);
    #write info to xls
    logging.info("Now to save all extracted info into %s", gConst['xls']['fileName']);
    for idx,eachInfoDict in enumerate(outputInfoDictList):
        num = idx + 1;
        # outputInfoDict = {
            # 'Header'        : "",
            # 'CatalogNumber' : "",
            # 'PartNumber'    : "",
        # };
        newWs.write(num, 0, eachInfoDict['Header']);
        newWs.write(num, 1, eachInfoDict['CatalogNumber']);
        newWs.write(num, 2, eachInfoDict['PartNumber']);

    newWb.save(gConst['xls']['fileName']);
    logging.info("Successfully saved all data into %s", gConst['xls']['fileName']);

###############################################################################
if __name__=="__main__":
    scriptSelfName = crifanLib.extractFilename(sys.argv[0]);

    logging.basicConfig(
                    level    = logging.DEBUG,
                    format   = 'LINE %(lineno)-4d  %(levelname)-8s %(message)s',
                    datefmt  = '%m-%d %H:%M',
                    filename = scriptSelfName + ".log",
                    filemode = 'w');
    # define a Handler which writes INFO messages or higher to the sys.stderr
    console = logging.StreamHandler();
    console.setLevel(logging.INFO);
    # set a format which is simpler for console use
    formatter = logging.Formatter('LINE %(lineno)-4d : %(levelname)-8s %(message)s');
    # tell the handler to use this format
    console.setFormatter(formatter);
    logging.getLogger('').addHandler(console);
    try:
        main();
    except:
        logging.exception("Unknown Error !");
        raise;

 

【总结】

转载请注明:在路上 » 【代码分享】Python代码:scrape_fishersci_com – 模拟www.fishersci.com搜索并抓取信息然后保存为csv

发表我的评论
取消评论

表情

Hi,您需要填写昵称和邮箱!

  • 昵称 (必填)
  • 邮箱 (必填)
  • 网址
89 queries in 0.185 seconds, using 21.83MB memory