【背景】
之前写的,去模拟:
中的搜索,并保存抓取出来的信息为csv文件。
【scrape_fishersci_com 代码分享】
1.截图:
(1)运行效果:
(2)保存信息为csv文件:
2.Python项目代码下载:
scrape_fishersci_com_2013-01-16.7z
3.代码分享:
(1)scrape_fishersci_com.py
#!/usr/bin/python # -*- coding: utf-8 -*- """ ------------------------------------------------------------------------------- Function: scrape www.fishersci.com https://www.elance.com/j/extract-data-from-website/36621245/ Version: 2013-01-16 Author: Crifan Li Contact: https://www.crifan.com/about/me/ ------------------------------------------------------------------------------- """ #--------------------------------const values----------------------------------- gConst = { "domain" : "http://www.fishersci.com", "csvFilename" : "outputInfo.csv", "xls" : { 'fileName' : "outputInfo.xls", 'sheetName' : "outputInfo", }, }; gCfg = { }; gVal = { }; #---------------------------------import--------------------------------------- import os; import re; import sys; sys.path.append("libs"); from BeautifulSoup import BeautifulSoup,Tag,CData; import crifanLib; import logging; import urllib; # import json; import csv; # import argparse; # import codecs; import xlwt; import xlrd; #import xlutils; from xlutils.copy import copy; def main(): # #init csv file # # 'a+': read,write,append # # 'w' : clear before, then write # csvFp = open(gConst['csvFilename'], 'a+'); # csvWriter = csv.writer(csvFp, dialect='excel'); # # outputInfoDict = { # # 'Header' : "", # # 'CatalogNumber' : "", # # 'PartNumber' : "", # # }; # csvWriter.writerow(["Header", "CatalogNumber", "PartNumber"]); # csvFp.close(); #init xls file #styleBlueBkg= xlwt.easyxf('pattern: pattern solid, fore_colour sky_blue;'); #styleBold = xlwt.easyxf('font: bold on'); styleBoldRed = xlwt.easyxf('font: color-index red, bold on'); headerStyle = styleBoldRed; wb = xlwt.Workbook(); ws = wb.add_sheet(gConst['xls']['sheetName']); ws.write(0, 0, "Header", headerStyle); ws.write(0, 1, "CatalogNumber", headerStyle); ws.write(0, 2, "PartNumber", headerStyle); wb.save(gConst['xls']['fileName']); #init cookie crifanLib.initAutoHandleCookies(); #crifanLib.initAutoHandleCookies("localCookieFile.txt"); mailUrl = "http://www.fishersci.com/"; #response = crifanLib.getUrlResponse(mailUrl); #logging.info("response=%s", response); #respInfo = response.info(); #logging.info("respInfo=%s", respInfo); respHtml = crifanLib.getUrlRespHtml(mailUrl); #logging.info("respHtml=%s", respHtml); testSearchKeyword = "small molecules inc"; logging.info("testSearchKeyword=%s", testSearchKeyword); encodedKeyword = urllib.quote_plus(testSearchKeyword); logging.debug("encodedKeyword=%s", encodedKeyword); #http://www.fishersci.com/ecomm/servlet/Search?keyWord=small+molecules+inc&store=Scientific&nav=0&offSet=0&storeId=10652&langId=-1&fromSearchPage=1&searchType=PROD searchBaseUrl = "http://www.fishersci.com/ecomm/servlet/Search"; paraDict = { "keyWord" : encodedKeyword, "store" : "Scientific", "nav" : "0", "offSet" : "0", "storeId" : "10652", "langId" : "-1", "fromSearchPage": "1", "searchType" : "PROD", }; searchWholeUrl = crifanLib.genFullUrl(searchBaseUrl, paraDict); logging.debug("searchWholeUrl=%s", searchWholeUrl); searchRespHtml = crifanLib.getUrlRespHtml(searchWholeUrl); #logging.debug("searchRespHtml=%s", searchRespHtml); soup = BeautifulSoup(searchRespHtml); foundAllPtitleLink = soup.findAll(name="a", attrs={"class":"ptitlelink"}); #logging.info("foundAllPtitleLink=%s", foundAllPtitleLink); logging.debug("len(foundAllPtitleLink)=%d", len(foundAllPtitleLink)); if(foundAllPtitleLink): outputInfoDictList = []; for i, eachPtitleLink in enumerate(foundAllPtitleLink): outputInfoDict = { 'Header' : "", 'CatalogNumber' : "", 'PartNumber' : "", }; logging.info("%s", '{0:-^80}'.format(" "+str(i)+" ")); # <A data-title-link = "177" class="ptitlelink" id="qa_srch_res_title_0" # href="/ecomm/servlet/itemdetail?LBCID=08502235&itemdetail='item'&storeId=10652&productId=6701199&catalogId=29104&matchedCatNo=NC9670658&fromSearch=1&searchKey=small+molecules+inc&highlightProductsItemsFlag=Y&endecaSearchQuery=%23store%3DScientific%23nav%3D0%23rpp%3D25%23offSet%3D0%23keyWord%3Dsmall%2Bmolecules%2Binc&xrefPartType=From&savings=0.0&xrefEvent=1358155008902_0&searchType=PROD" # onClick="javascript:forceRankWebTrends('false','6701199');"> # CBZ-3-PYRROLIDINONE 900G # </a> href = eachPtitleLink['href']; #logging.info("href=%s", href); singleUrl = gConst['domain'] + href; logging.debug("singleUrl=%s", singleUrl); respHtml = crifanLib.getUrlRespHtml(singleUrl); logging.debug("respHtml=%s", respHtml); #remove <!-- ---> filterHtml = re.sub("<!--?[\w \t&\./:\(\)\-'\\!*]+-->", "", respHtml); #logging.debug("filterHtml=%s", filterHtml); soup = BeautifulSoup(filterHtml); #found values: # <h1 id="item_default_header_text"> # <!-- display vendor name --><span class=search_highlight>SMALL MOLECULES INC</span> <!-- 30-char desciption --><!-- Item Diffentiator -->CBZ-3-PYRROLIDINONE 900G # </h1> # <h1 id="item_default_header_text"> # <span class=search_highlight>SMALL MOLECULES INC</span> CBZ-3-PYRROLIDINONE 900G # </h1> foundHeaderText = soup.find(name="h1", attrs={"id":"item_default_header_text"}); logging.debug("foundHeaderText=%s", foundHeaderText); if(foundHeaderText): headerTextContens = foundHeaderText.contents; logging.debug("headerTextContens=%s", headerTextContens); #headerTextContens=[u'\n', <span class="search_highlight">SMALL MOLECULES INC</span>, u' CBZ-3-PYRROLIDINONE 900G\r\n\t '] removedSpanContents = crifanLib.removeSoupContentsTagAttr(headerTextContens, "span", "class", "search_highlight"); removedSpanUni = crifanLib.soupContentsToUnicode(removedSpanContents); filterHtmlEntUni = crifanLib.decodeHtmlEntity(removedSpanUni); logging.debug("filterHtmlEntUni=%s", filterHtmlEntUni); strippedUni = filterHtmlEntUni.strip(); logging.debug("strippedUni=%s", strippedUni); #CBZ-3-PYRROLIDINONE 900G outputInfoDict['Header'] = strippedUni; else: logging.error("Can not find header text !!!"); sys.exit(-1); #Catalog No.:NC9670658 # <span id="qa_sku_cat_no_label">Catalog No.: # <span id="qa_sku_cat_no">NC9670658</span> # </span> foundCatalogNo = soup.find(name="span", attrs={"id":"qa_sku_cat_no"}); logging.debug("foundCatalogNo=%s", foundCatalogNo); if(foundCatalogNo): catalogNo = foundCatalogNo.string; logging.info("catalogNo=%s", catalogNo); #NC9670658 outputInfoDict['CatalogNumber'] = catalogNo; else: logging.error("Can not category No !!!"); sys.exit(-2); #find No. # <span id="qa_cat_details"> # <p id="item_default_intropara"> # <i>Fisher Scientific offers many products that do not appear in our catalogs. This may be one of those products, so pictures and detailed descriptions are not available. However, you may be able to order it by adding it to your shopping cart.</i> # </p> # CBZ-3-PYRROLIDINONE 900G # <input type="hidden" name="nonCompliance" value="false"> # NC9670658 # <br> # <br>No.:11-1240/900G # </span> # foundCatDetail = soup.find(name="span", attrs={"id":"qa_cat_details"}); # logging.info("foundCatDetail=%s", foundCatDetail); # if(foundCatDetail): # catalogDetailUni = crifanLib.soupContentsToUnicode(foundCatDetail.contents); # logging.info("catalogDetailUni=%s", catalogDetailUni); # foundNo = re.search("No.:(?P<no>[\w\-/]+)", catalogDetailUni); # logging.info("foundNo=%s", foundNo); # if(foundNo): # noUni = foundNo.group("no"); # logging.info("noUni=%s", noUni); #also can extract from: #<input type="hidden" name="partNum" value="11-1240/900G"> foundPartNum = soup.find(name="input", attrs={"name":"partNum"}); logging.debug("foundPartNum=%s", foundPartNum); if(foundPartNum): partNum = foundPartNum['value']; logging.info("partNum=%s", partNum); outputInfoDict['PartNumber'] = partNum; else: logging.error("Can not part num !!!"); sys.exit(-3); #store outputInfoDictList.append(outputInfoDict); #open existed xls file oldWb = xlrd.open_workbook(gConst['xls']['fileName'], formatting_info=True); newWb = copy(oldWb); newWs = newWb.get_sheet(0); #write info to xls logging.info("Now to save all extracted info into %s", gConst['xls']['fileName']); for idx,eachInfoDict in enumerate(outputInfoDictList): num = idx + 1; # outputInfoDict = { # 'Header' : "", # 'CatalogNumber' : "", # 'PartNumber' : "", # }; newWs.write(num, 0, eachInfoDict['Header']); newWs.write(num, 1, eachInfoDict['CatalogNumber']); newWs.write(num, 2, eachInfoDict['PartNumber']); newWb.save(gConst['xls']['fileName']); logging.info("Successfully saved all data into %s", gConst['xls']['fileName']); ############################################################################### if __name__=="__main__": scriptSelfName = crifanLib.extractFilename(sys.argv[0]); logging.basicConfig( level = logging.DEBUG, format = 'LINE %(lineno)-4d %(levelname)-8s %(message)s', datefmt = '%m-%d %H:%M', filename = scriptSelfName + ".log", filemode = 'w'); # define a Handler which writes INFO messages or higher to the sys.stderr console = logging.StreamHandler(); console.setLevel(logging.INFO); # set a format which is simpler for console use formatter = logging.Formatter('LINE %(lineno)-4d : %(levelname)-8s %(message)s'); # tell the handler to use this format console.setFormatter(formatter); logging.getLogger('').addHandler(console); try: main(); except: logging.exception("Unknown Error !"); raise;
【总结】
转载请注明:在路上 » 【代码分享】Python代码:scrape_fishersci_com – 模拟www.fishersci.com搜索并抓取信息然后保存为csv