【背景】
之前写的,去抓取:
中,符合特定规则的产品的信息
然后导出为csv文件。
【scrape_wheelbynet_com代码分享】
1.截图:
(1)程序运行效果:
(2)抓取出来的数据保存到csv文件:
2.Python项目代码下载:
scrape_wheelbynet_com_2013-07-05.7z
3.代码分享:
(1)scrape_wheelbynet_com.py
#!/usr/bin/python # -*- coding: utf-8 -*- """ ------------------------------------------------------------------------------- Function: scrape wheelbynet.com Version: 2013-07-05 Author: Crifan Li Contact: [email protected] Usage: scrape_wheelbynet_com.py (1)for %Y-%m-%d scrape_wheelbynet_com.py -b 2013-05-04 -d 2013-05-21 (2)for %Y-%m-%d %H:%M:%S scrape_wheelbynet_com.py -b "2013-05-04 00:00:00" -d "2013-05-20 00:00:00" TOOD: 1. ------------------------------------------------------------------------------- """ #--------------------------------const values----------------------------------- gConst = { "xls" : { 'fileName' : "outputInfo.xls", 'sheetName' : "outputInfo", }, 'domain' : "http://www.wheelbynet.com", 'minPrice' : 5000, 'maxPrice' : 5000000, # # 'beginDatetimeStr' : "1970-01-01 00:00:00", # # 'endDatetimeStr' : "2039-12-30 00:00:00", # 'beginDatetimeStr' : "1970-01-01", # 'endDatetimeStr' : "2039-12-30", }; gCfg = { }; gVal = { "csvFilename" : "wheelbynetItemsInfo", 'minPrice' : 0, 'maxPrice' : 0, # #date time string # 'beginDatetimeStr' : "", # 'endDatetimeStr' : "", # #converted to datetime type value # 'beginDatetime' : "", # 'endDatetime' : "", 'exclueZipcodeFile' : "", 'excludeZipCodeList' : [], #http://www.wheelbynet.com/docs/auto/index.html #http://www.wheelbynet.com/docs/rv/index.html #http://www.wheelbynet.com/docs/moto/index.html #http://www.wheelbynet.com/docs/boat/index.html 'allTypeInfoDict' : { "auto" : None, # is singleTypeInfoDict "rv" : None, "moto" : None, "boat" : None, }, #for show info 'curItemNum': 0, 'curTotalNum':0, }; #---------------------------------import--------------------------------------- import re; import sys; sys.path.append("libs"); from BeautifulSoup import BeautifulSoup,Tag,CData; import crifanLib; import logging; import urllib; import json; import os; import argparse; import codecs; import csv; import xlwt; import xlrd; #import xlutils; from xlutils.copy import copy; from datetime import datetime; def getTypeAndIdFromUrl(itemLink): """ get type from item url link input: http://www.wheelbynet.com/docs/auto/view_ad2.php3?ad_ref=autoG1RLDLQ8AQEM http://www.wheelbynet.com/docs/moto/view_ad2.php3?ad_ref=moto8EJ0VP28A9XK&motorcycle_make=Honda&motorcycle_model=Shadow+VT700 output: auto,G1RLDLQ8AQEM moto,8EJ0VP28A9XK """ (mainType, adId) = ("", ""); #http://autoexplosion.com/cars/buy/150594.php logging.debug("input itemLink=%s", itemLink); #foundMainType = re.search("http://autoexplosion\.com/(?P<mainType>\w+)/buy/(?P<adId>\d+)\.php", itemLink); foundMainType = re.search("http://(www\.)?wheelbynet\.com/docs/(?P<mainType>[a-zA-Z]+)/view_ad2\.php3\?ad_ref=(?P=mainType)(?P<adId>\w+)(&.+?)?", itemLink); logging.debug("foundMainType=%s", foundMainType); if(foundMainType): mainType = foundMainType.group("mainType"); adId = foundMainType.group("adId"); #mainType = foundMainType.group(1); #adId = foundMainType.group(2); logging.debug("mainType=%s, adId=%s", mainType, adId); #cars else: logging.error("Fail to find mainType,adId from %s", itemLink); sys.exit(-1); return (mainType, adId); def processEachItem(itemLink): """ process each search item from its url extract all info input example: http://www.wheelbynet.com/docs/auto/view_ad2.php3?ad_ref=autoG1RLDLQ8AQEM http://www.wheelbynet.com/docs/auto/view_ad2.php3?ad_ref=autoW7EE30W66V87 http://www.wheelbynet.com/docs/moto/view_ad2.php3?ad_ref=motoK3V853TOC818&motorcycle_make=Harley-Davidson&motorcycle_model=FLSTC+Heritage+Softail+Classic http://www.wheelbynet.com/docs/moto/view_ad2.php3?ad_ref=moto8EJ0VP28A9XK&motorcycle_make=Honda&motorcycle_model=Shadow+VT700 """ #debug #print "--------debug--------" #itemLink = "http://www.wheelbynet.com/docs/auto/view_ad2.php3?ad_ref=auto692R1SQXTIQA"; #itemLink = "http://www.wheelbynet.com/docs/auto/view_ad2.php3?ad_ref=autoN22YF8UWQ9OR"; #itemLink = "http://www.wheelbynet.com/docs/rv/view_ad2.php3?ad_ref=rvW00V9Z89PP3Z"; #strange: #http://www.wheelbynet.com/docs/rv/view_ad2.php3?ad_ref=rvW00V9Z89PP3Z #// REDIRECT if can not connect to mysql database #... #location.replace("http://www.wheelbynet.com/docs/main/server-upgrade.html"); logging.info("%s", crifanLib.formatString("[%d/%d] %s"%(gVal['curItemNum'],gVal['curTotalNum'],itemLink), paddingChar="-")); #itemRespHtml = crifanLib.getUrlRespHtml(itemLink); itemRespHtml = crifanLib.getUrlRespHtml_multiTry(itemLink, maxTryNum=50); #logging.debug("itemRespHtml=%s", itemRespHtml); itemInfoDict = { 'omitted' : False, 'omitReason' : "", 'Lead Source' : "", 'Ad Id' : "", 'Batch Date' : "", 'Phone' : "", 'Price' : "", 'Zip code' : "", 'Year' : "", 'Title' : "", 'Description' : "", 'Email' : "", 'URL' : "", 'Mileage' : "", 'City' : "", }; #init # (1) mainType, adId (mainType, adId) = getTypeAndIdFromUrl(itemLink); logging.debug("mainType=%s, adId=%s", mainType, adId); # (2) URL itemInfoDict['URL'] = itemLink; #1. check should be omit or not #(1) not in exclude zip code list #http://www.wheelbynet.com/docs/auto/view_ad2.php3?ad_ref=autoG1RLDLQ8AQEM #<meta property="og:postal-code" content="47833"/> #foundPostalCode = re.search('<meta property="og:postal-code" content="(?P<postalCode>\d+)"/>', itemRespHtml); #some no og:postal-code, so not use above method #http://www.wheelbynet.com/docs/moto/view_ad2.php3?ad_ref=motoK3V853TOC818&motorcycle_make=Harley-Davidson&motorcycle_model=FLSTC+Heritage+Softail+Classic #(<a href="/docs/moto/contact_seller275.php3?ad_ref=motoK3V853TOC818&sec_id=3">Contact Seller</a>)<br>Oak Ridge, New Jersey 07438<br>973-452-4859<br> #http://www.wheelbynet.com/docs/auto/view_ad2.php3?ad_ref=autoG1RLDLQ8AQEM #<a href="../moto/contact_seller275.php3?ad_ref=autoG1RLDLQ8AQEM&sec_id=1">Contact Seller</a>)<br />bowling green, Indiana 47833<br />812-236-7971<br /> #foundPostalCode = re.search('>Contact\s+Seller</a>\)<br\s*/?>[\w ]+, [\w ]+ (?P<postalCode>\d{5})<br\s*/?>', itemRespHtml); #http://www.wheelbynet.com/docs/auto/view_ad2.php3?ad_ref=auto1R3AESOO6MC7 # <td valign="top" width="65%"><font size="1" # face="Verdana">Andrea (<a href="../moto/contact_seller275.php3?ad_ref=auto1R3AESOO6MC7&sec_id=1">Contact Seller</a>)<br />Edmond, Oklahoma 7313<br />405-285-2472<br /><a href="/docs/moto/otherp275.php3?user_id=924836314&sec_id=1">View all of sellers ads</a> </font></td> foundPostalCode = re.search('>Contact\s+Seller</a>\)<br\s*/?>[\w ]+, [\w ]+ (?P<postalCode>\d+)<br\s*/?>', itemRespHtml); logging.debug("foundPostalCode=%s", foundPostalCode); if(foundPostalCode): postalCode = foundPostalCode.group("postalCode"); #47833 itemInfoDict['Zip code'] = postalCode; logging.info("itemInfoDict['Zip code']\t=%s", itemInfoDict['Zip code']); mainZipCode = postalCode[0:3]; #478 logging.debug("mainZipCode=%s", mainZipCode); if(mainZipCode in gVal['excludeZipCodeList']): logging.debug("mainZipCode=%s is in excludeZipCodeList, so omit this", mainZipCode); itemInfoDict['omitted'] = True; itemInfoDict['omitReason'] = "mainZipCode=%s in exclude list"%(mainZipCode); return itemInfoDict; else: logging.error("not found location for %s", itemLink); logging.debug("itemRespHtml=%s", itemRespHtml); sys.exit(-1); #3 extract remain infos #(1) Lead Source itemInfoDict['Lead Source'] = "wheelbynet-" + mainType; logging.info("itemInfoDict['Lead Source']=%s", itemInfoDict['Lead Source']); #(2) Ad Id itemInfoDict['Ad Id'] = adId; logging.info("itemInfoDict['Ad Id']\t=%s", itemInfoDict['Ad Id']); #(3) Batch Date itemInfoDict['Batch Date'] = datetime.now().strftime("%m/%d/%Y"); logging.info("itemInfoDict['Batch Date']\t=%s", itemInfoDict['Batch Date']); #(4) Phone # <td valign="top" width="65%"><font size="1" # face="Verdana">michelle (<a href="../moto/contact_seller275.php3?ad_ref=autoG1RLDLQ8AQEM&sec_id=1">Contact Seller</a>)<br />bowling green, Indiana 47833<br />812-236-7971<br /><a href="/docs/moto/otherp275.php3?user_id=911141267&sec_id=1">View all of sellers ads</a> </font></td> #foundPhone = re.search('<br\s*/?>(?P<phoneStr>[\d\-]+)<br\s*/?>', itemRespHtml); #http://www.wheelbynet.com/docs/auto/view_ad2.php3?ad_ref=auto0H9GYVY6A9Y7 # <td valign="top" width="65%"><font size="1" # face="Verdana">Jerry (<a href="../moto/contact_seller275.php3?ad_ref=auto0H9GYVY6A9Y7&sec_id=1">Contact Seller</a>)<br />Longview, Washington 98632<br />(360) 414-1382<br /><a href="/docs/moto/otherp275.php3?user_id=486819187&sec_id=1">View all of sellers ads</a> </font></td> #foundPhone = re.search('\d+<br\s*/?>(?P<phoneStr>[\(\)\d\- ]+)<br\s*/?>', itemRespHtml); #http://www.wheelbynet.com/docs/auto/view_ad2.php3?ad_ref=auto087SYHLH499D # <td valign="top" width="65%"><font size="1" # face="Verdana">Brad (<a href="../moto/contact_seller275.php3?ad_ref=auto087SYHLH499D&sec_id=1">Contact Seller</a>)<br />Farmington, Michigan 48331<br />248/324-4566<br /><a href="/docs/moto/otherp275.php3?user_id=229164588&sec_id=1">View all of sellers ads</a> </font></td> #foundPhone = re.search('\d+<br\s*/?>(?P<phoneStr>[/\(\)\d\- ]+)<br\s*/?>', itemRespHtml); #http://www.wheelbynet.com/docs/auto/view_ad2.php3?ad_ref=auto61T627OTMIHP # <td valign="top" width="65%"><font size="1" # face="Verdana">Pam (<a href="../moto/contact_seller275.php3?ad_ref=auto61T627OTMIHP&sec_id=1">Contact Seller</a>)<br />Taylorville, Illinois 62568<br />217-67;2-6906<br /><a href="/docs/moto/otherp275.php3?user_id=737269434&sec_id=1">View all of sellers ads</a> </font></td> #foundPhone = re.search('\d+<br\s*/?>(?P<phoneStr>[/;\(\)\d\- ]+)<br\s*/?>', itemRespHtml); #http://www.wheelbynet.com/docs/auto/view_ad2.php3?ad_ref=autoU18I89HE6PFN # <td valign="top" width="65%"><font size="1" # face="Verdana">Al (<a href="../moto/contact_seller275.php3?ad_ref=autoU18I89HE6PFN&sec_id=1">Contact Seller</a>)<br />Clinton, Mississippi 39056<br />601.953.6681<br /><a href="/docs/moto/otherp275.php3?user_id=783825779&sec_id=1">View all of sellers ads</a> </font></td> foundPhone = re.search('\d+<br\s*/?>(?P<phoneStr>[\./;\(\)\d\- ]+)<br\s*/?>', itemRespHtml); logging.debug("foundPhone=%s", foundPhone); if(foundPhone): #812-236-7971 #(360) 414-1382 phoneStr = foundPhone.group("phoneStr"); logging.debug("phoneStr=%s", phoneStr); stripedPhoneStr = phoneStr.strip(); onlyDigitPhone = re.sub("[^\d]", "", stripedPhoneStr); itemInfoDict['Phone'] = onlyDigitPhone; #8122367971, 3604141382 logging.info("itemInfoDict['Phone']\t=%s", itemInfoDict['Phone']); else: logging.error("not found phone for %s", itemLink); logging.debug("itemRespHtml=%s", itemRespHtml); sys.exit(-1); #(5) Price #<strong><font size="3" face="Verdana,arial">2007 Dodge Charger<br />$19,500.00</font></strong> foundPrice = re.search('<br\s*/?>(?P<priceStr>\$[\d,\.]+)</font></strong>', itemRespHtml); logging.debug("foundPrice=%s", foundPrice); if(foundPrice): priceStr = foundPrice.group("priceStr"); #$19,500.00 logging.debug("priceStr=%s", priceStr); #price = priceStr.replace("$", "").replace(",", "").replace(".", ""); priceNoDecimalPoint = re.sub("\.0+$", "", priceStr); logging.debug("priceNoDecimalPoint=%s", priceNoDecimalPoint); price = re.sub("[^\d]", "", priceNoDecimalPoint); logging.debug("price=%s", price); itemInfoDict['Price'] = price; logging.info("itemInfoDict['Price']\t=%s", itemInfoDict['Price']); else: logging.error("not found price for %s", itemLink); logging.debug("itemRespHtml=%s", itemRespHtml); sys.exit(-1); #(6) Year # <tr> # <td><font size="2" face="Verdana"><strong>Year:</strong></font></td><td><font size="2" face="Verdana"> # 2007</font></td> # </tr> foundYear = re.search('<strong>Year:</strong></font></td><td><font\s+size="2"\s+face="Verdana">\s*(?P<year>\d+)\s*</font></td>', itemRespHtml); logging.debug("foundYear=%s", foundYear); if(foundYear): itemInfoDict['Year'] = foundYear.group("year"); logging.info("itemInfoDict['Year']\t=%s", itemInfoDict['Year']); else: logging.error("Fail to find year from %s", itemLink); logging.debug("itemRespHtml=%s", itemRespHtml); sys.exit(-1); #(7) Title #http://www.wheelbynet.com/docs/auto/view_ad2.php3?ad_ref=autoG1RLDLQ8AQEM #<font size="3" face="Verdana,arial">2006 Harley-Davidson FLSTC Heritage Softail Classic<br>$12,500.00</font> #<td align="center" valign="top" width="450"><strong><font size="3" face="Verdana,arial">2007 Dodge Charger<br />$19,500.00</font></strong> foundTitle = re.search('<font\s+size="3"\s+face="Verdana,arial">(?P<title>.+?)<br\s*/?>\$', itemRespHtml); logging.debug("foundTitle=%s", foundTitle); if(foundTitle): title = foundTitle.group("title"); #2006 Harley-Davidson FLSTC Heritage Softail Classic logging.debug("title=%s", title); #2006 Harley-Davidson FLSTC Heritage Softail Classic itemInfoDict['Title'] = title; logging.info("itemInfoDict['Title']\t=%s", itemInfoDict['Title']); else: logging.error("Fail to find title from %s", itemLink); logging.debug("itemRespHtml=%s", itemRespHtml); sys.exit(-1); #(8) Description #http://www.wheelbynet.com/docs/moto/view_ad2.php3?ad_ref=motoK3V853TOC818&motorcycle_make=Harley-Davidson&motorcycle_model=FLSTC+Heritage+Softail+Classic # <td width="100%"><font size="2" face="Verdana">The bike is actually a Heritage FLSTI. It is fuel injected and it has a 5 speed trans. The bike was well maintained at the Knievel Custom Bike Shop in NJ. The bike was actually very gently broken in by one of the best bike riders on the road, "Kaptain Robbie Knievel" himself. It was transported along with the custom bikes to various events around the country and provided to some of the celebrities that attended if they needed a bike to ride. Christopher McDonald (Shooter McGavin in Happy Gilmore) was the last to ride it at Jim Kelly's silent austion event in Buffalo, NY several years ago and hasn't been ridden much since. The bike is in very nice condition with very low miles. I have clear title and am the original owner. The best way to contact me would be by e-mail. [email protected].</font> </td> foundDescription = re.search('<td\s+width="100%"><font\s+size="2"\s+face="Verdana">(?P<description>.+?)</font>\s*</td>', itemRespHtml); if(foundDescription): description = foundDescription.group("description"); logging.debug("description=%s", description); #<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1"> descriptionUnicode = description.decode("iso-8859-1", 'ignore'); #logging.debug("descriptionUnicode=%s", descriptionUnicode); descHtmlDecoded = crifanLib.decodeHtmlEntity(descriptionUnicode); #logging.info("type(descHtmlDecoded)=%s", type(descHtmlDecoded)); #logging.debug("descHtmlDecoded=%s", descHtmlDecoded); descHtmlFiltered = crifanLib.filterHtmlTag(descHtmlDecoded); #logging.info("type(descHtmlFiltered)=%s", type(descHtmlFiltered)); #logging.debug("descHtmlFiltered=%s", descHtmlFiltered); descHtmlFilteredUni = descHtmlFiltered; #logging.info("type(descHtmlFilteredUni)=%s", type(descHtmlFilteredUni)); descHtmlOnlyAscii = crifanLib.filterNonAsciiStr(descHtmlFilteredUni); #logging.debug("descHtmlOnlyAscii=%s", descHtmlOnlyAscii); strippedDesc = descHtmlOnlyAscii.strip(); itemInfoDict['Description'] = strippedDesc; logging.debug("itemInfoDict['Description']=%s", itemInfoDict['Description']); else: logging.error("Fail to find description from %s", itemLink); logging.debug("itemRespHtml=%s", itemRespHtml); sys.exit(-1); #(9) Email itemInfoDict['Email'] = "0"; #(10) Mileage #http://www.wheelbynet.com/docs/auto/view_ad2.php3?ad_ref=autoG1RLDLQ8AQEM # <tr> # <td><font size="2" face="Verdana"><strong>Miles:</strong></font></td><td><font size="2" face="Verdana"> # 49,000 </font></td> # </tr> #http://www.wheelbynet.com/docs/auto/view_ad2.php3?ad_ref=auto58XXKHTS7098 # <td><font size="2" face="Verdana"><strong>Miles:</strong></font></td><td><font size="2" face="Verdana"> # -- </font></td> #foundMiles = re.search('<td><font\s+size="2"\s+face="Verdana"><strong>Miles:</strong></font></td><td><font\s+size="2"\s+face="Verdana">\s*(?P<milesStr>[\d,]+)\s*</font></td>', itemRespHtml); foundMiles = re.search('<td><font\s+size="2"\s+face="Verdana"><strong>Miles:</strong></font></td><td><font\s+size="2"\s+face="Verdana">\s*(?P<milesStr>[\d,\-]+)\s*</font></td>', itemRespHtml); logging.debug("foundMiles=%s", foundMiles); if(foundMiles): milesStr = foundMiles.group("milesStr"); #49,000 logging.debug("milesStr=%s", milesStr); miles = re.sub("[^\d]", "", milesStr); if(miles): itemInfoDict['Mileage'] = miles; else: itemInfoDict['Mileage'] = "0"; # for "--" logging.info("itemInfoDict['Mileage']\t=%s", itemInfoDict['Mileage']); else: #special: #http://www.wheelbynet.com/docs/boat/view_ad2.php3?ad_ref=boat37K66CPNPI2H #no Miles, only have: Hours logging.debug("Fail to find mileage from %s", itemLink); logging.debug("itemRespHtml=%s", itemRespHtml); logging.debug("set Mileage to 0 for %s", itemLink); itemInfoDict['Mileage'] = "0"; #sys.exit(-1); #(11) City #http://www.wheelbynet.com/docs/moto/view_ad2.php3?ad_ref=motoK3V853TOC818&motorcycle_make=Harley-Davidson&motorcycle_model=FLSTC+Heritage+Softail+Classic # <font size="1" # face="Verdana">Joe (<a href="/docs/moto/contact_seller275.php3?ad_ref=motoK3V853TOC818&sec_id=3">Contact Seller</a>)<br>Oak Ridge, New Jersey 07438<br>973-452-4859<br><a href="/docs/moto/otherp275.php3?user_id=546843461&sec_id=3">View all of sellers ads</a> </font> foundCityStr = re.search(">Contact Seller</a>\)<br\s*/?>(?P<cityStr>[\w ]+),[\w ]+?<br\s*/?>", itemRespHtml); logging.debug("foundCityStr=%s", foundCityStr); if(foundCityStr): cityStr = foundCityStr.group("cityStr"); itemInfoDict['City'] = cityStr; logging.info("itemInfoDict['City']\t=%s", itemInfoDict['City']); else: logging.error("Fail to find city from %s", itemLink); logging.debug("itemRespHtml=%s", itemRespHtml); sys.exit(-1); return itemInfoDict; def getSecIdFromCurType(curType): sec_id = 0; if(curType == "auto"): #http://www.wheelbynet.com/docs/moto/search_moto275.php3?ad_image=&sec_id=1&sort=on&state=&sort5=entry_date&type_name=&make_name=&min_yr=1900&max_yr=2014&min_price=5000&max_price=5000000&seller=private&B1=Search sec_id = 1; elif(curType == "boat"): #http://www.wheelbynet.com/docs/moto/search_moto275.php3?ad_image=&sec_id=2&sort=on&state=&sort5=entry_date&type_name=&make_name=&min_yr=1900&max_yr=2014&min_price=5000&max_price=5000000&seller=private&B1=Search sec_id = 2; elif(curType == "moto"): #http://www.wheelbynet.com/docs/moto/search_moto275.php3?ad_image=&sec_id=3&sort=on&state=&sort5=entry_date&type_name=&make_name=&min_yr=1900&max_yr=2014&min_price=5000&max_price=5000000&seller=private&B1=Search sec_id = 3; elif(curType == "rv"): #http://www.wheelbynet.com/docs/moto/search_moto275.php3?ad_image=&sec_id=4&sort=on&state=&sort5=entry_date&type_name=&make_name=&min_yr=1900&max_yr=2014&min_price=5000&max_price=5000000&seller=private&B1=Search sec_id = 4; return sec_id; def getTotalPageNum(curType): totalPageNum = 0; sec_id = getSecIdFromCurType(curType); #http://www.wheelbynet.com/docs/auto/index.html mainUrl = "http://www.wheelbynet.com/docs/" + curType + "/index.html"; logging.debug("mainUrl=%s", mainUrl); mainUrlRespHtml = crifanLib.getUrlRespHtml(mainUrl); #logging.debug("mainUrlRespHtml=%s", mainUrlRespHtml); #http://www.wheelbynet.com/docs/moto/search_moto275.php3 searchBaseUrl = "http://www.wheelbynet.com/docs/moto/search_moto275.php3"; # searchBaseRespHtml = crifanLib.getUrlRespHtml(searchBaseUrl); # logging.debug("searchBaseRespHtml=%s", searchBaseRespHtml); #http://www.wheelbynet.com/docs/moto/search_moto275.php3?ad_image=&sec_id=1&sort=on&state=&sort5=entry_date&type_name=&make_name=&min_yr=1900&max_yr=2014&min_price=5000&max_price=5000000&seller=private&B1=Search # paraDict = { # 'zip' : "", # 'radius' : "0", # 'make%5B%5D' : "", # 'model%5B%5D' : "", # 'year_from' : "", # 'year_to' : "", # 'price_from' : gVal['minPrice'], # 'price_to' : gVal['maxPrice'], # 'mileage' : "", # 'posted_after' : "", # }; #http://www.wheelbynet.com/docs/moto/search_moto275.php3?offset=25&sec_id=1&min_yr=1900&max_yr=2014&min_price=5000&max_price=5000000&seller=private&B1=Search&pg=2&tc=500 #http://www.wheelbynet.com/docs/moto/search_moto275.php3?offset=50&sec_id=1&min_yr=1900&max_yr=2014&min_price=5000&max_price=5000000&seller=private&B1=Search&pg=3&tc=500 paraDict = { 'offset' : "0", 'sec_id' : str(sec_id), 'min_yr' : "1900", 'max_yr' : "2014", 'min_price' : gVal['minPrice'], 'max_price' : gVal['maxPrice'], 'seller' : "private", 'B1' : "Search", 'pg' : '1', 'tc' : '5000', # max search 5000 ! }; #http://www.wheelbynet.com/docs/moto/search_moto275.php3?ad_image=&sec_id=1&sort=on&state=&sort5=entry_date&type_name=&make_name=&min_yr=1900&max_yr=2014&min_price=5000&max_price=5000000&seller=private&B1=Search # paraDict = { # 'ad_image' : "", # 'sec_id' : str(sec_id), # 'sort' : "on", # 'state' : "", # 'sort5' : "entry_date", # 'type_name' : "", # 'make_name' : "", # 'min_yr' : "1900", # 'max_yr' : "2014", # 'min_price' : gVal['minPrice'], # 'max_price' : gVal['maxPrice'], # 'seller' : "private", # 'B1' : "Search", # 'tc' : '5000', # max search 5000 ! # }; searchUrl = crifanLib.genFullUrl(searchBaseUrl, paraDict); logging.info("searchUrl=%s", searchUrl); searchRespHtml = crifanLib.getUrlRespHtml_multiTry(searchUrl, maxTryNum=50); #logging.info("type(searchRespHtml)=%s", type(searchRespHtml)); #logging.debug("searchRespHtml=%s", searchRespHtml); # <font face=verdana,arial size=1><b>1</b> <a href="/docs/moto/search_moto275.php3?offset=25&sec_id=1&min_yr=1900&max_yr=2014&min_price=5000&max_price=5000000&seller=private&B1=Search&pg=2&tc=5000">2</a> # <a href="/docs/moto/search_moto275.php3?offset=50&sec_id=1&min_yr=1900&max_yr=2014&min_price=5000&max_price=5000000&seller=private&B1=Search&pg=3&tc=5000">3</a> # <a href="/docs/moto/search_moto275.php3?offset=75&sec_id=1&min_yr=1900&max_yr=2014&min_price=5000&max_price=5000000&seller=private&B1=Search&pg=4&tc=5000">4</a> # <a href="/docs/moto/search_moto275.php3?offset=100&sec_id=1&min_yr=1900&max_yr=2014&min_price=5000&max_price=5000000&seller=private&B1=Search&pg=5&tc=5000">5</a> # <a href="/docs/moto/search_moto275.php3?offset=125&sec_id=1&min_yr=1900&max_yr=2014&min_price=5000&max_price=5000000&seller=private&B1=Search&pg=6&tc=5000">6</a> # <a href="/docs/moto/search_moto275.php3?offset=150&sec_id=1&min_yr=1900&max_yr=2014&min_price=5000&max_price=5000000&seller=private&B1=Search&pg=7&tc=5000">7</a> # <a href="/docs/moto/search_moto275.php3?offset=175&sec_id=1&min_yr=1900&max_yr=2014&min_price=5000&max_price=5000000&seller=private&B1=Search&pg=8&tc=5000">8</a> # <a href="/docs/moto/search_moto275.php3?offset=200&sec_id=1&min_yr=1900&max_yr=2014&min_price=5000&max_price=5000000&seller=private&B1=Search&pg=9&tc=5000">9</a> # <a href="/docs/moto/search_moto275.php3?offset=225&sec_id=1&min_yr=1900&max_yr=2014&min_price=5000&max_price=5000000&seller=private&B1=Search&pg=10&tc=5000">10</a> # <a href="/docs/moto/search_moto275.php3?offset=250&sec_id=1&min_yr=1900&max_yr=2014&min_price=5000&max_price=5000000&seller=private&B1=Search&pg=11&tc=5000">11</a> # <a href="/docs/moto/search_moto275.php3?offset=275&sec_id=1&min_yr=1900&max_yr=2014&min_price=5000&max_price=5000000&seller=private&B1=Search&pg=12&tc=5000">12</a> # <a href="/docs/moto/search_moto275.php3?offset=300&sec_id=1&min_yr=1900&max_yr=2014&min_price=5000&max_price=5000000&seller=private&B1=Search&pg=13&tc=5000">13</a> # <a href="/docs/moto/search_moto275.php3?offset=325&sec_id=1&min_yr=1900&max_yr=2014&min_price=5000&max_price=5000000&seller=private&B1=Search&pg=14&tc=5000">14</a> # <a href="/docs/moto/search_moto275.php3?offset=350&sec_id=1&min_yr=1900&max_yr=2014&min_price=5000&max_price=5000000&seller=private&B1=Search&pg=15&tc=5000">15</a> # <a href="/docs/moto/search_moto275.php3?offset=375&sec_id=1&min_yr=1900&max_yr=2014&min_price=5000&max_price=5000000&seller=private&B1=Search&pg=16&tc=5000">16</a> # <a href="/docs/moto/search_moto275.php3?offset=400&sec_id=1&min_yr=1900&max_yr=2014&min_price=5000&max_price=5000000&seller=private&B1=Search&pg=17&tc=5000">17</a> # <a href="/docs/moto/search_moto275.php3?offset=425&sec_id=1&min_yr=1900&max_yr=2014&min_price=5000&max_price=5000000&seller=private&B1=Search&pg=18&tc=5000">18</a> # <a href="/docs/moto/search_moto275.php3?offset=450&sec_id=1&min_yr=1900&max_yr=2014&min_price=5000&max_price=5000000&seller=private&B1=Search&pg=19&tc=5000">19</a> # <a href="/docs/moto/search_moto275.php3?offset=475&sec_id=1&min_yr=1900&max_yr=2014&min_price=5000&max_price=5000000&seller=private&B1=Search&pg=20&tc=5000">20</a> # </font> foundPageLinkList = re.findall('<a href="/docs/\w+/search_\w+275\.php3\?offset=\d+&sec_id=\d+&min_yr=\d+&max_yr=\d+&min_price=\d+&max_price=\d+&seller=private&B1=Search&pg=(?P<pgNum>\d+)&tc=\d+">(?P=pgNum)</a>', searchRespHtml); logging.info("foundPageLinkList=%s", foundPageLinkList); #foundPageLinkList=['2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20'] if(foundPageLinkList): totalPageNumStr = foundPageLinkList[-1]; logging.debug("Total page number string = %s", totalPageNumStr); totalPageNum = int(totalPageNumStr); logging.debug("totalPageNum=%d", totalPageNum); else: logging.error("Can not find total number !"); logging.debug("searchRespHtml=%s", searchRespHtml); sys.exit(-1); return totalPageNum; def getSinglePageHtml(curType, offset): sec_id = getSecIdFromCurType(curType); singlePageSearchBaseUrl = "http://www.wheelbynet.com/docs/moto/search_moto275.php3"; #http://www.wheelbynet.com/docs/moto/search_moto275.php3?offset=25&sec_id=1&min_yr=1900&max_yr=2014&min_price=5000&max_price=5000000&seller=private&B1=Search&pg=2&tc=500 #http://www.wheelbynet.com/docs/moto/search_moto275.php3?offset=50&sec_id=1&min_yr=1900&max_yr=2014&min_price=5000&max_price=5000000&seller=private&B1=Search&pg=3&tc=500 paraDict = { 'offset' : offset, 'sec_id' : str(sec_id), 'min_yr' : "1900", 'max_yr' : "2014", 'min_price' : gVal['minPrice'], 'max_price' : gVal['maxPrice'], 'seller' : "private", 'B1' : "Search", #'pg' : '1', #'tc' : '5000', # max search 5000 ! }; singlePageSearchUrl = crifanLib.genFullUrl(singlePageSearchBaseUrl, paraDict); logging.info("singlePageSearchUrl=%s", singlePageSearchUrl); searchRespHtml = crifanLib.getUrlRespHtml_multiTry(singlePageSearchUrl, maxTryNum=50); #logging.debug("searchRespHtml=%s", searchRespHtml); #sometime will error for: # </form> // REDIRECT if can not connect to mysql database # <SCRIPT language="JavaScript1.1"> # <!-- # location.replace("http://www.wheelbynet.com/docs/main/server-upgrade.html"); # //--> # </SCRIPT> #so here so check if is invalid html, if is, should re-fetch the html foundNotConnectMysql = re.search("REDIRECT if can not connect to mysql database", searchRespHtml); while(foundNotConnectMysql): #re-get html logging.warning("occur: REDIRECT if can not connect to mysql database, so re-fetch the html for %s", singlePageSearchUrl); searchRespHtml = crifanLib.getUrlRespHtml_multiTry(singlePageSearchUrl, maxTryNum=50); foundNotConnectMysql = re.search("REDIRECT if can not connect to mysql database", searchRespHtml); return searchRespHtml; def initExcludeZipCodeList(): #parse csv file to generate the exclude zip code list gVal['excludeZipCodeList'] = []; exZipCsvFile = open(gVal['exclueZipcodeFile'], 'r'); logging.debug("exZipCsvFile=%s", exZipCsvFile); exZipCsvReader = csv.reader(exZipCsvFile) logging.debug("exZipCsvReader=%s", exZipCsvReader); for row in exZipCsvReader: logging.debug("row=%s", row); firstContent = row[0]; logging.debug("firstContent=%s", firstContent); filteredContent = firstContent.replace("'", ""); logging.debug("filteredContent=%s", filteredContent); filteredContent = filteredContent.replace('"', ""); logging.debug("filteredContent=%s", filteredContent); #curExCode = int(filteredContent); curExCode = filteredContent; logging.debug("curExCode=%s", curExCode); gVal['excludeZipCodeList'].append(curExCode); logging.debug("gVal['excludeZipCodeList']=%s", gVal['excludeZipCodeList']); return ; def outputInfoDictToFile(itemInfoDictList): #output all info dict list #outputFp = open(gVal['csvFilename'], 'a+'); outputFp = open(gVal['csvFilename'], 'ab+'); # MUST in binary mode !!! csvWriter = csv.writer(outputFp, dialect='excel'); for eachInfoDict in itemInfoDictList: fieldList = []; fieldList.append(eachInfoDict['Lead Source']); fieldList.append(eachInfoDict['Ad Id']); fieldList.append(eachInfoDict['Batch Date']); fieldList.append(eachInfoDict['Phone']); fieldList.append(eachInfoDict['Price']); fieldList.append(eachInfoDict['Zip code']); fieldList.append(eachInfoDict['Year']); fieldList.append(eachInfoDict['Title']); fieldList.append(eachInfoDict['Description']); fieldList.append(eachInfoDict['Email']); fieldList.append(eachInfoDict['URL']); fieldList.append(eachInfoDict['Mileage']); fieldList.append(eachInfoDict['City']); logging.info("fieldList=%s", fieldList); csvWriter.writerow(fieldList); outputFp.close(); return ; def processEachPageHtml(curType, eachPageHtml): #for each page to process it itemInfoDictList = []; # <tr bgcolor="#ffffff"><td width=275><font face=verdana,arial size=2><a href="view_ad2.php3?ad_ref=autoG1RLDLQ8AQEM">Dodge Charger</a> <img src="../images/new28x11.gif" width=28 height=11 alt="Ad placed on Jun-30-2013"></font></td><td width=55><font face=verdana,arial size=1>2007</font></td><td width=60><font face=verdana,arial size=1>49,000</font></td><td width=90><table width=72 border=0 cellpadding=0 cellspacing=0><tr><td align=right><font face=verdana,arial size=1>$19,500.00</font></td></tr></table></td><td width=120><font face=verdana,arial size=1>Indiana</font></td></tr> # <tr><td NOWRAP colspan=5><hr size=1 NOSHADOW width=600></td></tr> # <tr bgcolor="#ffffff"><td width=275><font face=verdana,arial size=2><a href="view_ad2.php3?ad_ref=autoW7EE30W66V87">Chevrolet 210 2 dr sedan</a> </font></td><td width=55><font face=verdana,arial size=1>1955</font></td><td width=60><font face=verdana,arial size=1>2,000</font></td><td width=90><table width=72 border=0 cellpadding=0 cellspacing=0><tr><td align=right><font face=verdana,arial size=1>$38,500.00</font></td></tr></table></td><td width=120><font face=verdana,arial size=1>California</font></td></tr> # <tr><td NOWRAP colspan=5><hr size=1 NOSHADOW width=600></td></tr> # <tr bgcolor="#ffffff"><td width=275><font face=verdana,arial size=2><a href="view_ad2.php3?ad_ref=autoOPXS56Q4S777">Chevrolet Chevelle</a> </font></td><td width=55><font face=verdana,arial size=1>1964</font></td><td width=60><font face=verdana,arial size=1>--</font></td><td width=90><table width=72 border=0 cellpadding=0 cellspacing=0><tr><td align=right><font face=verdana,arial size=1>$23,500.00</font></td></tr></table></td><td width=120><font face=verdana,arial size=1>California</font></td></tr> # <tr><td NOWRAP colspan=5><hr size=1 NOSHADOW width=600></td></tr> foundAllAhref = re.findall('<a\s+href="(view_ad2\.php3\?ad_ref=\w+)">[^<>]+?</a>', eachPageHtml, re.I); if(foundAllAhref): #<BASE href="http://www.wheelbynet.com/docs/auto/" target="_top"> hrefBase = "http://www.wheelbynet.com/docs/" + curType + "/"; for eachAHref in foundAllAhref: itemLink = hrefBase + eachAHref; itemInfoDict = processEachItem(itemLink); if(not itemInfoDict['omitted']): itemInfoDictList.append(itemInfoDict); gVal['allTypeInfoDict'][curType]["processedNum"] += 1; else: logging.info("Omit %s for %s", itemLink, itemInfoDict['omitReason']); gVal['allTypeInfoDict'][curType]["omittedNum"] += 1; gVal['curItemNum'] += 1; #output info outputInfoDictToFile(itemInfoDictList); else: logging.debug("Can not find any item link for curType=%s eachPageHtml=%s"%(curType, eachPageHtml)); return ; def initOutputCsvFile(): #init output file # 'a+': read,write,append # 'w' : clear before, then write #outputFp = open(gVal['csvFilename'], 'w'); outputFp = open(gVal['csvFilename'], 'wb'); # MUST in binary mode !!! csvWriter = csv.writer(outputFp, dialect='excel'); # itemInfoDict = { # 'Lead Source' : "", # 'Ad Id' : "", # 'Batch Date' : "", # 'Phone' : "", # 'Price' : "", # 'Zip code' : "", # 'Year' : "", # 'Title' : "", # 'Description' : "", # 'Email' : "", # 'URL' : "", # 'Mileage' : "", # 'City' : "", # }; csvHeaderList = [ "Lead Source", "Ad Id", "Batch Date", "Phone", "Price", "Zip code", "Year", "Title", "Description", "Email", "URL", "Mileage", "City", ]; csvWriter.writerow(csvHeaderList); outputFp.close(); return ; def main(): #support parameter newParser = argparse.ArgumentParser(description="Extarct wheelbynet's auto,rv,moto,boat info then save into csv."); newParser.add_argument("-e", "--exclueZipcodeFile", dest="exclueZipcodeFile", default="material/Excluded_area_codes.csv", help="Csv file contains the excluded zip code"); newParser.add_argument("-i", "--minPrice", type=int, default=gConst['minPrice'], dest="minPrice", help="Minimal money for item"); newParser.add_argument("-a", "--maxPrice", type=int, default=gConst['maxPrice'], dest="maxPrice", help="Maximum money for item"); # newParser.add_argument("-b", "--beginDatetimeStr", default=gConst['beginDatetimeStr'], dest="beginDatetimeStr", help="Begin date time"); # newParser.add_argument("-d", "--endDatetimeStr", default=gConst['endDatetimeStr'], dest="endDatetimeStr", help="End date time"); args = newParser.parse_args(); argsDict = args.__dict__; for eachArg in argsDict.keys(): exec(eachArg + " = args." + eachArg); #init values gVal['minPrice'] = minPrice; gVal['maxPrice'] = maxPrice; # gVal['beginDatetimeStr'] = beginDatetimeStr; # gVal['endDatetimeStr'] = endDatetimeStr; # # gVal['endDatetime'] = datetime.strptime(gVal['endDatetime'], "%Y-%m-%d %H:%M:%S"); # # gVal['beginDatetime'] = datetime.strptime(gVal['beginDatetime'], "%Y-%m-%d %H:%M:%S"); # gVal['beginDatetime'] = datetime.strptime(gVal['beginDatetimeStr'], "%Y-%m-%d"); # gVal['endDatetime'] = datetime.strptime(gVal['endDatetimeStr'], "%Y-%m-%d"); # logging.info("gVal['beginDatetime']=%s", gVal['beginDatetime']); # logging.info("gVal['endDatetime']=%s", gVal['endDatetime']); gVal['exclueZipcodeFile'] = exclueZipcodeFile; logging.info("gVal['minPrice']=%d, gVal['maxPrice']=%d", gVal['minPrice'], gVal['maxPrice']); #init output file #gVal['csvFilename'] = gVal['csvFilename'] + "_" + gVal['beginDatetimeStr'] + "_to_" + gVal['endDatetimeStr'] + ".csv"; gVal['csvFilename'] = gVal['csvFilename'] + ".csv"; initOutputCsvFile(); #init crifanLib.initAutoHandleCookies(); #here use gae 127.0.0.1:8087 #crifanLib.initProxy({'http':"http://127.0.0.1:8087"}); crifanLib.initProxyAndCookie({'http':"http://127.0.0.1:8087"}); #init exclude zip code list initExcludeZipCodeList(); #eachPageNum = 15; eachPageNum = 25; #get total number of search item #typeList = ["auto", "rv", "moto", "boat"]; typeList = gVal['allTypeInfoDict'].keys(); for curType in typeList: totalPageNum = getTotalPageNum(curType); logging.debug("totalPageNum=%d", totalPageNum); totalNum = eachPageNum * totalPageNum; logging.debug("totalNum=%d", totalNum); singleTypeInfoDict = { "totalNum" : 0, "omittedNum" : 0, "processedNum" : 0, }; singleTypeInfoDict["totalNum"] = totalNum; gVal['allTypeInfoDict'][curType] = singleTypeInfoDict; logging.info("%s", crifanLib.formatString("curType=%s totalNum=%d"%(curType,totalNum))); gVal['curTotalNum'] = totalNum; for curPageIdx in range(totalPageNum): #init curOffset = curPageIdx * eachPageNum; logging.debug("curOffset=%d", curOffset); gVal['curItemNum'] = curOffset + 1; #debug # #http://autoexplosion.com/bikes/buy/results.php?go=1&price_to=999999&price_from=4999&offset=375 # curOffset = 375; logging.info("%s", crifanLib.formatString("curType=%s,curPageIdx=%d,curOffset=%d"%(curType,curPageIdx,curOffset), paddingChar="-")); singlePageHtml = getSinglePageHtml(curType, curOffset); processEachPageHtml(curType, singlePageHtml); logging.info("Complete to process total %d %s", totalNum, curType); #done, output statics info logging.info("%s", crifanLib.formatString("Statistic Info")); for curType in gVal['allTypeInfoDict'].keys(): logging.info("%s", crifanLib.formatString("%s"%(curType), paddingChar="-")); logging.info("Total Number:\t%d", gVal['allTypeInfoDict'][curType]['totalNum']); logging.info("Omitted Number:\t%d", gVal['allTypeInfoDict'][curType]['omittedNum']); logging.info("Processed Number:\t%d", gVal['allTypeInfoDict'][curType]['processedNum']); ############################################################################### if __name__=="__main__": scriptSelfName = crifanLib.extractFilename(sys.argv[0]); logging.basicConfig( level = logging.DEBUG, format = 'LINE %(lineno)-4d %(levelname)-8s %(message)s', datefmt = '%m-%d %H:%M', filename = scriptSelfName + ".log", filemode = 'w'); # define a Handler which writes INFO messages or higher to the sys.stderr console = logging.StreamHandler(); console.setLevel(logging.INFO); # set a format which is simpler for console use formatter = logging.Formatter('LINE %(lineno)-4d : %(levelname)-8s %(message)s'); # tell the handler to use this format console.setFormatter(formatter); logging.getLogger('').addHandler(console); try: main(); except: logging.exception("Unknown Error !"); raise;
【总结】
转载请注明:在路上 » 【代码分享】Python代码:scrape_wheelbynet_com – 抓取wheelbynet.com中符合特定规则的产品信息并保存为csv