【背景】
之前写的,去抓取:
中,符合特定规则的产品的信息
然后导出为csv文件。
【scrape_autoexplosion_com代码分享】
1.截图:
(1)运行效果:
(2)导出数据为csv文件:
2.Python项目代码下载:
scrape_autoexplosion_com_2013-05-21.7z
3.代码分享:
(1)scrape_autoexplosion_com.py
#!/usr/bin/python # -*- coding: utf-8 -*- """ ------------------------------------------------------------------------------- Function: scrape for autoexplosion.com Version: 2013-05-21 Author: Crifan Li Contact: [email protected] Usage: scrape_autoexplosion_com.py (1)for %Y-%m-%d scrape_autoexplosion_com.py -b 2013-05-04 -d 2013-05-21 (2)for %Y-%m-%d %H:%M:%S scrape_autoexplosion_com.py -b "2013-05-04 00:00:00" -d "2013-05-20 00:00:00" TOOD: 1. ------------------------------------------------------------------------------- """ #--------------------------------const values----------------------------------- gConst = { "xls" : { 'fileName' : "outputInfo.xls", 'sheetName' : "outputInfo", }, 'domain' : "http://autoexplosion.com", 'priceFrom' : 4999, 'priceTo' : 999999, # 'beginDatetimeStr' : "1970-01-01 00:00:00", # 'endDatetimeStr' : "2039-12-30 00:00:00", 'beginDatetimeStr' : "1970-01-01", 'endDatetimeStr' : "2039-12-30", }; gCfg = { }; gVal = { "csvFilename" : "autoexplosionItemsInfo", 'priceFrom' : 0, 'priceTo' : 0, #date time string 'beginDatetimeStr' : "", 'endDatetimeStr' : "", #converted to datetime type value 'beginDatetime' : "", 'endDatetime' : "", 'exclueZipcodeFile' : "", 'excludeZipCodeList' : [], 'allTypeInfoDict' : { "cars" : None, # is singleTypeInfoDict "RVs" : None, "bikes" : None, "boats" : None, }, #for show info 'curItemNum': 0, 'curTotalNum':0, }; #---------------------------------import--------------------------------------- import re; import sys; sys.path.append("libs"); from BeautifulSoup import BeautifulSoup,Tag,CData; import crifanLib; import logging; import urllib; import json; import os; import argparse; import codecs; import csv; import xlwt; import xlrd; #import xlutils; from xlutils.copy import copy; from datetime import datetime; def getTypeAndIdFromUrl(itemLink): """ get type from item url link input: http://autoexplosion.com/cars/buy/150594.php http://autoexplosion.com/bikes/buy/11812.php output: cars,150594 bikes,11812 """ (mainType, adId) = ("", ""); #http://autoexplosion.com/cars/buy/150594.php logging.debug("input itemLink=%s", itemLink); foundMainType = re.search("http://autoexplosion\.com/(?P<mainType>\w+)/buy/(?P<adId>\d+)\.php", itemLink); logging.debug("foundMainType=%s", foundMainType); if(foundMainType): mainType = foundMainType.group("mainType"); adId = foundMainType.group("adId"); #mainType = foundMainType.group(1); #adId = foundMainType.group(2); logging.debug("mainType=%s, adId=%s", mainType, adId); #cars else: logging.error("Fail to find mainType,adId from %s", itemLink); sys.exit(-1); return (mainType, adId) def processEachItem(itemLink): """ process each search item from its url extract all info input example: http://autoexplosion.com/cars/buy/150594.php http://autoexplosion.com/bikes/buy/11812.php """ logging.info("%s", crifanLib.formatString("[%d/%d] %s"%(gVal['curItemNum'],gVal['curTotalNum'],itemLink), paddingChar="-")); #debug html tag #itemLink = "http://autoexplosion.com/cars/buy/150954.php"; #itemLink = "http://autoexplosion.com/RVs/buy/9776.php"; #itemLink = "http://autoexplosion.com/cars/buy/151366.php"; #itemRespHtml = crifanLib.getUrlRespHtml(itemLink); itemRespHtml = crifanLib.getUrlRespHtml_multiTry(itemLink, maxTryNum=50); #logging.debug("itemRespHtml=%s", itemRespHtml); itemInfoDict = { 'omitted' : False, 'omitReason' : "", 'Lead Source' : "", 'Ad Id' : "", 'Batch Date' : "", 'Phone' : "", 'Price' : "", 'Zip code' : "", 'Year' : "", 'Title' : "", 'Description' : "", 'Email' : "", 'URL' : "", 'Mileage' : "", 'City' : "", }; #check whether this page is invalid or not #http://autoexplosion.com/cars/buy/151366.php #<b>This listing has been suspended and is currently being reviewed.</b> suspendedNotice = "This listing has been suspended and is currently being reviewed"; if(re.search(suspendedNotice, itemRespHtml)): itemInfoDict['omitted'] = True; itemInfoDict['omitReason'] = suspendedNotice; return itemInfoDict; #http://autoexplosion.com/RVs/buy/9764.php #This listing is no longer available. noLongerAvailableNotice = "This listing is no longer available"; if(re.search(noLongerAvailableNotice, itemRespHtml)): itemInfoDict['omitted'] = True; itemInfoDict['omitReason'] = noLongerAvailableNotice; return itemInfoDict; #init locationStr = ""; # URL noHttpUrl = itemLink.replace("http://", ""); itemInfoDict['URL'] = noHttpUrl; soup = BeautifulSoup(itemRespHtml); #within time range or not #http://autoexplosion.com/bikes/buy/11812.php # <tr> # <td valign="top" nowrap><b>Posted</b></td> # <td valign="top">Feb. 19, 2013</td> # </tr> #http://autoexplosion.com/boats/buy/4270.php #Posted Apr. 25, 2013 #http://autoexplosion.com/boats/buy/4262.php #Posted Apr. 5, 2013 foundPosted = re.search('<td valign="top" nowrap><b>Posted</b></td>\s*<td valign="top">(?P<postDatetimeStr>[\w,\.\s]+?)</td>', itemRespHtml); if(foundPosted): postDatetimeStr = foundPosted.group("postDatetimeStr"); logging.debug("postDatetimeStr=%s", postDatetimeStr); parsedPostDate = datetime.strptime(postDatetimeStr, "%b. %d, %Y"); logging.debug("parsedPostDate=%s", parsedPostDate); if((parsedPostDate < gVal['beginDatetime']) or (parsedPostDate > gVal['endDatetime'])): omitReason = "PostDate=%s, not within range: %s <-> %s"%(parsedPostDate, gVal['beginDatetime'], gVal['endDatetime']); itemInfoDict['omitted'] = True; itemInfoDict['omitReason'] = omitReason; return itemInfoDict; else: logging.error("not found poste date for %s", itemLink); sys.exit(-1); #1. check should be omit or not #(1) has phone number #http://autoexplosion.com/cars/buy/150594.php # <tr> # <td valign="top" nowrap><b>Phone</b></td> # <td valign="top"> # Private # </td> # </tr> #http://autoexplosion.com/cars/buy/150887.php # <tr> # <td valign="top" nowrap><b>Phone</b></td> # <td valign="top"> # <span itemprop="telephone">(210) 473-9820</span> # </td> # </tr> #http://autoexplosion.com/boats/buy/4270.php # <tr> # <td valign="top" nowrap><b>Phone</b></td> # <td valign="top"> # (714) 532-0988 </td> # </tr> #foundPhone = soup.find(name="span", attrs={"itemprop":"telephone"}); #foundPhone = re.search('<td valign="top" nowrap><b>Phone</b></td>\s*<td valign="top">\s*(<span itemprop="telephone">)?(?P<phoneStr>[\d\(\)\-\s]+)(</span>)?\s*</td>', itemRespHtml); foundPhone = re.search('<td valign="top" nowrap><b>Phone</b></td>\s*<td valign="top">\s*(<span itemprop="telephone">)?(?P<phoneStr>.+?)(</span>)?\s*</td>', itemRespHtml); logging.debug("foundPhone=%s", foundPhone); if(foundPhone): #itemInfoDict['Phone'] = foundPhone.string; phoneStr = foundPhone.group("phoneStr"); logging.debug("phoneStr=%s", phoneStr); stripedPhoneStr = phoneStr.strip(); logging.debug("stripedPhoneStr=%s", stripedPhoneStr); if(stripedPhoneStr == "Private"): itemInfoDict['omitted'] = True; itemInfoDict['omitReason'] = "Phone is Private"; return itemInfoDict; else: onlyDigitPhone = re.sub("[^\d]", "", stripedPhoneStr); itemInfoDict['Phone'] = onlyDigitPhone; #(210) 473-9820 logging.info("itemInfoDict['Phone']\t=%s", itemInfoDict['Phone']); else: logging.error("not found phone for %s", itemLink); sys.exit(-1); #(2) not in exclude zip code list #http://autoexplosion.com/cars/buy/150594.php # <tr> # <td valign="top" nowrap><b>Location</b></td> # <td valign="top" itemprop="address">Tampa, FL</td> # </tr> #http://autoexplosion.com/boats/buy/4270.php # <tr> # <td valign="top" nowrap><b>Location</b></td> # <td valign="top">Orange, CA</td> # </tr> #foundLocation = soup.find(name="td", attrs={"itemprop":"address"}); #foundLocation = re.search('<td valign="top" nowrap><b>Location</b></td>\s*<td valign="top"( itemprop="address")?>(?P<location>[\w,]+)</td>', itemRespHtml); foundLocation = re.search('<td valign="top" nowrap><b>Location</b></td>\s*<td valign="top"( itemprop="address")?>(?P<location>.+?)</td>', itemRespHtml); logging.debug("foundLocation=%s", foundLocation); if(foundLocation): #locationStr = foundLocation.string; #Tampa, FL locationStr = foundLocation.group("location"); #Tampa, FL zipCode = crifanLib.getZipcodeFromLocation(locationStr); itemInfoDict['Zip code'] = zipCode; logging.info("itemInfoDict['Zip code']\t=%s", itemInfoDict['Zip code']); mainZipCode = zipCode[0:3]; logging.debug("mainZipCode=%s", mainZipCode); if(mainZipCode in gVal['excludeZipCodeList']): logging.debug("mainZipCode=%s is in excludeZipCodeList, so omit this", mainZipCode); itemInfoDict['omitted'] = True; itemInfoDict['omitReason'] = "mainZipCode=%s in exclude list"%(mainZipCode); return itemInfoDict; else: logging.error("not found location for %s", itemLink); sys.exit(-1); #(3) makesure money is in valid range: 4999 - 999999 #http://autoexplosion.com/cars/buy/150594.php # <tr itemprop="offers" itemscope itemtype="http://schema.org/Offer"> # <td valign="top" nowrap><b>Price</b></td> # <td valign="top"><b class="cssHeader" itemprop="price">$9,000</b><meta itemprop="priceCurrency" content="USD" /></td> # </tr> #http://autoexplosion.com/boats/buy/4270.php # <tr> # <td valign="top" nowrap><b>Price</b></td> # <td valign="top"><b class="cssHeader">$5,000</b></td> # </tr> #foundPrice = soup.find(name="b", attrs={"class":"cssHeader", "itemprop":"price"}); foundPrice = re.search('<td valign="top" nowrap><b>Price</b></td>\s*<td valign="top"><b class="cssHeader"[^<>]*?>(?P<price>.+?)</b>', itemRespHtml); logging.debug("foundPrice=%s", foundPrice); if(foundPrice): #priceDollar = foundPrice.string; #$19,999 priceDollar = foundPrice.group("price"); #$19,999 logging.debug("priceDollar=%s", priceDollar); priceDollarUni = unicode(priceDollar); price = priceDollarUni.replace("$", "").replace(",", ""); logging.debug("price=%s", price); itemInfoDict['Price'] = price; logging.info("itemInfoDict['Price']\t=%s", itemInfoDict['Price']); priceInt = int(itemInfoDict['Price']); if(priceInt >= gVal['priceFrom'] and priceInt <= gVal['priceTo']): #correct logging.debug("item price indeed with range"); else: logging.error("item price %d out of range, gVal['priceFrom']=%d, gVal['priceTo']=%d", priceInt, gVal['priceFrom'], gVal['priceTo']); sys.exit(-1); else: logging.error("not found price for %s", itemLink); sys.exit(-1); #2. prepare basic info #(1)mainType, adId (mainType, adId) = getTypeAndIdFromUrl(itemLink); #3 extract remain infos #(1) Lead Source itemInfoDict['Lead Source'] = "autoexplosion-" + mainType; logging.info("itemInfoDict['Lead Source']=%s", itemInfoDict['Lead Source']); #(2) Ad Id itemInfoDict['Ad Id'] = adId; logging.info("itemInfoDict['Ad Id']\t=%s", itemInfoDict['Ad Id']); #(3) Batch Date itemInfoDict['Batch Date'] = datetime.now().strftime("%m/%d/%Y"); logging.info("itemInfoDict['Batch Date']\t=%s", itemInfoDict['Batch Date']); #(4) Year # <tr> # <td valign="top" nowrap><b>Year</b></td> # <td valign="top">2004</td> # </tr> foundYear = re.search('<td valign="top" nowrap><b>Year</b></td>\s*<td valign="top">(?P<year>\d+)</td>', itemRespHtml); logging.debug("foundYear=%s", foundYear); if(foundYear): itemInfoDict['Year'] = foundYear.group("year"); logging.info("itemInfoDict['Year']\t=%s", itemInfoDict['Year']); else: logging.error("Fail to find year from %s", itemLink); sys.exit(-1); #(5) Title #<span itemprop="name">2008 Acura Rdx</span> foundTitle = soup.find(name="span", attrs={"itemprop":"name"}); logging.debug("foundTitle=%s", foundTitle); if(foundTitle): origTitle = foundTitle.string; #2008 Acura Rdx trueTitle = re.sub("^"+itemInfoDict['Year']+" (.+)$", r"\1", origTitle); #http://autoexplosion.com/bikes/buy/11722.php logging.debug("trueTitle=%s", trueTitle); #OC Choppers Super Stretch 124” Softail filteredTitleUni = crifanLib.filterNonAsciiStr(trueTitle); #OC Choppers Super Stretch 124 Softail itemInfoDict['Title'] = filteredTitleUni; logging.info("itemInfoDict['Title']\t=%s", itemInfoDict['Title']); else: logging.error("Fail to find title from %s", itemLink); sys.exit(-1); #(6) Description #http://autoexplosion.com/cars/buy/150594.php # <tr> # <td valign="top" colspan="2" itemprop="description"> # 2004 ACURA MDX Touring Package<br /> # V6 VTEC 3.5L 4 Wheel Drive<br /> # Transmission: Automatic<br /> # Mileage: 115000<br /> # Seating: 7 passenger<br /> # Bose Premium Sound 6 disk CD Changer<br /> # Roof Rack<br /> # Running Boards<br /> # Premium Wheels<br /> # Traction Control<br /> # ABS<br /> # Leather Interior<br /> # Rear Climate Control<br /> # <br /> # Kelley Blue Book Private Party Sale Value: $9,909 </td> # </tr> #http://autoexplosion.com/boats/buy/4270.php # <tr> # <td valign="top" colspan="2"><b>Description</b></td> # </tr> # <tr> # <td valign="top" colspan="2"> # Remanufactured Volvo 250 installed in 2009<br /> # Has radar with plotter<br /> # Outdrive needs sevices, cost will be deducted from sale price.<br /> # Boat is in the water. no trailer </td> # </tr> #http://autoexplosion.com/bikes/buy/11812.php # <tr> # <td valign="top" colspan="2"><b>Description</b></td> # </tr> # <tr> # <td valign="top" colspan="2" itemprop="description"> # 2004 AMERICAN IRONHORSE SLAMMER<br /> # 124 c.c. S&S<br /> # XRT CUSTOM PACKAGE<br /> # 2,800 MILES<br /> # <br /> # Feel free to contact me with any questions : <a href="mailto:[email protected]">[email protected]</a> </td> # </tr> #foundDescription = soup.find(name="td", attrs={"valign":"top", "colspan":"2", "itemprop":"description"}); foundDescriptionList = soup.findAll(name="td", attrs={"valign":"top", "colspan":"2"}); #will find two #first is #<td valign="top" colspan="2"><b>Description</b></td> #second is real description content foundDescription = foundDescriptionList[1]; logging.debug("foundDescription=%s", foundDescription); if(foundDescription): descContents = crifanLib.soupContentsToUnicode(foundDescription.contents); #descContents = foundDescription.renderContents(); logging.debug("descContents=%s", descContents); descHtmlDecoded = crifanLib.decodeHtmlEntity(descContents); logging.debug("descHtmlDecoded=%s", descHtmlDecoded); descHtmlFiltered = crifanLib.filterHtmlTag(descHtmlDecoded); logging.debug("descHtmlFiltered=%s", descHtmlFiltered); #http://autoexplosion.com/cars/buy/150631.php # Peapack, NJ. \u2028\u2028Mrs. Onassis bought #UnicodeEncodeError: 'ascii' codec can't encode character u'\u2028' in position 318: ordinal not in range(128) #so remove special unicode char descHtmlFilteredUni = crifanLib.filterNonAsciiStr(descHtmlFiltered); itemInfoDict['Description'] = descHtmlFilteredUni; logging.debug("itemInfoDict['Description']=%s", itemInfoDict['Description']); else: logging.error("Fail to find description from %s", itemLink); sys.exit(-1); #(7) Email itemInfoDict['Email'] = ""; #(8) Mileage if(mainType == "boats"): #http://autoexplosion.com/boats/buy/4270.php #ALL boats no Mileage itemInfoDict['Mileage'] = "0"; else: #http://autoexplosion.com/cars/buy/150594.php # <tr> # <td valign="top" nowrap><b>Mileage</b></td> # <td valign="top">115,000 miles</td> # </tr> #http://autoexplosion.com/RVs/buy/9234.php # <tr> # <td valign="top" nowrap><b>Mileage</b></td> # <td valign="top">n/a</td> # </tr> #foundMileage = re.search('<td valign="top" nowrap><b>Mileage</b></td>\s*<td valign="top">(?P<mileage>[\d,]+) miles</td>', itemRespHtml); foundMileage = re.search('<td valign="top" nowrap><b>Mileage</b></td>\s*<td valign="top">(?P<mileageStr>[^<>]+)</td>', itemRespHtml); logging.debug("foundMileage=%s", foundMileage); if(foundMileage): mileageStr = foundMileage.group("mileageStr"); #115,000 miles or n/a foundMiles = re.search("(?P<mileage>[\d,]+) miles", mileageStr); logging.debug("foundMiles=%s", foundMiles); if(foundMiles): mileage = foundMiles.group("mileage"); logging.debug("mileage=%s", mileage); mileage = mileage.replace(",", ""); #115000 itemInfoDict['Mileage'] = mileage; else: #http://autoexplosion.com/RVs/buy/9234.php #n/a #itemInfoDict['Mileage'] = mileageStr; itemInfoDict['Mileage'] = "0"; else: logging.error("Fail to find mileage from %s", itemLink); sys.exit(-1); logging.info("itemInfoDict['Mileage']\t=%s", itemInfoDict['Mileage']); #(9) City logging.debug("locationStr=%s", locationStr); if(locationStr): cityStateList = locationStr.split(","); city = cityStateList[0]; itemInfoDict['City'] = city; logging.info("itemInfoDict['City']\t=%s", itemInfoDict['City']); else: logging.error("Fail to find city from %s", itemLink); sys.exit(-1); return itemInfoDict; def getTotalNum(curType): totalNum = 0; #Method 1: #http://autoexplosion.com/cars/buy/advanced.php #process cars #to get cookie #http://autoexplosion.com/cars/buy/ # carsBuyUrl = "http://autoexplosion.com/cars/buy/"; # carsBuyRespHtml = crifanLib.getUrlRespHtml(carsBuyUrl); # logging.debug("carsBuyRespHtml=%s", carsBuyRespHtml); #http://autoexplosion.com/cars/buy/advanced.php #searchBaseUrl = "http://autoexplosion.com/cars/buy/advanced.php"; searchBaseUrl = "http://autoexplosion.com/" + curType + "/buy/advanced.php"; # searchBaseRespHtml = crifanLib.getUrlRespHtml(searchBaseUrl); # logging.debug("searchBaseRespHtml=%s", searchBaseRespHtml); #http://autoexplosion.com/cars/buy/advanced.php?zip=&radius=0&make%5B%5D=&model%5B%5D=&year_from=&year_to=&price_from=4999&price_to=999999&mileage=&posted_after= paraDict = { 'zip' : "", 'radius' : "0", 'make%5B%5D' : "", 'model%5B%5D' : "", 'year_from' : "", 'year_to' : "", 'price_from' : gVal['priceFrom'], 'price_to' : gVal['priceTo'], 'mileage' : "", 'posted_after' : "", }; searchUrl = crifanLib.genFullUrl(searchBaseUrl, paraDict); logging.info("searchUrl=%s", searchUrl); headerDict = { #'Referer' : "http://autoexplosion.com/cars/buy/advanced.php", 'Referer' : "http://autoexplosion.com/" + curType + "/buy/advanced.php", }; #searchRespHtml = crifanLib.getUrlRespHtml(searchUrl, headerDict=headerDict); searchRespHtml = crifanLib.getUrlRespHtml_multiTry(searchUrl, headerDict=headerDict, maxTryNum=50); #logging.debug("searchRespHtml=%s", searchRespHtml); #<p>Your search returned <b class="cssColorMain">966</b> soup = BeautifulSoup(searchRespHtml); #logging.debug("soup=%s", soup); foundTotalNum = soup.find(name="b", attrs={"class":"cssColorMain"}); logging.debug("foundTotalNum=%s", foundTotalNum); if(foundTotalNum): totalNum = foundTotalNum.string; #966 totalNum = int(totalNum); logging.info("totalNum=%d", totalNum); else: logging.error("Can not find total num !"); sys.exit(-1); return totalNum; def getSinglePageHtml(curType, offset): #http://autoexplosion.com/cars/buy/results.php?go=1&price_from=4999&price_to=999999&offset=15 #searchBaseUrl = "http://autoexplosion.com/cars/buy/results.php"; searchBaseUrl = "http://autoexplosion.com/" + curType + "/buy/results.php"; paraDict = { 'go' : "1", 'price_from' : gVal['priceFrom'], 'price_to' : gVal['priceTo'], 'offset' : str(offset), }; searchUrl = crifanLib.genFullUrl(searchBaseUrl, paraDict); logging.info("searchUrl=%s", searchUrl); #searchRespHtml = crifanLib.getUrlRespHtml(searchUrl); searchRespHtml = crifanLib.getUrlRespHtml_multiTry(searchUrl, maxTryNum=50); #logging.debug("searchRespHtml=%s", searchRespHtml); return searchRespHtml; def initExcludeZipCodeList(): #parse csv file to generate the exclude zip code list gVal['excludeZipCodeList'] = []; exZipCsvFile = open(gVal['exclueZipcodeFile'], 'r'); logging.debug("exZipCsvFile=%s", exZipCsvFile); exZipCsvReader = csv.reader(exZipCsvFile) logging.debug("exZipCsvReader=%s", exZipCsvReader); for row in exZipCsvReader: logging.debug("row=%s", row); firstContent = row[0]; logging.debug("firstContent=%s", firstContent); filteredContent = firstContent.replace("'", ""); logging.debug("filteredContent=%s", filteredContent); filteredContent = filteredContent.replace('"', ""); logging.debug("filteredContent=%s", filteredContent); #curExCode = int(filteredContent); curExCode = filteredContent; logging.debug("curExCode=%s", curExCode); gVal['excludeZipCodeList'].append(curExCode); logging.debug("gVal['excludeZipCodeList']=%s", gVal['excludeZipCodeList']); return ; def outputInfoDictToFile(itemInfoDictList): #output all info dict list #outputFp = open(gVal['csvFilename'], 'a+'); outputFp = open(gVal['csvFilename'], 'ab+'); # MUST in binary mode !!! csvWriter = csv.writer(outputFp, dialect='excel'); for eachInfoDict in itemInfoDictList: fieldList = []; fieldList.append(eachInfoDict['Lead Source']); fieldList.append(eachInfoDict['Ad Id']); fieldList.append(eachInfoDict['Batch Date']); fieldList.append(eachInfoDict['Phone']); fieldList.append(eachInfoDict['Price']); fieldList.append(eachInfoDict['Zip code']); fieldList.append(eachInfoDict['Year']); fieldList.append(eachInfoDict['Title']); fieldList.append(eachInfoDict['Description']); fieldList.append(eachInfoDict['Email']); fieldList.append(eachInfoDict['URL']); fieldList.append(eachInfoDict['Mileage']); fieldList.append(eachInfoDict['City']); logging.info("fieldList=%s", fieldList); csvWriter.writerow(fieldList); outputFp.close(); return ; def processEachPageHtml(curType, eachPageHtml): #for each page to process it soup = BeautifulSoup(eachPageHtml); logging.debug("soup=%s", soup); #<table width="100%" border="0" cellspacing="0" cellpadding="0" class="cssTable"> #foundCssTable = soup.find(id="table", attrs={"class":"cssTable"}); !!! not id, but name foundCssTable = soup.find(name="table", attrs={"class":"cssTable"}); logging.debug("foundCssTable=%s", foundCssTable); #<td><a href="/cars/buy/150594.php">Acura Mdx</a></td> #foundAllItems = foundCssTable.findAll(name="a", attrs={"href":re.compile("/cars/buy/\d+\.php")}); foundAllItems = foundCssTable.findAll(name="a", attrs={"href":re.compile("/" + curType + "/buy/\d+\.php")}); logging.debug("foundAllItems=%s", foundAllItems); itemLen = len(foundAllItems); logging.debug("itemLen=%s", itemLen); itemInfoDictList = []; for eachItem in foundAllItems: logging.debug("eachItem=%s", eachItem); href = eachItem['href']; #/cars/buy/150594.php logging.debug("href=%s", href); itemLink = gConst['domain'] + href; #http://autoexplosion.com/cars/buy/150594.php logging.debug("itemLink=%s", itemLink); itemInfoDict = processEachItem(itemLink); if(not itemInfoDict['omitted']): itemInfoDictList.append(itemInfoDict); gVal['allTypeInfoDict'][curType]["processedNum"] += 1; else: logging.info("Omit %s for %s", itemLink, itemInfoDict['omitReason']); gVal['allTypeInfoDict'][curType]["omittedNum"] += 1; gVal['curItemNum'] += 1; outputInfoDictToFile(itemInfoDictList); return ; def initOutputCsvFile(): #init output file # 'a+': read,write,append # 'w' : clear before, then write #outputFp = open(gVal['csvFilename'], 'w'); outputFp = open(gVal['csvFilename'], 'wb'); # MUST in binary mode !!! csvWriter = csv.writer(outputFp, dialect='excel'); # itemInfoDict = { # 'Lead Source' : "", # 'Ad Id' : "", # 'Batch Date' : "", # 'Phone' : "", # 'Price' : "", # 'Zip code' : "", # 'Year' : "", # 'Title' : "", # 'Description' : "", # 'Email' : "", # 'URL' : "", # 'Mileage' : "", # 'City' : "", # }; csvHeaderList = [ "Lead Source", "Ad Id", "Batch Date", "Phone", "Price", "Zip code", "Year", "Title", "Description", "Email", "URL", "Mileage", "City", ]; csvWriter.writerow(csvHeaderList); outputFp.close(); return ; def main(): #support parameter newParser = argparse.ArgumentParser(description="Extarct autoexplosion's boats,cycles,RVs,cars info then save into csv."); newParser.add_argument("-e", "--exclueZipcodeFile", dest="exclueZipcodeFile", default="material/Example/Excluded_area_codes.csv", help="Csv file contains the excluded zip code"); newParser.add_argument("-f", "--priceFrom", type=int, default=gConst['priceFrom'], dest="priceFrom", help="Minimal money for item"); newParser.add_argument("-t", "--priceTo", type=int, default=gConst['priceTo'], dest="priceTo", help="Maximum money for item"); newParser.add_argument("-b", "--beginDatetimeStr", default=gConst['beginDatetimeStr'], dest="beginDatetimeStr", help="Begin date time"); newParser.add_argument("-d", "--endDatetimeStr", default=gConst['endDatetimeStr'], dest="endDatetimeStr", help="End date time"); args = newParser.parse_args(); argsDict = args.__dict__; for eachArg in argsDict.keys(): exec(eachArg + " = args." + eachArg); #init values gVal['priceFrom'] = priceFrom; gVal['priceTo'] = priceTo; gVal['beginDatetimeStr'] = beginDatetimeStr; gVal['endDatetimeStr'] = endDatetimeStr; # gVal['endDatetime'] = datetime.strptime(gVal['endDatetime'], "%Y-%m-%d %H:%M:%S"); # gVal['beginDatetime'] = datetime.strptime(gVal['beginDatetime'], "%Y-%m-%d %H:%M:%S"); gVal['beginDatetime'] = datetime.strptime(gVal['beginDatetimeStr'], "%Y-%m-%d"); gVal['endDatetime'] = datetime.strptime(gVal['endDatetimeStr'], "%Y-%m-%d"); logging.info("gVal['beginDatetime']=%s", gVal['beginDatetime']); logging.info("gVal['endDatetime']=%s", gVal['endDatetime']); gVal['exclueZipcodeFile'] = exclueZipcodeFile; logging.info("gVal['priceFrom']=%d, gVal['priceTo']=%d", gVal['priceFrom'], gVal['priceTo']); #init output file gVal['csvFilename'] = gVal['csvFilename'] + "_" + gVal['beginDatetimeStr'] + "_to_" + gVal['endDatetimeStr'] + ".csv"; initOutputCsvFile(); #init crifanLib.initAutoHandleCookies(); #init exclude zip code list initExcludeZipCodeList(); eachPageNum = 15; #get total number of search item #typeList = ["cars", "RVs", "bikes", "boats"]; typeList = gVal['allTypeInfoDict'].keys(); for curType in typeList: totalNum = getTotalNum(curType); #debug #totalNum = 4; singleTypeInfoDict = { "totalNum" : 0, "omittedNum" : 0, "processedNum" : 0, }; singleTypeInfoDict["totalNum"] = totalNum; gVal['allTypeInfoDict'][curType] = singleTypeInfoDict; logging.info("%s", crifanLib.formatString("curType=%s totalNum=%d"%(curType,totalNum))); gVal['curTotalNum'] = totalNum; totalPageNum = totalNum/eachPageNum; if(totalNum%eachPageNum > 0): totalPageNum += 1; logging.debug("totalPageNum=%d", totalPageNum); for curPageIdx in range(totalPageNum): #init curOffset = curPageIdx * eachPageNum; logging.debug("curOffset=%d", curOffset); gVal['curItemNum'] = curOffset + 1; #debug # #http://autoexplosion.com/bikes/buy/results.php?go=1&price_to=999999&price_from=4999&offset=375 # curOffset = 375; logging.info("%s", crifanLib.formatString("curType=%s,curPageIdx=%d,curOffset=%d"%(curType,curPageIdx,curOffset), paddingChar="-")); singlePageHtml = getSinglePageHtml(curType, curOffset); processEachPageHtml(curType, singlePageHtml); logging.info("Complete to process total %d %s", totalNum, curType); #done, output statics info logging.info("%s", crifanLib.formatString("Statistic Info")); for curType in gVal['allTypeInfoDict'].keys(): logging.info("%s", crifanLib.formatString("%s"%(curType), paddingChar="-")); logging.info("Total Number:\t%d", gVal['allTypeInfoDict'][curType]['totalNum']); logging.info("Omitted Number:\t%d", gVal['allTypeInfoDict'][curType]['omittedNum']); logging.info("Processed Number:\t%d", gVal['allTypeInfoDict'][curType]['processedNum']); ############################################################################### if __name__=="__main__": scriptSelfName = crifanLib.extractFilename(sys.argv[0]); logging.basicConfig( level = logging.DEBUG, format = 'LINE %(lineno)-4d %(levelname)-8s %(message)s', datefmt = '%m-%d %H:%M', filename = scriptSelfName + ".log", filemode = 'w'); # define a Handler which writes INFO messages or higher to the sys.stderr console = logging.StreamHandler(); console.setLevel(logging.INFO); # set a format which is simpler for console use formatter = logging.Formatter('LINE %(lineno)-4d : %(levelname)-8s %(message)s'); # tell the handler to use this format console.setFormatter(formatter); logging.getLogger('').addHandler(console); try: main(); except: logging.exception("Unknown Error !"); raise;
【总结】
转载请注明:在路上 » 【代码分享】Python代码:scrape_autoexplosion_com – 抓取autoexplosion.com中符合特定规则的产品信息并保存为csv