【代码分享】Python代码:scrape_autoexplosion_com – 抓取autoexplosion.com中符合特定规则的产品信息并保存为csv

【背景】

之前写的,去抓取:

http://autoexplosion.com

中,符合特定规则的产品的信息

然后导出为csv文件。

【scrape_autoexplosion_com代码分享】

1.截图:

(1)运行效果:

scrape_autoexplosion_com.py run ui

(2)导出数据为csv文件:

scrape_autoexplosion_com.py save out csv file

2.Python项目代码下载:

scrape_autoexplosion_com_2013-05-21.7z

 

3.代码分享:

(1)scrape_autoexplosion_com.py

#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
-------------------------------------------------------------------------------
Function:
scrape for autoexplosion.com

Version:    2013-05-21
Author:     Crifan Li
Contact:    admin@crifan.com

Usage:
scrape_autoexplosion_com.py

(1)for %Y-%m-%d
scrape_autoexplosion_com.py -b 2013-05-04 -d 2013-05-21
(2)for %Y-%m-%d %H:%M:%S
scrape_autoexplosion_com.py -b "2013-05-04 00:00:00" -d "2013-05-20 00:00:00"

TOOD:
1.

-------------------------------------------------------------------------------
"""
#--------------------------------const values-----------------------------------

gConst = {
    "xls"   : {
        'fileName'  : "outputInfo.xls",
        'sheetName' : "outputInfo",
    },
    
    'domain'    : "http://autoexplosion.com",
    
    'priceFrom' : 4999,
    'priceTo'   : 999999,

    # 'beginDatetimeStr' : "1970-01-01 00:00:00",
    # 'endDatetimeStr'   : "2039-12-30 00:00:00",
    'beginDatetimeStr' : "1970-01-01",
    'endDatetimeStr'   : "2039-12-30",
};

gCfg = {

};

gVal = {
    "csvFilename" : "autoexplosionItemsInfo",

    'priceFrom' : 0,
    'priceTo'   : 0,

    #date time string
    'beginDatetimeStr'  : "",
    'endDatetimeStr'    : "",
    #converted to datetime type value
    'beginDatetime'     : "",
    'endDatetime'       : "",

    'exclueZipcodeFile' : "",
    
    'excludeZipCodeList' : [],

    'allTypeInfoDict' : {
        "cars"  : None, # is singleTypeInfoDict
        "RVs"   : None,
        "bikes" : None,
        "boats" : None,
    },
    
    #for show info
    'curItemNum': 0, 
    'curTotalNum':0,
};

#---------------------------------import---------------------------------------
import re;
import sys;
sys.path.append("libs");
from BeautifulSoup import BeautifulSoup,Tag,CData;
import crifanLib;
import logging;
import urllib;
import json;
import os;
import argparse;
import codecs;

import csv;
import xlwt;
import xlrd;
#import xlutils;
from xlutils.copy import copy;

from datetime import datetime;

def getTypeAndIdFromUrl(itemLink):
    """
        get type from item url link
        input:
        http://autoexplosion.com/cars/buy/150594.php
        http://autoexplosion.com/bikes/buy/11812.php
        output:
        cars,150594
        bikes,11812
    """
    (mainType, adId) = ("", "");
    #http://autoexplosion.com/cars/buy/150594.php
    logging.debug("input itemLink=%s", itemLink);
    foundMainType = re.search("http://autoexplosion\.com/(?P<mainType>\w+)/buy/(?P<adId>\d+)\.php", itemLink);
    logging.debug("foundMainType=%s", foundMainType);
    if(foundMainType):
        mainType = foundMainType.group("mainType");
        adId = foundMainType.group("adId");
        #mainType = foundMainType.group(1);
        #adId = foundMainType.group(2);
        logging.debug("mainType=%s, adId=%s", mainType, adId); #cars
    else:
        logging.error("Fail to find mainType,adId from %s", itemLink);
        sys.exit(-1);

    return (mainType, adId)

def processEachItem(itemLink):
    """
        process each search item from its url
            extract all info
        input example:
        http://autoexplosion.com/cars/buy/150594.php
        http://autoexplosion.com/bikes/buy/11812.php
    """

    logging.info("%s", crifanLib.formatString("[%d/%d] %s"%(gVal['curItemNum'],gVal['curTotalNum'],itemLink), paddingChar="-"));
    
    #debug html tag
    #itemLink = "http://autoexplosion.com/cars/buy/150954.php";
    #itemLink = "http://autoexplosion.com/RVs/buy/9776.php";
    #itemLink = "http://autoexplosion.com/cars/buy/151366.php";
    
    #itemRespHtml = crifanLib.getUrlRespHtml(itemLink);
    itemRespHtml = crifanLib.getUrlRespHtml_multiTry(itemLink, maxTryNum=50);

    #logging.debug("itemRespHtml=%s", itemRespHtml);

    itemInfoDict = {
        'omitted'       : False,
        'omitReason'    : "",
        
        'Lead Source'   : "",
        'Ad Id'         : "",
        'Batch Date'    : "",
        'Phone'         : "",
        'Price'         : "",
        'Zip code'      : "",
        'Year'          : "",
        'Title'         : "",
        'Description'   : "",
        'Email'         : "",
        'URL'           : "",
        'Mileage'       : "",
        'City'          : "",
    };
    
    #check whether this page is invalid or not

    #http://autoexplosion.com/cars/buy/151366.php
    #<b>This listing has been suspended and is currently being reviewed.</b>
    suspendedNotice = "This listing has been suspended and is currently being reviewed";
    if(re.search(suspendedNotice, itemRespHtml)):
        itemInfoDict['omitted'] = True;
        itemInfoDict['omitReason'] = suspendedNotice;
        return itemInfoDict;
    
    #http://autoexplosion.com/RVs/buy/9764.php
    #This listing is no longer available.
    noLongerAvailableNotice = "This listing is no longer available";
    if(re.search(noLongerAvailableNotice, itemRespHtml)):
        itemInfoDict['omitted'] = True;
        itemInfoDict['omitReason'] = noLongerAvailableNotice;
        return itemInfoDict;

    #init
    locationStr = "";
    
    # URL
    noHttpUrl = itemLink.replace("http://", "");
    itemInfoDict['URL'] = noHttpUrl;
    
    soup = BeautifulSoup(itemRespHtml);

    #within time range or not
    #http://autoexplosion.com/bikes/buy/11812.php
    # <tr>
        # <td valign="top" nowrap><b>Posted</b></td>
        # <td valign="top">Feb. 19, 2013</td>
    # </tr>
    
    #http://autoexplosion.com/boats/buy/4270.php
    #Posted Apr. 25, 2013 
    
    #http://autoexplosion.com/boats/buy/4262.php
    #Posted Apr. 5, 2013 
    
    foundPosted = re.search('<td valign="top" nowrap><b>Posted</b></td>\s*<td valign="top">(?P<postDatetimeStr>[\w,\.\s]+?)</td>', itemRespHtml);
    if(foundPosted):
        postDatetimeStr = foundPosted.group("postDatetimeStr");
        logging.debug("postDatetimeStr=%s", postDatetimeStr);
        parsedPostDate = datetime.strptime(postDatetimeStr, "%b. %d, %Y");
        logging.debug("parsedPostDate=%s", parsedPostDate);
        if((parsedPostDate < gVal['beginDatetime']) or (parsedPostDate > gVal['endDatetime'])):
            omitReason = "PostDate=%s, not within range: %s <-> %s"%(parsedPostDate, gVal['beginDatetime'], gVal['endDatetime']);

            itemInfoDict['omitted'] = True;
            itemInfoDict['omitReason'] = omitReason;
            return itemInfoDict;
    else:
        logging.error("not found poste date for %s", itemLink);
        sys.exit(-1);

    #1. check should be omit or not
    #(1) has phone number
    #http://autoexplosion.com/cars/buy/150594.php
    # <tr>
        # <td valign="top" nowrap><b>Phone</b></td>
        # <td valign="top">
                # Private
            # </td>
    # </tr>
    
    #http://autoexplosion.com/cars/buy/150887.php
    # <tr>
        # <td valign="top" nowrap><b>Phone</b></td>
        # <td valign="top">
                # <span itemprop="telephone">(210) 473-9820</span>
            # </td>
    # </tr>
    
    #http://autoexplosion.com/boats/buy/4270.php
    # <tr>
        # <td valign="top" nowrap><b>Phone</b></td>
        # <td valign="top">
                # (714) 532-0988		</td>
    # </tr>
    #foundPhone = soup.find(name="span", attrs={"itemprop":"telephone"});
    #foundPhone = re.search('<td valign="top" nowrap><b>Phone</b></td>\s*<td valign="top">\s*(<span itemprop="telephone">)?(?P<phoneStr>[\d\(\)\-\s]+)(</span>)?\s*</td>', itemRespHtml);
    foundPhone = re.search('<td valign="top" nowrap><b>Phone</b></td>\s*<td valign="top">\s*(<span itemprop="telephone">)?(?P<phoneStr>.+?)(</span>)?\s*</td>', itemRespHtml);
    logging.debug("foundPhone=%s", foundPhone);
    if(foundPhone):
        #itemInfoDict['Phone'] = foundPhone.string;
        phoneStr = foundPhone.group("phoneStr");
        logging.debug("phoneStr=%s", phoneStr);
        stripedPhoneStr = phoneStr.strip();
        logging.debug("stripedPhoneStr=%s", stripedPhoneStr);
        if(stripedPhoneStr == "Private"):
            itemInfoDict['omitted'] = True;
            itemInfoDict['omitReason'] = "Phone is Private";
            return itemInfoDict;
        else:
            onlyDigitPhone = re.sub("[^\d]", "", stripedPhoneStr);
            itemInfoDict['Phone'] = onlyDigitPhone; #(210) 473-9820
            logging.info("itemInfoDict['Phone']\t=%s", itemInfoDict['Phone']);
    else:
        logging.error("not found phone for %s", itemLink);
        sys.exit(-1);
        
    #(2) not in exclude zip code list
    #http://autoexplosion.com/cars/buy/150594.php
    # <tr>
        # <td valign="top" nowrap><b>Location</b></td>
        # <td valign="top" itemprop="address">Tampa, FL</td>
    # </tr>
    
    #http://autoexplosion.com/boats/buy/4270.php
    # <tr>
        # <td valign="top" nowrap><b>Location</b></td>
        # <td valign="top">Orange, CA</td>
    # </tr>
    
    #foundLocation = soup.find(name="td", attrs={"itemprop":"address"});
    #foundLocation = re.search('<td valign="top" nowrap><b>Location</b></td>\s*<td valign="top"( itemprop="address")?>(?P<location>[\w,]+)</td>', itemRespHtml);
    foundLocation = re.search('<td valign="top" nowrap><b>Location</b></td>\s*<td valign="top"( itemprop="address")?>(?P<location>.+?)</td>', itemRespHtml);
    logging.debug("foundLocation=%s", foundLocation);
    if(foundLocation):
        #locationStr = foundLocation.string; #Tampa, FL
        locationStr = foundLocation.group("location"); #Tampa, FL
        zipCode = crifanLib.getZipcodeFromLocation(locationStr);
        
        itemInfoDict['Zip code'] = zipCode;
        logging.info("itemInfoDict['Zip code']\t=%s", itemInfoDict['Zip code']);
        
        mainZipCode = zipCode[0:3];
        logging.debug("mainZipCode=%s", mainZipCode);
        if(mainZipCode in gVal['excludeZipCodeList']):
            logging.debug("mainZipCode=%s is in excludeZipCodeList, so omit this", mainZipCode);
            itemInfoDict['omitted'] = True;
            itemInfoDict['omitReason'] = "mainZipCode=%s in exclude list"%(mainZipCode);
            return itemInfoDict;
    else:
        logging.error("not found location for %s", itemLink);
        sys.exit(-1);
    
    #(3) makesure money is in valid range: 4999 - 999999
    #http://autoexplosion.com/cars/buy/150594.php
    # <tr itemprop="offers" itemscope itemtype="http://schema.org/Offer">
        # <td valign="top" nowrap><b>Price</b></td>
        # <td valign="top"><b class="cssHeader" itemprop="price">$9,000</b><meta itemprop="priceCurrency" content="USD" /></td>
    # </tr>
    
    #http://autoexplosion.com/boats/buy/4270.php
    # <tr>
        # <td valign="top" nowrap><b>Price</b></td>
        # <td valign="top"><b class="cssHeader">$5,000</b></td>
    # </tr>
    #foundPrice = soup.find(name="b", attrs={"class":"cssHeader", "itemprop":"price"});
    foundPrice = re.search('<td valign="top" nowrap><b>Price</b></td>\s*<td valign="top"><b class="cssHeader"[^<>]*?>(?P<price>.+?)</b>', itemRespHtml);
    logging.debug("foundPrice=%s", foundPrice);
    if(foundPrice):
        #priceDollar = foundPrice.string; #$19,999
        priceDollar = foundPrice.group("price"); #$19,999
        logging.debug("priceDollar=%s", priceDollar);
        priceDollarUni = unicode(priceDollar);
        price = priceDollarUni.replace("$", "").replace(",", "");
        logging.debug("price=%s", price);
        
        itemInfoDict['Price'] = price;
        logging.info("itemInfoDict['Price']\t=%s", itemInfoDict['Price']);
        
        priceInt = int(itemInfoDict['Price']);
        if(priceInt >= gVal['priceFrom'] and priceInt <= gVal['priceTo']):
            #correct
            logging.debug("item price indeed with range");
        else:
            logging.error("item price %d out of range, gVal['priceFrom']=%d, gVal['priceTo']=%d", priceInt, gVal['priceFrom'], gVal['priceTo']);
            sys.exit(-1);
    else:
        logging.error("not found price for %s", itemLink);
        sys.exit(-1);
    
    #2. prepare basic info
    #(1)mainType, adId
    (mainType, adId) = getTypeAndIdFromUrl(itemLink);

    #3 extract remain infos
    #(1) Lead Source
    itemInfoDict['Lead Source'] = "autoexplosion-" + mainType;
    logging.info("itemInfoDict['Lead Source']=%s", itemInfoDict['Lead Source']);
    
    #(2) Ad Id
    itemInfoDict['Ad Id'] = adId;
    logging.info("itemInfoDict['Ad Id']\t=%s", itemInfoDict['Ad Id']);
    
    #(3) Batch Date
    itemInfoDict['Batch Date'] = datetime.now().strftime("%m/%d/%Y");
    logging.info("itemInfoDict['Batch Date']\t=%s", itemInfoDict['Batch Date']);
    
    #(4) Year
    # <tr>
        # <td valign="top" nowrap><b>Year</b></td>
        # <td valign="top">2004</td>
    # </tr>
    foundYear = re.search('<td valign="top" nowrap><b>Year</b></td>\s*<td valign="top">(?P<year>\d+)</td>', itemRespHtml);
    logging.debug("foundYear=%s", foundYear);
    if(foundYear):
        itemInfoDict['Year'] = foundYear.group("year");
        logging.info("itemInfoDict['Year']\t=%s", itemInfoDict['Year']);
    else:
        logging.error("Fail to find year from %s", itemLink);
        sys.exit(-1);

    #(5) Title
    #<span itemprop="name">2008 Acura Rdx</span>
    foundTitle = soup.find(name="span", attrs={"itemprop":"name"});
    logging.debug("foundTitle=%s", foundTitle);
    if(foundTitle):
        origTitle = foundTitle.string; #2008 Acura Rdx
        trueTitle = re.sub("^"+itemInfoDict['Year']+" (.+)$", r"\1", origTitle);
        #http://autoexplosion.com/bikes/buy/11722.php
        logging.debug("trueTitle=%s", trueTitle); #OC Choppers Super Stretch 124” Softail
        filteredTitleUni = crifanLib.filterNonAsciiStr(trueTitle); #OC Choppers Super Stretch 124 Softail
        itemInfoDict['Title'] = filteredTitleUni;
        logging.info("itemInfoDict['Title']\t=%s", itemInfoDict['Title']);
    else:
        logging.error("Fail to find title from %s", itemLink);
        sys.exit(-1);

    #(6) Description
    #http://autoexplosion.com/cars/buy/150594.php
    # <tr>
        # <td valign="top" colspan="2" itemprop="description">
                # 2004 ACURA MDX Touring Package<br />
    # V6 VTEC 3.5L 4 Wheel Drive<br />
    # Transmission: Automatic<br />
    # Mileage: 115000<br />
    # Seating: 7 passenger<br />
    # Bose Premium Sound 6 disk CD Changer<br />
    # Roof Rack<br />
    # Running Boards<br />
    # Premium Wheels<br />
    # Traction Control<br />
    # ABS<br />
    # Leather Interior<br />
    # Rear Climate Control<br />
    # <br />
    # Kelley Blue Book Private Party Sale Value: $9,909		</td>
    # </tr>
    
    #http://autoexplosion.com/boats/buy/4270.php
    # <tr>
        # <td valign="top" colspan="2"><b>Description</b></td>
    # </tr>
    # <tr>
        # <td valign="top" colspan="2">
                # Remanufactured Volvo 250 installed in 2009<br />
    # Has radar with plotter<br />
    # Outdrive needs sevices, cost will be deducted from sale price.<br />
    # Boat is in the water. no trailer		</td>
    # </tr>
    
    #http://autoexplosion.com/bikes/buy/11812.php
    # <tr>
        # <td valign="top" colspan="2"><b>Description</b></td>
    # </tr>
    # <tr>
        # <td valign="top" colspan="2" itemprop="description">
                # 2004 AMERICAN IRONHORSE SLAMMER<br />
    # 124 c.c. S&S<br />
    # XRT CUSTOM PACKAGE<br />
    # 2,800 MILES<br />
    # <br />
    # Feel free to contact me with any questions : <a href="mailto:marciano.lindahl6@hotmail.com">marciano.lindahl6@hotmail.com</a>		</td>
    # </tr>
    
    #foundDescription = soup.find(name="td", attrs={"valign":"top", "colspan":"2", "itemprop":"description"});
    foundDescriptionList = soup.findAll(name="td", attrs={"valign":"top", "colspan":"2"});
    #will find two
    #first is 
    #<td valign="top" colspan="2"><b>Description</b></td>
    #second is real description content
    foundDescription = foundDescriptionList[1];
    logging.debug("foundDescription=%s", foundDescription);

    if(foundDescription):
        descContents = crifanLib.soupContentsToUnicode(foundDescription.contents);
        #descContents = foundDescription.renderContents();
        logging.debug("descContents=%s", descContents);
        descHtmlDecoded = crifanLib.decodeHtmlEntity(descContents);
        logging.debug("descHtmlDecoded=%s", descHtmlDecoded);
        descHtmlFiltered = crifanLib.filterHtmlTag(descHtmlDecoded);
        logging.debug("descHtmlFiltered=%s", descHtmlFiltered);
        
        #http://autoexplosion.com/cars/buy/150631.php
        # Peapack, NJ. \u2028\u2028Mrs. Onassis bought
        #UnicodeEncodeError: 'ascii' codec can't encode character u'\u2028' in position 318: ordinal not in range(128)
        
        #so remove special unicode char
        descHtmlFilteredUni = crifanLib.filterNonAsciiStr(descHtmlFiltered);
        itemInfoDict['Description'] = descHtmlFilteredUni;
        
        logging.debug("itemInfoDict['Description']=%s", itemInfoDict['Description']);
    else:
        logging.error("Fail to find description from %s", itemLink);
        sys.exit(-1);

    #(7) Email
    itemInfoDict['Email'] = "";
    
    #(8) Mileage
    if(mainType == "boats"):
        #http://autoexplosion.com/boats/buy/4270.php
        #ALL boats no Mileage
        itemInfoDict['Mileage'] = "0";
    else:
        #http://autoexplosion.com/cars/buy/150594.php
        # <tr>
            # <td valign="top" nowrap><b>Mileage</b></td>
            # <td valign="top">115,000 miles</td>
        # </tr>
        
        #http://autoexplosion.com/RVs/buy/9234.php
        # <tr>
            # <td valign="top" nowrap><b>Mileage</b></td>
            # <td valign="top">n/a</td>
        # </tr>
        
        #foundMileage = re.search('<td valign="top" nowrap><b>Mileage</b></td>\s*<td valign="top">(?P<mileage>[\d,]+) miles</td>', itemRespHtml);
        foundMileage = re.search('<td valign="top" nowrap><b>Mileage</b></td>\s*<td valign="top">(?P<mileageStr>[^<>]+)</td>', itemRespHtml);
        logging.debug("foundMileage=%s", foundMileage);
        if(foundMileage):
            mileageStr = foundMileage.group("mileageStr"); #115,000 miles or n/a
            foundMiles = re.search("(?P<mileage>[\d,]+) miles", mileageStr);
            logging.debug("foundMiles=%s", foundMiles);
            if(foundMiles):
                mileage = foundMiles.group("mileage");
                logging.debug("mileage=%s", mileage);
                mileage = mileage.replace(",", ""); #115000
                itemInfoDict['Mileage'] = mileage;
            else:
                #http://autoexplosion.com/RVs/buy/9234.php
                #n/a
                #itemInfoDict['Mileage'] = mileageStr;
                itemInfoDict['Mileage'] = "0";
        else:
            logging.error("Fail to find mileage from %s", itemLink);
            sys.exit(-1);
    logging.info("itemInfoDict['Mileage']\t=%s", itemInfoDict['Mileage']);
    
    #(9) City
    logging.debug("locationStr=%s", locationStr);
    if(locationStr):
        cityStateList = locationStr.split(",");
        city = cityStateList[0];
        itemInfoDict['City'] = city;
        logging.info("itemInfoDict['City']\t=%s", itemInfoDict['City']);
    else:
        logging.error("Fail to find city from %s", itemLink);
        sys.exit(-1);
    
    return itemInfoDict;

def getTotalNum(curType):
    totalNum = 0;
    
    #Method 1: 
    #http://autoexplosion.com/cars/buy/advanced.php
    
    #process cars
    #to get cookie
    #http://autoexplosion.com/cars/buy/
    # carsBuyUrl = "http://autoexplosion.com/cars/buy/";
    # carsBuyRespHtml = crifanLib.getUrlRespHtml(carsBuyUrl);
    # logging.debug("carsBuyRespHtml=%s", carsBuyRespHtml);
    
    #http://autoexplosion.com/cars/buy/advanced.php
    
    #searchBaseUrl = "http://autoexplosion.com/cars/buy/advanced.php";
    searchBaseUrl = "http://autoexplosion.com/" + curType + "/buy/advanced.php";
    
    # searchBaseRespHtml = crifanLib.getUrlRespHtml(searchBaseUrl);
    # logging.debug("searchBaseRespHtml=%s", searchBaseRespHtml);
    
    #http://autoexplosion.com/cars/buy/advanced.php?zip=&radius=0&make%5B%5D=&model%5B%5D=&year_from=&year_to=&price_from=4999&price_to=999999&mileage=&posted_after=
    paraDict = {
        'zip'           : "",
        'radius'        : "0",
        'make%5B%5D'    : "",
        'model%5B%5D'   : "",
        'year_from'     : "",
        'year_to'       : "",
        'price_from'    : gVal['priceFrom'],
        'price_to'      : gVal['priceTo'],
        'mileage'       : "",
        'posted_after'  : "",
    };
    searchUrl = crifanLib.genFullUrl(searchBaseUrl, paraDict);
    logging.info("searchUrl=%s", searchUrl);
    headerDict = {
        #'Referer'       : "http://autoexplosion.com/cars/buy/advanced.php",
        'Referer'       : "http://autoexplosion.com/" + curType + "/buy/advanced.php",
    };
    #searchRespHtml = crifanLib.getUrlRespHtml(searchUrl, headerDict=headerDict);
    searchRespHtml = crifanLib.getUrlRespHtml_multiTry(searchUrl, headerDict=headerDict, maxTryNum=50);
    #logging.debug("searchRespHtml=%s", searchRespHtml);
    
    #<p>Your search returned <b class="cssColorMain">966</b>
    soup = BeautifulSoup(searchRespHtml);
    #logging.debug("soup=%s", soup);
    foundTotalNum = soup.find(name="b", attrs={"class":"cssColorMain"});
    logging.debug("foundTotalNum=%s", foundTotalNum);
    if(foundTotalNum):
        totalNum = foundTotalNum.string; #966
        totalNum = int(totalNum);
        logging.info("totalNum=%d", totalNum);
    else:
        logging.error("Can not find total num !");
        sys.exit(-1);

    return totalNum;

def getSinglePageHtml(curType, offset):
    #http://autoexplosion.com/cars/buy/results.php?go=1&price_from=4999&price_to=999999&offset=15
    #searchBaseUrl = "http://autoexplosion.com/cars/buy/results.php";
    searchBaseUrl = "http://autoexplosion.com/" + curType + "/buy/results.php";
    paraDict = {
        'go'            : "1",
        'price_from'    : gVal['priceFrom'],
        'price_to'      : gVal['priceTo'],
        'offset'        : str(offset),
    };
    searchUrl = crifanLib.genFullUrl(searchBaseUrl, paraDict);
    logging.info("searchUrl=%s", searchUrl);
    
    #searchRespHtml = crifanLib.getUrlRespHtml(searchUrl);
    searchRespHtml = crifanLib.getUrlRespHtml_multiTry(searchUrl, maxTryNum=50);
    #logging.debug("searchRespHtml=%s", searchRespHtml);

    return searchRespHtml;

def initExcludeZipCodeList():
    #parse csv file to generate the exclude zip code list
    gVal['excludeZipCodeList'] = [];
    exZipCsvFile = open(gVal['exclueZipcodeFile'], 'r');
    logging.debug("exZipCsvFile=%s", exZipCsvFile);
    exZipCsvReader = csv.reader(exZipCsvFile)
    logging.debug("exZipCsvReader=%s", exZipCsvReader);
    for row in exZipCsvReader:
        logging.debug("row=%s", row);
        firstContent = row[0];
        logging.debug("firstContent=%s", firstContent);
        filteredContent = firstContent.replace("'", "");
        logging.debug("filteredContent=%s", filteredContent);
        filteredContent = filteredContent.replace('"', "");
        logging.debug("filteredContent=%s", filteredContent);
        #curExCode = int(filteredContent);
        curExCode = filteredContent;
        logging.debug("curExCode=%s", curExCode);
        gVal['excludeZipCodeList'].append(curExCode);
    logging.debug("gVal['excludeZipCodeList']=%s", gVal['excludeZipCodeList']);
    
    return ;

def outputInfoDictToFile(itemInfoDictList):
    #output all info dict list
    #outputFp = open(gVal['csvFilename'], 'a+');
    outputFp = open(gVal['csvFilename'], 'ab+'); # MUST in binary mode !!!
    csvWriter = csv.writer(outputFp, dialect='excel');
    for eachInfoDict in itemInfoDictList:
        fieldList = [];
        fieldList.append(eachInfoDict['Lead Source']);
        fieldList.append(eachInfoDict['Ad Id']);
        fieldList.append(eachInfoDict['Batch Date']);
        fieldList.append(eachInfoDict['Phone']);
        fieldList.append(eachInfoDict['Price']);
        fieldList.append(eachInfoDict['Zip code']);
        fieldList.append(eachInfoDict['Year']);
        fieldList.append(eachInfoDict['Title']);
        fieldList.append(eachInfoDict['Description']);
        fieldList.append(eachInfoDict['Email']);
        fieldList.append(eachInfoDict['URL']);
        fieldList.append(eachInfoDict['Mileage']);
        fieldList.append(eachInfoDict['City']);
        logging.info("fieldList=%s", fieldList);

        csvWriter.writerow(fieldList);
    outputFp.close();
    
    return ;
    
def processEachPageHtml(curType, eachPageHtml):
    #for each page to process it

    soup = BeautifulSoup(eachPageHtml);
    logging.debug("soup=%s", soup);
    #<table width="100%" border="0" cellspacing="0" cellpadding="0" class="cssTable">
    #foundCssTable = soup.find(id="table", attrs={"class":"cssTable"}); !!! not id, but name
    foundCssTable = soup.find(name="table", attrs={"class":"cssTable"});
    logging.debug("foundCssTable=%s", foundCssTable);
    #<td><a href="/cars/buy/150594.php">Acura Mdx</a></td>
    #foundAllItems = foundCssTable.findAll(name="a", attrs={"href":re.compile("/cars/buy/\d+\.php")});
    foundAllItems = foundCssTable.findAll(name="a", attrs={"href":re.compile("/" + curType + "/buy/\d+\.php")});
    logging.debug("foundAllItems=%s", foundAllItems);
    itemLen = len(foundAllItems);
    logging.debug("itemLen=%s", itemLen);
    
    itemInfoDictList = [];
    for eachItem in foundAllItems:
        logging.debug("eachItem=%s", eachItem);
        href = eachItem['href']; #/cars/buy/150594.php
        logging.debug("href=%s", href);
        itemLink = gConst['domain'] + href; #http://autoexplosion.com/cars/buy/150594.php
        logging.debug("itemLink=%s", itemLink);
        itemInfoDict = processEachItem(itemLink);
        if(not itemInfoDict['omitted']):
            itemInfoDictList.append(itemInfoDict);
            gVal['allTypeInfoDict'][curType]["processedNum"] += 1;
        else:
            logging.info("Omit %s for %s", itemLink, itemInfoDict['omitReason']);
            gVal['allTypeInfoDict'][curType]["omittedNum"] += 1;
        
        gVal['curItemNum'] += 1;

    outputInfoDictToFile(itemInfoDictList);
    return ;
    
def initOutputCsvFile():
    #init output file
    # 'a+': read,write,append
    # 'w' : clear before, then write
    #outputFp = open(gVal['csvFilename'], 'w');
    outputFp = open(gVal['csvFilename'], 'wb'); # MUST in binary mode !!!
    csvWriter = csv.writer(outputFp, dialect='excel');
    # itemInfoDict = {
        # 'Lead Source'   : "",
        # 'Ad Id'         : "",
        # 'Batch Date'    : "",
        # 'Phone'         : "",
        # 'Price'         : "",
        # 'Zip code'      : "",
        # 'Year'          : "",
        # 'Title'         : "",
        # 'Description'   : "",
        # 'Email'         : "",
        # 'URL'           : "",
        # 'Mileage'       : "",
        # 'City'          : "",
    # };
    csvHeaderList = [
        "Lead Source",
        "Ad Id",
        "Batch Date",
        "Phone",
        "Price",
        "Zip code",
        "Year",
        "Title",
        "Description",
        "Email",
        "URL",
        "Mileage",
        "City",
    ];
    csvWriter.writerow(csvHeaderList);
    outputFp.close();
    return ;

def main():
    #support parameter
    newParser = argparse.ArgumentParser(description="Extarct autoexplosion's boats,cycles,RVs,cars info then save into csv.");
    newParser.add_argument("-e", "--exclueZipcodeFile", dest="exclueZipcodeFile", default="material/Example/Excluded_area_codes.csv", help="Csv file contains the excluded zip code");
    newParser.add_argument("-f", "--priceFrom", type=int, default=gConst['priceFrom'],   dest="priceFrom", help="Minimal money for item");
    newParser.add_argument("-t", "--priceTo", type=int, default=gConst['priceTo'],   dest="priceTo", help="Maximum money for item");
    newParser.add_argument("-b", "--beginDatetimeStr", default=gConst['beginDatetimeStr'], dest="beginDatetimeStr", help="Begin date time");
    newParser.add_argument("-d", "--endDatetimeStr", default=gConst['endDatetimeStr'], dest="endDatetimeStr", help="End date time");

    args = newParser.parse_args();
    argsDict = args.__dict__;
    for eachArg in argsDict.keys():
        exec(eachArg + " = args." + eachArg);

    #init values
    gVal['priceFrom'] = priceFrom;
    gVal['priceTo'] = priceTo;
    
    gVal['beginDatetimeStr'] = beginDatetimeStr;
    gVal['endDatetimeStr'] = endDatetimeStr;
    
    # gVal['endDatetime'] = datetime.strptime(gVal['endDatetime'], "%Y-%m-%d %H:%M:%S");
    # gVal['beginDatetime'] = datetime.strptime(gVal['beginDatetime'], "%Y-%m-%d %H:%M:%S");
    gVal['beginDatetime'] = datetime.strptime(gVal['beginDatetimeStr'], "%Y-%m-%d");
    gVal['endDatetime'] = datetime.strptime(gVal['endDatetimeStr'], "%Y-%m-%d");
    logging.info("gVal['beginDatetime']=%s", gVal['beginDatetime']);
    logging.info("gVal['endDatetime']=%s", gVal['endDatetime']);
    
    gVal['exclueZipcodeFile'] = exclueZipcodeFile;
    logging.info("gVal['priceFrom']=%d, gVal['priceTo']=%d", gVal['priceFrom'], gVal['priceTo']);
    
    #init output file
    gVal['csvFilename'] = gVal['csvFilename'] + "_" + gVal['beginDatetimeStr'] + "_to_" + gVal['endDatetimeStr'] + ".csv";
    initOutputCsvFile();
    
    #init
    crifanLib.initAutoHandleCookies();

    #init exclude zip code list
    initExcludeZipCodeList();
    
    eachPageNum = 15;
    #get total number of search item
    
    #typeList = ["cars", "RVs", "bikes", "boats"];
    typeList = gVal['allTypeInfoDict'].keys();
    for curType in typeList:
        totalNum = getTotalNum(curType);
        
        #debug
        #totalNum = 4;
        
        singleTypeInfoDict = {
            "totalNum"      : 0,
            "omittedNum"    : 0,
            "processedNum"  : 0,
        };
        singleTypeInfoDict["totalNum"] = totalNum;
        gVal['allTypeInfoDict'][curType] = singleTypeInfoDict;
        
        logging.info("%s", crifanLib.formatString("curType=%s totalNum=%d"%(curType,totalNum)));
        
        gVal['curTotalNum'] = totalNum;
        
        totalPageNum = totalNum/eachPageNum;
        if(totalNum%eachPageNum > 0):
            totalPageNum += 1;
        logging.debug("totalPageNum=%d", totalPageNum);
        for curPageIdx in range(totalPageNum):
            #init
            curOffset = curPageIdx * eachPageNum;
            logging.debug("curOffset=%d", curOffset);
            
            gVal['curItemNum'] = curOffset + 1;

            #debug
            # #http://autoexplosion.com/bikes/buy/results.php?go=1&price_to=999999&price_from=4999&offset=375
            # curOffset = 375;

            logging.info("%s", crifanLib.formatString("curType=%s,curPageIdx=%d,curOffset=%d"%(curType,curPageIdx,curOffset), paddingChar="-"));
            singlePageHtml = getSinglePageHtml(curType, curOffset);
            processEachPageHtml(curType, singlePageHtml);
        
        logging.info("Complete to process total %d %s", totalNum, curType);
    
    #done, output statics info
    logging.info("%s", crifanLib.formatString("Statistic Info"));
    for curType in gVal['allTypeInfoDict'].keys():
        logging.info("%s", crifanLib.formatString("%s"%(curType), paddingChar="-"));
        logging.info("Total Number:\t%d", gVal['allTypeInfoDict'][curType]['totalNum']);
        logging.info("Omitted Number:\t%d", gVal['allTypeInfoDict'][curType]['omittedNum']);
        logging.info("Processed Number:\t%d", gVal['allTypeInfoDict'][curType]['processedNum']);
    
###############################################################################
if __name__=="__main__":
    scriptSelfName = crifanLib.extractFilename(sys.argv[0]);

    logging.basicConfig(
                    level    = logging.DEBUG,
                    format   = 'LINE %(lineno)-4d  %(levelname)-8s %(message)s',
                    datefmt  = '%m-%d %H:%M',
                    filename = scriptSelfName + ".log",
                    filemode = 'w');
    # define a Handler which writes INFO messages or higher to the sys.stderr
    console = logging.StreamHandler();
    console.setLevel(logging.INFO);
    # set a format which is simpler for console use
    formatter = logging.Formatter('LINE %(lineno)-4d : %(levelname)-8s %(message)s');
    # tell the handler to use this format
    console.setFormatter(formatter);
    logging.getLogger('').addHandler(console);
    try:
        main();
    except:
        logging.exception("Unknown Error !");
        raise;

 

【总结】



发表评论

电子邮件地址不会被公开。 必填项已用*标注

无觅相关文章插件,快速提升流量