#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
-------------------------------------------------------------------------------
Function:
scrape wheelbynet.com

Version:    2013-07-05
Author:     Crifan Li
Contact:    admin@crifan.com

Usage:
scrape_wheelbynet_com.py

(1)for %Y-%m-%d
scrape_wheelbynet_com.py -b 2013-05-04 -d 2013-05-21
(2)for %Y-%m-%d %H:%M:%S
scrape_wheelbynet_com.py -b "2013-05-04 00:00:00" -d "2013-05-20 00:00:00"

TOOD:
1.

-------------------------------------------------------------------------------
"""
#--------------------------------const values-----------------------------------

gConst = {
    "xls"   : {
        'fileName'  : "outputInfo.xls",
        'sheetName' : "outputInfo",
    },
    
    'domain'        : "http://www.wheelbynet.com",
    
    'minPrice'      : 5000,
    'maxPrice'      : 5000000,

    # # 'beginDatetimeStr' : "1970-01-01 00:00:00",
    # # 'endDatetimeStr'   : "2039-12-30 00:00:00",
    # 'beginDatetimeStr' : "1970-01-01",
    # 'endDatetimeStr'   : "2039-12-30",
};

gCfg = {

};

gVal = {
    "csvFilename" : "wheelbynetItemsInfo",

    'minPrice' : 0,
    'maxPrice'   : 0,

    # #date time string
    # 'beginDatetimeStr'  : "",
    # 'endDatetimeStr'    : "",
    # #converted to datetime type value
    # 'beginDatetime'     : "",
    # 'endDatetime'       : "",

    'exclueZipcodeFile' : "",
    
    'excludeZipCodeList' : [],

    #http://www.wheelbynet.com/docs/auto/index.html
    #http://www.wheelbynet.com/docs/rv/index.html
    #http://www.wheelbynet.com/docs/moto/index.html
    #http://www.wheelbynet.com/docs/boat/index.html
    'allTypeInfoDict' : {
        "auto"  : None, # is singleTypeInfoDict
        "rv"    : None,
        "moto"  : None,
        "boat"  : None,
    },
    
    #for show info
    'curItemNum': 0, 
    'curTotalNum':0,
};

#---------------------------------import---------------------------------------
import re;
import sys;
sys.path.append("libs");
from BeautifulSoup import BeautifulSoup,Tag,CData;
import crifanLib;
import logging;
import urllib;
import json;
import os;
import argparse;
import codecs;

import csv;
import xlwt;
import xlrd;
#import xlutils;
from xlutils.copy import copy;

from datetime import datetime;

def getTypeAndIdFromUrl(itemLink):
    """
        get type from item url link
        input:
        http://www.wheelbynet.com/docs/auto/view_ad2.php3?ad_ref=autoG1RLDLQ8AQEM
        http://www.wheelbynet.com/docs/moto/view_ad2.php3?ad_ref=moto8EJ0VP28A9XK&motorcycle_make=Honda&motorcycle_model=Shadow+VT700
        output:
        auto,G1RLDLQ8AQEM
        moto,8EJ0VP28A9XK
    """
    (mainType, adId) = ("", "");
    #http://autoexplosion.com/cars/buy/150594.php
    logging.debug("input itemLink=%s", itemLink);
    #foundMainType = re.search("http://autoexplosion\.com/(?P<mainType>\w+)/buy/(?P<adId>\d+)\.php", itemLink);
    foundMainType = re.search("http://(www\.)?wheelbynet\.com/docs/(?P<mainType>[a-zA-Z]+)/view_ad2\.php3\?ad_ref=(?P=mainType)(?P<adId>\w+)(&.+?)?", itemLink);
    logging.debug("foundMainType=%s", foundMainType);
    if(foundMainType):
        mainType = foundMainType.group("mainType");
        adId = foundMainType.group("adId");
        #mainType = foundMainType.group(1);
        #adId = foundMainType.group(2);
        logging.debug("mainType=%s, adId=%s", mainType, adId); #cars
    else:
        logging.error("Fail to find mainType,adId from %s", itemLink);
        sys.exit(-1);

    return (mainType, adId);

def processEachItem(itemLink):
    """
        process each search item from its url
            extract all info
        input example:
        http://www.wheelbynet.com/docs/auto/view_ad2.php3?ad_ref=autoG1RLDLQ8AQEM
        http://www.wheelbynet.com/docs/auto/view_ad2.php3?ad_ref=autoW7EE30W66V87
        
        http://www.wheelbynet.com/docs/moto/view_ad2.php3?ad_ref=motoK3V853TOC818&motorcycle_make=Harley-Davidson&motorcycle_model=FLSTC+Heritage+Softail+Classic
        http://www.wheelbynet.com/docs/moto/view_ad2.php3?ad_ref=moto8EJ0VP28A9XK&motorcycle_make=Honda&motorcycle_model=Shadow+VT700
    """

    #debug
    #print "--------debug--------"
    #itemLink = "http://www.wheelbynet.com/docs/auto/view_ad2.php3?ad_ref=auto692R1SQXTIQA";
    #itemLink = "http://www.wheelbynet.com/docs/auto/view_ad2.php3?ad_ref=autoN22YF8UWQ9OR";
    #itemLink = "http://www.wheelbynet.com/docs/rv/view_ad2.php3?ad_ref=rvW00V9Z89PP3Z";
    
    #strange:
    #http://www.wheelbynet.com/docs/rv/view_ad2.php3?ad_ref=rvW00V9Z89PP3Z
    #// REDIRECT if can not connect to mysql database
    #...
    #location.replace("http://www.wheelbynet.com/docs/main/server-upgrade.html");
    
    logging.info("%s", crifanLib.formatString("[%d/%d] %s"%(gVal['curItemNum'],gVal['curTotalNum'],itemLink), paddingChar="-"));

    #itemRespHtml = crifanLib.getUrlRespHtml(itemLink);
    itemRespHtml = crifanLib.getUrlRespHtml_multiTry(itemLink, maxTryNum=50);
    #logging.debug("itemRespHtml=%s", itemRespHtml);

    itemInfoDict = {
        'omitted'       : False,
        'omitReason'    : "",
        
        'Lead Source'   : "",
        'Ad Id'         : "",
        'Batch Date'    : "",
        'Phone'         : "",
        'Price'         : "",
        'Zip code'      : "",
        'Year'          : "",
        'Title'         : "",
        'Description'   : "",
        'Email'         : "",
        'URL'           : "",
        'Mileage'       : "",
        'City'          : "",
    };

    #init
    # (1) mainType, adId
    (mainType, adId) = getTypeAndIdFromUrl(itemLink);
    logging.debug("mainType=%s, adId=%s", mainType, adId);
    # (2) URL
    itemInfoDict['URL'] = itemLink;
    
    #1. check should be omit or not
    #(1) not in exclude zip code list
    
    #http://www.wheelbynet.com/docs/auto/view_ad2.php3?ad_ref=autoG1RLDLQ8AQEM
    #<meta property="og:postal-code" content="47833"/>
    #foundPostalCode = re.search('<meta property="og:postal-code" content="(?P<postalCode>\d+)"/>', itemRespHtml);
    #some no og:postal-code, so not use above method
    
    #http://www.wheelbynet.com/docs/moto/view_ad2.php3?ad_ref=motoK3V853TOC818&motorcycle_make=Harley-Davidson&motorcycle_model=FLSTC+Heritage+Softail+Classic
    #(<a href="/docs/moto/contact_seller275.php3?ad_ref=motoK3V853TOC818&sec_id=3">Contact Seller</a>)<br>Oak Ridge, New Jersey 07438<br>973-452-4859<br>
    
    #http://www.wheelbynet.com/docs/auto/view_ad2.php3?ad_ref=autoG1RLDLQ8AQEM
    #<a href="../moto/contact_seller275.php3?ad_ref=autoG1RLDLQ8AQEM&sec_id=1">Contact Seller</a>)<br />bowling green, Indiana 47833<br />812-236-7971<br />
    #foundPostalCode = re.search('>Contact\s+Seller</a>\)<br\s*/?>[\w ]+, [\w ]+ (?P<postalCode>\d{5})<br\s*/?>', itemRespHtml);
    
    #http://www.wheelbynet.com/docs/auto/view_ad2.php3?ad_ref=auto1R3AESOO6MC7
    # <td valign="top" width="65%"><font size="1"
    # face="Verdana">Andrea (<a href="../moto/contact_seller275.php3?ad_ref=auto1R3AESOO6MC7&sec_id=1">Contact Seller</a>)<br />Edmond, Oklahoma 7313<br />405-285-2472<br /><a href="/docs/moto/otherp275.php3?user_id=924836314&sec_id=1">View all of sellers ads</a>                                </font></td>
    foundPostalCode = re.search('>Contact\s+Seller</a>\)<br\s*/?>[\w ]+, [\w ]+ (?P<postalCode>\d+)<br\s*/?>', itemRespHtml);
    
    logging.debug("foundPostalCode=%s", foundPostalCode);
    if(foundPostalCode):
        postalCode = foundPostalCode.group("postalCode"); #47833
        itemInfoDict['Zip code'] = postalCode;
        logging.info("itemInfoDict['Zip code']\t=%s", itemInfoDict['Zip code']);
        
        mainZipCode = postalCode[0:3]; #478
        logging.debug("mainZipCode=%s", mainZipCode);
        if(mainZipCode in gVal['excludeZipCodeList']):
            logging.debug("mainZipCode=%s is in excludeZipCodeList, so omit this", mainZipCode);
            itemInfoDict['omitted'] = True;
            itemInfoDict['omitReason'] = "mainZipCode=%s in exclude list"%(mainZipCode);
            return itemInfoDict;
    else:
        logging.error("not found location for %s", itemLink);
        logging.debug("itemRespHtml=%s", itemRespHtml);
        sys.exit(-1);

    #3 extract remain infos

    #(1) Lead Source
    itemInfoDict['Lead Source'] = "wheelbynet-" + mainType;
    logging.info("itemInfoDict['Lead Source']=%s", itemInfoDict['Lead Source']);
    
    #(2) Ad Id
    itemInfoDict['Ad Id'] = adId;
    logging.info("itemInfoDict['Ad Id']\t=%s", itemInfoDict['Ad Id']);
    
    #(3) Batch Date
    itemInfoDict['Batch Date'] = datetime.now().strftime("%m/%d/%Y");
    logging.info("itemInfoDict['Batch Date']\t=%s", itemInfoDict['Batch Date']);

    #(4) Phone
    # <td valign="top" width="65%"><font size="1"
                                    # face="Verdana">michelle (<a href="../moto/contact_seller275.php3?ad_ref=autoG1RLDLQ8AQEM&sec_id=1">Contact Seller</a>)<br />bowling green, Indiana 47833<br />812-236-7971<br /><a href="/docs/moto/otherp275.php3?user_id=911141267&sec_id=1">View all of sellers ads</a>                                </font></td>
    #foundPhone = re.search('<br\s*/?>(?P<phoneStr>[\d\-]+)<br\s*/?>', itemRespHtml);
    
    #http://www.wheelbynet.com/docs/auto/view_ad2.php3?ad_ref=auto0H9GYVY6A9Y7
    # <td valign="top" width="65%"><font size="1"
                                    # face="Verdana">Jerry (<a href="../moto/contact_seller275.php3?ad_ref=auto0H9GYVY6A9Y7&sec_id=1">Contact Seller</a>)<br />Longview, Washington 98632<br />(360) 414-1382<br /><a href="/docs/moto/otherp275.php3?user_id=486819187&sec_id=1">View all of sellers ads</a>                                </font></td>
    #foundPhone = re.search('\d+<br\s*/?>(?P<phoneStr>[\(\)\d\- ]+)<br\s*/?>', itemRespHtml);
    
    #http://www.wheelbynet.com/docs/auto/view_ad2.php3?ad_ref=auto087SYHLH499D
    # <td valign="top" width="65%"><font size="1"
    # face="Verdana">Brad (<a href="../moto/contact_seller275.php3?ad_ref=auto087SYHLH499D&sec_id=1">Contact Seller</a>)<br />Farmington, Michigan 48331<br />248/324-4566<br /><a href="/docs/moto/otherp275.php3?user_id=229164588&sec_id=1">View all of sellers ads</a>                                </font></td>
    #foundPhone = re.search('\d+<br\s*/?>(?P<phoneStr>[/\(\)\d\- ]+)<br\s*/?>', itemRespHtml);
    
    #http://www.wheelbynet.com/docs/auto/view_ad2.php3?ad_ref=auto61T627OTMIHP
    # <td valign="top" width="65%"><font size="1"
    # face="Verdana">Pam (<a href="../moto/contact_seller275.php3?ad_ref=auto61T627OTMIHP&sec_id=1">Contact Seller</a>)<br />Taylorville, Illinois 62568<br />217-67;2-6906<br /><a href="/docs/moto/otherp275.php3?user_id=737269434&sec_id=1">View all of sellers ads</a>                                </font></td>
    #foundPhone = re.search('\d+<br\s*/?>(?P<phoneStr>[/;\(\)\d\- ]+)<br\s*/?>', itemRespHtml);
    
    #http://www.wheelbynet.com/docs/auto/view_ad2.php3?ad_ref=autoU18I89HE6PFN
    # <td valign="top" width="65%"><font size="1"
    # face="Verdana">Al (<a href="../moto/contact_seller275.php3?ad_ref=autoU18I89HE6PFN&sec_id=1">Contact Seller</a>)<br />Clinton, Mississippi 39056<br />601.953.6681<br /><a href="/docs/moto/otherp275.php3?user_id=783825779&sec_id=1">View all of sellers ads</a>                                </font></td>
    foundPhone = re.search('\d+<br\s*/?>(?P<phoneStr>[\./;\(\)\d\- ]+)<br\s*/?>', itemRespHtml);
    
    logging.debug("foundPhone=%s", foundPhone);
    if(foundPhone):
        #812-236-7971
        #(360) 414-1382
        phoneStr = foundPhone.group("phoneStr");
        logging.debug("phoneStr=%s", phoneStr);
        stripedPhoneStr = phoneStr.strip();
        onlyDigitPhone = re.sub("[^\d]", "", stripedPhoneStr);
        itemInfoDict['Phone'] = onlyDigitPhone; #8122367971, 3604141382
        logging.info("itemInfoDict['Phone']\t=%s", itemInfoDict['Phone']);
    else:
        logging.error("not found phone for %s", itemLink);
        logging.debug("itemRespHtml=%s", itemRespHtml);
        sys.exit(-1);
    
    #(5) Price
    #<strong><font size="3" face="Verdana,arial">2007 Dodge Charger<br />$19,500.00</font></strong>
    foundPrice = re.search('<br\s*/?>(?P<priceStr>\$[\d,\.]+)</font></strong>', itemRespHtml);
    logging.debug("foundPrice=%s", foundPrice);
    if(foundPrice):
        priceStr = foundPrice.group("priceStr"); #$19,500.00
        logging.debug("priceStr=%s", priceStr);
        #price = priceStr.replace("$", "").replace(",", "").replace(".", "");
        priceNoDecimalPoint = re.sub("\.0+$", "", priceStr);
        logging.debug("priceNoDecimalPoint=%s", priceNoDecimalPoint);
        price = re.sub("[^\d]", "", priceNoDecimalPoint);
        logging.debug("price=%s", price);
        
        itemInfoDict['Price'] = price;
        logging.info("itemInfoDict['Price']\t=%s", itemInfoDict['Price']);
    else:
        logging.error("not found price for %s", itemLink);
        logging.debug("itemRespHtml=%s", itemRespHtml);
        sys.exit(-1);

    #(6) Year
            # <tr>
                # <td><font size="2" face="Verdana"><strong>Year:</strong></font></td><td><font size="2" face="Verdana">
    # 2007</font></td>
            # </tr>
    foundYear = re.search('<strong>Year:</strong></font></td><td><font\s+size="2"\s+face="Verdana">\s*(?P<year>\d+)\s*</font></td>', itemRespHtml);
    logging.debug("foundYear=%s", foundYear);
    if(foundYear):
        itemInfoDict['Year'] = foundYear.group("year");
        logging.info("itemInfoDict['Year']\t=%s", itemInfoDict['Year']);
    else:
        logging.error("Fail to find year from %s", itemLink);
        logging.debug("itemRespHtml=%s", itemRespHtml);
        sys.exit(-1);

    #(7) Title
    #http://www.wheelbynet.com/docs/auto/view_ad2.php3?ad_ref=autoG1RLDLQ8AQEM
    #<font size="3" face="Verdana,arial">2006 Harley-Davidson FLSTC Heritage Softail Classic<br>$12,500.00</font>
    #<td align="center" valign="top" width="450"><strong><font size="3" face="Verdana,arial">2007 Dodge Charger<br />$19,500.00</font></strong>
    foundTitle = re.search('<font\s+size="3"\s+face="Verdana,arial">(?P<title>.+?)<br\s*/?>\$', itemRespHtml);
    logging.debug("foundTitle=%s", foundTitle);
    if(foundTitle):
        title = foundTitle.group("title"); #2006 Harley-Davidson FLSTC Heritage Softail Classic
        logging.debug("title=%s", title); #2006 Harley-Davidson FLSTC Heritage Softail Classic
        itemInfoDict['Title'] = title;
        logging.info("itemInfoDict['Title']\t=%s", itemInfoDict['Title']);
    else:
        logging.error("Fail to find title from %s", itemLink);
        logging.debug("itemRespHtml=%s", itemRespHtml);
        sys.exit(-1);

    #(8) Description
    #http://www.wheelbynet.com/docs/moto/view_ad2.php3?ad_ref=motoK3V853TOC818&motorcycle_make=Harley-Davidson&motorcycle_model=FLSTC+Heritage+Softail+Classic
    # <td width="100%"><font size="2" face="Verdana">The bike is actually a Heritage FLSTI. It is fuel injected and it has a 5 speed trans. The bike was well maintained at the Knievel Custom Bike Shop in NJ. The bike was actually very gently broken in by one of the best bike riders on the road, "Kaptain Robbie Knievel" himself. It was transported along with the custom bikes to various events around the country and provided to some of the celebrities that attended if they needed a bike to ride. Christopher McDonald (Shooter McGavin in Happy Gilmore) was the last to ride it at Jim Kelly's silent austion event in Buffalo, NY several years ago and hasn't been ridden much since. The bike is in very nice condition with very low miles. I have clear title and am the original owner. The best way to contact me would be by e-mail. jkosior@optonline.net.</font>        </td>
    
    foundDescription = re.search('<td\s+width="100%"><font\s+size="2"\s+face="Verdana">(?P<description>.+?)</font>\s*</td>', itemRespHtml);
    if(foundDescription):
        description = foundDescription.group("description");
        logging.debug("description=%s", description);
        #<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
        descriptionUnicode = description.decode("iso-8859-1", 'ignore');
        #logging.debug("descriptionUnicode=%s", descriptionUnicode);
        descHtmlDecoded = crifanLib.decodeHtmlEntity(descriptionUnicode);
        #logging.info("type(descHtmlDecoded)=%s", type(descHtmlDecoded));
        #logging.debug("descHtmlDecoded=%s", descHtmlDecoded);
        descHtmlFiltered = crifanLib.filterHtmlTag(descHtmlDecoded);
        #logging.info("type(descHtmlFiltered)=%s", type(descHtmlFiltered));
        #logging.debug("descHtmlFiltered=%s", descHtmlFiltered);
        descHtmlFilteredUni = descHtmlFiltered;
        #logging.info("type(descHtmlFilteredUni)=%s", type(descHtmlFilteredUni));
        descHtmlOnlyAscii = crifanLib.filterNonAsciiStr(descHtmlFilteredUni);
        #logging.debug("descHtmlOnlyAscii=%s", descHtmlOnlyAscii);
        strippedDesc = descHtmlOnlyAscii.strip();
        itemInfoDict['Description'] = strippedDesc;
        
        logging.debug("itemInfoDict['Description']=%s", itemInfoDict['Description']);
    else:
        logging.error("Fail to find description from %s", itemLink);
        logging.debug("itemRespHtml=%s", itemRespHtml);
        sys.exit(-1);

    #(9) Email
    itemInfoDict['Email'] = "0";

    #(10) Mileage
    #http://www.wheelbynet.com/docs/auto/view_ad2.php3?ad_ref=autoG1RLDLQ8AQEM
        # <tr>
            # <td><font size="2" face="Verdana"><strong>Miles:</strong></font></td><td><font size="2" face="Verdana">
    # 49,000					</font></td>
        # </tr>
    
    #http://www.wheelbynet.com/docs/auto/view_ad2.php3?ad_ref=auto58XXKHTS7098
    # <td><font size="2" face="Verdana"><strong>Miles:</strong></font></td><td><font size="2" face="Verdana">
    #						--					</font></td>
    #foundMiles = re.search('<td><font\s+size="2"\s+face="Verdana"><strong>Miles:</strong></font></td><td><font\s+size="2"\s+face="Verdana">\s*(?P<milesStr>[\d,]+)\s*</font></td>', itemRespHtml);
    foundMiles = re.search('<td><font\s+size="2"\s+face="Verdana"><strong>Miles:</strong></font></td><td><font\s+size="2"\s+face="Verdana">\s*(?P<milesStr>[\d,\-]+)\s*</font></td>', itemRespHtml);
    logging.debug("foundMiles=%s", foundMiles);
    if(foundMiles):
        milesStr = foundMiles.group("milesStr"); #49,000
        logging.debug("milesStr=%s", milesStr);
        miles = re.sub("[^\d]", "", milesStr);
        if(miles):
            itemInfoDict['Mileage'] = miles;
        else:
            itemInfoDict['Mileage'] = "0"; # for "--"
        logging.info("itemInfoDict['Mileage']\t=%s", itemInfoDict['Mileage']);
    else:
        #special:
        #http://www.wheelbynet.com/docs/boat/view_ad2.php3?ad_ref=boat37K66CPNPI2H
        #no Miles, only have: Hours
        logging.debug("Fail to find mileage from %s", itemLink);
        logging.debug("itemRespHtml=%s", itemRespHtml);
        logging.debug("set Mileage to 0 for %s", itemLink);
        itemInfoDict['Mileage'] = "0"; 
        #sys.exit(-1);
    
    #(11) City
    #http://www.wheelbynet.com/docs/moto/view_ad2.php3?ad_ref=motoK3V853TOC818&motorcycle_make=Harley-Davidson&motorcycle_model=FLSTC+Heritage+Softail+Classic
    # <font size="1"
                                    # face="Verdana">Joe (<a href="/docs/moto/contact_seller275.php3?ad_ref=motoK3V853TOC818&sec_id=3">Contact Seller</a>)<br>Oak Ridge, New Jersey 07438<br>973-452-4859<br><a href="/docs/moto/otherp275.php3?user_id=546843461&sec_id=3">View all of sellers ads</a>                                </font>
    foundCityStr = re.search(">Contact Seller</a>\)<br\s*/?>(?P<cityStr>[\w ]+),[\w ]+?<br\s*/?>", itemRespHtml);
    logging.debug("foundCityStr=%s", foundCityStr);
    if(foundCityStr):
        cityStr = foundCityStr.group("cityStr");
        itemInfoDict['City'] = cityStr;
        logging.info("itemInfoDict['City']\t=%s", itemInfoDict['City']);
    else:
        logging.error("Fail to find city from %s", itemLink);
        logging.debug("itemRespHtml=%s", itemRespHtml);
        sys.exit(-1);

    return itemInfoDict;

def getSecIdFromCurType(curType):
    sec_id = 0;
    
    if(curType == "auto"):
        #http://www.wheelbynet.com/docs/moto/search_moto275.php3?ad_image=&sec_id=1&sort=on&state=&sort5=entry_date&type_name=&make_name=&min_yr=1900&max_yr=2014&min_price=5000&max_price=5000000&seller=private&B1=Search
        sec_id = 1;
    elif(curType == "boat"):
        #http://www.wheelbynet.com/docs/moto/search_moto275.php3?ad_image=&sec_id=2&sort=on&state=&sort5=entry_date&type_name=&make_name=&min_yr=1900&max_yr=2014&min_price=5000&max_price=5000000&seller=private&B1=Search
        sec_id = 2;
    elif(curType == "moto"):
        #http://www.wheelbynet.com/docs/moto/search_moto275.php3?ad_image=&sec_id=3&sort=on&state=&sort5=entry_date&type_name=&make_name=&min_yr=1900&max_yr=2014&min_price=5000&max_price=5000000&seller=private&B1=Search
        sec_id = 3;
    elif(curType == "rv"):
        #http://www.wheelbynet.com/docs/moto/search_moto275.php3?ad_image=&sec_id=4&sort=on&state=&sort5=entry_date&type_name=&make_name=&min_yr=1900&max_yr=2014&min_price=5000&max_price=5000000&seller=private&B1=Search
        sec_id = 4;

    return sec_id;

def getTotalPageNum(curType):
    totalPageNum = 0;
    
    sec_id = getSecIdFromCurType(curType);

    #http://www.wheelbynet.com/docs/auto/index.html
    mainUrl = "http://www.wheelbynet.com/docs/" + curType + "/index.html";
    logging.debug("mainUrl=%s", mainUrl);
    mainUrlRespHtml = crifanLib.getUrlRespHtml(mainUrl);
    #logging.debug("mainUrlRespHtml=%s", mainUrlRespHtml);

    #http://www.wheelbynet.com/docs/moto/search_moto275.php3
    
    searchBaseUrl = "http://www.wheelbynet.com/docs/moto/search_moto275.php3";
    
    # searchBaseRespHtml = crifanLib.getUrlRespHtml(searchBaseUrl);
    # logging.debug("searchBaseRespHtml=%s", searchBaseRespHtml);
    #http://www.wheelbynet.com/docs/moto/search_moto275.php3?ad_image=&sec_id=1&sort=on&state=&sort5=entry_date&type_name=&make_name=&min_yr=1900&max_yr=2014&min_price=5000&max_price=5000000&seller=private&B1=Search
    # paraDict = {
        # 'zip'           : "",
        # 'radius'        : "0",
        # 'make%5B%5D'    : "",
        # 'model%5B%5D'   : "",
        # 'year_from'     : "",
        # 'year_to'       : "",
        # 'price_from'    : gVal['minPrice'],
        # 'price_to'      : gVal['maxPrice'],
        # 'mileage'       : "",
        # 'posted_after'  : "",
    # };
    
    #http://www.wheelbynet.com/docs/moto/search_moto275.php3?offset=25&sec_id=1&min_yr=1900&max_yr=2014&min_price=5000&max_price=5000000&seller=private&B1=Search&pg=2&tc=500
    #http://www.wheelbynet.com/docs/moto/search_moto275.php3?offset=50&sec_id=1&min_yr=1900&max_yr=2014&min_price=5000&max_price=5000000&seller=private&B1=Search&pg=3&tc=500
    paraDict = {
        'offset'        : "0",
        'sec_id'        : str(sec_id),
        'min_yr'        : "1900",
        'max_yr'        : "2014",
        'min_price'     : gVal['minPrice'],
        'max_price'     : gVal['maxPrice'],
        'seller'        : "private",
        'B1'            : "Search",
        'pg'            : '1',
        'tc'            : '5000', # max search 5000 !
    };
    #http://www.wheelbynet.com/docs/moto/search_moto275.php3?ad_image=&sec_id=1&sort=on&state=&sort5=entry_date&type_name=&make_name=&min_yr=1900&max_yr=2014&min_price=5000&max_price=5000000&seller=private&B1=Search
    # paraDict = {
        # 'ad_image'      : "",
        # 'sec_id'        : str(sec_id),
        # 'sort'          : "on",
        # 'state'         : "",
        # 'sort5'         : "entry_date",
        # 'type_name'     : "",
        # 'make_name'     : "",
        # 'min_yr'        : "1900",
        # 'max_yr'        : "2014",
        # 'min_price'     : gVal['minPrice'],
        # 'max_price'     : gVal['maxPrice'],
        # 'seller'        : "private",
        # 'B1'            : "Search",
        # 'tc'            : '5000', # max search 5000 !
    # };
    
    
    searchUrl = crifanLib.genFullUrl(searchBaseUrl, paraDict);
    logging.info("searchUrl=%s", searchUrl);

    searchRespHtml = crifanLib.getUrlRespHtml_multiTry(searchUrl, maxTryNum=50);
    #logging.info("type(searchRespHtml)=%s", type(searchRespHtml));
    #logging.debug("searchRespHtml=%s", searchRespHtml);

    # <font face=verdana,arial size=1><b>1</b>&nbsp;<a href="/docs/moto/search_moto275.php3?offset=25&sec_id=1&min_yr=1900&max_yr=2014&min_price=5000&max_price=5000000&seller=private&B1=Search&pg=2&tc=5000">2</a>
    # <a href="/docs/moto/search_moto275.php3?offset=50&sec_id=1&min_yr=1900&max_yr=2014&min_price=5000&max_price=5000000&seller=private&B1=Search&pg=3&tc=5000">3</a>
    # <a href="/docs/moto/search_moto275.php3?offset=75&sec_id=1&min_yr=1900&max_yr=2014&min_price=5000&max_price=5000000&seller=private&B1=Search&pg=4&tc=5000">4</a>
    # <a href="/docs/moto/search_moto275.php3?offset=100&sec_id=1&min_yr=1900&max_yr=2014&min_price=5000&max_price=5000000&seller=private&B1=Search&pg=5&tc=5000">5</a>
    # <a href="/docs/moto/search_moto275.php3?offset=125&sec_id=1&min_yr=1900&max_yr=2014&min_price=5000&max_price=5000000&seller=private&B1=Search&pg=6&tc=5000">6</a>
    # <a href="/docs/moto/search_moto275.php3?offset=150&sec_id=1&min_yr=1900&max_yr=2014&min_price=5000&max_price=5000000&seller=private&B1=Search&pg=7&tc=5000">7</a>
    # <a href="/docs/moto/search_moto275.php3?offset=175&sec_id=1&min_yr=1900&max_yr=2014&min_price=5000&max_price=5000000&seller=private&B1=Search&pg=8&tc=5000">8</a>
    # <a href="/docs/moto/search_moto275.php3?offset=200&sec_id=1&min_yr=1900&max_yr=2014&min_price=5000&max_price=5000000&seller=private&B1=Search&pg=9&tc=5000">9</a>
    # <a href="/docs/moto/search_moto275.php3?offset=225&sec_id=1&min_yr=1900&max_yr=2014&min_price=5000&max_price=5000000&seller=private&B1=Search&pg=10&tc=5000">10</a>
    # <a href="/docs/moto/search_moto275.php3?offset=250&sec_id=1&min_yr=1900&max_yr=2014&min_price=5000&max_price=5000000&seller=private&B1=Search&pg=11&tc=5000">11</a>
    # <a href="/docs/moto/search_moto275.php3?offset=275&sec_id=1&min_yr=1900&max_yr=2014&min_price=5000&max_price=5000000&seller=private&B1=Search&pg=12&tc=5000">12</a>
    # <a href="/docs/moto/search_moto275.php3?offset=300&sec_id=1&min_yr=1900&max_yr=2014&min_price=5000&max_price=5000000&seller=private&B1=Search&pg=13&tc=5000">13</a>
    # <a href="/docs/moto/search_moto275.php3?offset=325&sec_id=1&min_yr=1900&max_yr=2014&min_price=5000&max_price=5000000&seller=private&B1=Search&pg=14&tc=5000">14</a>
    # <a href="/docs/moto/search_moto275.php3?offset=350&sec_id=1&min_yr=1900&max_yr=2014&min_price=5000&max_price=5000000&seller=private&B1=Search&pg=15&tc=5000">15</a>
    # <a href="/docs/moto/search_moto275.php3?offset=375&sec_id=1&min_yr=1900&max_yr=2014&min_price=5000&max_price=5000000&seller=private&B1=Search&pg=16&tc=5000">16</a>
    # <a href="/docs/moto/search_moto275.php3?offset=400&sec_id=1&min_yr=1900&max_yr=2014&min_price=5000&max_price=5000000&seller=private&B1=Search&pg=17&tc=5000">17</a>
    # <a href="/docs/moto/search_moto275.php3?offset=425&sec_id=1&min_yr=1900&max_yr=2014&min_price=5000&max_price=5000000&seller=private&B1=Search&pg=18&tc=5000">18</a>
    # <a href="/docs/moto/search_moto275.php3?offset=450&sec_id=1&min_yr=1900&max_yr=2014&min_price=5000&max_price=5000000&seller=private&B1=Search&pg=19&tc=5000">19</a>
    # <a href="/docs/moto/search_moto275.php3?offset=475&sec_id=1&min_yr=1900&max_yr=2014&min_price=5000&max_price=5000000&seller=private&B1=Search&pg=20&tc=5000">20</a>
    # </font>
    
    foundPageLinkList = re.findall('<a href="/docs/\w+/search_\w+275\.php3\?offset=\d+&sec_id=\d+&min_yr=\d+&max_yr=\d+&min_price=\d+&max_price=\d+&seller=private&B1=Search&pg=(?P<pgNum>\d+)&tc=\d+">(?P=pgNum)</a>', searchRespHtml);
    logging.info("foundPageLinkList=%s", foundPageLinkList); #foundPageLinkList=['2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20']
    if(foundPageLinkList):
        totalPageNumStr = foundPageLinkList[-1];
        logging.debug("Total page number string = %s", totalPageNumStr);
        totalPageNum = int(totalPageNumStr);
        logging.debug("totalPageNum=%d", totalPageNum);
    else:
        logging.error("Can not find total number !");
        logging.debug("searchRespHtml=%s", searchRespHtml);
        sys.exit(-1);
    return totalPageNum;

def getSinglePageHtml(curType, offset):
    sec_id = getSecIdFromCurType(curType);

    singlePageSearchBaseUrl = "http://www.wheelbynet.com/docs/moto/search_moto275.php3";
    
    #http://www.wheelbynet.com/docs/moto/search_moto275.php3?offset=25&sec_id=1&min_yr=1900&max_yr=2014&min_price=5000&max_price=5000000&seller=private&B1=Search&pg=2&tc=500
    #http://www.wheelbynet.com/docs/moto/search_moto275.php3?offset=50&sec_id=1&min_yr=1900&max_yr=2014&min_price=5000&max_price=5000000&seller=private&B1=Search&pg=3&tc=500
    paraDict = {
        'offset'        : offset,
        'sec_id'        : str(sec_id),
        'min_yr'        : "1900",
        'max_yr'        : "2014",
        'min_price'     : gVal['minPrice'],
        'max_price'     : gVal['maxPrice'],
        'seller'        : "private",
        'B1'            : "Search",
        #'pg'            : '1',
        #'tc'            : '5000', # max search 5000 !
    };

    singlePageSearchUrl = crifanLib.genFullUrl(singlePageSearchBaseUrl, paraDict);
    logging.info("singlePageSearchUrl=%s", singlePageSearchUrl);

    searchRespHtml = crifanLib.getUrlRespHtml_multiTry(singlePageSearchUrl, maxTryNum=50);
    #logging.debug("searchRespHtml=%s", searchRespHtml);
    
    #sometime will error for:
    # </form>	// REDIRECT if can not connect to mysql database
        # <SCRIPT language="JavaScript1.1">
        # <!--
        # location.replace("http://www.wheelbynet.com/docs/main/server-upgrade.html");
        # //-->
        # </SCRIPT>
    
    #so here so check if is invalid html, if is, should re-fetch the html
    foundNotConnectMysql = re.search("REDIRECT if can not connect to mysql database", searchRespHtml);
    while(foundNotConnectMysql):
        #re-get html
        logging.warning("occur: REDIRECT if can not connect to mysql database, so re-fetch the html for %s", singlePageSearchUrl);
        searchRespHtml = crifanLib.getUrlRespHtml_multiTry(singlePageSearchUrl, maxTryNum=50);
        foundNotConnectMysql = re.search("REDIRECT if can not connect to mysql database", searchRespHtml);

    return searchRespHtml;

def initExcludeZipCodeList():
    #parse csv file to generate the exclude zip code list
    gVal['excludeZipCodeList'] = [];
    exZipCsvFile = open(gVal['exclueZipcodeFile'], 'r');
    logging.debug("exZipCsvFile=%s", exZipCsvFile);
    exZipCsvReader = csv.reader(exZipCsvFile)
    logging.debug("exZipCsvReader=%s", exZipCsvReader);
    for row in exZipCsvReader:
        logging.debug("row=%s", row);
        firstContent = row[0];
        logging.debug("firstContent=%s", firstContent);
        filteredContent = firstContent.replace("'", "");
        logging.debug("filteredContent=%s", filteredContent);
        filteredContent = filteredContent.replace('"', "");
        logging.debug("filteredContent=%s", filteredContent);
        #curExCode = int(filteredContent);
        curExCode = filteredContent;
        logging.debug("curExCode=%s", curExCode);
        gVal['excludeZipCodeList'].append(curExCode);
    logging.debug("gVal['excludeZipCodeList']=%s", gVal['excludeZipCodeList']);
    
    return ;

def outputInfoDictToFile(itemInfoDictList):
    #output all info dict list
    #outputFp = open(gVal['csvFilename'], 'a+');
    outputFp = open(gVal['csvFilename'], 'ab+'); # MUST in binary mode !!!
    csvWriter = csv.writer(outputFp, dialect='excel');
    for eachInfoDict in itemInfoDictList:
        fieldList = [];
        fieldList.append(eachInfoDict['Lead Source']);
        fieldList.append(eachInfoDict['Ad Id']);
        fieldList.append(eachInfoDict['Batch Date']);
        fieldList.append(eachInfoDict['Phone']);
        fieldList.append(eachInfoDict['Price']);
        fieldList.append(eachInfoDict['Zip code']);
        fieldList.append(eachInfoDict['Year']);
        fieldList.append(eachInfoDict['Title']);
        fieldList.append(eachInfoDict['Description']);
        fieldList.append(eachInfoDict['Email']);
        fieldList.append(eachInfoDict['URL']);
        fieldList.append(eachInfoDict['Mileage']);
        fieldList.append(eachInfoDict['City']);
        logging.info("fieldList=%s", fieldList);

        csvWriter.writerow(fieldList);
    outputFp.close();
    
    return ;
    
def processEachPageHtml(curType, eachPageHtml):
    #for each page to process it
    
    itemInfoDictList = [];

    # <tr bgcolor="#ffffff"><td width=275><font face=verdana,arial size=2><a href="view_ad2.php3?ad_ref=autoG1RLDLQ8AQEM">Dodge Charger</a>   <img src="../images/new28x11.gif" width=28 height=11 alt="Ad placed on Jun-30-2013"></font></td><td width=55><font face=verdana,arial size=1>2007</font></td><td width=60><font face=verdana,arial size=1>49,000</font></td><td width=90><table width=72 border=0 cellpadding=0 cellspacing=0><tr><td align=right><font face=verdana,arial size=1>$19,500.00</font></td></tr></table></td><td width=120><font face=verdana,arial size=1>Indiana</font></td></tr>
    # <tr><td NOWRAP colspan=5><hr size=1 NOSHADOW width=600></td></tr>
    # <tr bgcolor="#ffffff"><td width=275><font face=verdana,arial size=2><a href="view_ad2.php3?ad_ref=autoW7EE30W66V87">Chevrolet 210 2 dr sedan</a>   </font></td><td width=55><font face=verdana,arial size=1>1955</font></td><td width=60><font face=verdana,arial size=1>2,000</font></td><td width=90><table width=72 border=0 cellpadding=0 cellspacing=0><tr><td align=right><font face=verdana,arial size=1>$38,500.00</font></td></tr></table></td><td width=120><font face=verdana,arial size=1>California</font></td></tr>
    # <tr><td NOWRAP colspan=5><hr size=1 NOSHADOW width=600></td></tr>
    # <tr bgcolor="#ffffff"><td width=275><font face=verdana,arial size=2><a href="view_ad2.php3?ad_ref=autoOPXS56Q4S777">Chevrolet Chevelle</a>   </font></td><td width=55><font face=verdana,arial size=1>1964</font></td><td width=60><font face=verdana,arial size=1>--</font></td><td width=90><table width=72 border=0 cellpadding=0 cellspacing=0><tr><td align=right><font face=verdana,arial size=1>$23,500.00</font></td></tr></table></td><td width=120><font face=verdana,arial size=1>California</font></td></tr>
    # <tr><td NOWRAP colspan=5><hr size=1 NOSHADOW width=600></td></tr>
    
    foundAllAhref = re.findall('<a\s+href="(view_ad2\.php3\?ad_ref=\w+)">[^<>]+?</a>', eachPageHtml, re.I);
    if(foundAllAhref):
        #<BASE href="http://www.wheelbynet.com/docs/auto/" target="_top">
        hrefBase = "http://www.wheelbynet.com/docs/" + curType + "/";
        for eachAHref in foundAllAhref:
            itemLink = hrefBase + eachAHref;
            itemInfoDict = processEachItem(itemLink);
            if(not itemInfoDict['omitted']):
                itemInfoDictList.append(itemInfoDict);
                gVal['allTypeInfoDict'][curType]["processedNum"] += 1;
            else:
                logging.info("Omit %s for %s", itemLink, itemInfoDict['omitReason']);
                gVal['allTypeInfoDict'][curType]["omittedNum"] += 1;
            
            gVal['curItemNum'] += 1;
        
        #output info
        outputInfoDictToFile(itemInfoDictList);
    else:
        logging.debug("Can not find any item link for curType=%s eachPageHtml=%s"%(curType, eachPageHtml));
    
    return ;
    
def initOutputCsvFile():
    #init output file
    # 'a+': read,write,append
    # 'w' : clear before, then write
    #outputFp = open(gVal['csvFilename'], 'w');
    outputFp = open(gVal['csvFilename'], 'wb'); # MUST in binary mode !!!
    csvWriter = csv.writer(outputFp, dialect='excel');
    # itemInfoDict = {
        # 'Lead Source'   : "",
        # 'Ad Id'         : "",
        # 'Batch Date'    : "",
        # 'Phone'         : "",
        # 'Price'         : "",
        # 'Zip code'      : "",
        # 'Year'          : "",
        # 'Title'         : "",
        # 'Description'   : "",
        # 'Email'         : "",
        # 'URL'           : "",
        # 'Mileage'       : "",
        # 'City'          : "",
    # };
    csvHeaderList = [
        "Lead Source",
        "Ad Id",
        "Batch Date",
        "Phone",
        "Price",
        "Zip code",
        "Year",
        "Title",
        "Description",
        "Email",
        "URL",
        "Mileage",
        "City",
    ];
    csvWriter.writerow(csvHeaderList);
    outputFp.close();
    return ;
    
def main():
    #support parameter
    newParser = argparse.ArgumentParser(description="Extarct wheelbynet's auto,rv,moto,boat info then save into csv.");
    newParser.add_argument("-e", "--exclueZipcodeFile", dest="exclueZipcodeFile", default="material/Excluded_area_codes.csv", help="Csv file contains the excluded zip code");
    newParser.add_argument("-i", "--minPrice", type=int, default=gConst['minPrice'],   dest="minPrice", help="Minimal money for item");
    newParser.add_argument("-a", "--maxPrice", type=int, default=gConst['maxPrice'],   dest="maxPrice", help="Maximum money for item");
    # newParser.add_argument("-b", "--beginDatetimeStr", default=gConst['beginDatetimeStr'], dest="beginDatetimeStr", help="Begin date time");
    # newParser.add_argument("-d", "--endDatetimeStr", default=gConst['endDatetimeStr'], dest="endDatetimeStr", help="End date time");

    args = newParser.parse_args();
    argsDict = args.__dict__;
    for eachArg in argsDict.keys():
        exec(eachArg + " = args." + eachArg);

    #init values
    gVal['minPrice'] = minPrice;
    gVal['maxPrice'] = maxPrice;
    
    # gVal['beginDatetimeStr'] = beginDatetimeStr;
    # gVal['endDatetimeStr'] = endDatetimeStr;
    
    # # gVal['endDatetime'] = datetime.strptime(gVal['endDatetime'], "%Y-%m-%d %H:%M:%S");
    # # gVal['beginDatetime'] = datetime.strptime(gVal['beginDatetime'], "%Y-%m-%d %H:%M:%S");
    # gVal['beginDatetime'] = datetime.strptime(gVal['beginDatetimeStr'], "%Y-%m-%d");
    # gVal['endDatetime'] = datetime.strptime(gVal['endDatetimeStr'], "%Y-%m-%d");
    # logging.info("gVal['beginDatetime']=%s", gVal['beginDatetime']);
    # logging.info("gVal['endDatetime']=%s", gVal['endDatetime']);
    
    gVal['exclueZipcodeFile'] = exclueZipcodeFile;
    logging.info("gVal['minPrice']=%d, gVal['maxPrice']=%d", gVal['minPrice'], gVal['maxPrice']);
    
    #init output file
    #gVal['csvFilename'] = gVal['csvFilename'] + "_" + gVal['beginDatetimeStr'] + "_to_" + gVal['endDatetimeStr'] + ".csv";
    gVal['csvFilename'] = gVal['csvFilename'] + ".csv";
    initOutputCsvFile();
    
    #init
    crifanLib.initAutoHandleCookies();
    #here use gae 127.0.0.1:8087
    #crifanLib.initProxy({'http':"http://127.0.0.1:8087"});
    crifanLib.initProxyAndCookie({'http':"http://127.0.0.1:8087"});

    #init exclude zip code list
    initExcludeZipCodeList();
    
    #eachPageNum = 15;
    eachPageNum = 25;
    #get total number of search item
    
    #typeList = ["auto", "rv", "moto", "boat"];
    typeList = gVal['allTypeInfoDict'].keys();
    for curType in typeList:
        totalPageNum = getTotalPageNum(curType);
        logging.debug("totalPageNum=%d", totalPageNum);
        totalNum = eachPageNum * totalPageNum;
        logging.debug("totalNum=%d", totalNum);
        
        singleTypeInfoDict = {
            "totalNum"      : 0,
            "omittedNum"    : 0,
            "processedNum"  : 0,
        };
        singleTypeInfoDict["totalNum"] = totalNum;
        gVal['allTypeInfoDict'][curType] = singleTypeInfoDict;
        
        logging.info("%s", crifanLib.formatString("curType=%s totalNum=%d"%(curType,totalNum)));
        
        gVal['curTotalNum'] = totalNum;
        
        for curPageIdx in range(totalPageNum):
            #init
            curOffset = curPageIdx * eachPageNum;
            logging.debug("curOffset=%d", curOffset);
            
            gVal['curItemNum'] = curOffset + 1;

            #debug
            # #http://autoexplosion.com/bikes/buy/results.php?go=1&price_to=999999&price_from=4999&offset=375
            # curOffset = 375;

            logging.info("%s", crifanLib.formatString("curType=%s,curPageIdx=%d,curOffset=%d"%(curType,curPageIdx,curOffset), paddingChar="-"));
            singlePageHtml = getSinglePageHtml(curType, curOffset);
            processEachPageHtml(curType, singlePageHtml);
        
        logging.info("Complete to process total %d %s", totalNum, curType);
    
    #done, output statics info
    logging.info("%s", crifanLib.formatString("Statistic Info"));
    for curType in gVal['allTypeInfoDict'].keys():
        logging.info("%s", crifanLib.formatString("%s"%(curType), paddingChar="-"));
        logging.info("Total Number:\t%d", gVal['allTypeInfoDict'][curType]['totalNum']);
        logging.info("Omitted Number:\t%d", gVal['allTypeInfoDict'][curType]['omittedNum']);
        logging.info("Processed Number:\t%d", gVal['allTypeInfoDict'][curType]['processedNum']);
    
###############################################################################
if __name__=="__main__":
    scriptSelfName = crifanLib.extractFilename(sys.argv[0]);

    logging.basicConfig(
                    level    = logging.DEBUG,
                    format   = 'LINE %(lineno)-4d  %(levelname)-8s %(message)s',
                    datefmt  = '%m-%d %H:%M',
                    filename = scriptSelfName + ".log",
                    filemode = 'w');
    # define a Handler which writes INFO messages or higher to the sys.stderr
    console = logging.StreamHandler();
    console.setLevel(logging.INFO);
    # set a format which is simpler for console use
    formatter = logging.Formatter('LINE %(lineno)-4d : %(levelname)-8s %(message)s');
    # tell the handler to use this format
    console.setFormatter(formatter);
    logging.getLogger('').addHandler(console);
    try:
        main();
    except:
        logging.exception("Unknown Error !");
        raise;