【代码分享】Python代码:download_gcgis_map_pic – 从gcgis.org中的地图图片中提取信息并保存到excel文件

【背景】

之前写的,去处理:

http://www.gcgis.org/webmappub/titleWF.aspx

http://www.greenvillecounty.org/vrealpr24/clRealProp.ASP?WCI=tplRealSearch&WCE=Form1&WCU=

等地址,

寻找到匹配的地图图片,然后提取相关域的信息,保存为excel文件。

 

【download_gcgis_map_pic代码分享】

1.截图:

(1)运行效果:

download_gcgis_map_pic.py run ui

(2)保存信息为excel文件:

download_gcgis_map_pic.py map pic info excel

(3)下载的地图图片:

download_gcgis_map_pic.py map pic

 

2.Python项目代码下载:

download_gcgis_map_pic_2012-11-13.7z

 

3.代码分享:

(1)download_gcgis_map_pic.py

#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
-------------------------------------------------------------------------------
Function:
Web Scrape 11-10-12
https://www.elance.com/j/web-scrape/35102090/

Version:    2012-11-13
Author:     Crifan Li
Contact:    http://www.crifan.com/about/me/

-------------------------------------------------------------------------------
"""

#---------------------------------import---------------------------------------
import re;
import sys;
sys.path.append("libs");

from BeautifulSoup import BeautifulSoup,Tag,CData;
import crifanLib;
import logging;
import xlwt;


def searchFromTwoTd(htmlToSearch, keyName, doHtmlDecode=True):
    foundValue = "";
    foundTwoTd = re.search('<td\s+?style=".+?">'+str(keyName)+'</td>\s*?<td(\s+?style=".+?")?>(?P<foundValue>.+?)</td>', htmlToSearch);
    #print "foundTwoTd=",foundTwoTd;
    if(foundTwoTd):
        foundValue = foundTwoTd.group("foundValue");
        #print "foundValue=",foundValue;
        foundValue = foundValue.strip();
        #print "foundValue=",foundValue;
        if(foundValue and doHtmlDecode):
            #http://fredericiana.com/2010/10/08/decoding-html-entities-to-text-in-python/
            foundValue = crifanLib.decodeHtmlEntity(foundValue, decodedEncoding="GBK");
            #print "type(foundValue)=",type(foundValue);
            #print "after html decode, foundValue=",foundValue;

    return foundValue;

def extractCommonInfo(searchRespHtml):

    # 1. Mail Addr
    # <td style="background-color:#004080;text-align:right;color:white;font-weight:bold;width:15%">Mail Addr</td>
    # <td style="width:35%">224 STONE LAKE DR               </td>

    # foundMailAddr = re.search('<td style=".+?">Mail Addr</td>\s*?<td style=".+?">(?P<mailAddr>.+?)</td>', searchRespHtml);
    # print "foundMailAddr=",foundMailAddr;
    # if(foundMailAddr):
        # mailAddr = foundMailAddr.group("mailAddr");
        # #print "mailAddr=",mailAddr;
        # mailAddr = mailAddr.strip();
        # print "mailAddr=",mailAddr;
    mailAddr = searchFromTwoTd(searchRespHtml, "Mail Addr");
    #print "mailAddr=",mailAddr; #224 STONE LAKE DR
    
    # 2. City
    # <td style="background-color:#004080;text-align:right;color:white;font-weight:bold">City</td>
    # <td>GREENVILLE             </td>

    # foundCity = re.search('<td style=".+?">City</td>\s*?<td>(?P<city>.+?)</td>', searchRespHtml);
    # print "foundCity=",foundCity;
    # if(foundCity):
        # city = foundCity.group("city");
        # city = city.strip();
        # print "city=",city;
    city = searchFromTwoTd(searchRespHtml, "City");
    #print "city=",city; #GREENVILLE

    # 3. Owner 1
    # <td style="background-color:#004080;text-align:right;color:white;font-weight:bold">Owner 1</td>
    # <td>HOOPER JOAN KIRKSEY           </td>
    owner1 = searchFromTwoTd(searchRespHtml, "Owner 1");
    #print "owner1=",owner1; #HOOPER JOAN KIRKSEY
    
    # 4. Owner 2
    # <td style="background-color:#004080;text-align:right;color:white;font-weight:bold">Owner 2</td>
    # <td>                              </td>
    owner2 = searchFromTwoTd(searchRespHtml, "Owner 2");
    #print "owner2=",owner2; #

    # 5. Zip
    # <td style="background-color:#004080;text-align:right;color:white;font-weight:bold">Zip</td>
    # <td>29609</td>
    zip = searchFromTwoTd(searchRespHtml, "Zip");
    #print "zip=",zip; #29609
    
    # 6. Desc
    # <td style="background-color:#004080;text-align:right;color:white;font-weight:bold">Desc</td>
    # <td>1,PT2,7                       </td>
    desc = searchFromTwoTd(searchRespHtml, "Desc");
    #print "desc=",desc; #1,PT2,7

    # 7. Loc
    # <td style="background-color:#004080;text-align:right;color:white;font-weight:bold">Loc</td>
    # <td>707  &nbsp;GORDON ST EXT                   </td>
    loc = searchFromTwoTd(searchRespHtml, "Loc");
    #print "loc=",loc; #707  GORDON ST EXT

    # 8. Acreage
    # <td style="background-color:#004080;text-align:right;color:white;font-weight:bold">Acreage</td>
    # <td>0.49</td>
    acreage = searchFromTwoTd(searchRespHtml, "Acreage");
    #print "acreage=",acreage; #0.49
    
    # 9. Sq Footage
    # <td style="background-color:#004080;text-align:right;color:white;font-weight:bold;width:15%">Sq Footage</td>
    # <td style="width:13%">0</td>
    sqFootage = searchFromTwoTd(searchRespHtml, "Sq Footage");
    #print "sqFootage=",sqFootage; #0
    
    # 10. Deed Date
    # <td style="background-color:#004080;text-align:right;color:white;font-weight:bold">Deed Date</td>
    # <td>05/11/2007</td>
    deedDate = searchFromTwoTd(searchRespHtml, "Deed Date");
    #print "deedDate=",deedDate; #05/11/2007
    
    # 11. Land Use
    # <td style="background-color:#004080;text-align:right;color:white;font-weight:bold">Land Use</td>
    # <td>6800</td>
    landUse = searchFromTwoTd(searchRespHtml, "Land Use");
    #print "landUse=",landUse; #6800
    
    # 12. Fair Market Value
    # <td style="background-color:#004080;text-align:right;color:white;font-weight:bold;">Fair Market Value</td>
    # <td> </td>
    fairMarketValue = searchFromTwoTd(searchRespHtml, "Fair Market Value");
    #print "fairMarketValue=",fairMarketValue; #
    
    # 13. Sales Price
    # <td style="background-color:#004080;text-align:right;color:white;font-weight:bold">Sales Price</td>
    # <td>&nbsp;</td>
    salesPrice = searchFromTwoTd(searchRespHtml, "Sales Price");
    #print "salesPrice=",salesPrice; #
    
    # 14. Taxable Market Value
    # <td style="background-color:#004080;text-align:right;color:white;font-weight:bold;">Taxable Market Value</td>
    # <td>$24,500</td>
    taxableMarketValue = searchFromTwoTd(searchRespHtml, "Taxable Market Value");
    #print "taxableMarketValue=",taxableMarketValue; #$24,500
    
    # 15. Num Bathrooms
    # <td style="background-color:#004080;text-align:right;color:white;font-weight:bold">Num Bathrooms</td>
    # <td>0</td>
    numBathrooms = searchFromTwoTd(searchRespHtml, "Num Bathrooms");
    #print "numBathrooms=",numBathrooms; #0
    
    # 16. Num Bedrooms
    # <td style="background-color:#004080;text-align:right;color:white;font-weight:bold">Num Bedrooms</td>
    # <td>0</td>
    numBedrooms = searchFromTwoTd(searchRespHtml, "Num Bedrooms");
    #print "numBedrooms=",numBedrooms; #0
    
    # 17. Total Rollback
    # <td style="background-color:#004080;text-align:right;color:white;font-weight:bold;">Total Rollback</td>
    # <td>$0.00</td>
    totalRollback = searchFromTwoTd(searchRespHtml, "Total Rollback");
    #print "totalRollback=",totalRollback; #$0.00
    
    # 18. Num Half Baths
    # <td style="background-color:#004080;text-align:right;color:white;font-weight:bold">Num Half Baths</td>
    # <td>0</td>
    numHalfBaths = searchFromTwoTd(searchRespHtml, "Num Half Baths");
    #print "numHalfBaths=",numHalfBaths; #0
    
    # 19. Assmt Class
    # <td style="background-color:#004080;text-align:right;color:white;font-weight:bold;">Assmt Class</td>
    # <td> OT</td>
    assmtClass = searchFromTwoTd(searchRespHtml, "Assmt Class");
    #print "assmtClass=",assmtClass; #OT
    
    commonInfoDict = {
        'Owner1'    : owner1,
        'Owner2'    : owner2,
        'Acreage'   : acreage,
        'Mail Addr' : mailAddr,
        'Mail City' : city,
        'Mail Zip'  : zip,
        'Desc'      : desc,
        'Loc'       : loc,
        'Deed Date' : deedDate,
        'Sale Price': salesPrice,
        'LandUse'   : landUse,
        'Bath'      : numBathrooms,
        'Bed'       : numBedrooms,
        'Half Bath' : numHalfBaths,
        'SqFt'      : sqFootage,
        'Fair Market Val'   : fairMarketValue,
        'Tax Val'           : taxableMarketValue,
        'RollBack'          : totalRollback,
        'Assmt Class'       : assmtClass,
    };
    
    return commonInfoDict;
    
def extractOutstandingInfo(searchRespHtml):
    (yearsOutstanding, accountNo, amountOutstanding) = ("", "", "");
    #original html:
                # <tr>
              # <td width="33%" bgcolor="#004080" align="middle" ><strong><font color="#ffffff">&nbsp;Years
            # Outstanding</font></strong></td>

              # <td width="33%" bgcolor="#004080" align="middle" ><strong><font color="#ffffff">&nbsp;Account
            # No</font></strong></td>

              # <td width="34%" bgcolor="#004080" align="right" ><strong><font color="#ffffff">&nbsp;Amount
            # Outstanding</font></strong></td>
                # </tr></table></tr></table>
    # <table border='0' width="100%" cellspacing='0' cellpadding='0'>
    # <tr>
      # <td width='33%' bgcolor='#f5f5f5' align="center">
      # 2011
      # </td>
                 
      # <td width='33%' bgcolor='#f5f5f5' align="center">
      # 201100011448477001
      # </td>
                                
      # <td width='34%' bgcolor='#f5f5f5' align="right">
      # $1,438.60
      # </td>
                                                            
    # </tr>
    # </table>
    #
    foundYearAccountAmount = re.search('<td .+?>&nbsp;Years\s*?Outstanding.+?</td>\s*?<td .+?>&nbsp;Account\s*?No.+?</td>\s*?<td .+?>&nbsp;Amount\s*?Outstanding.+?</td>.+?<td .+?>\s*?(?P<yearsOutstanding>\d+)\s*</td>\s*?<td .+?>\s*?(?P<accountNo>\d+)\s*</td>\s*?<td .+?>\s*?(?P<amountOutstanding>[$,\.\d]+?)\s*</td>', searchRespHtml, re.S);
    print "foundYearAccountAmount=",foundYearAccountAmount;
    if(foundYearAccountAmount):
        yearsOutstanding = foundYearAccountAmount.group("yearsOutstanding");
        accountNo = foundYearAccountAmount.group("accountNo");
        amountOutstanding = foundYearAccountAmount.group("amountOutstanding");
        print "yearsOutstanding=%s, accountNo=%s, amountOutstanding=%s"%(yearsOutstanding, accountNo, amountOutstanding);
    
    return (yearsOutstanding, accountNo, amountOutstanding);

def getHtmlByMapIdAndYear(mapId, year):
    #http://www.greenvillecounty.org/vrealpr24/clRealProp.ASP?WCI=tplRealSearch&WCE=Form1&WCU=
    searchPropertyUrl = "http://www.greenvillecounty.org/vrealpr24/clRealProp.ASP?WCI=tplRealSearch&WCE=Form1&WCU=";
    
    #post data:
    #SelectYear=2012&txt_Name=&txt_Street=&txt_MapNo=0230000400502&txt_Subdiv=&B1=Submit&txt_Voided_MApNo=&SelectSalesYear=ALL&txt_Sales_SheetNo=
    #SelectYear=2010&txt_Name=&txt_Street=&txt_MapNo=0230000400502&txt_Subdiv=&B1=Submit&txt_Voided_MApNo=&SelectSalesYear=ALL&txt_Sales_SheetNo=
    postData = {
        'SelectYear'    : str(year),
        'txt_Name'      : "",
        'txt_Street'    : "",
        'txt_MapNo'     : str(mapId),
        'txt_Subdiv'    : "",
        'B1'            : "Submit",
        'txt_Voided_MApNo'  : "",
        'SelectSalesYear'   : "ALL",
        'txt_Sales_SheetNo' : "",
    };
    searchRespHtml = crifanLib.getUrlRespHtml(searchPropertyUrl, postData);
    #print "searchRespHtml=",searchRespHtml;
    
    return searchRespHtml;

def processEachMapId(mapId):
    searchRespHtml = getHtmlByMapIdAndYear(mapId, 2012);

    crifanLib.printCurrentCookies();

    commonInfoDict = extractCommonInfo(searchRespHtml);
    
    (yearsOutstanding, accountNo, amountOutstanding) = extractOutstandingInfo(searchRespHtml);

    # infoDict = {
        # 'MapID'     : "",
        # 'Owner1'    : "",
        # 'Owner2'    : "",
        # 'Acreage'   : "",
        # 'Mail Addr' : "",
        # 'Mail City' : "",
        # 'Mail Zip'  : "",
        # 'Desc'      : "",
        # 'Loc'       : "",
        # 'Deed Date' : "",
        # 'Sale Price': "",
        # 'LandUse'   : "",
        # 'Bath'      : "",
        # 'Bed'       : "",
        # 'Half Bath' : "",
        # 'SqFt'      : "",
        # 'Fair Market Val'   : "",
        # 'Tax Val'   : "",
        # 'RollBack'  : "",
        # 'Assmt Class'   : "",
        # '2012 Outstanding'  : "",
        # '2011 Outstanding'  : "",
        # '2010 Outstanding'  : "",
    # };
    
    # add for 2012
    commonInfoDict['2012 Outstanding'] = amountOutstanding;

    for eachYear in [2010, 2011]:
        print "eachYear=",eachYear;
        eachYearRespHtml = getHtmlByMapIdAndYear(mapId, eachYear);
        #print "eachYear=%d, eachYearRespHtml=%s"%(eachYear, eachYearRespHtml);

        (yearsOutstanding, accountNo, amountOutstanding) = extractOutstandingInfo(eachYearRespHtml);
        
        crifanLib.printCurrentCookies();

        commonInfoDict[str(eachYear)+' Outstanding'] = amountOutstanding;
    
    # finally add the mapId
    commonInfoDict["MapID"] = str(mapId);
    #commonInfoDict["MapID"] = mapId;
    
    print "commonInfoDict=",commonInfoDict;
    return commonInfoDict;

def downloadMap(mapId, loc):
    print "mapId=%s, loc=%s"%(mapId, loc);
    
    # 1. get cookie: ASP.NET_SessionId
    titleWfUrl = "http://www.gcgis.org/webmappub/titleWF.aspx";
    titleWfRespHtml = crifanLib.getUrlRespHtml(titleWfUrl);
    #print "titleWfRespHtml=",titleWfRespHtml;
    crifanLib.printCurrentCookies();
    #now got Cookie ASP.NET_SessionId=fbh0kcewbftsszzn3pvzqm45

    # 2. [11/77] to get __VIEWSTATE for later get pic url use
    postBackForGetPicUrl = "http://www.gcgis.org/webmappub/PostBack_WebForm.aspx";
    postBackForGetPicUrlRespHtml = crifanLib.getUrlRespHtml(postBackForGetPicUrl);
    
    #<input type="hidden" name="__EVENTVALIDATION" id="__EVENTVALIDATION" value="/wEWKgLzgKHkDAL9mLzBCQKh1vObCgLe35/qBgKW7cXfDgL5xd3gBQL0n+S1CQLZu8/YAgKm7eu0CAKF5em0CAKD8ZLwBgLi6JDwBgL5qq6zBgK2tK6zBgL5qqrTBgK2tKrTBgKD9sPKDQLd+ZO6BwLtmaqoDAKt+fHFBgKL9J2oCQKD39/ZBQLl9OXbAgLP9/udCwLP94+8AwLP96OyBQLP97eoBwLP9+vVBwLP9//LCQLFtoCeBQK/+qWjBAKco4j/BALOs5TJCALCspqrDAL2t82UCAKCjbz2BgKojfk8AqGk/dQLAp6frLMEAvHm8qAOAo3H37oJApS+ipsPn1bbnGxpyN+MEOuP9Zpsv+45KhQ=" />
    foundEventValidationForGetPic = re.search('<input\s*?type="hidden"\s*?name="__EVENTVALIDATION"\s*?id="__EVENTVALIDATION"\s*?value="(?P<eventValidationForGetPic>.+?)"\s*?/>', postBackForGetPicUrlRespHtml);
    print "foundEventValidationForGetPic=",foundEventValidationForGetPic;
    if(foundEventValidationForGetPic):
        eventValidationForGetPic = foundEventValidationForGetPic.group("eventValidationForGetPic");
        print "eventValidationForGetPic=",eventValidationForGetPic;

    #<input type="hidden" name="__VIEWSTATE" id="__VIEWSTATE" value="/wEPDwUKLTY4MjA2ODAyNmRk95maMKre9V09JJjbi9r8vCJQ1m0=" />
    foundViewStateForGetPic = re.search('<input\s*?type="hidden"\s*?name="__VIEWSTATE"\s*?id="__VIEWSTATE"\s*?value="(?P<viewStateForGetPic>.+?)"\s*?/>', postBackForGetPicUrlRespHtml);
    print "foundViewStateForGetPic=",foundViewStateForGetPic;
    if(foundViewStateForGetPic):
        viewStateForGetPic = foundViewStateForGetPic.group("viewStateForGetPic");
        print "viewStateForGetPic=",viewStateForGetPic;

    # 3. [59/77] get pic url
    postBackUrl = "http://www.gcgis.org/webmappub/PostBack_WebForm.aspx";
    # __VIEWSTATE=%2FwEPDwUKLTY4MjA2ODAyNmRk95maMKre9V09JJjbi9r8vCJQ1m0%3D
    # __EVENTVALIDATION=%2FwEWKgLzgKHkDAL9mLzBCQKh1vObCgLe35%2FqBgKW7cXfDgL5xd3gBQL0n%2BS1CQLZu8%2FYAgKm7eu0CAKF5em0CAKD8ZLwBgLi6JDwBgL5qq6zBgK2tK6zBgL5qqrTBgK2tKrTBgKD9sPKDQLd%2BZO6BwLtmaqoDAKt%2BfHFBgKL9J2oCQKD39%2FZBQLl9OXbAgLP9%2FudCwLP94%2B8AwLP96OyBQLP97eoBwLP9%2BvVBwLP9%2F%2FLCQLFtoCeBQK%2F%2BqWjBAKco4j%2FBALOs5TJCALCspqrDAL2t82UCAKCjbz2BgKojfk8AqGk%2FdQLAp6frLMEAvHm8qAOAo3H37oJApS%2BipsPn1bbnGxpyN%2BMEOuP9Zpsv%2B45KhQ%3D
    # govUser_hid=false
    # validUser_hid=true
    # accountInUse_hid=
    # command_hidden=zoomToStartExtent
    # mapImageHeight_hidden=431
    # mapImageWidth_hidden=1088
    # mapImageSrc_hidden=
    # xMinMap_hidden=0
    # yMinMap_hidden=0
    # xMaxMap_hidden=0
    # yMaxMap_hidden=0
    # x1_hidden=0
    # y1_hidden=0
    # x2_hidden=0
    # y2_hidden=0
    # layerIds_hid=
    # layersVisible_hid=
    # activeLayerId_hid=
    # vmlLayerID_hid=0
    # vmlObjectID_hid=0
    # pointXyText_hid=
    # vmlFeatureClass_hid=
    # vmlGeometry1_hid=
    # vmlGeometry2_hid=
    # vmlGeometry3_hid=
    # vmlGeometry4_hid=
    # vmlGeometry5_hid=
    # vmlGeometry6_hid=
    # aPoly_hid=
    # aLine_hid=
    # aCircle_hid=
    # aPoint_hid=
    # aText_hid=
    # numSelect_hid=0
    # sQuery_hid=
    # addBuffer_hid=false
    # sBuffer_hid=0
    # resultLayerId_hid=
    # compsMapList_hid=undefined
    # theme_hid=
    # errMsg_hid=
    postData = {
        '__VIEWSTATE'       : viewStateForGetPic,
        '__EVENTVALIDATION' : eventValidationForGetPic, 
        'govUser_hid'       : "false",
        'validUser_hid'     : "true",
        'command_hidden'    : "zoomToStartExtent",
        'mapImageHeight_hidden' : "431",
        'mapImageWidth_hidden'  : "1088",
        
        'xMinMap_hidden'        : "0",
        'yMinMap_hidden'        : "0",
        'xMaxMap_hidden'        : "0",
        'yMaxMap_hidden'        : "0",
        
        'x1_hidden'             : "0",
        'y1_hidden'             : "0",
        'x2_hidden'             : "0",
        'y2_hidden'             : "0",
        
        'vmlLayerID_hid'        : "0",
        'vmlObjectID_hid'       : "0",
        'numSelect_hid'         : "0",
        'sBuffer_hid'           : "0",
    };
    postBackUrlRespHtml = crifanLib.getUrlRespHtml(postBackUrl, postData);
    print "postBackUrlRespHtml=",postBackUrlRespHtml;
    #resp html contain: 
    # <input type="hidden" name="__VIEWSTATE" id="__VIEWSTATE" value="/wEPDwUKLTY4MjA2ODAyNmRk95maMKre9V09JJjbi9r8vCJQ1m0=" />
    # <input type="hidden" name="__EVENTVALIDATION" id="__EVENTVALIDATION" value="/wEWKgLzgKHkDAL9mLzBCQKh1vObCgLe35/qBgKW7cXfDgL5xd3gBQL0n+S1CQLZu8/YAgKm7eu0CAKF5em0CAKD8ZLwBgLi6JDwBgL5qq6zBgK2tK6zBgL5qqrTBgK2tKrTBgKD9sPKDQLd+ZO6BwLtmaqoDAKt+fHFBgKL9J2oCQKD39/ZBQLl9OXbAgLP9/udCwLP94+8AwLP96OyBQLP97eoBwLP9+vVBwLP9//LCQLFtoCeBQK/+qWjBAKco4j/BALOs5TJCALCspqrDAL2t82UCAKCjbz2BgKojfk8AqGk/dQLAp6frLMEAvHm8qAOAo3H37oJApS+ipsPn1bbnGxpyN+MEOuP9Zpsv+45KhQ=" />
    # <input type="hidden" name="govUser_hid" id="govUser_hid" value="False" />
    # <input type="hidden" name="validUser_hid" id="validUser_hid" value="True" />
    # <input type="hidden" name="accountInUse_hid" id="accountInUse_hid" />
    # <input type="hidden" name="command_hidden" id="command_hidden" value="zoomToStartExtent" />
    # <input type="hidden" name="mapImageHeight_hidden" id="mapImageHeight_hidden" value="431" />
    # <input type="hidden" name="mapImageWidth_hidden" id="mapImageWidth_hidden" value="1088" />
    # <input type="hidden" name="mapImageSrc_hidden" id="mapImageSrc_hidden" value="http://www.gcgis.org/output/webmappub_zs-gisims130202756373.jpg" />
    # <input type="hidden" name="xMinMap_hidden" id="xMinMap_hidden" value="1210744.77958237" />
    # <input type="hidden" name="yMinMap_hidden" id="yMinMap_hidden" value="958500" />
    # <input type="hidden" name="xMaxMap_hidden" id="xMaxMap_hidden" value="1911255.22041763" />
    # <input type="hidden" name="yMaxMap_hidden" id="yMaxMap_hidden" value="1236000" />
    # <input type="hidden" name="x1_hidden" id="x1_hidden" value="0" />
    # <input type="hidden" name="y1_hidden" id="y1_hidden" value="0" />
    # <input type="hidden" name="x2_hidden" id="x2_hidden" value="0" />
    # <input type="hidden" name="y2_hidden" id="y2_hidden" value="0" />
    # <input type="hidden" name="layerIds_hid" id="layerIds_hid" />
    # <input type="hidden" name="layersVisible_hid" id="layersVisible_hid" />
    # <input type="hidden" name="activeLayerId_hid" id="activeLayerId_hid" value="25" />
    # <input type="hidden" name="vmlLayerID_hid" id="vmlLayerID_hid" value="0" />
    # <input type="hidden" name="vmlObjectID_hid" id="vmlObjectID_hid" value="0" />
    # <input type="hidden" name="pointXyText_hid" id="pointXyText_hid" />
    # <input type="hidden" name="vmlFeatureClass_hid" id="vmlFeatureClass_hid" />
    # <input type="hidden" name="vmlGeometry1_hid" id="vmlGeometry1_hid" />
    # <input type="hidden" name="vmlGeometry2_hid" id="vmlGeometry2_hid" />
    # <input type="hidden" name="vmlGeometry3_hid" id="vmlGeometry3_hid" />
    # <input type="hidden" name="vmlGeometry4_hid" id="vmlGeometry4_hid" />
    # <input type="hidden" name="vmlGeometry5_hid" id="vmlGeometry5_hid" />
    # <input type="hidden" name="vmlGeometry6_hid" id="vmlGeometry6_hid" />
    # <input type="hidden" name="aPoly_hid" id="aPoly_hid" />
    # <input type="hidden" name="aLine_hid" id="aLine_hid" />
    # <input type="hidden" name="aCircle_hid" id="aCircle_hid" />
    # <input type="hidden" name="aPoint_hid" id="aPoint_hid" />
    # <input type="hidden" name="aText_hid" id="aText_hid" />
    # <input type="hidden" name="numSelect_hid" id="numSelect_hid" value="0" />
    # <input type="hidden" name="sQuery_hid" id="sQuery_hid" />
    # <input type="hidden" name="addBuffer_hid" id="addBuffer_hid" value="false" />
    # <input type="hidden" name="sBuffer_hid" id="sBuffer_hid" value="0" />
    # <input type="hidden" name="resultLayerId_hid" id="resultLayerId_hid" />
    # <input type="hidden" name="compsMapList_hid" id="compsMapList_hid" value="undefined" />
    # <input type="hidden" name="theme_hid" id="theme_hid" />
    # <input type="hidden" name="errMsg_hid" id="errMsg_hid" />

    gisImgUrl = "";
    
    foundGisImgUrl = re.search('<input\s*?type="hidden"\s*?name="mapImageSrc_hidden"\s*?id="mapImageSrc_hidden"\s*?value="(?P<gisImgUrl>.+?)"\s*?/>', postBackUrlRespHtml);
    print "foundGisImgUrl=",foundGisImgUrl;
    if(foundGisImgUrl):
        gisImgUrl = foundGisImgUrl.group("gisImgUrl");
        print "gisImgUrl=",gisImgUrl;
        
        getImgName = gisImgUrl.split("/")[-1];
        print "getImgName=",getImgName;
        crifanLib.manuallyDownloadFile(gisImgUrl, getImgName);
    
    # 4. [61/77] find __EVENTVALIDATION and __VIEWSTATE
    findLocUrl = "http://www.gcgis.org/webmappub/find.aspx?govUser=false&validUser=true";
    getFindLocUrlRespHtml = crifanLib.getUrlRespHtml(findLocUrl);
    #<input type="hidden" name="__EVENTVALIDATION" id="__EVENTVALIDATION" value="/wEWCQKbucpfAum77Y8DAr66s9ELAv2YvMEJAqHW85sKAu2ZqqgMAoCf7Z8EAq358cUGAov0nagJEOHv/rftLPy2jxNqDJgxJaj/dyE=" />
    foundEventValidation = re.search('<input\s*?type="hidden"\s*?name="__EVENTVALIDATION"\s*?id="__EVENTVALIDATION"\s*?value="(?P<eventValidation>.+?)"\s*?/>', getFindLocUrlRespHtml);
    print "foundEventValidation=",foundEventValidation;
    if(foundEventValidation):
        eventValidation = foundEventValidation.group("eventValidation");
        print "eventValidation=",eventValidation;

    #<input type="hidden" name="__VIEWSTATE" id="__VIEWSTATE" value="/wEPDwUKMTUzMjUxNjgxMg9kFgICAw9kFgICAQ9kFgJmD2QWAgIBDw8WAh4EVGV4dGRkZBgBBQ5maW5kX011bHRpVmlldw8PZGZkyFe0rX+rEa+u5GulCVJ7mrLSWG8=" />
    foundViewState = re.search('<input\s*?type="hidden"\s*?name="__VIEWSTATE"\s*?id="__VIEWSTATE"\s*?value="(?P<viewState>.+?)"\s*?/>', getFindLocUrlRespHtml);
    print "foundViewState=",foundViewState;
    if(foundViewState):
        viewState = foundViewState.group("viewState");
        print "viewState=",viewState;

    
    # 5. [66/77] do search
    # __EVENTTARGET=
    # __EVENTARGUMENT=
    # __VIEWSTATE=%2FwEPDwUKMTUzMjUxNjgxMg9kFgICAw9kFgICAQ9kFgJmD2QWAgIBDw8WAh4EVGV4dGRkZBgBBQ5maW5kX011bHRpVmlldw8PZGZkyFe0rX%2BrEa%2Bu5GulCVJ7mrLSWG8%3D
    # __EVENTVALIDATION=%2FwEWCQKbucpfAum77Y8DAr66s9ELAv2YvMEJAqHW85sKAu2ZqqgMAoCf7Z8EAq358cUGAov0nagJEOHv%2FrftLPy2jxNqDJgxJaj%2FdyE%3D
    # find_TextBox=707++GORDON+ST+EXT
    # find_Button=Search
    # govUser_hid=false
    # validUser_hid=true
    # activeLayerId_hid=25
    # findText_hid=
    # vmlLayerID_hid=
    # vmlObjectID_hid=
    
    #findTextBox = urllib.quote_plus(loc);
    findTextBox = loc;
    print "findTextBox=",findTextBox;
    postData = {
        #'__EVENTTARGET'     : "",
        #'__EVENTARGUMENT'   : "",
        '__VIEWSTATE'       : viewState, #%2FwEPDwUKMTUzMjUxNjgxMg9kFgICAw9kFgICAQ9kFgJmD2QWAgIBDw8WAh4EVGV4dGRkZBgBBQ5maW5kX011bHRpVmlldw8PZGZkyFe0rX%2BrEa%2Bu5GulCVJ7mrLSWG8%3D
        '__EVENTVALIDATION' : eventValidation, #%2FwEWCQKbucpfAum77Y8DAr66s9ELAv2YvMEJAqHW85sKAu2ZqqgMAoCf7Z8EAq358cUGAov0nagJEOHv%2FrftLPy2jxNqDJgxJaj%2FdyE%3D
        'find_TextBox'      : findTextBox, #707++GORDON+ST+EXT
        'find_Button'       : "Search",
        'govUser_hid'       : "false",
        'validUser_hid'     : "true",
        'activeLayerId_hid' : "25",
        #"findText_hid"      : "",
        #"vmlLayerID_hid"    : "",
        #"vmlObjectID_hid"   : "",
    };
    headerDict = {
        "Referer"   :   "http://www.gcgis.org/webmappub/find.aspx?govUser=false&validUser=true",
    };
    postFindLocUrlRespHtml = crifanLib.getUrlRespHtml(findLocUrl, postData, headerDict);
    print "postFindLocUrlRespHtml=",postFindLocUrlRespHtml;
    
    # [76/77] get real pic
    #postBackUrl = "http://www.gcgis.org/webmappub/PostBack_WebForm.aspx";
    # __VIEWSTATE=%2FwEPDwUKLTY4MjA2ODAyNmRk95maMKre9V09JJjbi9r8vCJQ1m0%3D
    # __EVENTVALIDATION=%2FwEWKgLzgKHkDAL9mLzBCQKh1vObCgLe35%2FqBgKW7cXfDgL5xd3gBQL0n%2BS1CQLZu8%2FYAgKm7eu0CAKF5em0CAKD8ZLwBgLi6JDwBgL5qq6zBgK2tK6zBgL5qqrTBgK2tKrTBgKD9sPKDQLd%2BZO6BwLtmaqoDAKt%2BfHFBgKL9J2oCQKD39%2FZBQLl9OXbAgLP9%2FudCwLP94%2B8AwLP96OyBQLP97eoBwLP9%2BvVBwLP9%2F%2FLCQLFtoCeBQK%2F%2BqWjBAKco4j%2FBALOs5TJCALCspqrDAL2t82UCAKCjbz2BgKojfk8AqGk%2FdQLAp6frLMEAvHm8qAOAo3H37oJApS%2BipsPn1bbnGxpyN%2BMEOuP9Zpsv%2B45KhQ%3D
    # govUser_hid=false
    # validUser_hid=true
    # accountInUse_hid=
    # command_hidden=zoomToFeature
    # mapImageHeight_hidden=431
    # mapImageWidth_hidden=1088
    # mapImageSrc_hidden=http%3A%2F%2Fwww.gcgis.org%2Foutput%2Fwebmappub_zs-gisims130202756373.jpg
    # xMinMap_hidden=1210744.77958237
    # yMinMap_hidden=958500
    # xMaxMap_hidden=1911255.22041763
    # yMaxMap_hidden=1236000
    # x1_hidden=2000
    # y1_hidden=19661
    # x2_hidden=0
    # y2_hidden=0
    # layerIds_hid=
    # layersVisible_hid=
    # activeLayerId_hid=25
    # vmlLayerID_hid=2000
    # vmlObjectID_hid=19661
    # pointXyText_hid=
    # vmlFeatureClass_hid=
    # vmlGeometry1_hid=
    # vmlGeometry2_hid=
    # vmlGeometry3_hid=
    # vmlGeometry4_hid=
    # vmlGeometry5_hid=
    # vmlGeometry6_hid=
    # aPoly_hid=
    # aLine_hid=
    # aCircle_hid=
    # aPoint_hid=
    # aText_hid=
    # numSelect_hid=0
    # sQuery_hid=
    # addBuffer_hid=false
    # sBuffer_hid=0
    # resultLayerId_hid=
    # compsMapList_hid=undefined
    # theme_hid=
    # errMsg_hid=
    postData = {
        '__VIEWSTATE'       : viewStateForGetPic,
        '__EVENTVALIDATION' : eventValidationForGetPic, 
        'govUser_hid'       : "false",
        'validUser_hid'     : "true",
        'command_hidden'    : "zoomToFeature",
        'mapImageHeight_hidden' : "431",
        'mapImageWidth_hidden'  : "1088",
        
        'mapImageSrc_hidden'    : gisImgUrl,
        
        'xMinMap_hidden'        : "1210744.77958237",
        'yMinMap_hidden'        : "958500",
        'xMaxMap_hidden'        : "1911255.22041763",
        'yMaxMap_hidden'        : "1236000",
        
        'x1_hidden'             : "2000",
        'y1_hidden'             : "19661",
        'x2_hidden'             : "0",
        'y2_hidden'             : "0",
        
        'activeLayerId_hid'     : "25",
        
        'vmlLayerID_hid'        : "2000",
        'vmlObjectID_hid'       : "19661",
        'numSelect_hid'         : "0",
        'sBuffer_hid'           : "0",
        
        'addBuffer_hid'         : "false",
        'compsMapList_hid'      : "undefined",
    };
    postBackUrlForGetPicRespHtml = crifanLib.getUrlRespHtml(postBackUrl, postData);
    print "postBackUrlForGetPicRespHtml=",postBackUrlForGetPicRespHtml;
    
    if(gisImgUrl):
        #http://www.gcgis.org/output/webmappub_zs-gisims130202756373.jpg
    
        #download second time, this time, this pic is what we real want
        getImgName = gisImgUrl.split("/")[-1];
        print "getImgName=",getImgName;
        #webmappub_zs-gisims130202756373.jpg
        realImgName = "real_" + getImgName;
        print "realImgName=",realImgName;
        crifanLib.manuallyDownloadFile(gisImgUrl, realImgName);
        print "Download real pic OK";
        ddddddddd
        
    return ;
    
    
def outputInfoDictList(allInfoDictList):

    #init output excel file
    excelFilename = "extractedRealPropertyInfo.xls";
    
    
    
    #https://groups.google.com/forum/?fromgroups=#!topic/python-excel/8kCUw2y8PrU
        
    # badBG = xlwt.Pattern();
    # badBG.SOLID_PATTERN = 0x34
    # badBG.NO_PATTERN = 0x34
    # badBG.pattern_fore_colour = 0x34
    # badBG.pattern_back_colour = 0x34

    # badFontStyle = xlwt.XFStyle()
    # badFontStyle.Pattern = badBG

    # sheet1.write(1,1,'hello world', badFontStyle) 
        
    
    #https://github.com/python-excel/xlwt/blob/master/xlwt/Cell.py
    #not find background color
    
    #https://github.com/python-excel/xlwt/blob/master/xlwt/Formatting.py
    #blueBackgroundPattern = xlwt.Pattern();
    #blueBackgroundPattern.pattern_back_colour = 0x34;
    #blueBackgroundPattern.SOLID_PATTERN = 0x34
    #blueBackgroundPattern.NO_PATTERN = 0x34
    #blueBackgroundPattern.pattern_fore_colour = "red"
    #blueBackgroundPattern.pattern_back_colour = "blue";
    
    
    
    # #https://groups.google.com/forum/?fromgroups=#!topic/python-excel/8kCUw2y8PrU
    # badBG = xlwt.Pattern()
    # badBG.pattern = badBG.SOLID_PATTERN
    # #badBG.pattern_fore_colour = 3
    # #badBG.pattern_fore_colour = "blue";
    # badBG.pattern_fore_colour = 3;
    # badFontStyle = xlwt.XFStyle()
    # badFontStyle.pattern = badBG;
    # styleBlueBkg = badFontStyle;
    
    #styleBlueBkg = xlwt.easyxf('font: color-index red, bold on');
    #styleBlueBkg = xlwt.easyxf('font: background-color-index red, bold on');
    #styleBlueBkg = xlwt.easyxf('pattern: pattern solid, fore_colour red;');
    #styleBlueBkg = xlwt.easyxf('pattern: pattern solid, fore_colour blue;');
    #styleBlueBkg = xlwt.easyxf('pattern: pattern solid, fore_colour light_blue;');
    #styleBlueBkg = xlwt.easyxf('pattern: pattern solid, fore_colour pale_blue;');
    #styleBlueBkg = xlwt.easyxf('pattern: pattern solid, fore_colour dark_blue;');
    #styleBlueBkg = xlwt.easyxf('pattern: pattern solid, fore_colour dark_blue_ega;');
    #styleBlueBkg = xlwt.easyxf('pattern: pattern solid, fore_colour ice_blue;');
    styleBlueBkg = xlwt.easyxf('pattern: pattern solid, fore_colour sky_blue;');

    #styleBlueBkg = xlwt.easyxf('pattern: pattern solid, fore_colour ocean_blue; font: bold on;'); # 80% like
    
    
    #blueBkgFontStyle = xlwt.XFStyle()
    #blueBkgFontStyle.Pattern = blueBackgroundPattern;
    #styleBlueBkg = blueBkgFontStyle;
    
    styleBold   = xlwt.easyxf('font: bold on');
    
    wb = xlwt.Workbook();
    ws = wb.add_sheet('realPropertyInfo');
    
    #write header
    # infoDict = {
        # Sequence
        # 'MapID'     : "",
        # 'Owner1'    : "",
        # 'Owner2'    : "",
        # 'Acreage'   : "",
        # 'Mail Addr' : "",
        # 'Mail City' : "",
        # 'Mail Zip'  : "",
        # 'Desc'      : "",
        # 'Loc'       : "",
        # 'Deed Date' : "",
        # 'Sale Price': "",
        # 'LandUse'   : "",
        # 'Bath'      : "",
        # 'Bed'       : "",
        # 'Half Bath' : "",
        # 'SqFt'      : "",
        # 'Fair Market Val'   : "",
        # 'Tax Val'   : "",
        # 'RollBack'  : "",
        # 'Assmt Class'   : "",
        # '2012 Outstanding'  : "",
        # '2011 Outstanding'  : "",
        # '2010 Outstanding'  : "",
    # };
    
    ws.write(0, 0, "Sequence",  styleBlueBkg);
    ws.write(0, 1, "MapID",     styleBlueBkg);

    ws.write(0, 2, "Owner1",    styleBold);
    ws.write(0, 3, "Owner2",    styleBold);
    ws.write(0, 4, "Acreage",   styleBold);
    ws.write(0, 5, "Mail Addr", styleBold);
    ws.write(0, 6, "Mail City", styleBold);
    ws.write(0, 7, "Mail Zip",  styleBold);
    ws.write(0, 8, "Desc",      styleBold);
    ws.write(0, 9, "Loc",       styleBold);
    ws.write(0, 10, "Deed Date",styleBold);
    ws.write(0, 11, "Sale Price", styleBold);
    ws.write(0, 12, "LandUse",  styleBold);
    ws.write(0, 13, "Bath",     styleBold);
    ws.write(0, 14, "Bed",      styleBold);
    ws.write(0, 15, "Half Bath",styleBold);
    ws.write(0, 16, "SqFt",     styleBold);
    ws.write(0, 17, "Fair Market Val",  styleBold);
    ws.write(0, 18, "Tax Val",          styleBold);
    ws.write(0, 19, "RollBack",         styleBold);
    ws.write(0, 20, "Assmt Class",      styleBold);
    ws.write(0, 21, "2012 Outstanding", styleBold);
    ws.write(0, 22, "2011 Outstanding", styleBold);
    ws.write(0, 23, "2010 Outstanding", styleBold);
    
    #output extracted info
    print "Outputing extracted info to excel file ",excelFilename;
    for index,eachInfoDict in enumerate(allInfoDictList):
        number = index + 1;
        numberStr = str(number);
        
        #eachInfoDict['Sequence'] = numberStr;
        #ws.write(number, 0, eachInfoDict['Sequence']);
        
        ws.write(number, 0, numberStr);
        
        mapId = eachInfoDict['MapID'];
        print "mapId=",mapId;
        ws.write(number, 1, mapId);
        
        ws.write(number, 2, eachInfoDict['Owner1']);
        ws.write(number, 3, eachInfoDict['Owner2']);
        ws.write(number, 4, eachInfoDict['Acreage']);
        ws.write(number, 5, eachInfoDict['Mail Addr']);
        ws.write(number, 6, eachInfoDict['Mail City']);
        ws.write(number, 7, eachInfoDict['Mail Zip']);
        ws.write(number, 8, eachInfoDict['Desc']);
        
        loc = eachInfoDict['Loc'];
        print "loc=",loc;
        ws.write(number, 9, loc);
        
        ws.write(number, 10, eachInfoDict['Deed Date']);
        ws.write(number, 11, eachInfoDict['Sale Price']);
        ws.write(number, 12, eachInfoDict['LandUse']);
        ws.write(number, 13, eachInfoDict['Bath']);
        ws.write(number, 14, eachInfoDict['Bed']);
        ws.write(number, 15, eachInfoDict['Half Bath']);
        ws.write(number, 16, eachInfoDict['SqFt']);
        ws.write(number, 17, eachInfoDict['Fair Market Val']);
        ws.write(number, 18, eachInfoDict['Tax Val']);
        ws.write(number, 19, eachInfoDict['RollBack']);
        ws.write(number, 20, eachInfoDict['Assmt Class']);
        ws.write(number, 21, eachInfoDict['2012 Outstanding']);
        ws.write(number, 22, eachInfoDict['2011 Outstanding']);
        ws.write(number, 23, eachInfoDict['2010 Outstanding']);
        
        #fetch map
        downloadMap(mapId, loc);

    wb.save(excelFilename);
    return;
    
def main():
    crifanLib.initAutoHandleCookies();

    allInfoDictList = [];
    
    mapIdList = [
        "0230000400502",
        "0230000300400",
        "0230000509400",
    ];
    for eachMapId in mapIdList:
        singleInfoDict = processEachMapId(eachMapId);
        allInfoDictList.append(singleInfoDict);
    
    outputInfoDictList(allInfoDictList);
    
###############################################################################
if __name__=="__main__":
    scriptSelfName = crifanLib.extractFilename(sys.argv[0]);

    logging.basicConfig(
                    level    = logging.DEBUG,
                    format   = 'LINE %(lineno)-4d  %(levelname)-8s %(message)s',
                    datefmt  = '%m-%d %H:%M',
                    filename = scriptSelfName + ".log",
                    filemode = 'w');
    # define a Handler which writes INFO messages or higher to the sys.stderr
    console = logging.StreamHandler();
    console.setLevel(logging.INFO);
    # set a format which is simpler for console use
    formatter = logging.Formatter('LINE %(lineno)-4d : %(levelname)-8s %(message)s');
    # tell the handler to use this format
    console.setFormatter(formatter);
    logging.getLogger('').addHandler(console);
    try:
        main();
    except:
        logging.exception("Unknown Error !");
        raise;

 

 

【总结】



发表评论

电子邮件地址不会被公开。 必填项已用*标注

无觅相关文章插件,快速提升流量