【代码分享】Python代码:scrape_chaosgroup_contact(Python 2.x版本和Python 3.x版本) – 抓取chaosgroup.com中的联系人信息保存为excel

背景】

之前写的,前后共写了两个版本的:

Python 2.x版本

Python 3.x版本

去抓取

http://www.chaosgroup.com/

中联系人信息,并保存为excel文件

 

【scrape_chaosgroup_contact 代码分享】

1.截图:

(1)运行效果:

scrape_chaosgroup_contact_py2.py run ui

(2)保存为excel文件:

scrape_chaosgroup_contact save excel file

 

2.Python项目代码下载:

scrape_chaosgroup_contact_py2.7z

 

scrape_chaosgroup_contact_py3.7z

 

3.代码分享:

(1)Python 2.x版本的:scrape_chaosgroup_contact_py2.py

#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
-------------------------------------------------------------------------------
Collect all data from a webpage
https://www.elance.com/j/collect-all-data-from-webpage/34563264/

Version:    2012-10-25
Author:     Crifan Li
Contact:    http://www.crifan.com/about/me/

-------------------------------------------------------------------------------
"""

#---------------------------------import---------------------------------------
import os;
import re;
import sys;
sys.path.append("libs/crifan");
sys.path.append("libs/thirdparty");
import math;
import time;
import codecs;
import logging;
import urllib;
from datetime import datetime,timedelta;
from optparse import OptionParser;
from string import Template,replace;
import xml;
from xml.sax import saxutils;

import crifanLib;
from BeautifulSoup import BeautifulSoup,Tag,CData;

import xlwt;

#--------------------------------const values-----------------------------------
__VERSION__ = "v0.1";

gConst = {
    
};

#----------------------------------global values--------------------------------
gVal = {
    
};

#--------------------------configurable values---------------------------------
gCfg ={
    
};

#--------------------------functions--------------------------------------------


#------------------------------------------------------------------------------
def main():
    global gVal
    global gCfg
    
    allItemsDictList = [];

    mainUrl = "http://www.chaosgroup.com/en/2/purchase.html?g=0&pID=1";
    logging.debug("mainUrl=%s", mainUrl);
    respHtml = crifanLib.getUrlRespHtml(mainUrl);
    logging.debug("respHtml=%s", respHtml);
    soup = BeautifulSoup(respHtml);
    foundAllItems = soup.findAll(attrs={"class":"countryInfo"});
    logging.debug("foundAllItems=%s", foundAllItems);
    itemsLen = len(foundAllItems);
    logging.info("Total found %d contact info", itemsLen);
    for i,eachItemSoup in enumerate(foundAllItems):
        itemDict = {
            'country':"",
            'name'  : "",
            'phone' : "",
            'fax'   : "",
            'email' : "",
            'vRay'  :"",
            'maxLink':"",
            'address':"",
        };
        itemDict['country'] = eachItemSoup.h3.string;
        logging.debug("itemDict['country']=%s", itemDict['country']);
        foundName = eachItemSoup.find(attrs={"class":"name"});
        if(foundName):
            itemDict['name'] = foundName.string;
            logging.debug("itemDict['name']=%s", itemDict['name']);
        else:
            logging.error("Can not find name");
            sys.exit(2);

        foundPhone = eachItemSoup.find(attrs={"class":"phone"});
        logging.debug("foundPhone=%s", foundPhone);
        if(foundPhone):
            foundPhoneUni = unicode(foundPhone);
            logging.debug("foundPhoneUni=%s", foundPhoneUni);
            # case 1:
            #<p class="phone"><strong>phone:</strong>&nbsp;800.206.7886<br />
            #<strong>fax:</strong>&nbsp;503-295-6533</p>
            # case 2:
                                # <p class="phone"><strong>phone:</strong>&nbsp; +1 800 854 4496 or outside US +1 407 833 0600<br />
                                # <strong>fax:</strong>&nbsp;+1 813 283 4906
            # </p>
            # case 3:
            # <p class="phone"><strong>phone:</strong>&nbsp;604 682 6639 x105 <br /><strong>phone:</strong>  toll-free 1 800 682 6639 x105<br />
            # <strong>fax:</strong>&nbsp;</p>
            foundPhoneFax = re.search("<strong>phone:</strong>&nbsp;(?P<phone>.+)<br />\s*?<strong>fax:</strong>&nbsp;(?P<fax>.*)</p>", foundPhoneUni, re.S);
            logging.debug("foundPhoneFax=%s", foundPhoneFax);
            if(foundPhoneFax):
                itemDict['phone']   = foundPhoneFax.group("phone");
                itemDict['fax']     = foundPhoneFax.group("fax");
                
                itemDict['phone']   = itemDict['phone'].strip();
                itemDict['fax']     = itemDict['fax'].strip();
                
                logging.debug("phone=%s,fax=%s", itemDict['phone'], itemDict['fax']);
            else:
                logging.error("Can not find phone and fax");
                sys.exit(2);
        else:
            logging.error("Can not find phone");
            sys.exit(2);

        foundWeb = eachItemSoup.find(attrs={"class":"web"});
        logging.debug("foundWeb=%s", foundWeb);
        if(foundWeb):
            foundWebUni = unicode(foundWeb);
            logging.debug("foundWebUni=%s", foundWebUni);
            
            # <p class="web"><strong>e-mail:</strong>&nbsp;<a href="#">sales@cinesysinc.com</a><br />
                                # <strong>V-Ray|Max link:</strong>&nbsp;<a target="_blank" href="http://www.cinesysinc.com/page3/page20/page20.html">Cinesys</a>
                                    
            # </p>
            foundEmailInfo = re.search('<strong>e-mail:</strong>&nbsp;<a href="\#">(?P<email>.+)</a><br />\s*<strong>V-Ray\|Max link:</strong>&nbsp;<a target="_blank" href="(?P<maxLink>.+)">(?P<vRay>.+)</a>', foundWebUni);
            logging.debug("foundEmailInfo=%s", foundEmailInfo);
            if(foundEmailInfo):
                itemDict['email']   = foundEmailInfo.group("email");
                itemDict['maxLink'] = foundEmailInfo.group("maxLink");
                itemDict['vRay']    = foundEmailInfo.group("vRay");

                itemDict['email']   = itemDict['email'].strip();
                itemDict['maxLink'] = itemDict['maxLink'].strip();
                itemDict['vRay']    = itemDict['vRay'].strip();

                logging.debug("email=%s,maxLink=%s,vRay=%s", itemDict['email'], itemDict['maxLink'], itemDict['vRay']);
            else:
                logging.error("Can not find email info");
                sys.exit(2);
        else:
            logging.error("Can not find web");
            sys.exit(2);

        foundAddr = eachItemSoup.find(attrs={"class":"addr"});
        logging.debug("foundAddr=%s", foundAddr);
        if(foundAddr):
            foundAddrUni = unicode(foundAddr);
            
            # <p class="addr">
                # <strong>address:</strong>&nbsp;740 SW 21st Ave, Suite #310<br />
                # Portland 97205 Oregon;<br />
                # USA				</p>
                
            foundAddress = re.search('<p class="addr">\s*<strong>address:</strong>&nbsp;(?P<address>.+)</p>', foundAddrUni, re.S);
            if(foundAddress):
                itemDict['address'] = foundAddress.group("address");
                itemDict['address'] = itemDict['address'].replace("<br />", "");
                itemDict['address'] = itemDict['address'].strip();
                logging.debug("address=%s", itemDict['address']);
            else:
                logging.error("Can not find address");
                sys.exit(2);
        else:
            logging.error("Can not find addr");
            sys.exit(2);
        logging.debug("----------------- parse [%d] OK: %s", i, itemDict);
        logging.info("Successfully processed %d contact info", i);
        allItemsDictList.append(itemDict);
    
    #output into excel
    style0 = xlwt.easyxf('font: name Times New Roman, color-index red, bold on',num_format_str='#,##0.00');
    style1 = xlwt.easyxf(num_format_str='D-MMM-YY');
    styleBoldRed = xlwt.easyxf('font: name Times New Roman, color-index red, bold on');
     
    wb = xlwt.Workbook();
    ws = wb.add_sheet('AllContactInfo');
    
    ws.write(0, 0, "Country", styleBoldRed);
    ws.write(0, 1, "Name", styleBoldRed);
    ws.write(0, 2, "Phone", styleBoldRed);
    ws.write(0, 3, "Fax", styleBoldRed);
    ws.write(0, 4, "Email", styleBoldRed);
    ws.write(0, 5, "Vray", styleBoldRed);
    ws.write(0, 6, "MaxLink", styleBoldRed);
    ws.write(0, 7, "Address", styleBoldRed);
    
    for idx,eachItemDict in enumerate(allItemsDictList):
        num = idx + 1;
        
        ws.write(num, 0, eachItemDict['country']);
        ws.write(num, 1, eachItemDict['name']);
        ws.write(num, 2, eachItemDict['phone']);
        ws.write(num, 3, eachItemDict['fax']);
        ws.write(num, 4, eachItemDict['email']);
        ws.write(num, 5, eachItemDict['vRay']);
        ws.write(num, 6, eachItemDict['maxLink']);
        ws.write(num, 7, eachItemDict['address']);
    
    excelFilename = "allExtractedWebsiteData.xls";
    logging.info("Now save all data info excel file: %s", excelFilename);
    wb.save(excelFilename);

###############################################################################
if __name__=="__main__":
    scriptSelfName = crifanLib.extractFilename(sys.argv[0]);

    logging.basicConfig(
                    level    = logging.DEBUG,
                    format   = 'LINE %(lineno)-4d  %(levelname)-8s %(message)s',
                    datefmt  = '%m-%d %H:%M',
                    filename = scriptSelfName + ".log",
                    filemode = 'w');
    # define a Handler which writes INFO messages or higher to the sys.stderr
    console = logging.StreamHandler();
    console.setLevel(logging.INFO);
    # set a format which is simpler for console use
    formatter = logging.Formatter('LINE %(lineno)-4d : %(levelname)-8s %(message)s');
    # tell the handler to use this format
    console.setFormatter(formatter);
    logging.getLogger('').addHandler(console);
    try:
        main();
    except:
        logging.exception("Unknown Error !");
        raise;

(2)Python 3.x版本的:scrape_chaosgroup_contact_py3.py

#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
-------------------------------------------------------------------------------
[Function]
Collect all data from a webpage
https://www.elance.com/j/collect-all-data-from-webpage/34563264/

Version:    2012-10-25
Author:     Crifan Li
Contact:    http://www.crifan.com/about/me/

[NOTE]
This script is for Python 3.x
before you can use this script, should do:

1.install bs4(BeautifulSoup version 4)
http://www.crummy.com/software/BeautifulSoup/bs4/download/beautifulsoup4-4.1.3.tar.gz
->
setup.py install

2. install xlwt3
http://pypi.python.org/pypi/xlwt3/0.1.0
->
http://pypi.python.org/packages/source/x/xlwt3/xlwt3-0.1.0.tar.gz
->
setup.py install

3. modify installed xlwt3
after install, change 
Python32\Lib\site-packages\
->
xlwt3\BIFFRecords.py
->
WriteAccessRecord -> __init__
from :
        self._rec_data = pack('%ds%ds' % (uowner_len, 0x70 - uowner_len),
                              uowner, b' '*(0x70 - uowner_len)) # (to_py3): added b'...'
to:
        self._rec_data = pack('%ds%ds' % (uowner_len, 0x70 - uowner_len),
                              uowner.encode("utf-8"), b' '*(0x70 - uowner_len)) # (to_py3): added b'...'

-------------------------------------------------------------------------------
"""

#---------------------------------import---------------------------------------
import os;
import re;
import sys;
sys.path.append("libs/crifan");
sys.path.append("libs/thirdparty");
import math;
import time;
import codecs;
import logging;
import urllib.request, urllib.parse, urllib.error;
from datetime import datetime,timedelta;
from optparse import OptionParser;
import xml;
from xml.sax import saxutils;

import crifanLib;
#from BeautifulSoup import BeautifulSoup,Tag,CData;
from bs4 import BeautifulSoup,Tag,CData;

#import xlwt;
import xlwt3 as xlwt;

#--------------------------------const values-----------------------------------
__VERSION__ = "v0.1";

gConst = {
    
};

#----------------------------------global values--------------------------------
gVal = {
    
};

#--------------------------configurable values---------------------------------
gCfg ={
    
};

#------------------------------------------------------------------------------
def main():
    global gVal
    global gCfg
        
    allItemsDictList = [];

    mainUrl = "http://www.chaosgroup.com/en/2/purchase.html?g=0&pID=1";
    logging.debug("mainUrl=%s", mainUrl);
    respHtml = crifanLib.getUrlRespHtml(mainUrl);
    #print("type(respHtml)=", type(respHtml));
    #respHtml = respHtml.decode("UTF-8");
    #logging.debug("respHtml=%s", respHtml);
    soup = BeautifulSoup(respHtml, from_encoding="UTF-8");
    foundAllItems = soup.findAll(attrs={"class":"countryInfo"});
    #logging.debug("foundAllItems=%s", foundAllItems);
    itemsLen = len(foundAllItems);
    logging.info("Total found %d contact info", itemsLen);
    for i,eachItemSoup in enumerate(foundAllItems):
        itemDict = {
            'country':"",
            'name'  : "",
            'phone' : "",
            'fax'   : "",
            'email' : "",
            'vRay'  :"",
            'maxLink':"",
            'address':"",
        };
        itemDict['country'] = eachItemSoup.h3.string;
        #logging.debug("itemDict['country']=%s", itemDict['country']);
        foundName = eachItemSoup.find(attrs={"class":"name"});
        if(foundName):
            itemDict['name'] = foundName.string;
            #logging.debug("itemDict['name']=%s", itemDict['name']);
        else:
            logging.error("Can not find name");
            sys.exit(2);

        foundPhone = eachItemSoup.find(attrs={"class":"phone"});
        #logging.debug("foundPhone=%s", foundPhone);
        if(foundPhone):
            #print("foundPhone=%s", foundPhone);
            #foundPhoneUni = str(foundPhone).encode("UTF-8");
            foundPhoneUni = str(foundPhone);
            #print("foundPhoneUni=", foundPhoneUni);
            #print("type(foundPhoneUni)=", type(foundPhoneUni));
            #print("foundPhoneUni.encode('GB18030')=%s", foundPhoneUni.encode('GB18030'));

            #print("type(foundPhone)=", type(foundPhone));
            #foundPhoneString = foundPhone.string;
            #print("type(foundPhoneString)=", type(foundPhoneString));
            #print("foundPhoneString=", foundPhoneString);
            #foundPhoneUni = foundPhone.decode("UTF-8");
            #logging.debug("foundPhoneUni=%s", foundPhoneUni);
                        
            # case 1:
            #<p class="phone"><strong>phone:</strong>&nbsp;800.206.7886<br />
            #<strong>fax:</strong>&nbsp;503-295-6533</p>
            # case 2:
                                # <p class="phone"><strong>phone:</strong>&nbsp; +1 800 854 4496 or outside US +1 407 833 0600<br />
                                # <strong>fax:</strong>&nbsp;+1 813 283 4906
            # </p>
            # case 3:
            # <p class="phone"><strong>phone:</strong>&nbsp;604 682 6639 x105 <br /><strong>phone:</strong>  toll-free 1 800 682 6639 x105<br />
            # <strong>fax:</strong>&nbsp;</p>
            
            #foundPhoneFax = re.search("<strong>phone:</strong>&nbsp;(?P<phone>.+)<br />\s*?<strong>fax:</strong>&nbsp;(?P<fax>.*)</p>", foundPhoneUni, re.S);
            
            #print("dir(foundPhoneUni)=", dir(foundPhoneUni));
            #foundPhoneUtf8 = foundPhoneUni.encode("UTF-8");
            #print("foundPhoneUtf8=", foundPhoneUtf8);
            #foundPhoneFax = re.search("<strong>phone:</strong>&nbsp;(?P<phone>.+)<br />\s*?<strong>fax:</strong>&nbsp;(?P<fax>.*)</p>", foundPhoneUtf8, re.S);
            #<p class="phone"><strong>phone:</strong>&nbsp;866-905-2050<br />\r\n\t\t\t\t<strong>fax:</strong>&nbsp;800 542 7928</p>
            #foundPhoneFax = re.search("<strong>phone:</strong>(?P<phone>.+)<br />\s*?<strong>fax:</strong>(?P<fax>.*)</p>", foundPhoneUni, re.S);
            #foundPhoneFax = re.search("<strong>phone:</strong>(?P<phone>.+)<br />.+?<strong>fax:</strong>(?P<fax>.*)</p>", foundPhoneUni, re.S);
            #foundPhoneFax = re.search("<strong>phone:</strong>(?P<phone>.+)<br />", foundPhoneUni);
            
            #foundPhoneUtf8= b'<p class="phone"><strong>phone:</strong>\xc2\xa0800.206.7886<br/>\n<strong>fax:</strong>\xc2\xa0503-295-6533</p>'
            foundPhoneFax = re.search("<strong>phone:</strong>(?P<phone>.+)<br\s*/>\s*?<strong>fax:</strong>(?P<fax>.*)</p>", foundPhoneUni, re.S);

            #logging.debug("foundPhoneFax=%s", foundPhoneFax);
            if(foundPhoneFax):
                itemDict['phone']   = foundPhoneFax.group("phone");
                itemDict['fax']     = foundPhoneFax.group("fax");
                
                itemDict['phone']   = itemDict['phone'].strip();
                itemDict['fax']     = itemDict['fax'].strip();
                
                #logging.debug("phone=%s,fax=%s", itemDict['phone'], itemDict['fax']);
            else:
                logging.error("Can not find phone and fax");
                sys.exit(2);
        else:
            logging.error("Can not find phone");
            sys.exit(2);

        foundWeb = eachItemSoup.find(attrs={"class":"web"});
        #logging.debug("foundWeb=%s", foundWeb);
        if(foundWeb):
            foundWebUni = str(foundWeb);
            #logging.debug("foundWebUni=%s", foundWebUni);
            
            # <p class="web"><strong>e-mail:</strong>&nbsp;<a href="#">sales@cinesysinc.com</a><br />
                                # <strong>V-Ray|Max link:</strong>&nbsp;<a target="_blank" href="http://www.cinesysinc.com/page3/page20/page20.html">Cinesys</a>
                                    
            # </p>
            #foundEmailInfo = re.search('<strong>e-mail:</strong>&nbsp;<a href="\#">(?P<email>.+)</a><br />\s*<strong>V-Ray\|Max link:</strong>&nbsp;<a target="_blank" href="(?P<maxLink>.+)">(?P<vRay>.+)</a>', foundWebUni);
            

            #foundWebUtf8 = foundWebUni.encode("UTF-8");
            #print("foundWebUtf8=", foundWebUtf8);
            
            # foundWebUtf8= b'<p class="web"><strong>e-mail:</strong>\xc2\xa0<a href="#">info@3dv.com</a><br/>\n<strong>V-Ray|Max link:</strong>\xc2\xa0<a href="http://www.3dv.com/#/Rendering_Solutions/Chaos_Group/VRay/" target="_blank">3DV Corporation</a>\n</p>'
            foundEmailInfo = re.search('<strong>e-mail:</strong>.*?<a href="\#">(?P<email>.+)</a><br\s*/>\s*<strong>V-Ray\|Max link:</strong>.*?<a href="(?P<maxLink>.+)" target="_blank">(?P<vRay>.+)</a>', foundWebUni);
            
            #logging.debug("foundEmailInfo=%s", foundEmailInfo);
            if(foundEmailInfo):
                itemDict['email']   = foundEmailInfo.group("email");
                itemDict['maxLink'] = foundEmailInfo.group("maxLink");
                itemDict['vRay']    = foundEmailInfo.group("vRay");

                itemDict['email']   = itemDict['email'].strip();
                itemDict['maxLink'] = itemDict['maxLink'].strip();
                itemDict['vRay']    = itemDict['vRay'].strip();

                #logging.debug("email=%s,maxLink=%s,vRay=%s", itemDict['email'], itemDict['maxLink'], itemDict['vRay']);
            else:
                logging.error("Can not find email info");
                sys.exit(2);
        else:
            logging.error("Can not find web");
            sys.exit(2);

        foundAddr = eachItemSoup.find(attrs={"class":"addr"});
        #logging.debug("foundAddr=%s", foundAddr);
        if(foundAddr):
            foundAddrUni = str(foundAddr);
            
            # <p class="addr">
                # <strong>address:</strong>&nbsp;740 SW 21st Ave, Suite #310<br />
                # Portland 97205 Oregon;<br />
                # USA				</p>
                
            #foundAddress = re.search('<p class="addr">\s*<strong>address:</strong>&nbsp;(?P<address>.+)</p>', foundAddrUni, re.S);
            
            #foundAddrUtf8 = foundAddrUni.encode("UTF-8");
            #print("foundAddrUtf8=", foundAddrUtf8);
            
            #foundAddrUtf8= b'<p class="addr">\n<strong>address:</strong>\xc2\xa0Kiacheli, 26<br/>\r\n\t\t\t\t\tTbilisi 0108 ;<br/>\r\n\t\t\t\t\tGeorgia\t\t\t\t</p>'
            foundAddress = re.search('<p class="addr">\s*<strong>address:</strong>(?P<address>.+)</p>', foundAddrUni, re.S);
                        
            if(foundAddress):
                itemDict['address'] = foundAddress.group("address");
                
                #itemDict['address'] = itemDict['address'].replace("<br />", "");
                itemDict['address'] = re.sub("<br\s*/>", "", itemDict['address']);
                
                itemDict['address'] = itemDict['address'].strip();
                #logging.debug("address=%s", itemDict['address']);
            else:
                logging.error("Can not find address");
                sys.exit(2);
        else:
            logging.error("Can not find addr");
            sys.exit(2);
        #logging.debug("----------------- parse [%d] OK: %s", i, itemDict);
        logging.info("Successfully processed %d contact info", i);
        allItemsDictList.append(itemDict);
    
    #output into excel
    style0 = xlwt.easyxf('font: name Times New Roman, color-index red, bold on',num_format_str='#,##0.00');
    style1 = xlwt.easyxf(num_format_str='D-MMM-YY');
    styleBoldRed = xlwt.easyxf('font: name Times New Roman, color-index red, bold on');
     
    wb = xlwt.Workbook();
    ws = wb.add_sheet('AllContactInfo');
    
    ws.write(0, 0, "Country", styleBoldRed);
    ws.write(0, 1, "Name", styleBoldRed);
    ws.write(0, 2, "Phone", styleBoldRed);
    ws.write(0, 3, "Fax", styleBoldRed);
    ws.write(0, 4, "Email", styleBoldRed);
    ws.write(0, 5, "Vray", styleBoldRed);
    ws.write(0, 6, "MaxLink", styleBoldRed);
    ws.write(0, 7, "Address", styleBoldRed);
    
    for idx,eachItemDict in enumerate(allItemsDictList):
        num = idx + 1;
        
        ws.write(num, 0, eachItemDict['country']);
        ws.write(num, 1, eachItemDict['name']);
        ws.write(num, 2, eachItemDict['phone']);
        ws.write(num, 3, eachItemDict['fax']);
        ws.write(num, 4, eachItemDict['email']);
        ws.write(num, 5, eachItemDict['vRay']);
        ws.write(num, 6, eachItemDict['maxLink']);
        ws.write(num, 7, eachItemDict['address']);
    
    excelFilename = "allExtractedWebsiteData.xls";
    logging.info("Now save all data info excel file: %s", excelFilename);
    wb.save(excelFilename);

###############################################################################
if __name__=="__main__":
    scriptSelfName = crifanLib.extractFilename(sys.argv[0]);

    logging.basicConfig(
                    level    = logging.DEBUG,
                    format   = 'LINE %(lineno)-4d  %(levelname)-8s %(message)s',
                    datefmt  = '%m-%d %H:%M',
                    filename = scriptSelfName + ".log",
                    filemode = 'w');
    # define a Handler which writes INFO messages or higher to the sys.stderr
    console = logging.StreamHandler();
    console.setLevel(logging.INFO);
    # set a format which is simpler for console use
    formatter = logging.Formatter('LINE %(lineno)-4d : %(levelname)-8s %(message)s');
    # tell the handler to use this format
    console.setFormatter(formatter);
    logging.getLogger('').addHandler(console);
    try:
        main();
    except:
        logging.exception("Unknown Error !");
        raise;

 

【总结】



发表评论

电子邮件地址不会被公开。 必填项已用*标注

无觅相关文章插件,快速提升流量