【代码分享】Python代码:scrape_html_to_json – 从本地html中抓取信息导出为各种形式的json字符串

背景】

之前写的,去处理本地已有的一个html文件,

然后对于提取出来的信息,导出为,各种形式的json字符串。

 

【scrape_html_to_json代码分享】

1.截图:

(1)运行效果:

scrape_html_to_json.py run ui

(2)输出的各种json字符串:

A。无格式化,无缩进:

[{"yearMonth": {"month": {"string": "November", "value": "11"}, "year": {"string": "2012", "value": "2012"}}, "reservedMonthList": ["2", "3", "8", "9", "10", "11", "12", "13", "17", "18", "19", "20", "21", "22", "23"]}, {"yearMonth": {"month": {"string": "December", "value": "12"}, "year": {"string": "2012", "value": "2012"}}, "reservedMonthList": ["7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "21", "22", "23", "24", "25", "26", "27", "28", "30", "31"]}]

B。普通的:

$calendar = {"listing_id1":{"1":
    {"start_date": 2/11/2012,
    "end_date": 3/11/2012,
    "status": reserved
    },"2":
    {"start_date": 8/11/2012,
    "end_date": 13/11/2012,
    "status": reserved
    },"3":
    {"start_date": 17/11/2012,
    "end_date": 23/11/2012,
    "status": reserved
    },},

"listing_id2":{"1":
    {"start_date": 7/12/2012,
    "end_date": 16/12/2012,
    "status": reserved
    },"2":
    {"start_date": 21/12/2012,
    "end_date": 28/12/2012,
    "status": reserved
    },"3":
    {"start_date": 30/12/2012,
    "end_date": 31/12/2012,
    "status": reserved
    },},

"listing_id3":{"1":
    {"start_date": 1/1/2013,
    "end_date": 10/1/2013,
    "status": reserved
    },},

"listing_id4":{"1":
    {"start_date": 1/2/2013,
    "end_date": 27/2/2013,
    "status": reserved
    },},

"listing_id5":{},

"listing_id6":{"1":
    {"start_date": 2/4/2013,
    "end_date": 30/4/2013,
    "status": reserved
    },},

"listing_id7":{"1":
    {"start_date": 1/5/2013,
    "end_date": 31/5/2013,
    "status": reserved
    },},

"listing_id8":{"1":
    {"start_date": 1/6/2013,
    "end_date": 30/6/2013,
    "status": reserved
    },},

"listing_id9":{"1":
    {"start_date": 1/7/2013,
    "end_date": 31/7/2013,
    "status": reserved
    },},

"listing_id10":{"1":
    {"start_date": 1/8/2013,
    "end_date": 31/8/2013,
    "status": reserved
    },},

"listing_id11":{"1":
    {"start_date": 1/9/2013,
    "end_date": 30/9/2013,
    "status": reserved
    },},

"listing_id12":{"1":
    {"start_date": 1/10/2013,
    "end_date": 31/10/2013,
    "status": reserved
    },},

}

C。带缩进的格式化的json:

[
 {
  "yearMonth": {
   "month": {
    "string": "November", 
    "value": "11"
   }, 
   "year": {
    "string": "2012", 
    "value": "2012"
   }
  }, 
  "reservedMonthList": [
   "2", 
   "3", 
   "8", 
   "9", 
   "10", 
   "11", 
   "12", 
   "13", 
   "17", 
   "18", 
   "19", 
   "20", 
   "21", 
   "22", 
   "23"
  ]
 }, 
 {
  "yearMonth": {
   "month": {
    "string": "December", 
    "value": "12"
   }, 
   "year": {
    "string": "2012", 
    "value": "2012"
   }
  }, 
  "reservedMonthList": [
   "7", 
   "8", 
   "9", 
   "10", 
   "11", 
   "12", 
   "13", 
   "14", 
   "15", 
   "16", 
   "21", 
   "22", 
   "23", 
   "24", 
   "25", 
   "26", 
   "27", 
   "28", 
   "30", 
   "31"
  ]
 }
]

 

注:以上内容不全部相同。只是为了显示效果。

 

2.Python项目代码下载:

scrape_html_to_json_2012-11-08.7z

 

3.代码分享:

(1)scrape_html_to_json.py

#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
-------------------------------------------------------------------------------
Function:
Web Scraper
https://www.elance.com/j/web-scraper/35025238/

Version:    2012-11-08
Author:     Crifan Li
Contact:    http://www.crifan.com/about/me/

-------------------------------------------------------------------------------
"""

#---------------------------------import---------------------------------------
import re;
import sys;
sys.path.append("libs");
#import urllib;
import codecs;
from string import Template,replace;
import json;

from BeautifulSoup import BeautifulSoup,Tag,CData;

#------------------------------------------------------------------------------
# from ['2','3','8','9','10','11','12','13','17','18','19','20','21','22','23']
# to 
#[
#{
#'startDay': 2,
#'endDay'  : 3,
#},
#{
#'startDay': 8,
#'endDay'  : 13,
#},
#{
#'startDay': 17,
#'endDay'  : 23,
#},
#]
def generateDurationDictList(monthList):
    durationMonthDictList = [];
    
    #print "monthList=",monthList;
    
    if(monthList):
        monthIntList = [];
        for eachMonthStr in monthList:
            monthIntList.append(int(eachMonthStr));
        monthIntList.sort();
        #print "monthIntList=",monthIntList;
        
        curStartMonth = monthIntList.pop(0);
        #print "curStartMonth=",curStartMonth;
        curEndMonth = curStartMonth;
        curInterMonth = curStartMonth;
        
        startFindNewDuration = False;
        while(monthIntList):
            currentMonthInt = monthIntList.pop(0);
            #print "---currentMonthInt=",currentMonthInt;
            if(currentMonthInt == (curInterMonth+1)):
                startFindNewDuration = True;
                curInterMonth = curInterMonth + 1;
                #print "after add 1, curInterMonth=",curInterMonth;
            else:
                durationInfoDict = {
                    'startDay': curStartMonth,
                    'endDay'  : curInterMonth,
                };
                durationMonthDictList.append(durationInfoDict);
                startFindNewDuration = False;
                
                curEndMonth = currentMonthInt;
                curStartMonth = currentMonthInt;
                curInterMonth = currentMonthInt;
        if(startFindNewDuration):
            startFindNewDuration = False;
            durationInfoDict = {
                'startDay': curStartMonth,
                'endDay'  : currentMonthInt,
            };
            durationMonthDictList.append(durationInfoDict);

        #print "durationMonthDictList=",durationMonthDictList;
    #else:
        #print "input monthList is null";
    return durationMonthDictList;

#------------------------------------------------------------------------------
def generateOutputCalendar(MonthDictList):
    #print "MonthDictList=",MonthDictList;
    
    wholeStr = "";
    
    headerStr = "$calendar = {";
    tailStr = "}";
    allMonthStr = "";
    
    for index,eachMonthDict in enumerate(MonthDictList):
        number = index + 1;
        
        singleMonthWholeStr = "";
        
        monthHeaderStr = '"listing_id'+str(number)+'":{';
        monthTailStr = "},";
        monthDurationListStr = "";
        
        #print "============ now process year=%s, month=%s"%(eachMonthDict['yearMonth']['year']['string'], eachMonthDict['yearMonth']['month']['string']);
        durationInfoDictList = generateDurationDictList(eachMonthDict['reservedMonthList']);
        
        for durationIdx,eachDurationDict in enumerate(durationInfoDictList):
            durationNum = durationIdx + 1;
            singelDurationT = Template(""""${number}":
    {"start_date": ${startDay}/${startMonth}/${startYear},
    "end_date": ${endDay}/${endMonth}/${endYear},
    "status": reserved
    },""");
            singleDurationDict = {
                'number'        : durationNum,
                'startDay'      : eachDurationDict['startDay'],
                'startMonth'    : eachMonthDict['yearMonth']['month']['value'],
                'startYear'     : eachMonthDict['yearMonth']['year']['value'],
                'endDay'        : eachDurationDict['endDay'],
                'endMonth'      : eachMonthDict['yearMonth']['month']['value'],
                'endYear'       : eachMonthDict['yearMonth']['year']['value'],
                
            };
            
            # "1":
            # {"start_date": 11/7/2012,
            # "end_date": 11/9/2012,
            # "status": reserved
            # },
            singelDurationStr = singelDurationT.substitute(singleDurationDict);
            #print "singelDurationStr=",singelDurationStr;
            
            monthDurationListStr += singelDurationStr;
        
        singleMonthWholeStr = monthHeaderStr + monthDurationListStr + monthTailStr;
        #print "singleMonthWholeStr=",singleMonthWholeStr;
        
        allMonthStr += singleMonthWholeStr + "\r\n\r\n";
    wholeStr = headerStr + allMonthStr + tailStr;
    #print "wholeStr=",wholeStr;
    return wholeStr;
#------------------------------------------------------------------------------
def generateOutputCalendarJsonNoIndent(MonthDictList):
    jsonDumpsNoIndent = json.dumps(MonthDictList);
    #print "jsonDumpsNoIndent=",jsonDumpsNoIndent;
    return jsonDumpsNoIndent;


#------------------------------------------------------------------------------
def main():
    testEntryUrl = "http://testingsite.com/CalendarViewPublic.asp?HouseID=39";
    foundSingleAttrFromUrl = re.search("http:.+?\?(?P<singleAttr>\w+)=.*?", testEntryUrl);
    #print "foundSingleAttrFromUrl=",foundSingleAttrFromUrl;
    if(foundSingleAttrFromUrl):
        singleAttr = foundSingleAttrFromUrl.group("singleAttr");
        print "Extract singleAttr=%s from testEntryUrl=%s"%(singleAttr, testEntryUrl);
        
    testFilename = "testfiles/test_scrape.htm";
    htmlFile = codecs.open(testFilename, 'r', "UTF-8");
    #print "htmlFile=",htmlFile;
    testHtml = htmlFile.read();
    #print "testHtml=",testHtml;
    soup = BeautifulSoup(testHtml);
    
    #<table border="0" cellpadding="2" cellspacing="0" class="text" width="100%">
    foundAllMonthHeader = soup.findAll(name="table", attrs={"class":"text"});
    #print "foundAllMonthHeader=",foundAllMonthHeader;
    monthHeaderLen = len(foundAllMonthHeader);
    #print "monthHeaderLen=",monthHeaderLen;
    #<table border="1" class="CalendarCellActive" cellpadding="2" cellspacing="0" style=" border: 1px solid navy; table-layout:fixed" width="100%">
    foundAllMonthContent = soup.findAll(name="table", attrs={"class":"CalendarCellActive"});
    #print "foundAllMonthContent=",foundAllMonthContent;
    monthContentLen = len(foundAllMonthContent);
    #print "monthContentLen=",monthContentLen;
    print "Total found %d month's info of reserved days"%(monthContentLen);
    
    MonthDictList = [];
    for i,eachMonthHeader in enumerate(foundAllMonthHeader):
        singleMonthDict = {
            'yearMonth' :{
                'year'  : {
                    'value' : "",
                    'string': "",
                },
                'month' : {
                    'value' : "",
                    'string': "",
                },
            },
            'reservedMonthList':[], # each one is singel string of month
        };
        
        
        #Note:
        #here, actually, the simplest method to extract the year and month label is:
        #just find two label, then consider the first is month and second is year
        
        # foundTwoLabel = eachMonthHeader.findAll("label");
        # print "foundTwoLabel=",foundTwoLabel;
        # monthLabel = foundTwoLabel[0];
        # yearLabel = foundTwoLabel[1];
        # monthStr = monthLabel.string;
        # yearStr = yearLabel.string;
        # print "monthStr=",monthStr; # monthStr= November
        # print "yearStr=",yearStr; # yearStr= 2012
        
        # but that kind of method is not safe and robust
        #so use following code
        
        # <td style="padding-left:0" width="60%"><label>November</label>
        # <input type="Hidden" id="cboMonth1" name="cboMonth1" value="11">
        # </td><td style="padding-right:0;" width="40%">
            # <label>2012</label>
            # <input type="Hidden" id="cboYear1" name="cboYear1" value="2012">
        # </td>
        foundCboMonth = eachMonthHeader.find("input", {"id":re.compile("cboMonth\d+")});
        #print "foundCboMonth=",foundCboMonth;
        monthValue = foundCboMonth['value'];
        #print "monthValue=",monthValue;
        tdMonth = foundCboMonth.parent;
        #print "tdMonth=",tdMonth;
        tdMonthLabel = tdMonth.label;
        #print "tdMonthLabel=",tdMonthLabel;
        monthStr = tdMonthLabel.string;
        #print "monthStr=",monthStr;
        
        foundCboYear = eachMonthHeader.find("input", {"id":re.compile("cboYear\d+")});
        #print "foundCboYear=",foundCboYear;
        yearValue = foundCboYear['value'];
        #print "yearValue=",yearValue;
        tdYear = foundCboYear.parent;
        #print "tdYear=",tdYear;
        tdYearLabel = tdYear.label;
        #print "tdYearLabel=",tdYearLabel;
        yearStr = tdYearLabel.string;
        #print "yearStr=",yearStr;
        
        singleMonthDict['yearMonth']['month']['string'] = monthStr;
        singleMonthDict['yearMonth']['month']['value'] = monthValue;
        singleMonthDict['yearMonth']['year']['string'] = yearStr;
        singleMonthDict['yearMonth']['year']['value'] = yearValue;
        
        # extract the necessary content: the reserved days
        eachMonthContent = foundAllMonthContent[i];
        #<td align="center" class="CalendarCellReserved" id="dd1">2</td>
        foundAllReservedCell = eachMonthContent.findAll("td", {"class":"CalendarCellReserved"});
        #print "foundAllReservedCell=",foundAllReservedCell;
        reservedCellNum = len(foundAllReservedCell);
        #print "reservedCellNum=",reservedCellNum;
        for eachReservedCell in foundAllReservedCell:
            cellVal = eachReservedCell.string;
            #print "cellVal=",cellVal;
            singleMonthDict['reservedMonthList'].append(cellVal);
            
        #print "singleMonthDict=",singleMonthDict;
        
        MonthDictList.append(singleMonthDict);
        #print str(i+1) + "="*79;
        print "Processed %d month's info"%(i+1);
    
    # generate output string
    
    generatedCalendarStr = generateOutputCalendar(MonthDictList);
    #print "generatedCalendarStr=",generatedCalendarStr;
    outputFileName = "generatedCalerdarString.txt";
    print "Exporting generated calendar string into %s"%(outputFileName);
    outputFile = codecs.open(outputFileName, 'w', 'utf-8');
    outputFile.write(generatedCalendarStr);
    outputFile.close();
    print "Has exported calendar string into %s"%(outputFileName);
    
    
    # Note:
    # only makesure your expected output is somthing like:
    # {"start_date": "11/7/2012",
    # "end_date": "11/9/2012",
    # "status": "reserved"
    # },
    # not :
    # {"start_date": 11/7/2012,
    # "end_date": 11/9/2012,
    # "status": reserved
    # },
    # then I can use json to ouptut PRETTY-PRINTED dict string
    
    
    #------------------------------------------------------------------------------
    def generateOutputCalendarJsonIndent(MonthDictList):
        jsonDumpsIndent = json.dumps(MonthDictList, indent=1);
        #print "jsonDumpsIndent=",jsonDumpsIndent;
        return jsonDumpsIndent;
    
    # json ouput demo
    demoDictList = MonthDictList[0:2];
    jsonDumpsIndentStr = json.dumps(demoDictList, indent=1);
    outputFile_json_indent = "treeLikeWithIndentJsonString.txt";
    outputFile_json_indent = codecs.open(outputFile_json_indent, 'w', 'utf-8');
    outputFile_json_indent.write(jsonDumpsIndentStr);
    outputFile_json_indent.close();
    
    #tttttttt
    
    generatedCalendarJsonIndentStr = generateOutputCalendarJsonIndent(demoDictList);
    print "type(generatedCalendarJsonIndentStr)=",type(generatedCalendarJsonIndentStr);
    #print "generatedCalendarJsonIndentStr=",generatedCalendarJsonIndentStr;
    outputFileName_json_indent = "generatedCalerdarString_json_indent.txt";
    print "Exporting generated calendar string json indent into %s"%(outputFileName_json_indent);
    outputFile_json_indent = codecs.open(outputFileName_json_indent, 'w', 'utf-8');
    outputFile_json_indent.write(generatedCalendarJsonIndentStr);
    outputFile_json_indent.close();

    generatedCalendarJsonNoIndentStr = generateOutputCalendarJsonNoIndent(demoDictList);
    print "type(generatedCalendarJsonNoIndentStr)=",type(generatedCalendarJsonNoIndentStr);
    #print "generatedCalendarJsonNoIndentStr=",generatedCalendarJsonNoIndentStr;
    outputFileName_json_noIndent = "generatedCalerdarString_json_noIndent.txt";
    print "Exporting generated calendar string json no indent into %s"%(outputFileName_json_noIndent);
    outputFile_json_noIndent = codecs.open(outputFileName_json_noIndent, 'w', 'utf-8');
    outputFile_json_noIndent.write(generatedCalendarJsonNoIndentStr);
    outputFile_json_noIndent.close();

###############################################################################
if __name__=="__main__":
    main();

 

【总结】



发表评论

电子邮件地址不会被公开。 必填项已用*标注

无觅相关文章插件,快速提升流量