【代码分享】C#代码:ScrapeFiverrComments – 抓取fiverr.com中帖子的评论

【背景】

之前写的,用于抓取:

http://fiverr.com

中帖子

(比如:

http://fiverr.com/bizgrowthcoach/provide-a-startup-checklist-and-project-plan

)的评论。

 

注:

此代码是之前该网站改版之前写的;

且是没有完成的;

只是贴出来,供参考而已->其中有些关于SgmlReader等函数的使用,可供参考;

 

【ScrapeFiverrComments代码分享】

1.截图:

frmScrapeFiverrComments main ui

2.项目代码下载:

ScrapeFiverrComments_2013-02-28_uncompleted.7z

 

3.代码分享:

(1)frmScrapeFiverrComments.cs

/*
 * [File]
 * frmScrapeFiverrComments.cs
 *
 * [Function]
 * fiverr.com comments scrapper
 *
 * [Note]
 *
 * [Update]
 * 2013-02-28
 *
 * [Author]
 * Crifan Li
 *
 * [Contact]
 * http://www.crifan.com/contact_me/
 *
 */

using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Text;
using System.Windows.Forms;

using System.Web;
using System.Xml;
using Sgml;
using System.IO;

using Excel = Microsoft.Office.Interop.Excel;
using Microsoft.Office.Interop.Excel;

/*
 * icons:
 * 
 * search/find
 * http://www.easyicon.cn/icondetail/106/
 * 
 * stop
 * http://www.easyicon.cn/icondetail/568811/
 * 
 * crawler
 * http://www.easyicon.cn/icondetail/13685/
 * 
 * login
 * http://www.easyicon.cn/icondetail/500811/
 * 
 * send mail
 * http://www.easyicon.cn/icondetail/538560/
 */

namespace ScrapeFiverrComments
{
    public partial class frmScrapeFiverrComments : Form
    {
        public crifanLib crifanLib;
        static int constPageGigNumber = 40;

        public frmScrapeFiverrComments()
        {
            AppDomain.CurrentDomain.AssemblyResolve += new ResolveEventHandler(CurrentDomain_AssemblyResolve);

            InitializeComponent();

            crifanLib = new crifanLib();
        }

        System.Reflection.Assembly CurrentDomain_AssemblyResolve(object sender, ResolveEventArgs args)
        {
            string dllName = args.Name.Contains(",") ? args.Name.Substring(0, args.Name.IndexOf(',')) : args.Name.Replace(".dll", "");

            dllName = dllName.Replace(".", "_");

            if (dllName.EndsWith("_resources")) return null;

            System.Resources.ResourceManager rm = new System.Resources.ResourceManager(GetType().Namespace + ".Properties.Resources", System.Reflection.Assembly.GetExecutingAssembly());

            byte[] bytes = (byte[])rm.GetObject(dllName);

            return System.Reflection.Assembly.Load(bytes);
        }

        private void initDataGridView()
        {
            dgvCmtAuthorList.ColumnCount = 2;

            dgvCmtAuthorList.RowHeadersWidth = 80;
            dgvCmtAuthorList.RowHeadersDefaultCellStyle.Alignment = DataGridViewContentAlignment.MiddleCenter;
            dgvCmtAuthorList.RowHeadersWidthSizeMode = DataGridViewRowHeadersWidthSizeMode.DisableResizing;

            dgvCmtAuthorList.AutoSizeColumnsMode = DataGridViewAutoSizeColumnsMode.Fill;

            //(1)username
            dgvCmtAuthorList.Columns[0].HeaderText = "Username";
            dgvCmtAuthorList.Columns[0].Width = 160;
            //(2)profile url
            dgvCmtAuthorList.Columns[1].HeaderText = "Profile Url";
            dgvCmtAuthorList.Columns[1].Width = grbCmtAuthorList.Width - dgvCmtAuthorList.RowHeadersWidth - dgvCmtAuthorList.Columns[0].Width - 20;
        }

        private void frmScrapeFiverrComments_Load(object sender, EventArgs e)
        {
            initDataGridView();

            grbLogin.Enabled = false;
            //txbMessageToSend.Enabled = false;
            //btnSendMessage.Enabled = false;
        }

        XmlDocument htmlToXmlDoc(string html)
        {
            // setup SgmlReader
            Sgml.SgmlReader sgmlReader = new Sgml.SgmlReader();
            sgmlReader.DocType = "HTML";
            sgmlReader.WhitespaceHandling = WhitespaceHandling.All;
            sgmlReader.CaseFolding = Sgml.CaseFolding.ToLower;
            //sgmlReader.InputStream = reader;
            sgmlReader.InputStream = new StringReader(html);

            // create document
            XmlDocument doc = new XmlDocument();
            doc.PreserveWhitespace = true;
            doc.XmlResolver = null;
            doc.Load(sgmlReader);

            return doc;
        }

        //gig comment author info
        public struct gigCmtAuthorInfo
        {
            public string username;
            public string profileUrl;
        };


        private void btnScrape_Click(object sender, EventArgs e)
        {
            //http://fiverr.com/bizgrowthcoach/provide-a-startup-checklist-and-project-plan
            string curGigUrl = txbGigUrl.Text;
            bool isFirstPage = true;

            bool needGetMorePage = true;
            int curPageNumber = 0;
            string gigId = "";
            
            while (needGetMorePage)
            {
                string gigUrlRespHtml = "";
                if (isFirstPage)
                {
                    gigUrlRespHtml = crifanLib.getUrlRespHtml(curGigUrl);
                }
                else
                {

                    //string gigUrlRespHtml = crifanLib.getUrlRespHtml(curGigUrl, headerDict);
                }
           
                XmlDocument xmlDoc = htmlToXmlDoc(gigUrlRespHtml);

                XmlNamespaceManager m = new XmlNamespaceManager(xmlDoc.NameTable);
                m.AddNamespace("w3org", "http://www.w3.org/1999/xhtml");

                //<li class="rating-block ">
                //    <div class="userimage">
                //        <img src="http://dfkno3dtzeq4c.cloudfront.net/assets/02-mini-2bb551afad6a7740ad73314482189dd7.gif" width="24px" height="24px"  class="true" alt="azza1200" />
                //    </div>
                //    <div class= " rating-text">
                //        <div>
                //            <div class="rater-username">
                //                    <a href="/azza1200" rel="nofollow">azza1200</a>
                //                    <span class="time-ago titled" title="1361846351"></span>
                //            </div>	
                //            <div class="comment-block">
                //                <div class="rating-icon">
                //                        <img alt="thumb down - negative" src="http://dfkno3dtzeq4c.cloudfront.net/assets/thumb_down-9ff2828220cbb43e26ad5b4fa0b0fe88.png" />
                //                </div>
                //                <div class="rating-comment">
                //                    Terrible value. Seller is arrogant and unprofessional as well. Advise Google-ing to get a better plan than this rubbish he is selling. Poor form
                //                </div>
                //            </div>
                //        </div>
                //    </div>
                //    <div class="clear"></div>
                //</li>
                XmlNodeList ratingBlockList = xmlDoc.SelectNodes("//w3org:li[@class='rating-block ']", m);
                if (ratingBlockList != null)
                {
                    if (ratingBlockList.Count < constPageGigNumber)
                    {
                        needGetMorePage = false;
                    }

                    foreach (XmlNode ratingBlockNode in ratingBlockList)
                    {
                        gigCmtAuthorInfo cmtAuthorInfo = new gigCmtAuthorInfo();

                        //1. user name
                        //2. profile url
                        //<div class="rater-username">
                        //        <a href="/azza1200" rel="nofollow">azza1200</a>
                        //        <span class="time-ago titled" title="1361846351"></span>
                        //</div>	
                        XmlNode rateUsernameNode = ratingBlockNode.SelectSingleNode(".//w3org:div[@class='rater-username']", m);
                        string username = "";
                        string profileUrl = "";
                        if (rateUsernameNode != null)
                        {
                            XmlNode aNode = rateUsernameNode.SelectSingleNode(".//w3org:a[@rel|href]", m);
                            if (aNode != null)
                            {
                                username = aNode.InnerText;
                                string href = aNode.Attributes["href"].Value;
                                profileUrl = "http://fiverr.com" + href; //http://fiverr.com/azza1200

                                cmtAuthorInfo.username = username;
                                cmtAuthorInfo.profileUrl = profileUrl;

                                storeCommentAuthorInfo(cmtAuthorInfo);

                                //update UI
                                System.Windows.Forms.Application.DoEvents();
                            }
                        }//if (rateUsernameNode != null)
                    }//foreach (XmlNode ratingBlockNode in ratingBlockList)

                    //update for next page
                    if (isFirstPage)
                    {
                        isFirstPage = false;

                        curPageNumber = 1;

                        //<form accept-charset="UTF-8" action="http://fiverr.com/purchases?gig_id=748824" class="order-form" id="start_order_form_748824" method="post">

                        if (crifanLib.extractSingleStr(@"action=""http://fiverr\.com/purchases\?gig_id=(\d+)""", gigUrlRespHtml, out gigId))
                        {

                        }
                    }
                    else
                    {
                        curPageNumber++;
                    }
                    int offsetNumber = 40 * curPageNumber;
                    //http://fiverr.com/gigs/748824/load_ratings?offset=40&show_work_sample=false
                    string nextPageGigUrl = "http://fiverr.com/gigs/" + gigId + "/load_ratings?offset=" + offsetNumber.ToString() + "&show_work_sample=false";
                    //curGigUrl = nextPageGigUrl;

                    string titlePart = "";
                    if (crifanLib.extractSingleStr(@"http://fiverr\.com/\w+/([\w-]+)", curGigUrl, out titlePart))
                    {
                        //http://fiverr.com/gigs/provide-a-startup-checklist-and-project-plan?offset=40
                        nextPageGigUrl = "http://fiverr.com/gigs/" + titlePart + "?offset=" + offsetNumber.ToString();
                        curGigUrl = nextPageGigUrl;
                    }
                }//if (ratingBlockList != null)
                else
                {
                    needGetMorePage = false;
                }
            }
        }

        private bool userNotExist(string username)
        {
            bool notExist = true;
            
            for(int rowIdx = 0; rowIdx <= dgvCmtAuthorList.Rows.Count -1; rowIdx++)
            {
                string eachUsername = dgvCmtAuthorList.Rows[rowIdx].Cells[0].Value.ToString();
                if (eachUsername.Equals(username))
                {
                    notExist = false;
                    break;
                }
            }

            return notExist;
        }

        void storeCommentAuthorInfo(gigCmtAuthorInfo cmtAuthorInfo)
        {
            if (userNotExist(cmtAuthorInfo.username))
            {
                dgvCmtAuthorList.Rows.Add(
                    cmtAuthorInfo.username,
                    cmtAuthorInfo.profileUrl);

                dgvCmtAuthorList.Rows[dgvCmtAuthorList.Rows.Count - 1].Selected = true;
                dgvCmtAuthorList.FirstDisplayedScrollingRowIndex = dgvCmtAuthorList.Rows.Count - 1;

                for (int count = 0; (count <= (dgvCmtAuthorList.Rows.Count - 1)); count++)
                {
                    dgvCmtAuthorList.Rows[count].HeaderCell.Value = String.Format("{0}", count + 1);
                }
            }

            return;
        }

        private void btnSaveAll_Click(object sender, EventArgs e)
        {
            Excel.Application xlApp = new Excel.Application();
            Excel.Workbook xlWorkBook;
            Excel.Worksheet xlWorkSheet;

            object misValue = System.Reflection.Missing.Value;
            xlApp = new Excel.ApplicationClass();
            xlWorkBook = xlApp.Workbooks.Add(misValue);
            xlWorkSheet = (Excel.Worksheet)xlWorkBook.Worksheets.get_Item(1);
            int i = 0;
            int j = 0;

            //save header
            for (i = 0; i <= dgvCmtAuthorList.ColumnCount - 1; i++)
            {
                xlWorkSheet.Cells[0 + 1, i + 1] = dgvCmtAuthorList.Columns[i].HeaderText;
            }

            //save cells
            for (i = 0; i <= dgvCmtAuthorList.RowCount - 1; i++)
            {
                for (j = 0; j <= dgvCmtAuthorList.ColumnCount - 1; j++)
                {
                    DataGridViewCell cell = dgvCmtAuthorList[j, i];
                    xlWorkSheet.Cells[i + 2, j + 1] = cell.Value;
                }
            }

            //formatting
            //header to bold
            Range headerRow = xlWorkSheet.get_Range("1:1", System.Type.Missing);
            headerRow.Font.Bold = true;

            //auto adjust column width (according to content)
            Range allColumn = xlWorkSheet.Columns;
            allColumn.AutoFit();

            string currentPath = System.Environment.CurrentDirectory;
            string outputFilename = "ScrapedGigCommentsAuthorList.xls";
            string fullFilename = Path.Combine(currentPath, outputFilename);
            //xlWorkBook.SaveAs(fullFilename, Excel.XlFileFormat.xlWorkbookNormal, misValue, misValue, misValue, misValue, Excel.XlSaveAsAccessMode.xlExclusive, misValue, misValue, misValue, misValue, misValue);
            xlWorkBook.SaveAs(fullFilename, Excel.XlFileFormat.xlWorkbookNormal, misValue, misValue, misValue, misValue, Excel.XlSaveAsAccessMode.xlExclusive, XlSaveConflictResolution.xlLocalSessionChanges, misValue, misValue, misValue, misValue);
            xlWorkBook.Close(true, misValue, misValue);
            xlApp.Quit();

            releaseObject(xlWorkSheet);
            releaseObject(xlWorkBook);
            releaseObject(xlApp);

            System.Diagnostics.Process.Start("Explorer.exe", "/select," + fullFilename);
        }

        private void releaseObject(object obj)
        {
            try
            {
                System.Runtime.InteropServices.Marshal.ReleaseComObject(obj);
                obj = null;
            }
            catch (Exception ex)
            {
                obj = null;
                MessageBox.Show("Exception Occured while releasing object " + ex.ToString());
            }
            finally
            {
                GC.Collect();
            }
        }

        private void btnClearAll_Click(object sender, EventArgs e)
        {
            dgvCmtAuthorList.Rows.Clear();
        }

        private void btnLogin_Click(object sender, EventArgs e)
        {
            bool loginOk = loginFiverrCom(txbUsername.Text, txbPassword.Text);

            if (loginOk)
            {
                txbMessageToSend.Enabled = true;
                btnSendMessage.Enabled = true;
            }
            else
            {
                txbMessageToSend.Enabled = false;
                btnSendMessage.Enabled = false;
            }
        }

        private bool loginFiverrCom(string username, string password)
        {
            bool loginOk = false;

            return loginOk;
        }
    }
}

 

【总结】



发表评论

电子邮件地址不会被公开。 必填项已用*标注

无觅相关文章插件,快速提升流量