【背景】
写了个爬虫,爬取
然后将所抓取数据,导出成excel和csv。
下面把代码分享出来。
供参考。
【FiverComScraper 代码】
1.截图:
2.完整项目代码下载:
FiverrComScraper_2013-03-08_onlyScrapeFiverrCom_beforeWebsiteChange.7z
3.源码:
(1)frmFiverrComScraper.cs
/*
* [File]
* frmFiverrComScraper.cs
*
* [Function]
* fiverr.com scrapper
*
* [Note]
*
* [Update]
* 2013-03-08
*
* [Author]
* Crifan Li
*
* [Contact]
* https://www.crifan.com/contact_me/
*
*/
using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Text;
using System.Windows.Forms;
using Sgml;
using System.Xml;
using System.IO;
using System.Web;
using Excel = Microsoft.Office.Interop.Excel;
using Microsoft.Office.Interop.Excel;
/*
* icons:
*
* search/find
* http://www.easyicon.cn/icondetail/106/
*
* stop
* http://www.easyicon.cn/icondetail/568811/
*
* excel
* http://www.easyicon.cn/icondetail/1087666/
*
* csv
* http://www.easyicon.cn/icondetail/558199/
*
* help
* http://www.easyicon.cn/icondetail/12270/
*/
namespace FiverComScraper
{
public partial class frmFiverrComScraper : Form
{
public crifanLib crifanLib;
public DataGridViewButtonColumn gigUrlColumn = null;
public static int girUrlColumnIdx = 12;
//need get more gig to scrape or not
bool needGetMore = true;
bool bWorkNotCompleted = true;
private string curRespHtml = "";
enum search_status
{
SEARCH_STATUS_STOPPED,
SEARCH_STATUS_SEARCHING,
SEARCH_STATUS_PAUSED
};
search_status curSearchStatus = search_status.SEARCH_STATUS_STOPPED;
public struct search_info
{
public int pageNum;
public string searchUrl;
public string searchRespHtml;
public XmlDocument xmlDoc;
public XmlNamespaceManager m;
public XmlNodeList gigDataList;
public int nodeIdx;
};
search_info curSearchInfo = new search_info();
public frmFiverrComScraper()
{
AppDomain.CurrentDomain.AssemblyResolve += new ResolveEventHandler(CurrentDomain_AssemblyResolve);
InitializeComponent();
crifanLib = new crifanLib();
gigUrlColumn = new DataGridViewButtonColumn();
}
//for load embedded dll
System.Reflection.Assembly CurrentDomain_AssemblyResolve(object sender, ResolveEventArgs args)
{
string dllName = args.Name.Contains(",") ? args.Name.Substring(0, args.Name.IndexOf(',')) : args.Name.Replace(".dll", "");
dllName = dllName.Replace(".", "_");
if (dllName.EndsWith("_resources")) return null;
System.Resources.ResourceManager rm = new System.Resources.ResourceManager(GetType().Namespace + ".Properties.Resources", System.Reflection.Assembly.GetExecutingAssembly());
byte[] bytes = (byte[])rm.GetObject(dllName);
return System.Reflection.Assembly.Load(bytes);
}
private void frmFiverrComScraper_Load(object sender, EventArgs e)
{
//DataGridView init
dgvSearchResult.ColumnCount = 12;
dgvSearchResult.RowHeadersWidth = 60;
dgvSearchResult.RowHeadersDefaultCellStyle.Alignment = DataGridViewContentAlignment.MiddleCenter;
dgvSearchResult.RowHeadersWidthSizeMode = DataGridViewRowHeadersWidthSizeMode.DisableResizing;
dgvSearchResult.AutoSizeColumnsMode = DataGridViewAutoSizeColumnsMode.None;
dgvSearchResult.AutoSizeRowsMode = DataGridViewAutoSizeRowsMode.AllCellsExceptHeaders;
//(1)title
dgvSearchResult.Columns[0].AutoSizeMode = DataGridViewAutoSizeColumnMode.Fill;
dgvSearchResult.Columns[0].HeaderText = "Title";
dgvSearchResult.Columns[0].Width = 100;
//(2)seller rating ( based on 1-100% format )
dgvSearchResult.Columns[1].HeaderText = "Seller Rating";
dgvSearchResult.Columns[1].Width = 49;
//(3)estimated delivery ( based on 24 hours - 7days format )
dgvSearchResult.Columns[2].HeaderText = "Estimated Delivery";
dgvSearchResult.Columns[2].Width = 66;
//(4)gig rating ( based on 1-100% )
dgvSearchResult.Columns[3].HeaderText = "Gig Rating";
dgvSearchResult.Columns[3].Width = 47;
//(5)orders in que ( based on 0-9999 format )
dgvSearchResult.Columns[4].HeaderText = "Orders in Queue";
dgvSearchResult.Columns[4].Width = 54;
//(6)level of the seller ( 1-3 )
dgvSearchResult.Columns[5].HeaderText = "Seller Level";
dgvSearchResult.Columns[5].Width = 47;
//(7)haz video ( yes or no )
dgvSearchResult.Columns[6].HeaderText = "Has Video";
dgvSearchResult.Columns[6].Width = 42;
//(8)express gigs (yes or no )
dgvSearchResult.Columns[7].HeaderText = "Is Express Gig";
dgvSearchResult.Columns[7].Width = 55;
//(9)country flag ( display county flag )
dgvSearchResult.Columns[8].HeaderText = "Country Flag";
dgvSearchResult.Columns[8].Width = 106;
//(10)+ve reviews and -ve reviews ( based on 1-9999 )
dgvSearchResult.Columns[9].HeaderText = "Positive Reviews";
dgvSearchResult.Columns[9].Width = 57;
dgvSearchResult.Columns[10].HeaderText = "Negative Reviews";
dgvSearchResult.Columns[10].Width = 60;
//(11)top rated seller ( yes or no )
dgvSearchResult.Columns[11].HeaderText = "Is Top Rated Seller";
dgvSearchResult.Columns[11].Width = 50;
////(12)gig url
//dgvSearchResult.Columns[12].HeaderText = "Gig Url";
//dgvSearchResult.Columns[12].Width = 106;
// Add a button column
gigUrlColumn.HeaderText = "Gig Url";
//gigUrlColumn.Name = "Gig Url name";
gigUrlColumn.Text = "Buy Now";
//gigUrlColumn.UseColumnTextForButtonValue = true;
gigUrlColumn.Width = 106;
dgvSearchResult.Columns.Add(gigUrlColumn);
//this.WindowState = FormWindowState.Maximized;
updateUI();
}
//update UI according current status
private void updateUI()
{
if (curSearchStatus == search_status.SEARCH_STATUS_STOPPED)
{
btnSearch.Enabled = true;
btnSearch.Text = "Search";
btnPause.Enabled = false;
btnStop.Enabled = false;
}
else if (curSearchStatus == search_status.SEARCH_STATUS_PAUSED)
{
btnSearch.Enabled = true;
btnSearch.Text = "Continue Search";
btnPause.Enabled = false;
btnStop.Enabled = true;
}
else if (curSearchStatus == search_status.SEARCH_STATUS_SEARCHING)
{
btnSearch.Enabled = false;
btnSearch.Text = "Searching";
btnPause.Enabled = true;
btnStop.Enabled = true;
}
}
XmlDocument htmlToXmlDoc(string html)
{
// setup SgmlReader
Sgml.SgmlReader sgmlReader = new Sgml.SgmlReader();
sgmlReader.DocType = "HTML";
sgmlReader.WhitespaceHandling = WhitespaceHandling.All;
sgmlReader.CaseFolding = Sgml.CaseFolding.ToLower;
//sgmlReader.InputStream = reader;
sgmlReader.InputStream = new StringReader(html);
// create document
XmlDocument doc = new XmlDocument();
doc.PreserveWhitespace = true;
doc.XmlResolver = null;
doc.Load(sgmlReader);
return doc;
}
private void processEachGig(string gigUrl)
{
gigInfo singleGigInfo = new gigInfo();
//(12)gig url
//gigUrl
singleGigInfo.gigUrl = gigUrl;
//string gitHtml = crifanLib.getUrlRespHtml(gigUrl);
string gitHtml = "";
getUrlRespHtml_bw(gigUrl);
while (bWorkNotCompleted)
{
System.Windows.Forms.Application.DoEvents();
}
gitHtml = curRespHtml;
XmlDocument xmlDoc = htmlToXmlDoc(gitHtml);
XmlNamespaceManager m = new XmlNamespaceManager(xmlDoc.NameTable);
m.AddNamespace("w3org", "http://www.w3.org/1999/xhtml");
//(1)title
//http://fiverr.com/gamingaffiliate/seo-critique-your-website
//<head>
// <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
// <meta http-equiv="content-script-type" content="text/javascript">
// <title>Gamingaffiliate will seo critique your website with search engine optimization strategies and tips for $5, only on fiverr.com</title>
//XmlNode titleNode = xmlDoc.SelectSingleNode("/w3org:html/w3org:head/w3org:title", m);
//<div class="gig-title-g">
// <span itemprop="url" content="http://fiverr.com/gamingaffiliate/seo-critique-your-website"></span>
// <h1 itemprop="name">
// I will seo critique your website with search engine optimization strategies and tips for $5
// </h1>
// <div class="gig-category-name">CREATED <a href="/archives/2010/7/22">OVER 2 YEARS AGO</a>, IN <a href="/categories/online-marketing">ONLINE MARKETING</a> / <a href="/categories/online-marketing/seo-services">SEO</a>
// </div>
//</div>
XmlNode titleNode = xmlDoc.SelectSingleNode("//w3org:h1[@itemprop='name']", m);
string title = titleNode.InnerText; //"\n\t\t\t\t\tI will seo critique your website with search engine optimization strategies and tips for $5\n\t\t\t\t"
title = title.Trim();
singleGigInfo.title = title;
//(2)seller rating ( based on 1-100% format )
//http://fiverr.com/woofy31/give-you-a-list-with-over-50-best-free-seo-and-sem-tools-out-there
//<div class='user-rate'>rated <span class='colored green'>99%</span></div>
XmlNode userRateNode = xmlDoc.SelectSingleNode("//w3org:div[@class='user-rate']", m);
string userRateTxt = userRateNode.InnerText;
string userRateValue = "";
if (crifanLib.extractSingleStr(@"(\d+)%", userRateTxt, out userRateValue))
{
int userRateValueInt = Int32.Parse(userRateValue);
singleGigInfo.sellerRating = userRateValueInt;
}
//(3)estimated delivery ( based on 24 hours - 7days format )
//http://fiverr.com/woofy31/give-you-a-list-with-over-50-best-free-seo-and-sem-tools-out-there
//<li class="delv-time">
// <div>
// <span class='big-txt'>2</span> <span class='mid-txt'>days</span>
// <div class="clear"></div>
// </div>
// <div class="small-txt">
// EST. DELIVERY
// </div>
//</li>
//http://fiverr.com/crashkron/check-your-website-and-keywords-rankings-and-send-you-a-complete-report-to-help-you-to-improve-your-seo
//<li class="delv-time">
// <div>
// <span class='big-txt'>24</span> <span class='mid-txt'>hrs</span>
// <div class="clear"></div>
// </div>
// <div class="small-txt">
// <div class='express'>express delivery</div>
// </div>
//</li>
XmlNode delvTimeNode = xmlDoc.SelectSingleNode("//w3org:li[@class='delv-time']", m);
XmlNode delvTimeBigTxtNode = delvTimeNode.SelectSingleNode(".//w3org:span[@class='big-txt']", m);
string devTimeBigStr = delvTimeBigTxtNode.InnerText;
XmlNode delvTimeMidTxtNode = delvTimeNode.SelectSingleNode(".//w3org:span[@class='mid-txt']", m);
string devTimeMidStr = delvTimeMidTxtNode.InnerText;
singleGigInfo.estimatedDeliveryStr = devTimeBigStr + " " + devTimeMidStr;
//(4)gig rating ( based on 1-100% )
//http://fiverr.com/fiverrfanatic/be-your-seo-assistant-for-an-hour
//<li class="gig-rating">
// <span class="big-txt">
// 100<span class='mid-txt'>%</span>
// </span>
// <div class="small-txt max-rate">
// GIG RATING
// </div>
//</li>
XmlNode gigRatingNode = xmlDoc.SelectSingleNode(".//w3org:li[@class='gig-rating']", m);
string gitRatingTxt = gigRatingNode.InnerText; //"\n\t\t\t\n\t\t\t\t100%\n\t\t\t\n\t\t\t\n\t\t\t\tGIG RATING\n\t\t\t\n\t"
string gitRatingValue = "";
if (crifanLib.extractSingleStr(@"(\d+)%", gitRatingTxt, out gitRatingValue))
{
singleGigInfo.gigRating = Int32.Parse(gitRatingValue);
}
else
{
//http://fiverr.com/techlinks/provide-you-an-ebook-for-teaching-you-all-tips-and-methods-for-doing-seo-on-your-own
//<li class="gig-rating">
// <span class="big-txt not-availale">N/A</span>
// <div class="small-txt not-availale">NOT RATED YET</div>
//</li>
singleGigInfo.gigRating = 0;
}
//(5)orders in que ( based on 0-9999 format )
//http://fiverr.com/seostar/create-complete-seo-analysis-report-of-your-website
//<li class="queue ">
// <div class="big-txt">4<span class="mid-txt">in queue</span></div>
// <div class="small-txt">ORDERS</div>
//</li>
XmlNode queueNode = xmlDoc.SelectSingleNode(".//w3org:li[@class='queue ']", m);
if (queueNode != null)
{
//extract value
XmlNode queueBigTxtNode = queueNode.SelectSingleNode(".//w3org:div[@class='big-txt']", m);
string queueTxtValue = queueBigTxtNode.InnerText;
string queueValue = "";
if (crifanLib.extractSingleStr(@"(\d+)", queueTxtValue, out queueValue))
{
singleGigInfo.ordersInQueue = Int32.Parse(queueValue);
}
}
else
{
//should be:
//http://fiverr.com/woofy31/give-you-a-list-with-over-50-best-free-seo-and-sem-tools-out-there
//<li class="queue not-availale">
// <div class="big-txt">0<span class="mid-txt">in queue</span></div>
// <div class="small-txt">ORDERS</div>
//</li>
XmlNode queueNoneNode = xmlDoc.SelectSingleNode(".//w3org:li[@class='queue not-availale']", m);
if (queueNoneNode != null)
{
//ok
singleGigInfo.ordersInQueue = 0;
}
else
{
//seems some error
MessageBox.Show("Error while find orders in queue!");
}
}
//(6)level of the seller ( 1-3 )
//(11)top rated seller ( yes or no )
//http://fiverr.com/fiverrfanatic/be-your-seo-assistant-for-an-hour
//<li class="badge-container top_rated_seller">
// <img alt="Gig_stats_badges" src="/assets/gig_show/gig_stats_badges.png" /> </li>
//http://fiverr.com/maxsimpson/create-500-high-pr-seo-backlinks-for-your-web-page-which-are-google-panda-and-penguin-safe-backlink-will-ping-back-links
//<li class="badge-container level_two_seller">
// <img alt="Gig_stats_badges" src="/assets/gig_show/gig_stats_badges.png" /> </li>
//http://fiverr.com/seostar/create-complete-seo-analysis-report-of-your-website
//Not contain any badge-container
XmlNode badgeLevelOneNode = xmlDoc.SelectSingleNode(".//w3org:li[@class='badge-container level_one_seller']", m);
XmlNode badgeLevelTwoNode = xmlDoc.SelectSingleNode(".//w3org:li[@class='badge-container level_two_seller']", m);
XmlNode badgeTopRatedNode = xmlDoc.SelectSingleNode(".//w3org:li[@class='badge-container top_rated_seller']", m);
int badgeLevel = 0;
bool isTopRatedSeller = false;
if ((badgeLevelOneNode == null) && (badgeLevelTwoNode == null) && (badgeTopRatedNode == null))
{
badgeLevel = 0;
}
else if (badgeLevelOneNode != null)
{
badgeLevel = 1;
}
else if (badgeLevelTwoNode != null)
{
badgeLevel = 2;
}
else if (badgeTopRatedNode != null)
{
badgeLevel = 3;
isTopRatedSeller = true;
}
singleGigInfo.sellerLevel = badgeLevel;
singleGigInfo.isTopRatedSeller = isTopRatedSeller;
//(7)haz video ( yes or no )
//http://fiverr.com/hdsmith7674/write-a-high-quality-100-to-300-word-blog-post-or-article
//<div class="play-trigger">
// <a href="http://api.dmcloud.net/embed/4e5bf73e94a6f629c900461b/5044c9e794739936f100011b?auth=1519213997-0-r6qkysc4-b4645d9babf33e282ff8f66fbab95c75&wmode=transparent" class="vid-play"></a>
// <img alt="alt_text.html_safe" src="http://static.dmcloud.net/4e5bf73e94a6f629c900461b/5044c9e794739936f100011b/jpeg_thumbnail_large-1346739574.jpeg" width="100%" />
//</div>
XmlNode playTriggerNode = xmlDoc.SelectSingleNode(".//w3org:div[@class='play-trigger']", m);
bool hasVideo = false;
if (playTriggerNode != null)
{
hasVideo = true;
}
singleGigInfo.hasVideo = hasVideo;
//(8)express gigs (yes or no )
//http://fiverr.com/earnonlinemoney/give-you-a-guest-post-on-my-pr2-pets-pet-carepet-trainingpet-nutrition-and-supplement-seomoz-page-authority-of-41
//<div class='express'>EXPRESS DELIVERY</div>
XmlNode expressNode = xmlDoc.SelectSingleNode(".//w3org:div[@class='express']", m);
bool isExpress = false;
if (expressNode != null)
{
isExpress = true;
}
singleGigInfo.isExpressGig = isExpress;
//(9)country flag ( display county flag )
//http://fiverr.com/maxsimpson/create-500-high-pr-seo-backlinks-for-your-web-page-which-are-google-panda-and-penguin-safe-backlink-will-ping-back-links
//<li class="user-det">
// <img src="/assets/02-68c5bd24e80eda13bef308cc3381a6a0.gif" width="50px" height="50px" align="left" class="user-photo" alt="maxsimpson" /> <div>
// By <a href="/maxsimpson">maxsimpson</a> <div class='user-rate'>rated <span class='colored green'>98%</span></div>
// <span class='flag in' title="India"></span>
// </div>
//</li>
//http://fiverr.com/earnonlinemoney/give-you-a-guest-post-on-my-pr2-pets-pet-carepet-trainingpet-nutrition-and-supplement-seomoz-page-authority-of-41
//<li class="user-det">
// <img src="http://cdn0.fiverrcdn.com/photos/268438/thumb/dollar_sign.jpg?1307287478" width="50px" height="50px" align="left" class="user-photo" alt="earnonlinemoney" /> <div>
// By <a href="/earnonlinemoney">earnonlinemoney</a> <div class='user-rate'>rated <span class='colored green'>100%</span></div>
// <span class='flag us' title="United States"></span>
// </div>
//</li>
//http://fiverr.com/daica85/give-you-an-advance-seo-techniques-ebook
//<li class="user-det">
// <img src="http://cdn3.fiverrcdn.com/photos/68219/thumb/tiphu.jpg?1280070107" width="50px" height="50px" align="left" class="user-photo" alt="daica85" /> <div>
// By <a href="/daica85">daica85</a> <div class='user-rate'>rated <span class='colored green'>100%</span></div>
// <span class='flag vn' title="Viet Nam"></span>
// </div>
//</li>
XmlNode userDetNode = xmlDoc.SelectSingleNode(".//w3org:li[@class='user-det']", m);
string userDetXmlTxt = userDetNode.InnerXml;
string countryTxt = "";
//if (crifanLib.extractSingleStr(@"<span class='flag \w+' title=""(\w+)""", userDetXmlTxt, out countryTxt))
//if (crifanLib.extractSingleStr(@"<span class=""flag \w+"" title=""(\w+)""", userDetXmlTxt, out countryTxt))
if (crifanLib.extractSingleStr(@"<span class=""flag \w+"" title=""([a-zA-Z ]+)""", userDetXmlTxt, out countryTxt))
{
singleGigInfo.coutryFlag = countryTxt;
}
else
{
//MessageBox.Show("Error while find country flag");
}
//(10)+ve reviews and -ve reviews ( based on 1-9999 )
//http://fiverr.com/hdsmith7674/write-a-high-quality-100-to-300-word-blog-post-or-article
// <li class="thumbs">
// <div class="gig-stats-numbers"><span itemprop="ratingValue" content="5.0">684</span></div>
// <div class="thumb"></div>
// <br class="clear" />
// <div class="gig-stats-text">POSITIVE REVIEWS</div>
// </li>
// <li class="thumbs">
// <div class="gig-stats-numbers"><span itemprop="reviewCount" content="690">6</span></div>
// <div class="down"><span class="thumb"></span></div>
// <br class="clear" />
// <div class="gig-stats-text">NEGATIVE REVIEWS</div>
// </li>
//<li class="thumbs stars">
// <div class="gig-stats-numbers">437</div>
// <div class="stat-heart heart collected"></div>
// <br class="clear" />
// <div class="gig-stats-text">COLLECTED THIS GIG</div>
//</li>
XmlNode positiveNode = xmlDoc.SelectSingleNode(".//w3org:span[@itemprop='ratingValue']", m);
XmlNode negativeNode = xmlDoc.SelectSingleNode(".//w3org:span[@itemprop='reviewCount']", m);
if ((positiveNode != null) && (negativeNode != null))
{
string posibiteValue = positiveNode.InnerText;
singleGigInfo.positiveReviews = Int32.Parse(posibiteValue);
string negativeValue = negativeNode.InnerText;
singleGigInfo.negativeReviews = Int32.Parse(negativeValue);
}
else
{
//http://fiverr.com/techlinks/provide-you-an-ebook-for-teaching-you-all-tips-and-methods-for-doing-seo-on-your-own
//http://fiverr.com/submitdirectory/do-seo-and-manually-submit-your-business-site-url-to-100-pr3-to-pr7-directories
//no POSITIVE REVIEWS and NEGATIVE REVIEWS
singleGigInfo.positiveReviews = 0;
singleGigInfo.negativeReviews = 0;
}
storeGigInfo(singleGigInfo);
//update UI
System.Windows.Forms.Application.DoEvents();
}
public struct gigInfo
{
public string title;
public int sellerRating;
public string estimatedDeliveryStr;
public int gigRating;
public int ordersInQueue;
public int sellerLevel;
public bool hasVideo;
public bool isExpressGig;
public string coutryFlag;
public int positiveReviews;
public int negativeReviews;
public bool isTopRatedSeller;
public string gigUrl;
};
private void getUrlRespHtml_bw(string url)
{
// Create a background thread
BackgroundWorker m_bgWorker = new BackgroundWorker();
m_bgWorker.DoWork += new DoWorkEventHandler(m_bgWorker_DoWork);
m_bgWorker.RunWorkerCompleted += new RunWorkerCompletedEventHandler
( m_bgWorker_RunWorkerCompleted );
//init
bWorkNotCompleted = true;
// run in another thread
m_bgWorker.RunWorkerAsync(url);
}
private void m_bgWorker_DoWork(object sender, DoWorkEventArgs e)
{
string url = (string)e.Argument;
e.Result = crifanLib.getUrlRespHtml(url);
}
void m_bgWorker_ProgressChanged(object sender, ProgressChangedEventArgs e)
{
bWorkNotCompleted = true;
}
private void m_bgWorker_RunWorkerCompleted(object sender, RunWorkerCompletedEventArgs e)
{
// The background process is complete. We need to inspect
// our response to see if an error occurred, a cancel was
// requested or if we completed successfully.
// Check to see if an error occurred in the
// background process.
if (e.Error != null)
{
//MessageBox.Show(e.Error.Message);
return;
}
// Check to see if the background process was cancelled.
if (e.Cancelled)
{
//MessageBox.Show("Cancelled ...");
}
else
{
bWorkNotCompleted = false;
// Everything completed normally.
// process the response using e.Result
//MessageBox.Show("Completed...");
curRespHtml = e.Result.ToString();
}
}
private void btnSearch_Click(object sender, EventArgs e)
{
string fiverMainUrl = "http://fiverr.com";
if (curSearchStatus == search_status.SEARCH_STATUS_PAUSED)
{
needGetMore = true;
//restore status
//continue search
curSearchStatus = search_status.SEARCH_STATUS_SEARCHING;
updateUI();
//curSearchInfo = curSearchInfo;
//for debug
//int debugNum = 0;
//int debugMaxNum = 3;
//foreach (XmlNode gigNode in gigDataList)
for (; curSearchInfo.nodeIdx < curSearchInfo.gigDataList.Count; curSearchInfo.nodeIdx++)
{
XmlNode gigNode = curSearchInfo.gigDataList[curSearchInfo.nodeIdx];
if (needGetMore)
{
//<div class="gig-title approved">
//XmlNode gitTitleNode = gigNode.SelectSingleNode(".//div[@class='gig-title approved']"); //null
XmlNode gitTitleNode = gigNode.SelectSingleNode(".//w3org:div[@class='gig-title approved']", curSearchInfo.m);
//XmlNode h2ANode = gitTitleNode.ChildNodes[1].FirstChild;
//XmlNode h2Node = gitTitleNode.SelectSingleNode(".//w3org:h2", curSearchInfo.m);
//XmlNode h2ANode = h2Node.SelectSingleNode(".//w3org:a", curSearchInfo.m);
XmlNode h2ANode = gitTitleNode.SelectSingleNode(".//w3org:h2/w3org:a", curSearchInfo.m);
string gitTitleStr = h2ANode.InnerText; //"I will give you an Advance SEO Techniques eBook for $5"
string aHref = h2ANode.Attributes["href"].Value; // /daica85/give-you-an-advance-seo-techniques-ebook
string singleGigUrl = fiverMainUrl + aHref;
processEachGig(singleGigUrl);
////for debug
//debugNum++;
//if (debugNum >= debugMaxNum)
//{
// //debug
// needGetMore = false;
// break;
//}
}
else
{
break;
}
}
//update for next page
curSearchInfo.pageNum++;
}
else if (curSearchStatus == search_status.SEARCH_STATUS_STOPPED)
{
// new search -> clear previously searched result
clearSearchResult();
curSearchStatus = search_status.SEARCH_STATUS_SEARCHING;
updateUI();
curSearchInfo = new search_info();
curSearchInfo.pageNum = 1;
needGetMore = true;
}
else
{
//unexpected status
return;
}
while (needGetMore)
{
//http://fiverr.com/gigs/search?utf8=%E2%9C%93&query=seo&x=15&y=13&page=2
curSearchInfo.searchUrl = "http://fiverr.com/gigs/search?utf8=%E2%9C%93"
+ "&query=" + HttpUtility.UrlEncode(txbKeyword.Text)
+ "&page=" + curSearchInfo.pageNum.ToString();
//string searchResultHtml = crifanLib.getUrlRespHtml(curSearchInfo.searchUrl);
getUrlRespHtml_bw(curSearchInfo.searchUrl);
while (bWorkNotCompleted)
{
System.Windows.Forms.Application.DoEvents();
}
curSearchInfo.searchRespHtml = curRespHtml;
curSearchInfo.xmlDoc = htmlToXmlDoc(curSearchInfo.searchRespHtml);
curSearchInfo.m = new XmlNamespaceManager(curSearchInfo.xmlDoc.NameTable);
curSearchInfo.m.AddNamespace("w3org", "http://www.w3.org/1999/xhtml");
curSearchInfo.gigDataList = curSearchInfo.xmlDoc.SelectNodes("//w3org:div[@data-gig_id]", curSearchInfo.m);
if (curSearchInfo.gigDataList != null)
{
//for debug
//int debugNum = 0;
//int debugMaxNum = 3;
//foreach (XmlNode gigNode in gigDataList)
for (curSearchInfo.nodeIdx = 0; curSearchInfo.nodeIdx < curSearchInfo.gigDataList.Count; curSearchInfo.nodeIdx++)
{
XmlNode gigNode = curSearchInfo.gigDataList[curSearchInfo.nodeIdx];
if (needGetMore)
{
//<div class="gig-title approved">
//XmlNode gitTitleNode = gigNode.SelectSingleNode(".//div[@class='gig-title approved']"); //null
XmlNode gitTitleNode = gigNode.SelectSingleNode(".//w3org:div[@class='gig-title approved']", curSearchInfo.m);
//XmlNode h2ANode = gitTitleNode.ChildNodes[1].FirstChild;
//XmlNode h2Node = gitTitleNode.SelectSingleNode(".//w3org:h2", curSearchInfo.m);
//XmlNode h2ANode = h2Node.SelectSingleNode(".//w3org:a", curSearchInfo.m);
XmlNode h2ANode = gitTitleNode.SelectSingleNode(".//w3org:h2/w3org:a", curSearchInfo.m);
string gitTitleStr = h2ANode.InnerText; //"I will give you an Advance SEO Techniques eBook for $5"
string aHref = h2ANode.Attributes["href"].Value; // /daica85/give-you-an-advance-seo-techniques-ebook
string singleGigUrl = fiverMainUrl + aHref;
processEachGig(singleGigUrl);
////for debug
//debugNum++;
//if (debugNum >= debugMaxNum)
//{
// //debug
// needGetMore = false;
// break;
//}
}
else
{
break;
}
}
//update for next page
curSearchInfo.pageNum++;
}
else
{
needGetMore = false;
}
};
}
private void btnPause_Click(object sender, EventArgs e)
{
if (curSearchStatus == search_status.SEARCH_STATUS_SEARCHING)
{
curSearchStatus = search_status.SEARCH_STATUS_PAUSED;
updateUI();
needGetMore = false;
//store current status and progress
//MessageBox.Show(curSearchInfo.gigDataList[0].ToString());
}
}
private void btnStopSearching_Click(object sender, EventArgs e)
{
if ((curSearchStatus == search_status.SEARCH_STATUS_SEARCHING) ||
(curSearchStatus == search_status.SEARCH_STATUS_PAUSED)
)
{
curSearchStatus = search_status.SEARCH_STATUS_STOPPED;
updateUI();
needGetMore = false;
//clear things
}
}
void storeGigInfo(gigInfo singleGigInfo)
{
//DataGridViewButtonCell gigUrlCell = new DataGridViewButtonCell();
//gigUrlCell.Value = "Buy Now";
//gigUrlCell.Tag = singleGigInfo.gigUrl;
dgvSearchResult.Rows.Add(
singleGigInfo.title,
singleGigInfo.sellerRating,
singleGigInfo.estimatedDeliveryStr,
singleGigInfo.gigRating,
singleGigInfo.ordersInQueue,
singleGigInfo.sellerLevel,
singleGigInfo.hasVideo ? "yes" : "no",
singleGigInfo.isExpressGig,
singleGigInfo.coutryFlag,
singleGigInfo.positiveReviews,
singleGigInfo.negativeReviews,
singleGigInfo.isTopRatedSeller);
//gigUrlCell);
//singleGigInfo.gigUrl);
gigUrlColumn.DataGridView.Rows[dgvSearchResult.Rows.Count - 1].Cells[girUrlColumnIdx].Value = "Buy Now";
gigUrlColumn.DataGridView.Rows[dgvSearchResult.Rows.Count - 1].Cells[girUrlColumnIdx].Tag = singleGigInfo.gigUrl;
dgvSearchResult.Rows[dgvSearchResult.Rows.Count - 1].Selected = true;
dgvSearchResult.FirstDisplayedScrollingRowIndex = dgvSearchResult.Rows.Count - 1;
//draw the row index
for (int count = 0; (count <= (dgvSearchResult.Rows.Count - 1)); count++)
{
dgvSearchResult.Rows[count].HeaderCell.Value = String.Format("{0}", count + 1);
//dgvSearchResult.Rows[count].HeaderCell.Value = string.Format((count + 1).ToString(), "0");
}
return;
}
private void dgvSearchResult_CellContentClick(object sender, DataGridViewCellEventArgs e)
{
if ((e.RowIndex >= 0) && (e.ColumnIndex == girUrlColumnIdx))
{
DataGridViewButtonCell clickedButtonCell = (DataGridViewButtonCell)dgvSearchResult.Rows[e.RowIndex].Cells[e.ColumnIndex];
//MessageBox.Show(clickedButtonCell.Value.ToString() + clickedButtonCell.Tag.ToString());
System.Diagnostics.Process.Start(clickedButtonCell.Tag.ToString());
}
}
private void releaseObject(object obj)
{
try
{
System.Runtime.InteropServices.Marshal.ReleaseComObject(obj);
obj = null;
}
catch (Exception ex)
{
obj = null;
MessageBox.Show("Exception Occured while releasing object " + ex.ToString());
}
finally
{
GC.Collect();
}
}
private void btnSaveAll_Click(object sender, EventArgs e)
{
Excel.Application xlApp = new Excel.Application();
Excel.Workbook xlWorkBook;
Excel.Worksheet xlWorkSheet;
object misValue = System.Reflection.Missing.Value;
xlApp = new Excel.ApplicationClass();
xlWorkBook = xlApp.Workbooks.Add(misValue);
xlWorkSheet = (Excel.Worksheet)xlWorkBook.Worksheets.get_Item(1);
int i = 0;
int j = 0;
//save header
for (i = 0; i <= dgvSearchResult.ColumnCount - 1; i++)
{
xlWorkSheet.Cells[0+1, i+1] = dgvSearchResult.Columns[i].HeaderText;
}
//save cells
for (i = 0; i <= dgvSearchResult.RowCount - 1; i++)
{
for (j = 0; j <= dgvSearchResult.ColumnCount - 1; j++)
{
DataGridViewCell cell = dgvSearchResult[j, i];
if (j == girUrlColumnIdx)
{
xlWorkSheet.Cells[i + 2, j + 1] = cell.Tag.ToString();
}
else
{
xlWorkSheet.Cells[i + 2, j + 1] = cell.Value;
}
}
}
//formatting
//header to bold
Range headerRow = xlWorkSheet.get_Range("1:1", System.Type.Missing);
headerRow.Font.Bold = true;
string outputFilename = "fiverrComScrapedResult.xls";
string fullFilename = Path.Combine(getSaveFolder(), outputFilename);
//xlWorkBook.SaveAs(fullFilename, Excel.XlFileFormat.xlWorkbookNormal, misValue, misValue, misValue, misValue, Excel.XlSaveAsAccessMode.xlExclusive, misValue, misValue, misValue, misValue, misValue);
xlWorkBook.SaveAs(fullFilename, Excel.XlFileFormat.xlWorkbookNormal, misValue, misValue, misValue, misValue, Excel.XlSaveAsAccessMode.xlExclusive, XlSaveConflictResolution.xlLocalSessionChanges, misValue, misValue, misValue, misValue);
xlWorkBook.Close(true, misValue, misValue);
xlApp.Quit();
releaseObject(xlWorkSheet);
releaseObject(xlWorkBook);
releaseObject(xlApp);
openFolderAndSelectFile(fullFilename);
}
private void openFolderAndSelectFile(string fullFilename)
{
System.Diagnostics.Process.Start("Explorer.exe", "/select," + fullFilename);
}
private string getSaveFolder()
{
string saveFolderPath = System.Environment.CurrentDirectory;
//fbdSaveFolder.SelectedPath = System.Environment.CurrentDirectory;
if (fbdSaveFolder.ShowDialog() == System.Windows.Forms.DialogResult.OK)
{
saveFolderPath = fbdSaveFolder.SelectedPath;
}
return saveFolderPath;
}
private void btnExportToCsv_Click(object sender, EventArgs e)
{
//settings
//string delimiter = "|";
string delimiter = ",";
string outputFilename = "fiverrComScrapedResult.csv";
string fullFilename = Path.Combine(getSaveFolder(), outputFilename);
StreamWriter csvStreamWriter = new StreamWriter(fullFilename, false, System.Text.Encoding.UTF8);
//output header data
string strHeader = "";
for (int i = 0; i < dgvSearchResult.Columns.Count; i++)
{
strHeader += dgvSearchResult.Columns[i].HeaderText + delimiter;
}
csvStreamWriter.WriteLine(strHeader);
//output rows data
for (int j = 0; j < dgvSearchResult.Rows.Count; j++)
{
string strRowValue = "";
for (int k = 0; k < dgvSearchResult.Columns.Count; k++)
{
if (k == girUrlColumnIdx)
{
strRowValue += dgvSearchResult.Rows[j].Cells[k].Tag.ToString() + delimiter;
}
else
{
strRowValue += dgvSearchResult.Rows[j].Cells[k].Value + delimiter;
}
}
csvStreamWriter.WriteLine(strRowValue);
}
csvStreamWriter.Close();
//after save file
openFolderAndSelectFile(fullFilename);
}
private void clearSearchResult()
{
dgvSearchResult.Rows.Clear();
}
private void btnClearAll_Click(object sender, EventArgs e)
{
clearSearchResult();
}
private void btnHelp_Click(object sender, EventArgs e)
{
string helpUrl = "http://giggladiator.com/help";
System.Diagnostics.Process.Start(helpUrl);
}
}
}(2)
转载请注明:在路上 » 【代码分享】C#代码:FiverComScraper – 只抓取fiverr.com,网站改版之前