【背景】
之前写了个C#程序,从Amazon中抓取数据。
此版本是完全从网页中抓取产品信息的。
【ScrapeAmazonProduct代码分享】
1.截图:
2.完整项目代码下载:
ScrapeAmazonProduct_2013-06-11_scrapeFromHtml.zip
3.代码分享:
(1)frmScrapeAmazonProduct.cs
/*
* [File]
* frmScrapeAmazonProduct.cs
*
* [Function]
* Scrape products data from Amazon
*
* [Author]
* Crifan Li
*
* [Date]
* 2013-06-11
*
* [Contact]
* https://www.crifan.com/contact_me/
*/
using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Text;
using System.Windows.Forms;
using System.Web;
using System.Net;
using System.Xml;
using System.IO;
using HtmlAgilityPack;
using System.Text.RegularExpressions;
using Excel = Microsoft.Office.Interop.Excel;
using Microsoft.Office.Interop.Excel;
using NLog;
using NLog.Targets;
using NLog.Config;
namespace ScrapeAmazonProduct
{
public partial class frmScrapeAmazonProduct : Form
{
struct AmazonProductInfo
{
public string url; //record who it is
public string title;
public string description;
//5 bullet
public string[] bulletArr; // total 5 (or more, but only record 5)
//download 5 pics
public string[] imgFullnameArr; // total 5 (or more, but only record 5)
//product keyword fileds, up to 3
public string[] keywordFieldArr; //each field, less than 50 chars, seperated by ','
//highest price of total (up to 8) sellers
public float highestPrice;
public bool isOneSellerIsAmazon;
public int reviewNumber;
public bool isBestSeller;
};
//for debug
private int lineNumber = 1;
string outputExcelFilename = "AmazonProductInfo.xls";
string constOutputFolderName = "output";
string outputExcelFullFilename = "";
string absOutputFolder = "";
string gLogFilename;
public static string constAmazonDomainUrl = "http://www.amazon.com";
public static int rule_minimalBuyerNumber = 8;
public static int rule_totalUnitNumber = 50;
//check max length for each bullet < 100 (or 90?)
public static int rule_maxLenEachBullet = 100;
public static float rule_dimensionMaxLengthCm = 80.0F;
public static float rule_dimensionMaxWidthCm = 80.0F;
public static float rule_dimensionMaxHeightCm = 80.0F;
public static int rule_maxSingleKeywordFieldLen = 50;
Dictionary<string, string> gMainCatMappingBestSellerCatDict;
public crifanLib crl;
public crifanLibAmazon amazonLib;
List<crifanLibAmazon.categoryItem> mainCategoryList;
List<crifanLibAmazon.categoryItem> bestSellerCategoryList;
//for log
public Logger gLogger = null;
public frmScrapeAmazonProduct()
{
//!!! for load embedded dll: (1) register resovle handler
AppDomain.CurrentDomain.AssemblyResolve += new ResolveEventHandler(CurrentDomain_AssemblyResolve);
crl = new crifanLib();
amazonLib = new crifanLibAmazon();
gMainCatMappingBestSellerCatDict = null;
InitializeComponent();
}
//!!! for load embedded dll: (2) implement this handler
System.Reflection.Assembly CurrentDomain_AssemblyResolve(object sender, ResolveEventArgs args)
{
string dllName = args.Name.Contains(",") ? args.Name.Substring(0, args.Name.IndexOf(',')) : args.Name.Replace(".dll", "");
dllName = dllName.Replace(".", "_");
if (dllName.EndsWith("_resources")) return null;
System.Resources.ResourceManager rm = new System.Resources.ResourceManager(GetType().Namespace + ".Properties.Resources", System.Reflection.Assembly.GetExecutingAssembly());
byte[] bytes = (byte[])rm.GetObject(dllName);
return System.Reflection.Assembly.Load(bytes);
}
private void initSearchCategory()
{
//http://www.amazon.com/ref=nb_sb_noss_null
string regularCategoryMainUrl = "http://www.amazon.com/ref=nb_sb_noss_null";
mainCategoryList = amazonLib.extractMainCategoryList(regularCategoryMainUrl);
if ((mainCategoryList != null) && (mainCategoryList.Count > 0))
{
//init search category
cmbSearchCategory.DataSource = mainCategoryList;
cmbSearchCategory.DisplayMember = "name";
}
else
{
gLogger.Fatal("can not find main category list");
}
//string bestSellerMainUrl = "http://www.amazon.com/Best-Sellers/zgbs/ref=zg_bs_tab";
string bestSellerMainUrl = "http://www.amazon.com/Best-Sellers/zgbs";
bestSellerCategoryList = amazonLib.extractBestSellerCategoryList(bestSellerMainUrl);
//gLogger.Trace("=== Main Category Info ===");
//for (int idx = 0; idx < mainCategoryList.Count; idx++)
//{
// crifanLibAmazon.categoryItem catItem = mainCategoryList[idx];
// int num = idx + 1;
// gLogger.Trace(num.ToString());
// gLogger.Trace("Name:\t" + catItem.Name);
// gLogger.Trace("Key:\t" + catItem.Key);
// gLogger.Trace("Url:\t" + catItem.Url);
//}
//gLogger.Trace("=== Best Seller Category Info ===");
//for (int idx = 0; idx < bestSellerCategoryList.Count; idx++)
//{
// crifanLibAmazon.categoryItem catItem = bestSellerCategoryList[idx];
// int num = idx + 1;
// gLogger.Trace(num.ToString());
// gLogger.Trace("Name:\t" + catItem.Name);
// gLogger.Trace("Key:\t" + catItem.Key);
// gLogger.Trace("Url:\t" + catItem.Url);
//}
}
private void initMainCategoryToBestSellerCategoryMapping()
{
gMainCatMappingBestSellerCatDict = new Dictionary<string, string>();
//gmainCatMappingBestSellerCatDict.Add("instant-video", "");
gMainCatMappingBestSellerCatDict.Add("appliances", "appliances");
gMainCatMappingBestSellerCatDict.Add("mobile-apps", "mobile");
gMainCatMappingBestSellerCatDict.Add("arts-crafts", "arts");
gMainCatMappingBestSellerCatDict.Add("automotive", "automotive");
gMainCatMappingBestSellerCatDict.Add("baby-products", "baby");
gMainCatMappingBestSellerCatDict.Add("beauty", "beauty");
gMainCatMappingBestSellerCatDict.Add("stripbooks", "books");
//gmainCatMappingBestSellerCatDict.Add("", "photo");
gMainCatMappingBestSellerCatDict.Add("mobile", "wireless");
gMainCatMappingBestSellerCatDict.Add("apparel", "apparel");
//gmainCatMappingBestSellerCatDict.Add("collectibles", "");
gMainCatMappingBestSellerCatDict.Add("computers", "pc");
//gmainCatMappingBestSellerCatDict.Add("financial", "");
gMainCatMappingBestSellerCatDict.Add("electronics", "electronics");
gMainCatMappingBestSellerCatDict.Add("gift-cards", "gift");
gMainCatMappingBestSellerCatDict.Add("grocery", "grocery");
gMainCatMappingBestSellerCatDict.Add("hpc", "hpc");
gMainCatMappingBestSellerCatDict.Add("garden", "home");
//gmainCatMappingBestSellerCatDict.Add("", "hi");
gMainCatMappingBestSellerCatDict.Add("industrial", "industrial");
gMainCatMappingBestSellerCatDict.Add("jewelry", "jewelry");
gMainCatMappingBestSellerCatDict.Add("digital-text", "digital");
//gmainCatMappingBestSellerCatDict.Add("", "kitchen");
//gmainCatMappingBestSellerCatDict.Add("", "dmusic");
gMainCatMappingBestSellerCatDict.Add("magazines", "magazines");
gMainCatMappingBestSellerCatDict.Add("movies-tv", "movies");
gMainCatMappingBestSellerCatDict.Add("digital-music", "dmusic");//MP3 Music
gMainCatMappingBestSellerCatDict.Add("popular", "music"); //Music
gMainCatMappingBestSellerCatDict.Add("mi", "musical"); //Musical Instruments
gMainCatMappingBestSellerCatDict.Add("office-products", "office");
gMainCatMappingBestSellerCatDict.Add("lawngarden", "lawn");
gMainCatMappingBestSellerCatDict.Add("pets", "pet");
gMainCatMappingBestSellerCatDict.Add("shoes", "shoes");
gMainCatMappingBestSellerCatDict.Add("software", "software");
gMainCatMappingBestSellerCatDict.Add("sporting", "sporting");
gMainCatMappingBestSellerCatDict.Add("tools", "hi"); //Tools & Home Improvement -> Home Improvement
gMainCatMappingBestSellerCatDict.Add("toys-and-games", "toys");
gMainCatMappingBestSellerCatDict.Add("videogames", "videogames");
gMainCatMappingBestSellerCatDict.Add("watches", "watches");
}
private void initLogger()
{
//logger = LogManager.GetCurrentClassLogger();
// Step 1. Create configuration object
LoggingConfiguration logConfig = new LoggingConfiguration();
// Step 2. Create targets and add them to the configuration
RichTextBoxTarget rtbTarget = new RichTextBoxTarget();
logConfig.AddTarget("richTextBox", rtbTarget);
rtbTarget.FormName = "frmScrapeAmazonProduct"; // your winform class name
rtbTarget.ControlName = "rtbLog"; // your RichTextBox control/variable name
FileTarget fileTarget = new FileTarget();
logConfig.AddTarget("logFile", fileTarget);
// Step 3. Set target properties
//string commonLayout = "${date:format=yyyy-MM-dd HH\\:mm\\:ss} ${logger} ${message}";
//https://github.com/nlog/nlog/wiki/Layout-renderers
//https://github.com/nlog/nlog/wiki/Level-Layout-Renderer
//string commonLayout = "[${date:format=yyyy-MM-dd HH\\:mm\\:ss}][${level}] ${message}";
string commonLayout = "[${date:format=yyyy-MM-dd HH\\:mm\\:ss}][${pad:padding=5:inner=${level:uppercase=true}}] ${message}";
rtbTarget.Layout = commonLayout;
//fileTarget.FileName = "${basedir}/output/log.txt"; //{'${basedir}/output/log.txt'}
fileTarget.FileName = gLogFilename; //{'D:\tmp\tmp_dev_root\freelance\elance\40939187_scrape_amazon\40939187_scrape_amazon\ScrapeAmazonProduct\ScrapeAmazonProduct\bin\Debug\output\2013-06-11_153647_log.txt'}
fileTarget.Layout = commonLayout;
// Step 4. Define rules
LoggingRule ruleRichTextBox = new LoggingRule("*", LogLevel.Info, rtbTarget);
logConfig.LoggingRules.Add(ruleRichTextBox);
LoggingRule ruleFile = new LoggingRule("*", LogLevel.Trace, fileTarget);
logConfig.LoggingRules.Add(ruleFile);
// Step 5. Activate the configuration
LogManager.Configuration = logConfig;
// Example usage
//Logger logger = LogManager.GetLogger("Amazon");
//Logger logger = LogManager.GetLogger("");
gLogger = LogManager.GetLogger("");
//gLogger.Trace("trace log message");
//gLogger.Debug("debug log message");
//gLogger.Info("info log message");
//gLogger.Warn("warn log message");
//gLogger.Error("error log message");
//gLogger.Fatal("fatal log message");
}
private void frmScrapeAmazonProduct_Load(object sender, EventArgs e)
{
//1. init output directory
absOutputFolder = Path.Combine(Environment.CurrentDirectory, constOutputFolderName);
if (!Directory.Exists(absOutputFolder))
{
Directory.CreateDirectory(absOutputFolder);
}
outputExcelFullFilename = Path.Combine(absOutputFolder, outputExcelFilename);
//2. init log filename
//string curDatetimeStr = DateTime.Now.ToString();
DateTime curDateTime = DateTime.Now;
string curDatetimeStr = String.Format("{0:yyyy-MM-dd_HHmmss}", curDateTime); //"2013-06-11_142102"
gLogFilename = curDatetimeStr + "_log.txt"; //"2013-06-11_153647_log.txt"
gLogFilename = Path.Combine(absOutputFolder, gLogFilename); //{'D:\tmp\tmp_dev_root\freelance\elance\40939187_scrape_amazon\40939187_scrape_amazon\ScrapeAmazonProduct\ScrapeAmazonProduct\bin\Debug\output\2013-06-11_153647_log.txt'}
//3. init logger
initLogger();
//4. init main category list to best seller mapping
initMainCategoryToBestSellerCategoryMapping();
//5. init main category list
initSearchCategory();
}
//private void print(string info)
//{
// rtbLog.Text = rtbLog.Text + info + Environment.NewLine;
// System.Windows.Forms.Application.DoEvents();
//}
//private void log(string info)
//{
// rtbLog.Text = "[" + lineNumber.ToString() + "]"
// + info
// + Environment.NewLine
// + rtbLog.Text;
// lineNumber++;
// System.Windows.Forms.Application.DoEvents();
//}
private bool checkBuyerNumber(string productHtml, out string invalidReason, out string usedAndNewUrl)
{
bool isBuyerNumberValid = false;
invalidReason = "Unknow error for checkBuyerNumber";
usedAndNewUrl = "";
int buyerNumber = 0;
if (amazonLib.extractProductBuyerNumberAndNewUrl(productHtml, out buyerNumber, out usedAndNewUrl))
{
if (buyerNumber > rule_minimalBuyerNumber)
{
isBuyerNumberValid = true;
invalidReason = "";
}
else
{
isBuyerNumberValid = false;
invalidReason = String.Format("Buyer Number is {0}, less than {1}", buyerNumber, rule_minimalBuyerNumber);
}
}
else
{
isBuyerNumberValid = false;
invalidReason = "Not found buyer number string and used and new url";
}
return isBuyerNumberValid;
}
//http://www.amazon.com/gp/offer-listing/B0083PWAPW/ref=dp_olp_all_mbc?ie=UTF8&condition=all
private bool checkTotalUnitNumber(string productUrl, out string invalidReason)
{
//debug
//productUrl = "http://www.amazon.com/gp/offer-listing/B0083PWAPW/ref=dp_olp_all_mbc?ie=UTF8&condition=all";
//productUrl = "http://www.amazon.com/gp/offer-listing/B007HUUU6A/ref=dp_olp_new_mbc?ie=UTF8&condition=new";
bool isTotal50UnitNum = false;
invalidReason = "Unknow error for checkTotalUnitNumber";
HtmlAgilityPack.HtmlDocument htmlDoc = null;
int totalNumber = 0;
//string respHtml = crl.getUrlRespHtml(productUrl);
string respHtml = crl.getUrlRespHtml_multiTry(productUrl);
/*
<form method="POST" action="/gp/item-dispatch/ref=olp_atc_used_1" >
<input type="hidden" name="session-id" value="178-3505985-4680803">
<input type="hidden" name="qid" value="">
<input type="hidden" name="sr" value="">
<input id="signInToHUC" type="hidden" value="0" name="signInToHUC">
<input type="hidden" name="metric-asin.1616550414" value="1">
<input type="hidden" name="registryItemID.1" value="">
<input type="hidden" name="registryID.1" value="">
<input type="hidden" name="itemCount" value="1">
<input type="hidden" name="offeringID.1" value="n5Z1VzKW%2FMw90LVK2m6qQO9cxMOctYK3JMq7ea8RaqXkBZUh4WAfxc0emP1KFayuYhqKmk7KlUt9mqIcvgBck9UP4MtWJC1ZvH527IITG0IwujyCrGlxnA6WVgb02eM2avyXLkUpdRFYiUvksTQTqT87qNDk6mMo">
<input type="hidden" name="isAddon" value="0">
<input type="image" src="http://g-ecx.images-amazon.com/images/G/01/x-locale/nav2/images/add-to-cart-md-p._V192250398_.gif" align="absmiddle" alt="Add to cart" border="0" height="21" name="submit.addToCart" width="112"/>
</form>
*/
htmlDoc = crl.htmlToHtmlDoc(respHtml);
HtmlNodeCollection postItemNodeList = htmlDoc.DocumentNode.SelectNodes("//form[starts-with(@action, '/gp/item-dispatch/ref=') and @method='POST']");
if (postItemNodeList == null)
{
//something error
invalidReason = "Can not found /gp/item-dispatch post item";
}
else
{
foreach (HtmlNode postItemNode in postItemNodeList)
{
//http://www.amazon.com/gp/item-dispatch/ref=olp_atc_used_1
string itemDispatchUrl = postItemNode.Attributes["action"].Value; ///gp/item-dispatch/ref=olp_atc_used_1
itemDispatchUrl = constAmazonDomainUrl + itemDispatchUrl;//http://www.amazon.com/gp/item-dispatch/ref=olp_atc_used_1
Dictionary<string, string> postDict = new Dictionary<string, string>();
HtmlNodeCollection inputTypeNodeList = postItemNode.SelectNodes(".//input[@type='hidden' and @name and @value]");
//HtmlNodeCollection inputTypeNodeList = postItemNode.SelectNodes(".//input[@type and @name and @value]");
if (inputTypeNodeList == null)
{
//something error
invalidReason = String.Format("Can not find input tag for node: {1}", postItemNode.InnerHtml);
break;
}
else
{
foreach (HtmlNode inputTypeNode in inputTypeNodeList)
{
//get each post key and value
string postKey = inputTypeNode.Attributes["name"].Value;
string postValue = inputTypeNode.Attributes["value"].Value;
postDict.Add(postKey, postValue);
/*
session-id=178-3505985-4680803
&qid=
&sr=
&signInToHUC=0
&metric-asin.1616550414=1
®istryItemID.1=
®istryID.1=
&itemCount=1
&offeringID.1=n5Z1VzKW%252FMw90LVK2m6qQO9cxMOctYK3JMq7ea8RaqXkBZUh4WAfxc0emP1KFayuYhqKmk7KlUt9mqIcvgBck9UP4MtWJC1ZvH527IITG0IwujyCrGlxnA6WVgb02eM2avyXLkUpdRFYiUvksTQTqT87qNDk6mMo
&isAddon=0
&submit.addToCart.x=63
&submit.addToCart.y=7
*/
}
postDict.Add("submit.addToCart.x", "63");
postDict.Add("submit.addToCart.y", "7");
Dictionary<string, string> headerDict = new Dictionary<string, string>();
headerDict.Add("AllowAutoRedirect", "false");
headerDict.Add("Referer", productUrl);
//do POST
//no autoredirect
HttpWebResponse resp = crl.getUrlResponse(itemDispatchUrl, headerDict, postDict);
string viewHtmlUrl = resp.Headers["Location"];
if (viewHtmlUrl != null && viewHtmlUrl != "")
{
//respHtml = crl.getUrlRespHtml(viewHtmlUrl);
respHtml = crl.getUrlRespHtml_multiTry(viewHtmlUrl);
//got html:
//<div class="hlb-scarcity red">Only 8 left in stock.</div>
htmlDoc = crl.htmlToHtmlDoc(respHtml);
HtmlNode hlbScarcityNode = htmlDoc.DocumentNode.SelectSingleNode("//div[starts-with(@class, 'hlb-scarcity')]");
if (hlbScarcityNode == null)
{
//first one is amazon, no hlb-scarcity red
//others must have this
}
else
{
string leftInStockStr = hlbScarcityNode.InnerText; //Only 1 left in stock.
string leftNumberStr = "";
if (crl.extractSingleStr(@"Only (\d+) left in stock", leftInStockStr, out leftNumberStr))
{
int leftNumberInt = Int32.Parse(leftNumberStr); //1
totalNumber += leftNumberInt;
if (totalNumber > rule_totalUnitNumber)
{
isTotal50UnitNum = true;
invalidReason = "No error";
break;
}
}
else
{
//something error
invalidReason = "Can not find remaining number";
break;
}
}
}
else
{
//something error
invalidReason = "Not found viewHtmlUrl";
break;
}
}
}
}
return isTotal50UnitNum;
}
private bool checkWeight(string productUrl, string productHtml, out string invalidReason)
{
bool isLess5Pounds = false;
invalidReason = "Unknow error for checkWeight";
float maxKiloGram = 2.5F;
float kiloGram = amazonLib.extractProductWeight(productHtml);
//check valid or not
if (kiloGram > 0.0F)
{
if (kiloGram <= maxKiloGram)
{
isLess5Pounds = true;
}
else
{
isLess5Pounds = false;
invalidReason = String.Format("Weight is {0} kilogram, more than 5 pounds", kiloGram);
}
}
else
{
isLess5Pounds = false;
invalidReason = "Not found weight string or unrecognized weight number";
}
return isLess5Pounds;
}
private bool checkDimension(string productUrl, string productHtml, out string invalidReason)
{
bool isValidDimension = false;
invalidReason = "Unknow error for checkDimension";
crifanLibAmazon.productDimension dimensionCm = amazonLib.extractProductDimension(productHtml);
if (dimensionCm.length > 0.0F)
{
crifanLibAmazon.productDimension dimensionMaxCm = new crifanLibAmazon.productDimension();
dimensionMaxCm.length = rule_dimensionMaxLengthCm;
dimensionMaxCm.width = rule_dimensionMaxWidthCm;
dimensionMaxCm.height = rule_dimensionMaxHeightCm;
//check valid or not
if (
(dimensionCm.length <= dimensionMaxCm.length) &&
(dimensionCm.width <= dimensionMaxCm.width) &&
(dimensionCm.height <= dimensionMaxCm.height)
)
{
isValidDimension = true;
}
else
{
isValidDimension = false;
invalidReason = String.Format("Dimension: {0}cm x {1}cm x {2}cm invalid for exceed max: {3}cm x {4}cm x {5}cm",
dimensionCm.length, dimensionCm.width, dimensionCm.height,
dimensionMaxCm.length, dimensionMaxCm.width, dimensionMaxCm.height);
}
}
else
{
//isValidDimension = false;
//invalidReason = "Not found dimension string";
isValidDimension = true; // even if no dimension, also consider it as valid one if the weight is valid
}
return isValidDimension;
}
private bool checkProductValid(string productUrl, string productHtml, out string invalidReason, out string usedAndNewUrl)
{
bool isProductValid = true;
invalidReason = "";
usedAndNewUrl = "";
//1. check buyer number > 8
if (isProductValid)
{
//debug
isProductValid = checkBuyerNumber(productHtml, out invalidReason, out usedAndNewUrl);
}
//2. check total unit number > 50
if (isProductValid)
{
//debug
isProductValid = checkTotalUnitNumber(usedAndNewUrl, out invalidReason);
}
//3. check no more than 5 pounds (2.5 kg)
if (isProductValid)
{
//debug
isProductValid = checkWeight(productUrl, productHtml, out invalidReason);
}
//4. check dimension less than 80cmX80cmX80cm
if (isProductValid)
{
//debug
isProductValid = checkDimension(productUrl, productHtml, out invalidReason);
}
return isProductValid;
}
public void updateProgress(int percentage)
{
//pgbDownload.Value = percentage;
}
public void downloadPictures(string productUrl, string respHtml, out string[] picFullnameList)
{
picFullnameList = null;
//init
string productAsin = "";
if (amazonLib.extractAsinFromProductUrl(productUrl, out productAsin))
{
}
else
{
//something wrong
}
//creat folder
string downloadRootPath = Path.Combine(absOutputFolder, "download");
string downloadFullPath = Path.Combine(downloadRootPath, productAsin);
if (!Directory.Exists(downloadFullPath))
{
Directory.CreateDirectory(downloadFullPath);
}
string[] imageUrlList = amazonLib.extractProductImageList(respHtml);
gLogger.Info("Extracted image url list:");
if (imageUrlList != null)
{
picFullnameList = new string[imageUrlList.Length];
for (int idx = 0; idx < imageUrlList.Length; idx++)
{
string imageUrl = imageUrlList[idx];
gLogger.Info(String.Format("[{0}]={1}", idx, imageUrl));
string picFilename = crl.extractFilenameFromUrl(imageUrl);
string picFullFilename = Path.Combine(downloadFullPath, picFilename);
string errorStr = "";
gLogger.Info(String.Format("Downloading {0}] to {1}", imageUrl, picFullFilename));
crl.downloadFile(imageUrl, picFullFilename, out errorStr, updateProgress);
//update
picFullnameList[idx] = picFullFilename;
}
}
else
{
gLogger.Error("No image url for " + productUrl);
}
}
/*
* productUrl=http://www.amazon.com/Kindle-Paperwhite-Touch-light/dp/B007OZNZG0/ref=lp_1055398_1_1?ie=UTF8&qid=1370510177&sr=1-1
* usedAndNewUrl=http://www.amazon.com/gp/offer-listing/B007OZNZG0/ref=dp_olp_all_mbc?ie=UTF8&condition=all
*/
private AmazonProductInfo extractProductInfo(string productUrl, string productHtml, string usedAndNewUrl)
{
gLogger.Info("Extracting info for " + productUrl);
//init
AmazonProductInfo productInfo = new AmazonProductInfo();
productInfo.url = productUrl;
productInfo.highestPrice = 0.0F;
productInfo.isOneSellerIsAmazon = false;
//must init, otherwise, when only got 4 bullet, here total 5 -> last is null -> assign later will exception
productInfo.bulletArr = new string[5];
crl.emptyStringArray(productInfo.bulletArr);
productInfo.imgFullnameArr = new string[5];
crl.emptyStringArray(productInfo.imgFullnameArr);
productInfo.keywordFieldArr = new string[3];
crl.emptyStringArray(productInfo.keywordFieldArr);
//1. title
productInfo.title = amazonLib.extractProductTitle(productHtml);
gLogger.Info("Title=" + productInfo.title);
//2. description and 5 bullet
List<string> bulletList = new List<string>();
bool gotBullets = amazonLib.extractProductBulletList(productHtml, out bulletList);
gLogger.Info("Extracted Bullets=" + gotBullets);
string description = "";
bool gotDescription = amazonLib.extractProductDescription(productHtml, out description);
gLogger.Info("Got Description=" + gotDescription);
/*
* 1. if no description, use bullet
* 2. if more than normal 5 bullets, get all bullets, just use first 5 bullets to description
* 3. if no bullet, use description to split to 5 bullets
*/
//type1: has description, has bullet
if ((description != "") && (bulletList.Count > 0))
{
productInfo.description = description;
//bullets
//maybe has more than 5 bullets
//maybe less than 5 bullets
//http://www.amazon.com/AmazonBasics-Lightning-Compatible-Cable-inch/dp/B00B5RGAWY/ref=sr_1_3?s=wireless&ie=UTF8&qid=1369753764&sr=1-3
//has feature-bullets_feature_div, but no content -> bulletsNodeList is null
for (int idx = 0; idx < bulletList.Count; idx++)
{
string bulletStr = bulletList[idx];
//get first 5 -> to bullet
if (idx < 5)
{
productInfo.bulletArr[idx] = bulletStr;
}
}
}
//type2: no description, has bullet
else if ((description == "") && (bulletList.Count > 0))
{
//bullets
//maybe has more than 5 bullets
//maybe less than 5 bullets
for (int idx = 0; idx < bulletList.Count; idx++)
{
string bulletStr = bulletList[idx];
//get first 5 -> to bullet
if (idx < 5)
{
productInfo.bulletArr[idx] = bulletStr;
}
//all bullet -> description
description = description + bulletStr + Environment.NewLine;
}
productInfo.description = description;
}
//type3: has description, no bullet
else if ((description != "") && (bulletList.Count == 0))
{
productInfo.description = description;
//seperate description to many lines
string[] lines = description.Split('.');
//maybe less than 5, maybe greater than 5
for (int idx = 0; idx < lines.Length; idx++)
{
string curLine = lines[idx];
//get first 5 -> to bullet
if (idx < 5)
{
productInfo.bulletArr[idx] = curLine;
}
}
}
//type4: no description, no bullet
else if ((description == "") && (bulletList.Count == 0))
{
//something wrong
}
//check max length for each bullet
for (int idx = 0; idx < productInfo.bulletArr.Length; idx++)
{
if (productInfo.bulletArr[idx].Length > rule_maxLenEachBullet)
{
productInfo.bulletArr[idx] = productInfo.bulletArr[idx].Substring(0, rule_maxLenEachBullet);
}
}
//check max length for whole description ?
//3. download 5(or 7) pics
string[] picFullnameList = null;
//debug
downloadPictures(productUrl, productHtml, out picFullnameList);
if((picFullnameList != null) && (picFullnameList.Length > 0))
{
int maxImageCount = 0;
if(picFullnameList.Length > productInfo.imgFullnameArr.Length)
{
maxImageCount = productInfo.imgFullnameArr.Length;
}
else
{
maxImageCount = picFullnameList.Length;
}
for (int idx = 0; idx < maxImageCount; idx++)
{
productInfo.imgFullnameArr[idx] = picFullnameList[idx];
}
}
//4.extract product seller info: price and name
List<crifanLibAmazon.productSellerInfo> allSellerInfoList = new List<crifanLibAmazon.productSellerInfo>();
if (amazonLib.extractAllSellerInfo(usedAndNewUrl, out allSellerInfoList))
{
foreach (crifanLibAmazon.productSellerInfo eachSellerInfo in allSellerInfoList)
{
//(1) calc highest price
if (eachSellerInfo.price > productInfo.highestPrice)
{
productInfo.highestPrice = eachSellerInfo.price;
}
//(2) find whether one of the sellers is Amazon
//here means: one of the seller's name is: Amazon.com
if (eachSellerInfo.name.Equals("Amazon.com", StringComparison.CurrentCultureIgnoreCase))
{
productInfo.isOneSellerIsAmazon = true;
}
}
}
else
{
gLogger.Debug("not found seller info for " + usedAndNewUrl);
}
gLogger.Info("Highest Price=" + productInfo.highestPrice);
gLogger.Info("One of Seller is Amazon=" + productInfo.isOneSellerIsAmazon);
//5. 3 keyword Field
productInfo.keywordFieldArr = amazonLib.extractProductKeywordField(productInfo.title, productInfo.keywordFieldArr.Length, rule_maxSingleKeywordFieldLen);
gLogger.Info("Keyword Field List:");
if ((productInfo.keywordFieldArr != null) && (productInfo.keywordFieldArr.Length > 0))
{
for (int idx = 0; idx < productInfo.keywordFieldArr.Length; idx++)
{
String keywordField = productInfo.keywordFieldArr[idx];
gLogger.Info(String.Format("[{0}]={1}", idx, keywordField));
}
}
//6. product review
productInfo.reviewNumber = amazonLib.extractProductReviewNumber(productHtml);
gLogger.Info("ReviewNumber=" + productInfo.reviewNumber);
//7. product best seller rank number list
List<crifanLibAmazon.productBestRank> bestSellerRankList = amazonLib.extractProductBestSellerRankList(productHtml);
if ((bestSellerRankList != null) && (bestSellerRankList.Count > 0))
{
productInfo.isBestSeller = true;
}
else
{
gLogger.Debug("bestSellerRankList is null or count not > 0 : " + bestSellerRankList.ToString());
}
gLogger.Info("Is BestSeller=" + productInfo.isBestSeller);
return productInfo;
}
private void createOutputFile(string excelFullFilename)
{
gLogger.Info("Creating ouput file " + excelFullFilename);
bool isAutoFit = true;
bool isHeaderBold = true;
//init
//if exist remove it
if (File.Exists(excelFullFilename))
{
File.Delete(excelFullFilename);
}
Excel.Application xlApp = new Excel.Application();
Excel.Workbook xlWorkBook;
Excel.Worksheet xlWorkSheet;
object misValue = System.Reflection.Missing.Value;
xlApp = new Excel.ApplicationClass();
xlWorkBook = xlApp.Workbooks.Add(misValue);
xlWorkSheet = (Excel.Worksheet)xlWorkBook.Worksheets.get_Item(1);
const int excelRowHeader = 1;
const int excelColumnHeader = 1;
//save header
int curColumnIdx = 0 + excelColumnHeader;
int rowIdx = 0 + excelRowHeader;
xlWorkSheet.Cells[rowIdx, curColumnIdx++] = "Title";
xlWorkSheet.Cells[rowIdx, curColumnIdx++] = "Description";
const int constBullerLen = 5;
for (int bulletIdx = 0; bulletIdx < constBullerLen; bulletIdx++)
{
int bulletNum = bulletIdx + 1;
xlWorkSheet.Cells[rowIdx, curColumnIdx + bulletIdx] = "Bullet" + bulletNum.ToString();
}
curColumnIdx = curColumnIdx + constBullerLen;
const int constImgNameListLen = 5;
for (int imgIdx = 0; imgIdx < constImgNameListLen; imgIdx++)
{
int imgNum = imgIdx + 1;
xlWorkSheet.Cells[rowIdx, curColumnIdx + imgIdx] = "ImageFilename" + imgNum.ToString();
}
curColumnIdx = curColumnIdx + constImgNameListLen;
xlWorkSheet.Cells[rowIdx, curColumnIdx++] = "HighestPrice";
xlWorkSheet.Cells[rowIdx, curColumnIdx++] = "OneSellerIsAmazon";
xlWorkSheet.Cells[rowIdx, curColumnIdx++] = "ReviewNumber";
xlWorkSheet.Cells[rowIdx, curColumnIdx++] = "IsBestSeller";
//formatting
//(1) header to bold
if (isHeaderBold)
{
Range headerRow = xlWorkSheet.get_Range("1:1", System.Type.Missing);
headerRow.Font.Bold = true;
}
//(2) auto adjust column width (according to content)
if (isAutoFit)
{
Range allColumn = xlWorkSheet.Columns;
allColumn.AutoFit();
}
//output
xlWorkBook.SaveAs(excelFullFilename,
XlFileFormat.xlWorkbookNormal,
misValue,
misValue,
misValue,
misValue,
XlSaveAsAccessMode.xlExclusive,
XlSaveConflictResolution.xlLocalSessionChanges,
misValue,
misValue,
misValue,
misValue);
xlWorkBook.Close(true, misValue, misValue);
xlApp.Quit();
crl.releaseObject(xlWorkSheet);
crl.releaseObject(xlWorkBook);
crl.releaseObject(xlApp);
}
private void appendInfoToFile(string fullFilename, AmazonProductInfo productInfo)
{
gLogger.Info("Saving product info for " + productInfo.url);
Excel.Application xlApp;
Excel.Workbook xlWorkBook;
Excel.Worksheet xlWorkSheet;
object missingVal = System.Reflection.Missing.Value;
xlApp = new Microsoft.Office.Interop.Excel.Application();
//xlApp.Visible = true;
//xlApp.DisplayAlerts = false;
//http://msdn.microsoft.com/zh-cn/library/microsoft.office.interop.excel.workbooks.open%28v=office.11%29.aspx
xlWorkBook = xlApp.Workbooks.Open(
Filename : fullFilename,
//UpdateLinks:3,
ReadOnly : false,
//Format : 2, //use Commas as delimiter when open text file
//Password : missingVal,
//WriteResPassword : missingVal,
//IgnoreReadOnlyRecommended: false, //when save to readonly, will notice you
Origin: Excel.XlPlatform.xlWindows, //xlMacintosh/xlWindows/xlMSDOS
//Delimiter: ",", // usefule when is text file
Editable : true,
Notify : false,
//Converter: missingVal,
AddToMru: true, //True to add this workbook to the list of recently used files
Local: true,
CorruptLoad: missingVal //xlNormalLoad/xlRepairFile/xlExtractData
);
//Get the first sheet
xlWorkSheet = (Excel.Worksheet)xlWorkBook.Worksheets.get_Item(1); //also can get by sheet name
Excel.Range range = xlWorkSheet.UsedRange;
//int usedColCount = range.Columns.Count;
int usedRowCount = range.Rows.Count;
const int excelRowHeader = 1;
const int excelColumnHeader = 1;
//int curColumnIdx = usedColCount + excelColumnHeader;
int curColumnIdx = 0 + excelColumnHeader; //start from column begin
int curRrowIdx = usedRowCount + excelRowHeader; // !!! here must added buildin excelRowHeader=1, otherwise will overwrite previous (added title or whole row value)
xlWorkSheet.Cells[curRrowIdx, curColumnIdx++] = productInfo.title;
xlWorkSheet.Cells[curRrowIdx, curColumnIdx++] = productInfo.description;
const int constBullerLen = 5;
int bulletListLen = 0;
if (productInfo.bulletArr.Length > constBullerLen)
{
bulletListLen = constBullerLen;
}
else
{
bulletListLen = productInfo.bulletArr.Length;
}
for (int bulletIdx = 0; bulletIdx < bulletListLen; bulletIdx++)
{
xlWorkSheet.Cells[curRrowIdx, curColumnIdx + bulletIdx] = productInfo.bulletArr[bulletIdx];
}
curColumnIdx = curColumnIdx + bulletListLen;
const int constImgNameListLen = 5;
int imgNameListLen = 0;
if (productInfo.imgFullnameArr.Length > constImgNameListLen)
{
imgNameListLen = constImgNameListLen;
}
else
{
imgNameListLen = productInfo.imgFullnameArr.Length;
}
for (int imgIdx = 0; imgIdx < imgNameListLen; imgIdx++)
{
xlWorkSheet.Cells[curRrowIdx, curColumnIdx + imgIdx] = productInfo.imgFullnameArr[imgIdx];
}
curColumnIdx = curColumnIdx + imgNameListLen;
xlWorkSheet.Cells[curRrowIdx, curColumnIdx++] = productInfo.highestPrice;
xlWorkSheet.Cells[curRrowIdx, curColumnIdx++] = productInfo.isOneSellerIsAmazon;
xlWorkSheet.Cells[curRrowIdx, curColumnIdx++] = productInfo.reviewNumber;
xlWorkSheet.Cells[curRrowIdx, curColumnIdx++] = productInfo.isBestSeller;
////http://msdn.microsoft.com/query/dev10.query?appId=Dev10IDEF1&l=ZH-CN&k=k%28MICROSOFT.OFFICE.INTEROP.EXCEL._WORKBOOK.SAVEAS%29;k%28SAVEAS%29;k%28TargetFrameworkMoniker-%22.NETFRAMEWORK%2cVERSION%3dV3.5%22%29;k%28DevLang-CSHARP%29&rd=true
//xlWorkBook.SaveAs(
// Filename: fullFilename,
// ConflictResolution: XlSaveConflictResolution.xlLocalSessionChanges //The local user's changes are always accepted.
// //FileFormat : Excel.XlFileFormat.xlWorkbookNormal
//);
//if use above SaveAs -> will popup a window ask you overwrite it or not, even if you have set the ConflictResolution to xlLocalSessionChanges, which should not ask, should directly save
xlWorkBook.Save();
//http://msdn.microsoft.com/query/dev10.query?appId=Dev10IDEF1&l=ZH-CN&k=k%28MICROSOFT.OFFICE.INTEROP.EXCEL._WORKBOOK.CLOSE%29;k%28CLOSE%29;k%28TargetFrameworkMoniker-%22.NETFRAMEWORK%2cVERSION%3dV3.5%22%29;k%28DevLang-CSHARP%29&rd=true
xlWorkBook.Close(SaveChanges : true);
crl.releaseObject(xlWorkSheet);
crl.releaseObject(xlWorkBook);
crl.releaseObject(xlApp);
}
//save product info
private void saveProductInfo(AmazonProductInfo productInfo)
{
//check if output excel file already exist
if (!File.Exists(outputExcelFullFilename))
{
//if no, create it, add header
createOutputFile(outputExcelFullFilename);
}
//then append info to it
appendInfoToFile(outputExcelFullFilename, productInfo);
return;
}
//check whether each product valid or not
//if valid, extract product info
//http://www.amazon.com/Silver-Linings-Playbook/dp/B00CL68QVQ/ref=sr_1_2?s=instant-video&ie=UTF8&qid=1368688342&sr=1-2
private void checkAndExtractForSingleProduct(string productUrl)
{
//debug
//productUrl = "http://www.amazon.com/Paderno-World-Cuisine-A4982799-Tri-Blade/dp/B0007Y9WHQ/ref=lp_1055398_1_3?ie=UTF8&qid=1370596558&sr=1-3";
bool isProductValid = false;
string invalidReason = "";
//string respHtml = crl.getUrlRespHtml(productUrl);
string productHtml = crl.getUrlRespHtml_multiTry(productUrl);
string usedAndNewUrl = "";
isProductValid = checkProductValid(productUrl, productHtml, out invalidReason, out usedAndNewUrl);
if (isProductValid)
{
gLogger.Info("+VALID+ Product=" + productUrl);
AmazonProductInfo productInfo = extractProductInfo(productUrl, productHtml, usedAndNewUrl);
saveProductInfo(productInfo);
}
else
{
gLogger.Info(String.Format("-INVALID- product={0}, reason={1}", productUrl, invalidReason));
}
}
//check whether each product variation valid or not
//if valid, extract product info
private void checkAndExtractForSingleVariation(crifanLibAmazon.variationItem singleVariationItem)
{
bool isProductValid = false;
string invalidReason = "";
gLogger.Info("processing variation " + singleVariationItem.url);
//string respHtml = crl.getUrlRespHtml(singleVariationItem.url);
string productHtml = crl.getUrlRespHtml_multiTry(singleVariationItem.url);
string usedAndNewUrl = "";
isProductValid = checkProductValid(singleVariationItem.url, productHtml, out invalidReason, out usedAndNewUrl);
if (isProductValid)
{
gLogger.Info("Valid product=" + singleVariationItem.url);
AmazonProductInfo productInfo = extractProductInfo(singleVariationItem.url, productHtml, usedAndNewUrl);
//check whether the product title already have vartiation label in the end of title
//if not, added it
if (productInfo.title.EndsWith(singleVariationItem.label))
{
//http://www.amazon.com/GE-MWF-Refrigerator-Filter-1-Pack/dp/B000AST3AK/ref=lp_1055398_1_4?ie=UTF8&qid=1370574186&sr=1-4
//title already added variation label:
//GE MWF Refrigerator Water Filter, 1-Pack
//also for:
//http://www.amazon.com/gp/product/B003BIG0DO/ref=twister_B000AST3AK?ie=UTF8&psc=1
//GE SmartWater MWF Refrigerator Water Filter, 2-Pack
}
else
{
//http://www.amazon.com/Thermos-Insulated-18-Ounce-Stainless-Steel-Hydration/dp/B000FJ9DOK/ref=lp_1055398_1_6?ie=UTF8&qid=1370574186&sr=1-6
//and
//http://www.amazon.com/gp/product/B0057FQCNC/ref=twister_B000FJ9DOK?ie=UTF8&psc=1
//has same title
productInfo.title = productInfo.title + ", " + singleVariationItem.label;
}
saveProductInfo(productInfo);
}
else
{
gLogger.Info(String.Format("Invalid product={0}, reason={1}",singleVariationItem.url, invalidReason));
}
}
private void processSinglePageHtml(string singlePageHtml)
{
List<crifanLibAmazon.searchResultItem> searchedItemList = new List<crifanLibAmazon.searchResultItem>();
if (amazonLib.extractSearchItemList(singlePageHtml, out searchedItemList))
{
foreach (crifanLibAmazon.searchResultItem eachSearchResultItem in searchedItemList)
{
crifanLibAmazon.productVariationInfo variationInfo = new crifanLibAmazon.productVariationInfo();
gLogger.Info("processing single product url " + eachSearchResultItem.productUrl);
if (amazonLib.checkVariation(eachSearchResultItem.productUrl, out variationInfo))
{
//have many varation
//process each variation
List<crifanLibAmazon.variationItem> variationList = variationInfo.variationList;
gLogger.Info(String.Format("Total {0} variations for {1}", variationList.Count, eachSearchResultItem.productUrl));
foreach (crifanLibAmazon.variationItem eachVariationItem in variationList)
{
checkAndExtractForSingleVariation(eachVariationItem);
}
}
else
{
//no variation -> only current single product
//directly process this product
gLogger.Info("no variation for " + eachSearchResultItem.productUrl);
checkAndExtractForSingleProduct(eachSearchResultItem.productUrl);
}
}
}
}
private void processEachSearchCategory(string curPageSearchUrl)
{
gLogger.Info("processing search category " + curPageSearchUrl);
string eachPageHtml = "";
bool hasMorePage = true;
//get each page html
while (hasMorePage)
{
//fisrt:
//http://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3dinstant-video
//then:
//http://www.amazon.com/s/ref=sr_pg_2?rh=n%3A2625373011%2Cn%3A%212644981011%2Cn%3A%212644982011%2Cn%3A2858778011&page=2&ie=UTF8&qid=1368697688
//eachPageHtml = crl.getUrlRespHtml(curPageSearchUrl);
eachPageHtml = crl.getUrlRespHtml_multiTry(curPageSearchUrl);
processSinglePageHtml(eachPageHtml);
string nextPageUrl = "";
if (amazonLib.extractNextPageUrl(eachPageHtml, out nextPageUrl))
{
if (nextPageUrl != "")
{
//http://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Dinstant-video#/ref=sr_pg_2?rh=n%3A2858778011&page=2&ie=UTF8&qid=1368688123
//http://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Dinstant-video#/ref=sr_pg_3?rh=n%3A2858778011&page=3&ie=UTF8&qid=1368688123
hasMorePage = true;
}
else
{
hasMorePage = false;
break;
}
}
else
{
//something wrong
break;
}
}
}
//find matched best seller category for input main category item
public bool findMatchedBestSellerCategoryItem(crifanLibAmazon.categoryItem mainCateoryItem, out crifanLibAmazon.categoryItem bestSellerCateoryItem)
{
bool foundMatchedBestSeller = false;
bestSellerCateoryItem = new crifanLibAmazon.categoryItem();
//Method 1: static mapping
if (gMainCatMappingBestSellerCatDict != null && (gMainCatMappingBestSellerCatDict.Count > 0))
{
if (gMainCatMappingBestSellerCatDict.ContainsKey(mainCateoryItem.Key))
{
string bestSellerCategoryKey = gMainCatMappingBestSellerCatDict[mainCateoryItem.Key];
foreach (crifanLibAmazon.categoryItem singleBestSellerCatItem in bestSellerCategoryList)
{
if (bestSellerCategoryKey.Equals(singleBestSellerCatItem.Key, StringComparison.CurrentCultureIgnoreCase))
{
bestSellerCateoryItem = singleBestSellerCatItem;
foundMatchedBestSeller = true;
break;
}
}
}
}
//Method 2: dynamic find same category key
//bestSellerCateoryItem = new crifanLibAmazon.categoryItem();
//foreach (crifanLibAmazon.categoryItem singleBestSellerCatItem in bestSellerCategoryList)
//{
// if (mainCateoryItem.Key.Equals(singleBestSellerCatItem.Key, StringComparison.CurrentCultureIgnoreCase))
// {
// bestSellerCateoryItem = singleBestSellerCatItem;
// foundMatchedBestSeller = true;
// break;
// }
//}
////not found key match
//if (!foundMatchedBestSeller)
//{
// //check some specials
// //(1)
// //Main Category : Best Seller
// //mobile-apps : mobile
// //arts-crafts : arts
// //baby-products : baby
// //stripbooks : books
// //mobile : wireless
// //...
//}
return foundMatchedBestSeller;
}
private void searchSingleCategory(crifanLibAmazon.categoryItem singleCateoryItem)
{
//instant-video
string curSearchCategoryKey = singleCateoryItem.Key;
//1. general category url
//instant-video
//http://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3dinstant-video
string generalCategoryUrl = amazonLib.generateMainCategoryUrlFromCategoryKey(curSearchCategoryKey);
processEachSearchCategory(singleCateoryItem.Url);
//2. Best Sellers
crifanLibAmazon.categoryItem bestSellerCategoryItem;
if (findMatchedBestSellerCategoryItem(singleCateoryItem, out bestSellerCategoryItem))
{
gLogger.Info("Found corrsponding best seller item category url=" + bestSellerCategoryItem.Url);
processEachSearchCategory(bestSellerCategoryItem.Url);
}
else
{
gLogger.Info("NOT found corrsponding best seller item category url, for: " + singleCateoryItem.Url);
}
//3. Movers & Shakers
//string moversShakersCategoryUrl = "";
//if(curSearchCategoryKey in moversShakersCategoryList)
//processEachSearchCategory(moversShakersCategoryUrl);
//4. Top Rated
//string topRatedCategoryUrl = "";
//if(curSearchCategoryKey in topRatedCategoryList)
//processEachSearchCategory(topRatedCategoryUrl);
}
private void btnSearch_Click(object sender, EventArgs e)
{
/*
* http://www.amazon.com/Best-Sellers/zgbs/ref=zg_bs_tab
* http://www.amazon.com/gp/movers-and-shakers/ref=zg_bs_tab
* http://www.amazon.com/gp/top-rated/ref=zg_bs_tab
*
* */
crifanLibAmazon.categoryItem curSelectedCategory = new crifanLibAmazon.categoryItem();
if (cmbSearchCategory.SelectedIndex >= 0)
{
//has selected some sub category
curSelectedCategory = (crifanLibAmazon.categoryItem)cmbSearchCategory.SelectedItem;
searchSingleCategory(curSelectedCategory);
}
}
}
}
【总结】
转载请注明:在路上 » 【代码分享】C#代码:ScrapeAmazonProduct – 抓取Amazon产品数据(完全从网页中抓取)