【背景】
之前已经有:
【代码分享】C#代码:ScrapeAmazonProduct – 抓取Amazon产品数据(完全从网页中抓取)
这个是其升级版:
主要改为从AWS的API中抓取数据,其次再从网页中抓取。
【ScrapeAmazonProduct代码分享】
1.截图:
2.项目代码下载:
ScrapeAmazonProduct_2013-09-10_scrapeFromAwsApi.7z
3.代码分享:
(1)frmScrapeAmazonProduct.cs
/*
* [File]
* frmScrapeAmazonProduct.cs
*
* [Function]
* Scrape products data from Amazon, mainly from AWS API, partially from html
*
* [Author]
* Crifan Li
*
* [Date]
* 2013-09-10
*
* [Contact]
* https://www.crifan.com/contact_me/
*/
using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Text;
using System.Windows.Forms;
using System.Web;
using System.Net;
using System.Xml;
using System.IO;
using HtmlAgilityPack;
using System.Text.RegularExpressions;
using Excel = Microsoft.Office.Interop.Excel;
using Microsoft.Office.Interop.Excel;
using NLog;
using NLog.Targets;
using NLog.Config;
namespace ScrapeAmazonProduct
{
public partial class frmScrapeAmazonProduct : Form
{
struct AmazonProductInfo
{
public string url; //record who it is
public string title;
public string description;
//5 bullet
public string[] bulletArr; // total 5 (or more, but only record 5)
//download 5 pics
public string[] imgUrlArr; // total 5 (or more, but only record 5)
//product keyword fileds, up to 3
public string[] keywordFieldArr; //each field, less than 50 chars, seperated by ','
//cheapest price of total (up to 8) sellers
public float cheapestPrice;
public bool isOneSellerIsAmazon;
public int reviewNumber;
public bool isBestSeller;
};
//for debug
//private int lineNumber = 1;
string defaultOutputFolderName = "output";
string defaultOutputImageFolderName = "images";
string gLogFilename;
public static string constAmazonDomainUrl = "http://www.amazon.com";
public static int rule_minimalBuyerNumber;
public static int rule_totalUnitNumber;
public static int rule_maxLenEachBullet;
public static int rule_maxDescriptionLen;
public static float rule_dimensionMaxLengthCm;
public static float rule_dimensionMaxWidthCm;
public static float rule_dimensionMaxHeightCm;
public static int rule_maxSingleKeywordFieldLen;
public static float rule_maxWeightPounds;
Dictionary<string, string> gMainCatMappingBestSellerCatDict;
public crifanLib crl;
public crifanLibAmazon amazonLib;
//List<crifanLibAmazon.categoryItem> mainCategoryList;
List<crifanLibAmazon.categoryItem> bestSellerCategoryList;
//for log
public Logger gLogger = null;
//need continue search or not
bool needContinueSearch = true;
List<TreeNode> curSelTreeNodeList;
enum search_status
{
SEARCH_STATUS_STOPPED,
SEARCH_STATUS_SEARCHING,
//SEARCH_STATUS_PAUSED
};
search_status curSearchStatus = search_status.SEARCH_STATUS_STOPPED;
//AWS API
public crifanLibAws aws;
List<crifanLibAws.awsBrowseNode> gMainBrowserNodeList;
List<string> gProcessedAsinList; //makesure all ASIN is Upper Case
public int gCurItemNum;
public frmScrapeAmazonProduct()
{
//!!! for load embedded dll: (1) register resovle handler
AppDomain.CurrentDomain.AssemblyResolve += new ResolveEventHandler(CurrentDomain_AssemblyResolve);
crl = new crifanLib();
amazonLib = new crifanLibAmazon();
aws = new crifanLibAws();
//gMainCatMappingBestSellerCatDict = null;
//init AWS API
string awsAccessKeyId = "your aws access key id";
string awsSecretKey = "your aws secret key";
string awsAssociateTag = "your aws associate tag";
crifanLibAws.awsEndpoint usEndpoint = crifanLibAws.awsEndpoint.US;
//note, here evenif you pass into 2011-08-02, but response xmlns still is:
//http://webservices.amazon.com/AWSECommerceService/2011-08-01
//so here only use 2011-08-01
string awsApiVersion = "2011-08-01";
aws.initAws(awsAccessKeyId, awsSecretKey, awsAssociateTag, usEndpoint, awsApiVersion);
gProcessedAsinList = new List<string>();
gCurItemNum = 1;
curSelTreeNodeList = new List<TreeNode>();
InitializeComponent();
}
//!!! for load embedded dll: (2) implement this handler
System.Reflection.Assembly CurrentDomain_AssemblyResolve(object sender, ResolveEventArgs args)
{
string dllName = args.Name.Contains(",") ? args.Name.Substring(0, args.Name.IndexOf(',')) : args.Name.Replace(".dll", "");
dllName = dllName.Replace(".", "_");
if (dllName.EndsWith("_resources")) return null;
System.Resources.ResourceManager rm = new System.Resources.ResourceManager(GetType().Namespace + ".Properties.Resources", System.Reflection.Assembly.GetExecutingAssembly());
byte[] bytes = (byte[])rm.GetObject(dllName);
return System.Reflection.Assembly.Load(bytes);
}
//update UI according current status
private void updateUI()
{
if (curSearchStatus == search_status.SEARCH_STATUS_STOPPED)
{
btnSearch.Enabled = true;
btnSearch.Text = "Search";
btnStop.Enabled = false;
//cmbSearchCategory.Enabled = true;
//grbSelectCategory.Enabled = true;
grbSettings.Enabled = true;
}
else if (curSearchStatus == search_status.SEARCH_STATUS_SEARCHING)
{
btnSearch.Enabled = false;
btnSearch.Text = "Searching";
btnStop.Enabled = true;
//cmbSearchCategory.Enabled = false;
//grbSelectCategory.Enabled = false;
grbSettings.Enabled = false;
}
}
private void initLoggerFilename()
{
string searchCategoryName = "";
crifanLibAws.awsBrowseNode curSelectedBrowserNode = getCurSelBrowserNode();
searchCategoryName = curSelectedBrowserNode.Name;
//string curDatetimeStr = DateTime.Now.ToString();
DateTime curDateTime = DateTime.Now;
string curDatetimeStr = String.Format("{0:yyyy-MM-dd_HHmmss}", curDateTime); //"2013-06-11_142102"
if (!string.IsNullOrEmpty(searchCategoryName))
{
gLogFilename = curDatetimeStr + "_log_" + searchCategoryName + ".txt"; //"2013-06-11_153647_log_.txt"
}
else
{
gLogFilename = curDatetimeStr + "_log.txt"; //"2013-06-11_153647_log.txt"
}
gLogFilename = Path.Combine(txbOutputFolder.Text, gLogFilename); //{'D:\tmp\tmp_dev_root\freelance\elance\40939187_scrape_amazon\40939187_scrape_amazon\ScrapeAmazonProduct\ScrapeAmazonProduct\bin\Debug\output\2013-06-11_153647_log.txt'}
}
private void initLogger()
{
//logger = LogManager.GetCurrentClassLogger();
// Step 1. Create configuration object
LoggingConfiguration logConfig = new LoggingConfiguration();
// Step 2. Create targets and add them to the configuration
RichTextBoxTarget rtbTarget = new RichTextBoxTarget();
logConfig.AddTarget("richTextBox", rtbTarget);
rtbTarget.FormName = "frmScrapeAmazonProduct"; // your winform class name
rtbTarget.ControlName = "rtbLog"; // your RichTextBox control/variable name
FileTarget fileTarget = new FileTarget();
logConfig.AddTarget("logFile", fileTarget);
// Step 3. Set target properties
//string commonLayout = "${date:format=yyyy-MM-dd HH\\:mm\\:ss} ${logger} ${message}";
//https://github.com/nlog/nlog/wiki/Layout-renderers
//https://github.com/nlog/nlog/wiki/Level-Layout-Renderer
//string commonLayout = "[${date:format=yyyy-MM-dd HH\\:mm\\:ss}][${level}] ${message}";
string commonLayout = "[${date:format=yyyy-MM-dd HH\\:mm\\:ss}][${pad:padding=5:inner=${level:uppercase=true}}] ${message}";
rtbTarget.Layout = commonLayout;
//fileTarget.FileName = "${basedir}/output/log.txt"; //{'${basedir}/output/log.txt'}
fileTarget.FileName = gLogFilename; //{'D:\tmp\tmp_dev_root\freelance\elance\40939187_scrape_amazon\40939187_scrape_amazon\ScrapeAmazonProduct\ScrapeAmazonProduct\bin\Debug\output\2013-06-11_153647_log.txt'}
fileTarget.Layout = commonLayout;
// Step 4. Define rules
LoggingRule ruleRichTextBox = new LoggingRule("*", LogLevel.Info, rtbTarget);
logConfig.LoggingRules.Add(ruleRichTextBox);
LoggingRule ruleFile = new LoggingRule("*", LogLevel.Trace, fileTarget);
logConfig.LoggingRules.Add(ruleFile);
// Step 5. Activate the configuration
LogManager.Configuration = logConfig;
// Example usage
//Logger logger = LogManager.GetLogger("Amazon");
//Logger logger = LogManager.GetLogger("");
gLogger = LogManager.GetLogger("");
//gLogger.Trace("trace log message");
//gLogger.Debug("debug log message");
//gLogger.Info("info log message");
//gLogger.Warn("warn log message");
//gLogger.Error("error log message");
//gLogger.Fatal("fatal log message");
}
public void initRules()
{
rule_minimalBuyerNumber = Int32.Parse(txbMinBuyerNum.Text);
rule_totalUnitNumber = Int32.Parse(txbTotalUnitNum.Text);
rule_maxLenEachBullet = Int32.Parse(txbEachBulletMaxLen.Text);
rule_maxDescriptionLen = Int32.Parse(txbMaxDescriptionLen.Text);
rule_dimensionMaxLengthCm = float.Parse(txbDimensionHeight.Text);
rule_dimensionMaxWidthCm = float.Parse(txbDimensionWidth.Text);
rule_dimensionMaxHeightCm = float.Parse(txbDimensionHeight.Text);
rule_maxSingleKeywordFieldLen = Int32.Parse(txbSingleKeywordFieldMaxLen.Text);
rule_maxWeightPounds = float.Parse(txbMaxWeightPounds.Text);
}
public void initOutputRootFolder()
{
string currentFolder = Environment.CurrentDirectory;
string defaultAbsOutputFolder = Path.Combine(currentFolder, defaultOutputFolderName);
if (!Directory.Exists(defaultAbsOutputFolder))
{
Directory.CreateDirectory(defaultAbsOutputFolder);
}
txbOutputFolder.Text = defaultAbsOutputFolder;
}
private string getCurrentOutputFullFilename()
{
return Path.Combine(txbOutputFolder.Text, txbExcelFilename.Text);
}
//init log filename
//init logger
//create ouput image foler
private void afterChangeOutputFolder()
{
//3. init log filename
initLoggerFilename();
//4. init logger
initLogger();
//5. init output image foler
string strOutputImageFolder = Path.Combine(txbOutputFolder.Text, defaultOutputImageFolderName);
if (!Directory.Exists(strOutputImageFolder))
{
Directory.CreateDirectory(strOutputImageFolder);
}
}
private void frmScrapeAmazonProduct_Load(object sender, EventArgs e)
{
//1. init rules
initRules();
//2. init output
initOutputRootFolder();
////5. init main category list to best seller mapping
//initMainCategoryToBestSellerCategoryMapping();
//include init logger
afterChangeOutputFolder();
//6. init main category list
//initSearchCategory();
//!!! must init logger first
initAwsCategory();
//7.update UI
updateUI();
////debug
//string testAsin = "B0007S5N8O";
////crifanLibAws.awsEditorialReview editorialReview = aws.awsGetEditorialReview(testAsin);
//crifanLibAws.awsImages imagesInfo = aws.awsGetImages(testAsin);
//debug
//createOutputFile("D:\\download\\AmazonProductInfo.xls");
////debug
//string itemAsin = "B008D5UG6M";
//crifanLibAws.awsItemAttributes itemAttributes = aws.awsGetItemAttributes(itemAsin);
////debug
//string itemAsin = "B004FGMDOQ";
//processAmazonItem(itemAsin);
////debug
//string itemAsin = "B0005YWH7A";
//itemAsin = "B001FA1L9I";
//itemAsin = "B003YBJ9KY";
//itemAsin = "B000G1EO6O";
//itemAsin = "B000JSOBSA";
//itemAsin = "B000LKYUSM";
//itemAsin = "B002B8GH74";
//itemAsin = "B000EZYFRA";
//itemAsin = "B000BTAREY";
//itemAsin = "B000JSOBSU";
//itemAsin = "B0029JRTVI";
//itemAsin = "B0005ZW4QI";
//itemAsin = "B0005ZWJ0O";
//itemAsin = "B0095XRL1Y";
//itemAsin = "B001SB1BA8";
//string offerListingUrl = amazonLib.generateOfferListingUrl(itemAsin);
//List<crifanLibAmazon.productSellerInfo> allSellerInfoList = new List<crifanLibAmazon.productSellerInfo>();
//amazonLib.extractAllSellerInfo(offerListingUrl, out allSellerInfoList);
////debug
//string itemAsin = "B0029A71C4";
//processAmazonItem(itemAsin);
}
private bool checkBuyerNumber(string productHtml, out string invalidReason, out string usedAndNewUrl)
{
bool isBuyerNumberValid = false;
invalidReason = "Unknow error for checkBuyerNumber";
usedAndNewUrl = "";
int buyerNumber = 0;
if (amazonLib.extractProductBuyerNumberAndNewUrl(productHtml, out buyerNumber, out usedAndNewUrl))
{
if (buyerNumber > rule_minimalBuyerNumber)
{
isBuyerNumberValid = true;
invalidReason = "";
}
else
{
isBuyerNumberValid = false;
invalidReason = String.Format("Buyer Number is {0}, less than {1}", buyerNumber, rule_minimalBuyerNumber);
}
}
else
{
isBuyerNumberValid = false;
invalidReason = "Not found buyer number string and used and new url";
}
return isBuyerNumberValid;
}
//http://www.amazon.com/gp/offer-listing/B0083PWAPW/ref=dp_olp_all_mbc?ie=UTF8&condition=all
//"http://www.amazon.com/Frigidaire-FRA052XT7-000-BTU-Window-Conditioner/dp/B003F4TH6G/ref=lp_3737671_1_1?ie=UTF8&qid=1371183851&sr=1-1"
//http://www.amazon.com/gp/product/B0009IQXFO/ref=olp_product_details?ie=UTF8
//"http://www.amazon.com/gp/product/B00A49TQPC"
private bool checkTotalUnitNumber(string productUrl, out string invalidReason)
{
//debug
//productUrl = "http://www.amazon.com/gp/offer-listing/B0083PWAPW/ref=dp_olp_all_mbc?ie=UTF8&condition=all";
//productUrl = "http://www.amazon.com/gp/offer-listing/B007HUUU6A/ref=dp_olp_new_mbc?ie=UTF8&condition=new";
string strNoError = "No Error";
bool bTotalUnitNumValid = false;
//int totalNumber = 0;
//invalidReason = "Unknow error for checkTotalUnitNumber";
invalidReason = strNoError;
HtmlAgilityPack.HtmlDocument htmlDoc = null;
//string respHtml = crl.getUrlRespHtml(productUrl);
string respHtml = crl.getUrlRespHtml_multiTry(productUrl);
//Method 2: just check the availGreen node
//something wrong, so re-check
//http://www.amazon.com/Battery-Tender-081-0069-6-Terminal-Disconnect/dp/B004JV6OMO/ref=zg_bs_15719731_80
//Only 2 left in stock.
//<div class="buying" style="padding-bottom: 0.75em;">
// <span class="availGreen">Only 2 left in stock.</span>
//http://www.amazon.com/Battery-Tender-021-0123-Junior-Charger/dp/B000CITK8S/ref=zg_bs_automotive_3
//In Stock.
//<div class="buying" style="padding-bottom: 0.75em;">
// <span class="availGreen">In Stock.</span>
htmlDoc = crl.htmlToHtmlDoc(respHtml);
HtmlNode availGreenNode = htmlDoc.DocumentNode.SelectSingleNode("//div[@class='buying']/span[@class='availGreen']");
if (availGreenNode == null)
{
//http://www.amazon.com/gp/product/B005SSWKMK
//<div id="availability">
// <div class="a-color-available a-size-medium">
// In Stock.
// </div>
//</div>
HtmlNode availabilityDivNode = htmlDoc.DocumentNode.SelectSingleNode("//div[@id='availability']/div");
availGreenNode = availabilityDivNode; // for latter to check
}
if (availGreenNode != null)
{
string strAvailGreen = availGreenNode.InnerText; //" \t\t\t\t\t\tIn Stock.\t\t "
strAvailGreen = strAvailGreen.Trim(); //"In Stock."
//http://www.amazon.com/gp/product/B0009IQXFO/ref=olp_product_details?ie=UTF8
//"In stock but may require an extra 1-2 days to process."
if (strAvailGreen.StartsWith("in stock", StringComparison.CurrentCultureIgnoreCase))
{
bTotalUnitNumValid = true; //consider "In Stock." is valid
}
else
{
//consider "Only N left in stock." as invalid
bTotalUnitNumValid = false;
invalidReason = strAvailGreen;
gLogger.Debug("availGreen is " + strAvailGreen + " for " + productUrl);
}
}
else
{
invalidReason = "Can not find 'In Stock.'";
gLogger.Debug(invalidReason + " for " + productUrl);
}
return bTotalUnitNumValid;
}
private bool checkWeight(string productUrl, string productHtml, out string invalidReason)
{
bool bNotExceedWeight = false;
invalidReason = "Unknow error for checkWeight";
float maxKiloGram = crl.poundToKiloGram(rule_maxWeightPounds);
float kiloGram = amazonLib.extractProductWeight(productHtml);
//check valid or not
if (kiloGram > 0.0F)
{
if (kiloGram <= maxKiloGram)
{
bNotExceedWeight = true;
}
else
{
bNotExceedWeight = false;
invalidReason = String.Format("Weight is {0} kilogram, more than {1} pounds({2} kilograms)", kiloGram, rule_maxWeightPounds, maxKiloGram);
}
}
else
{
bNotExceedWeight = false;
invalidReason = "Not found weight string or unrecognized weight number";
}
return bNotExceedWeight;
}
private bool checkDimension(string productUrl, string productHtml, out string invalidReason)
{
bool isValidDimension = false;
invalidReason = "Unknow error for checkDimension";
crifanLibAmazon.productDimension dimensionCm = amazonLib.extractProductDimension(productHtml);
if (dimensionCm.length > 0.0F)
{
crifanLibAmazon.productDimension dimensionMaxCm = new crifanLibAmazon.productDimension();
dimensionMaxCm.length = rule_dimensionMaxLengthCm;
dimensionMaxCm.width = rule_dimensionMaxWidthCm;
dimensionMaxCm.height = rule_dimensionMaxHeightCm;
//check valid or not
if (
(dimensionCm.length <= dimensionMaxCm.length) &&
(dimensionCm.width <= dimensionMaxCm.width) &&
(dimensionCm.height <= dimensionMaxCm.height)
)
{
isValidDimension = true;
}
else
{
isValidDimension = false;
invalidReason = String.Format("Dimension: {0}cm x {1}cm x {2}cm invalid for exceed max: {3}cm x {4}cm x {5}cm",
dimensionCm.length, dimensionCm.width, dimensionCm.height,
dimensionMaxCm.length, dimensionMaxCm.width, dimensionMaxCm.height);
}
}
else
{
//isValidDimension = false;
//invalidReason = "Not found dimension string";
isValidDimension = true; // even if no dimension, also consider it as valid one if the weight is valid
}
return isValidDimension;
}
private bool checkProductValid(string productUrl, string productHtml, out string invalidReason, out string usedAndNewUrl)
{
bool isProductValid = true;
invalidReason = "";
usedAndNewUrl = "";
//1. check buyer number > 8
if (isProductValid)
{
//debug
isProductValid = checkBuyerNumber(productHtml, out invalidReason, out usedAndNewUrl);
}
//2. check total unit number > 50
if (isProductValid)
{
//debug
//isProductValid = checkTotalUnitNumber(usedAndNewUrl, out invalidReason);
isProductValid = checkTotalUnitNumber(productUrl, out invalidReason);
}
//3. check no more than 5 pounds (2.5 kg)
if (isProductValid)
{
//debug
isProductValid = checkWeight(productUrl, productHtml, out invalidReason);
}
//4. check dimension less than 80cmX80cmX80cm
if (isProductValid)
{
//debug
isProductValid = checkDimension(productUrl, productHtml, out invalidReason);
}
return isProductValid;
}
public void updateProgress(int percentage)
{
//pgbDownload.Value = percentage;
}
public void downloadPictures(string productUrl, string respHtml, out string[] picFullnameList)
{
picFullnameList = null;
//init
string productAsin = "";
if (amazonLib.extractAsinFromProductUrl(productUrl, out productAsin))
{
}
else
{
//something wrong
}
//creat folder
string picFolderFullPath = Path.Combine(txbOutputFolder.Text, productAsin);
if (!Directory.Exists(picFolderFullPath))
{
Directory.CreateDirectory(picFolderFullPath);
}
string[] imageUrlList = amazonLib.extractProductImageList(respHtml);
gLogger.Info("Extracted image url list:");
if (imageUrlList != null)
{
picFullnameList = new string[imageUrlList.Length];
for (int idx = 0; idx < imageUrlList.Length; idx++)
{
string imageUrl = imageUrlList[idx];
gLogger.Info(String.Format("[{0}]={1}", idx, imageUrl));
string picFilename = crl.extractFilenameFromUrl(imageUrl);
string picFullFilename = Path.Combine(picFolderFullPath, picFilename);
string errorStr = "";
gLogger.Info(String.Format("Downloading {0} to {1}", imageUrl, picFullFilename));
crl.downloadFile(imageUrl, picFullFilename, out errorStr, updateProgress);
//update
picFullnameList[idx] = picFullFilename;
}
}
else
{
gLogger.Error("No image url for " + productUrl);
}
}
private void createOutputFile(string excelFullFilename)
{
gLogger.Info("Creating ouput file " + excelFullFilename);
//bool isAutoFit = true;
bool isHeaderBold = true;
//init
//if exist remove it
if (File.Exists(excelFullFilename))
{
File.Delete(excelFullFilename);
}
Excel.Application xlApp = new Excel.Application();
Excel.Workbook xlWorkBook;
Excel.Worksheet xlWorkSheet;
object misValue = System.Reflection.Missing.Value;
xlApp = new Excel.ApplicationClass();
xlWorkBook = xlApp.Workbooks.Add(misValue);
xlWorkSheet = (Excel.Worksheet)xlWorkBook.Worksheets.get_Item(1);
const int excelRowHeader = 1;
const int excelColumnHeader = 1;
//save header
int curColumnIdx = 0 + excelColumnHeader;
int rowIdx = 0 + excelRowHeader;
xlWorkSheet.Cells[rowIdx, curColumnIdx++] = "Title";
xlWorkSheet.Cells[rowIdx, curColumnIdx++] = "Description";
const int constBullerLen = 5;
for (int bulletIdx = 0; bulletIdx < constBullerLen; bulletIdx++)
{
int bulletNum = bulletIdx + 1;
xlWorkSheet.Cells[rowIdx, curColumnIdx + bulletIdx] = "Bullet" + bulletNum.ToString();
}
curColumnIdx = curColumnIdx + constBullerLen;
const int constImgNameListLen = 5;
for (int imgIdx = 0; imgIdx < constImgNameListLen; imgIdx++)
{
int imgNum = imgIdx + 1;
xlWorkSheet.Cells[rowIdx, curColumnIdx + imgIdx] = "ImageFilename" + imgNum.ToString();
}
curColumnIdx = curColumnIdx + constImgNameListLen;
xlWorkSheet.Cells[rowIdx, curColumnIdx++] = "CheapestPrice";
xlWorkSheet.Cells[rowIdx, curColumnIdx++] = "OneSellerIsAmazon";
xlWorkSheet.Cells[rowIdx, curColumnIdx++] = "ReviewNumber";
xlWorkSheet.Cells[rowIdx, curColumnIdx++] = "IsBestSeller";
//formatting
//(1) header to bold
if (isHeaderBold)
{
Range headerRow = xlWorkSheet.get_Range("1:1", System.Type.Missing);
headerRow.Font.Bold = true;
}
//here not autoFit for latter when save into will autofit
////(2) auto adjust column width (according to content)
//if (isAutoFit)
//{
// Range allColumn = xlWorkSheet.Columns;
// allColumn.AutoFit();
//}
//output
xlWorkBook.SaveAs(excelFullFilename,
XlFileFormat.xlWorkbookNormal,
misValue,
misValue,
misValue,
misValue,
XlSaveAsAccessMode.xlExclusive,
XlSaveConflictResolution.xlLocalSessionChanges,
misValue,
misValue,
misValue,
misValue);
xlWorkBook.Close(true, misValue, misValue);
xlApp.Quit();
crl.releaseObject(xlWorkSheet);
crl.releaseObject(xlWorkBook);
crl.releaseObject(xlApp);
}
private void appendInfoToFile(string fullFilename, AmazonProductInfo productInfo)
{
gLogger.Info("Saving product info for " + productInfo.url);
bool isAutoFitForFistColumn = true;
Excel.Application xlApp;
Excel.Workbook xlWorkBook;
Excel.Worksheet xlWorkSheet;
object missingVal = System.Reflection.Missing.Value;
xlApp = new Microsoft.Office.Interop.Excel.Application();
//xlApp.Visible = true;
//xlApp.DisplayAlerts = false;
//http://msdn.microsoft.com/zh-cn/library/microsoft.office.interop.excel.workbooks.open%28v=office.11%29.aspx
xlWorkBook = xlApp.Workbooks.Open(
Filename : fullFilename,
//UpdateLinks:3,
ReadOnly : false,
//Format : 2, //use Commas as delimiter when open text file
//Password : missingVal,
//WriteResPassword : missingVal,
//IgnoreReadOnlyRecommended: false, //when save to readonly, will notice you
Origin: Excel.XlPlatform.xlWindows, //xlMacintosh/xlWindows/xlMSDOS
//Delimiter: ",", // usefule when is text file
Editable : true,
Notify : false,
//Converter: missingVal,
AddToMru: true, //True to add this workbook to the list of recently used files
Local: true,
CorruptLoad: missingVal //xlNormalLoad/xlRepairFile/xlExtractData
);
//Get the first sheet
xlWorkSheet = (Excel.Worksheet)xlWorkBook.Worksheets.get_Item(1); //also can get by sheet name
Excel.Range range = xlWorkSheet.UsedRange;
//int usedColCount = range.Columns.Count;
int usedRowCount = range.Rows.Count;
const int excelRowHeader = 1;
const int excelColumnHeader = 1;
//int curColumnIdx = usedColCount + excelColumnHeader;
int curColumnIdx = 0 + excelColumnHeader; //start from column begin
int curRrowIdx = usedRowCount + excelRowHeader; // !!! here must added buildin excelRowHeader=1, otherwise will overwrite previous (added title or whole row value)
xlWorkSheet.Cells[curRrowIdx, curColumnIdx++] = productInfo.title;
xlWorkSheet.Cells[curRrowIdx, curColumnIdx++] = productInfo.description;
const int constBullerLen = 5;
int bulletListLen = 0;
if (productInfo.bulletArr.Length > constBullerLen)
{
bulletListLen = constBullerLen;
}
else
{
bulletListLen = productInfo.bulletArr.Length;
}
for (int bulletIdx = 0; bulletIdx < bulletListLen; bulletIdx++)
{
xlWorkSheet.Cells[curRrowIdx, curColumnIdx + bulletIdx] = productInfo.bulletArr[bulletIdx];
}
curColumnIdx = curColumnIdx + bulletListLen;
const int constImgNameListLen = 5;
int imgNameListLen = 0;
if (productInfo.imgUrlArr.Length > constImgNameListLen)
{
imgNameListLen = constImgNameListLen;
}
else
{
imgNameListLen = productInfo.imgUrlArr.Length;
}
for (int imgIdx = 0; imgIdx < imgNameListLen; imgIdx++)
{
xlWorkSheet.Cells[curRrowIdx, curColumnIdx + imgIdx] = productInfo.imgUrlArr[imgIdx];
}
curColumnIdx = curColumnIdx + imgNameListLen;
xlWorkSheet.Cells[curRrowIdx, curColumnIdx++] = productInfo.cheapestPrice;
xlWorkSheet.Cells[curRrowIdx, curColumnIdx++] = productInfo.isOneSellerIsAmazon;
xlWorkSheet.Cells[curRrowIdx, curColumnIdx++] = productInfo.reviewNumber;
xlWorkSheet.Cells[curRrowIdx, curColumnIdx++] = productInfo.isBestSeller;
//(2) auto adjust first column width (according to content)
if (isAutoFitForFistColumn)
{
//Range firstColumn = (Range)xlWorkSheet.Columns[0];
Range firstColumn = xlWorkSheet.get_Range("A1");
//firstColumn.AutoFit();
firstColumn.EntireColumn.AutoFit();
}
////http://msdn.microsoft.com/query/dev10.query?appId=Dev10IDEF1&l=ZH-CN&k=k%28MICROSOFT.OFFICE.INTEROP.EXCEL._WORKBOOK.SAVEAS%29;k%28SAVEAS%29;k%28TargetFrameworkMoniker-%22.NETFRAMEWORK%2cVERSION%3dV3.5%22%29;k%28DevLang-CSHARP%29&rd=true
//xlWorkBook.SaveAs(
// Filename: fullFilename,
// ConflictResolution: XlSaveConflictResolution.xlLocalSessionChanges //The local user's changes are always accepted.
// //FileFormat : Excel.XlFileFormat.xlWorkbookNormal
//);
//if use above SaveAs -> will popup a window ask you overwrite it or not, even if you have set the ConflictResolution to xlLocalSessionChanges, which should not ask, should directly save
xlWorkBook.Save();
//http://msdn.microsoft.com/query/dev10.query?appId=Dev10IDEF1&l=ZH-CN&k=k%28MICROSOFT.OFFICE.INTEROP.EXCEL._WORKBOOK.CLOSE%29;k%28CLOSE%29;k%28TargetFrameworkMoniker-%22.NETFRAMEWORK%2cVERSION%3dV3.5%22%29;k%28DevLang-CSHARP%29&rd=true
xlWorkBook.Close(SaveChanges : true);
crl.releaseObject(xlWorkSheet);
crl.releaseObject(xlWorkBook);
crl.releaseObject(xlApp);
}
//save product info
private void saveProductInfo(AmazonProductInfo productInfo)
{
string outputExcelFullFilename = Path.Combine(txbOutputFolder.Text, txbExcelFilename.Text);
//check if output excel file already exist
if (!File.Exists(outputExcelFullFilename))
{
//if no, create it, add header
createOutputFile(outputExcelFullFilename);
}
//then append info to it
appendInfoToFile(outputExcelFullFilename, productInfo);
return;
}
/*
* productUrl=http://www.amazon.com/Kindle-Paperwhite-Touch-light/dp/B007OZNZG0/ref=lp_1055398_1_1?ie=UTF8&qid=1370510177&sr=1-1
* usedAndNewUrl=http://www.amazon.com/gp/offer-listing/B007OZNZG0/ref=dp_olp_all_mbc?ie=UTF8&condition=all
*/
private bool extractProductInfo(string productUrl, string productHtml, string usedAndNewUrl, out AmazonProductInfo productInfo)
{
gLogger.Info("Extracting info for " + productUrl);
//init
bool extractProductInfoOk = true;
productInfo = new AmazonProductInfo();
productInfo.url = productUrl;
productInfo.cheapestPrice = float.MaxValue;
productInfo.isOneSellerIsAmazon = false;
//must init, otherwise, when only got 4 bullet, here total 5 -> last is null -> assign later will exception
productInfo.bulletArr = new string[5];
crl.emptyStringArray(productInfo.bulletArr);
productInfo.imgUrlArr = new string[5];
crl.emptyStringArray(productInfo.imgUrlArr);
productInfo.keywordFieldArr = new string[3];
crl.emptyStringArray(productInfo.keywordFieldArr);
//1. title
productInfo.title = amazonLib.extractProductTitle(productHtml);
gLogger.Info("Title=" + productInfo.title);
//2. description and 5 bullet
List<string> bulletList = new List<string>();
bool gotBullets = amazonLib.extractProductBulletList(productHtml, out bulletList);
gLogger.Info("Extracted Bullets=" + gotBullets);
string description = "";
bool gotDescription = amazonLib.extractProductDescription(productHtml, out description);
gLogger.Info("Got Description=" + gotDescription);
/*
* 1. if no description, use bullet
* 2. if more than normal 5 bullets, get all bullets, just use first 5 bullets to description
* 3. if no bullet, use description to split to 5 bullets
*/
//type1: has description, has bullet
if ((description != "") && (bulletList.Count > 0))
{
productInfo.description = description;
//bullets
//maybe has more than 5 bullets
//maybe less than 5 bullets
//http://www.amazon.com/AmazonBasics-Lightning-Compatible-Cable-inch/dp/B00B5RGAWY/ref=sr_1_3?s=wireless&ie=UTF8&qid=1369753764&sr=1-3
//has feature-bullets_feature_div, but no content -> bulletsNodeList is null
for (int idx = 0; idx < bulletList.Count; idx++)
{
string bulletStr = bulletList[idx];
//get first 5 -> to bullet
if (idx < 5)
{
productInfo.bulletArr[idx] = bulletStr;
}
}
}
//type2: no description, has bullet
else if ((description == "") && (bulletList.Count > 0))
{
//bullets
//maybe has more than 5 bullets
//maybe less than 5 bullets
for (int idx = 0; idx < bulletList.Count; idx++)
{
string bulletStr = bulletList[idx];
//get first 5 -> to bullet
if (idx < 5)
{
productInfo.bulletArr[idx] = bulletStr;
}
//all bullet -> description
description = description + bulletStr + Environment.NewLine;
}
productInfo.description = description;
}
//type3: has description, no bullet
else if ((description != "") && (bulletList.Count == 0))
{
productInfo.description = description;
//seperate description to many lines
string[] lines = description.Split('.');
//maybe less than 5, maybe greater than 5
for (int idx = 0; idx < lines.Length; idx++)
{
string curLine = lines[idx];
//get first 5 -> to bullet
if (idx < 5)
{
productInfo.bulletArr[idx] = curLine;
}
}
}
//type4: no description, no bullet
else if ((description == "") && (bulletList.Count == 0))
{
//something wrong
extractProductInfoOk = false;
return extractProductInfoOk;
}
//check max length for each bullet
for (int idx = 0; idx < productInfo.bulletArr.Length; idx++)
{
if (productInfo.bulletArr[idx].Length > rule_maxLenEachBullet)
{
productInfo.bulletArr[idx] = productInfo.bulletArr[idx].Substring(0, rule_maxLenEachBullet);
}
}
//check max length for whole description ?
//3. download 5(or 7) pics
string[] picFullnameList = null;
//debug
downloadPictures(productUrl, productHtml, out picFullnameList);
if ((picFullnameList != null) && (picFullnameList.Length > 0))
{
int maxImageCount = 0;
if (picFullnameList.Length > productInfo.imgUrlArr.Length)
{
maxImageCount = productInfo.imgUrlArr.Length;
}
else
{
maxImageCount = picFullnameList.Length;
}
for (int idx = 0; idx < maxImageCount; idx++)
{
productInfo.imgUrlArr[idx] = picFullnameList[idx];
}
}
//4.extract product seller info: price and name
List<crifanLibAmazon.productSellerInfo> allSellerInfoList = new List<crifanLibAmazon.productSellerInfo>();
if (amazonLib.extractAllSellerInfo(usedAndNewUrl, out allSellerInfoList))
{
if ((allSellerInfoList != null) && (allSellerInfoList.Count > 0))
{
foreach (crifanLibAmazon.productSellerInfo eachSellerInfo in allSellerInfoList)
{
//(1) calc cheapest price
if (eachSellerInfo.price < productInfo.cheapestPrice)
{
productInfo.cheapestPrice = eachSellerInfo.price;
}
//(2) find whether one of the sellers is Amazon
//here means: one of the seller's name is: Amazon.com
if (eachSellerInfo.name.Equals("Amazon.com", StringComparison.CurrentCultureIgnoreCase))
{
productInfo.isOneSellerIsAmazon = true;
}
}
if (productInfo.cheapestPrice.CompareTo(float.MaxValue) == 0)
{
gLogger.Info(String.Format("Omit this {0} for not find valid cheapest price for {1} ", productUrl, usedAndNewUrl));
extractProductInfoOk = false;
return extractProductInfoOk;
}
else
{
gLogger.Info("Cheapest Price=" + productInfo.cheapestPrice);
gLogger.Info("One of Seller is Amazon=" + productInfo.isOneSellerIsAmazon);
}
}
else
{
gLogger.Info(String.Format("Omit this {0} for found seller info but is invalid for {1} ", productUrl, usedAndNewUrl));
extractProductInfoOk = false;
return extractProductInfoOk;
}
}
else
{
gLogger.Info(String.Format("Omit this {0} for not found seller info for {1} ", productUrl, usedAndNewUrl));
extractProductInfoOk = false;
return extractProductInfoOk;
}
//5. 3 keyword Field
productInfo.keywordFieldArr = amazonLib.extractProductKeywordField(productInfo.title, productInfo.keywordFieldArr.Length, rule_maxSingleKeywordFieldLen);
gLogger.Info("Keyword Field List:");
if ((productInfo.keywordFieldArr != null) && (productInfo.keywordFieldArr.Length > 0))
{
for (int idx = 0; idx < productInfo.keywordFieldArr.Length; idx++)
{
String keywordField = productInfo.keywordFieldArr[idx];
gLogger.Info(String.Format("[{0}]={1}", idx, keywordField));
}
}
//6. product review
productInfo.reviewNumber = amazonLib.extractProductReviewNumber(productHtml: productHtml);
gLogger.Info("ReviewNumber=" + productInfo.reviewNumber);
//7. product best seller rank number list
List<crifanLibAmazon.productBestRank> bestSellerRankList = amazonLib.extractProductBestSellerRankList(productHtml);
if ((bestSellerRankList != null) && (bestSellerRankList.Count > 0))
{
productInfo.isBestSeller = true;
gLogger.Info("Is BestSeller=" + productInfo.isBestSeller);
}
else
{
gLogger.Debug(" or count not > 0 : " + bestSellerRankList.ToString());
gLogger.Info(String.Format("Omit this {0} for bestSellerRankList is empty", productUrl));
extractProductInfoOk = false;
return extractProductInfoOk;
}
return extractProductInfoOk; ;
}
//check whether each product valid or not
//if valid, extract product info
//http://www.amazon.com/Silver-Linings-Playbook/dp/B00CL68QVQ/ref=sr_1_2?s=instant-video&ie=UTF8&qid=1368688342&sr=1-2
private void checkAndExtractForSingleProduct(string productUrl)
{
//debug
//productUrl = "http://www.amazon.com/Paderno-World-Cuisine-A4982799-Tri-Blade/dp/B0007Y9WHQ/ref=lp_1055398_1_3?ie=UTF8&qid=1370596558&sr=1-3";
bool isProductValid = false;
string invalidReason = "";
//string respHtml = crl.getUrlRespHtml(productUrl);
string productHtml = crl.getUrlRespHtml_multiTry(productUrl);
string usedAndNewUrl = "";
isProductValid = checkProductValid(productUrl, productHtml, out invalidReason, out usedAndNewUrl);
if (isProductValid)
{
gLogger.Info("+VALID+ Product=" + productUrl);
AmazonProductInfo productInfo;
if (extractProductInfo(productUrl, productHtml, usedAndNewUrl, out productInfo))
{
saveProductInfo(productInfo);
}
}
else
{
gLogger.Info(String.Format("-INVALID- product={0}, reason={1}", productUrl, invalidReason));
}
}
//check whether each product variation valid or not
//if valid, extract product info
private void checkAndExtractForSingleVariation(crifanLibAmazon.variationItem singleVariationItem)
{
bool isProductValid = false;
string invalidReason = "";
gLogger.Info("processing variation " + singleVariationItem.url);
//string respHtml = crl.getUrlRespHtml(singleVariationItem.url);
string productHtml = crl.getUrlRespHtml_multiTry(singleVariationItem.url);
string usedAndNewUrl = "";
isProductValid = checkProductValid(singleVariationItem.url, productHtml, out invalidReason, out usedAndNewUrl);
if (isProductValid)
{
gLogger.Info("Valid product=" + singleVariationItem.url);
AmazonProductInfo productInfo;
if (extractProductInfo(singleVariationItem.url, productHtml, usedAndNewUrl, out productInfo))
{
//check whether the product title already have vartiation label in the end of title
//if not, added it
if (productInfo.title.EndsWith(singleVariationItem.label))
{
//http://www.amazon.com/GE-MWF-Refrigerator-Filter-1-Pack/dp/B000AST3AK/ref=lp_1055398_1_4?ie=UTF8&qid=1370574186&sr=1-4
//title already added variation label:
//GE MWF Refrigerator Water Filter, 1-Pack
//also for:
//http://www.amazon.com/gp/product/B003BIG0DO/ref=twister_B000AST3AK?ie=UTF8&psc=1
//GE SmartWater MWF Refrigerator Water Filter, 2-Pack
}
else
{
//http://www.amazon.com/Thermos-Insulated-18-Ounce-Stainless-Steel-Hydration/dp/B000FJ9DOK/ref=lp_1055398_1_6?ie=UTF8&qid=1370574186&sr=1-6
//and
//http://www.amazon.com/gp/product/B0057FQCNC/ref=twister_B000FJ9DOK?ie=UTF8&psc=1
//has same title
productInfo.title = productInfo.title + ", " + singleVariationItem.label;
}
saveProductInfo(productInfo);
}
}
else
{
gLogger.Info(String.Format("Invalid product={0}, reason={1}",singleVariationItem.url, invalidReason));
}
}
private void processSinglePageHtml(string curPageSearchUrl, string singlePageHtml)
{
List<crifanLibAmazon.searchResultItem> searchedItemList = new List<crifanLibAmazon.searchResultItem>();
if (amazonLib.extractSearchItemList(curPageSearchUrl, singlePageHtml, out searchedItemList))
{
foreach (crifanLibAmazon.searchResultItem eachSearchResultItem in searchedItemList)
{
if (!needContinueSearch)
{
break;
}
crifanLibAmazon.productVariationInfo variationInfo = new crifanLibAmazon.productVariationInfo();
gLogger.Info("processing single product url " + eachSearchResultItem.productUrl);
if (amazonLib.checkVariation(eachSearchResultItem.productUrl, out variationInfo))
{
//have many varation
//process each variation
List<crifanLibAmazon.variationItem> variationList = variationInfo.variationList;
gLogger.Info(String.Format("Total {0} variations for {1}", variationList.Count, eachSearchResultItem.productUrl));
foreach (crifanLibAmazon.variationItem eachVariationItem in variationList)
{
if (!needContinueSearch)
{
break;
}
checkAndExtractForSingleVariation(eachVariationItem);
}
}
else
{
//no variation -> only current single product
//directly process this product
gLogger.Info("no variation for " + eachSearchResultItem.productUrl);
checkAndExtractForSingleProduct(eachSearchResultItem.productUrl);
}
}
}
}
//"http://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3dappliances"
private void processEachSearchCategory(string curPageSearchUrl)
{
gLogger.Info("processing search category " + curPageSearchUrl);
string eachPageHtml = "";
//find all level 1 child category url list
List<crifanLibAmazon.categoryItem> subCategoryList = amazonLib.extractSubCategoryList(curPageSearchUrl);
foreach (crifanLibAmazon.categoryItem subCategory in subCategoryList)
{
bool hasMorePage = true;
curPageSearchUrl = subCategory.Url;
if (!needContinueSearch)
{
break;
}
//get each page html
while (hasMorePage)
{
if (!needContinueSearch)
{
break;
}
//fisrt:
//http://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3dinstant-video
//then:
//http://www.amazon.com/s/ref=sr_pg_2?rh=n%3A2625373011%2Cn%3A%212644981011%2Cn%3A%212644982011%2Cn%3A2858778011&page=2&ie=UTF8&qid=1368697688
//eachPageHtml = crl.getUrlRespHtml(curPageSearchUrl);
eachPageHtml = crl.getUrlRespHtml_multiTry(curPageSearchUrl);
processSinglePageHtml(curPageSearchUrl, eachPageHtml);
string nextPageUrl = "";
if (amazonLib.extractNextPageUrl(curPageSearchUrl, eachPageHtml, out nextPageUrl))
{
if (nextPageUrl != "")
{
//http://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Dinstant-video#/ref=sr_pg_2?rh=n%3A2858778011&page=2&ie=UTF8&qid=1368688123
//http://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Dinstant-video#/ref=sr_pg_3?rh=n%3A2858778011&page=3&ie=UTF8&qid=1368688123
hasMorePage = true;
}
else
{
hasMorePage = false;
break;
}
}
else
{
//something wrong
break;
}
}
}
}
//find matched best seller category for input main category item
public bool findMatchedBestSellerCategoryItem(crifanLibAmazon.categoryItem mainCateoryItem, out crifanLibAmazon.categoryItem bestSellerCateoryItem)
{
bool foundMatchedBestSeller = false;
bestSellerCateoryItem = new crifanLibAmazon.categoryItem();
//Method 1: static mapping
if (gMainCatMappingBestSellerCatDict != null && (gMainCatMappingBestSellerCatDict.Count > 0))
{
if (gMainCatMappingBestSellerCatDict.ContainsKey(mainCateoryItem.Key))
{
string bestSellerCategoryKey = gMainCatMappingBestSellerCatDict[mainCateoryItem.Key];
foreach (crifanLibAmazon.categoryItem singleBestSellerCatItem in bestSellerCategoryList)
{
if (bestSellerCategoryKey.Equals(singleBestSellerCatItem.Key, StringComparison.CurrentCultureIgnoreCase))
{
bestSellerCateoryItem = singleBestSellerCatItem;
foundMatchedBestSeller = true;
break;
}
}
}
}
return foundMatchedBestSeller;
}
private void searchSingleCategory(crifanLibAmazon.categoryItem singleCateoryItem)
{
//instant-video
string curSearchCategoryKey = singleCateoryItem.Key;
//1. general category url
//instant-video
//http://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3dinstant-video
string generalCategoryUrl = amazonLib.generateMainCategoryUrlFromCategoryKey(curSearchCategoryKey);
processEachSearchCategory(singleCateoryItem.Url);
//2. Best Sellers
crifanLibAmazon.categoryItem bestSellerCategoryItem;
if (findMatchedBestSellerCategoryItem(singleCateoryItem, out bestSellerCategoryItem))
{
gLogger.Info("Found corrsponding best seller item category url=" + bestSellerCategoryItem.Url);
processEachSearchCategory(bestSellerCategoryItem.Url);
}
else
{
gLogger.Info("NOT found corrsponding best seller item category url, for: " + singleCateoryItem.Url);
}
}
private void btnSearch_Click(object sender, EventArgs e)
{
if (curSearchStatus == search_status.SEARCH_STATUS_STOPPED)
{
needContinueSearch = true;
//start search
curSearchStatus = search_status.SEARCH_STATUS_SEARCHING;
updateUI();
//mainCategorySearch();
cleanAndReinitBeforeSearch();
awsCategorySearch();
//end search
curSearchStatus = search_status.SEARCH_STATUS_STOPPED;
updateUI();
}
}
private void cleanAndReinitBeforeSearch()
{
rtbLog.Text = "";
gCurItemNum = 1;
//re-init log filename
initLoggerFilename();
//re-init logger
initLogger();
}
private void btnChangeOutputFolder_Click(object sender, EventArgs e)
{
DialogResult outputFolderResult = fbdOutputFolder.ShowDialog();
if (outputFolderResult == System.Windows.Forms.DialogResult.OK)
{
txbOutputFolder.Text = fbdOutputFolder.SelectedPath;
afterChangeOutputFolder();
}
//else if (outputFolderResult == System.Windows.Forms.DialogResult.Cancel)
//{
//
//}
}
private void btnOpenOutputFolder_Click(object sender, EventArgs e)
{
if (Directory.Exists(txbOutputFolder.Text))
{
crl.openFileDirectly(txbOutputFolder.Text);
}
}
private void btnBrowserOutputFile_Click(object sender, EventArgs e)
{
string currentOutputFullFilename = Path.Combine(txbOutputFolder.Text, txbExcelFilename.Text);
gLogger.Debug("In btnBrowserOutputFile_Click:");
gLogger.Debug("OutputFolder=" + txbOutputFolder.Text);
gLogger.Debug("ExcelFilename=" + txbExcelFilename.Text);
gLogger.Debug("currentOutputFullFilename=" + currentOutputFullFilename);
if (File.Exists(currentOutputFullFilename))
{
crl.openFolderAndSelectFile(currentOutputFullFilename);
}
else
{
crl.openFolderAndSelectFile(txbOutputFolder.Text);
}
}
private void btnOpenOutputFile_Click(object sender, EventArgs e)
{
string currentOutputFullFilename = Path.Combine(txbOutputFolder.Text, txbExcelFilename.Text);
if (File.Exists(currentOutputFullFilename))
{
crl.openFileDirectly(currentOutputFullFilename);
}
else if(Directory.Exists(txbOutputFolder.Text))
{
crl.openFolderAndSelectFile(txbOutputFolder.Text);
}
}
private void btnStop_Click(object sender, EventArgs e)
{
if (curSearchStatus == search_status.SEARCH_STATUS_SEARCHING)
{
curSearchStatus = search_status.SEARCH_STATUS_STOPPED;
updateUI();
//do stop things
needContinueSearch = false;
}
}
/****************************** AWS ********************************/
private List<string> buildMainBrowserNodeNameList()
{
List<string> mainBrowserNodeNameList = new List<string>();
//http://docs.aws.amazon.com/AWSECommerceService/latest/DG/BrowseNodeIDs.html
//mainBrowserNodeNameList.Add("");
mainBrowserNodeNameList.Add("Apparel");
mainBrowserNodeNameList.Add("Appliances");
mainBrowserNodeNameList.Add("ArtsAndCrafts");
mainBrowserNodeNameList.Add("Automotive");
mainBrowserNodeNameList.Add("Baby");
mainBrowserNodeNameList.Add("Beauty");
mainBrowserNodeNameList.Add("Books");
mainBrowserNodeNameList.Add("Classical");
mainBrowserNodeNameList.Add("Collectibles");
//Code=AWS.InvalidParameterValue, Message=195208011 is not a valid value for BrowseNodeId. Please change this value and retry your request.
//mainBrowserNodeNameList.Add("DigitalMusic");
mainBrowserNodeNameList.Add("DVD");
mainBrowserNodeNameList.Add("Electronics");
mainBrowserNodeNameList.Add("ForeignBooks");
mainBrowserNodeNameList.Add("Garden");
//Request valid, but Error: Code=AWS.InvalidParameterValue, Message=3580501 is not a valid value for BrowseNodeId. Please change this value and retry your request.
//mainBrowserNodeNameList.Add("GourmetFood");
mainBrowserNodeNameList.Add("Grocery");
mainBrowserNodeNameList.Add("HealthPersonalCare");
mainBrowserNodeNameList.Add("Hobbies");
mainBrowserNodeNameList.Add("Home");
//Request valid, but Error: Code=AWS.InvalidParameterValue, Message=285080 is not a valid value for BrowseNodeId. Please change this value and retry your request.
mainBrowserNodeNameList.Add("HomeGarden");
////https://www.crifan.com/amazon_asw_api_usage_notice/
////PetSupplies=1063498, is just sub category of Home & Kitchen=1055398
////mainBrowserNodeNameList.Add("PetSupplies");
//mainBrowserNodeNameList.Add("Home & Kitchen");
mainBrowserNodeNameList.Add("HomeImprovement");
mainBrowserNodeNameList.Add("Industrial");
mainBrowserNodeNameList.Add("Jewelry");
mainBrowserNodeNameList.Add("KindleStore");
mainBrowserNodeNameList.Add("Kitchen");
//mainBrowserNodeNameList.Add("LawnGarden");
//https://www.crifan.com/aws_searchindex_has_not_support_lawngarden_changed_to_lawnandgarden/
mainBrowserNodeNameList.Add("LawnAndGarden");
mainBrowserNodeNameList.Add("Lighting");
mainBrowserNodeNameList.Add("Magazines");
mainBrowserNodeNameList.Add("Miscellaneous");
mainBrowserNodeNameList.Add("MobileApps");
//Request valid, but Error: Code=AWS.InvalidParameterValue, Message=195211011 is not a valid value for BrowseNodeId. Please change this value and retry your request.
//mainBrowserNodeNameList.Add("MP3Downloads");
mainBrowserNodeNameList.Add("Music");
mainBrowserNodeNameList.Add("MusicalInstruments");
mainBrowserNodeNameList.Add("OfficeProducts");
//mainBrowserNodeNameList.Add("OutdoorLiving");
//mainBrowserNodeNameList.Add("PCHardware");
//seem also miss this
mainBrowserNodeNameList.Add("PetSupplies");
//mainBrowserNodeNameList.Add("Photo");
mainBrowserNodeNameList.Add("Shoes");
mainBrowserNodeNameList.Add("Software");
mainBrowserNodeNameList.Add("SoftwareVideoGames");
mainBrowserNodeNameList.Add("SportingGoods");
mainBrowserNodeNameList.Add("Tools");
mainBrowserNodeNameList.Add("Toys");
//Request valid, but Error: Code=AWS.InvalidParameterValue, Message=404272 is not a valid value for BrowseNodeId. Please change this value and retry your request.
//mainBrowserNodeNameList.Add("VHS");
mainBrowserNodeNameList.Add("Video");
//mainBrowserNodeNameList.Add("VideoGames");
mainBrowserNodeNameList.Add("Watches");
//mainBrowserNodeNameList.Add("Wireless");
//mainBrowserNodeNameList.Add("WirelessAccessories");
return mainBrowserNodeNameList;
}
//tmp only support US
//later will add: CA CN DE ES FR IN IT JP UK
private List<string> buildMainBrowserNodeIdList()
{
List<string> mainBrowserNodeIdList = new List<string>();
//http://docs.aws.amazon.com/AWSECommerceService/latest/DG/BrowseNodeIDs.html
//mainBrowserNodeIdList.Add("US");
mainBrowserNodeIdList.Add("1036592");
mainBrowserNodeIdList.Add("2619525011");
mainBrowserNodeIdList.Add("2617941011");
mainBrowserNodeIdList.Add("15690151");
mainBrowserNodeIdList.Add("165796011");
mainBrowserNodeIdList.Add("11055981");
mainBrowserNodeIdList.Add("1000");
mainBrowserNodeIdList.Add("301668");
mainBrowserNodeIdList.Add("4991425011");
//Code=AWS.InvalidParameterValue, Message=195208011 is not a valid value for BrowseNodeId. Please change this value and retry your request.
//mainBrowserNodeIdList.Add("195208011");
mainBrowserNodeIdList.Add("2625373011");
mainBrowserNodeIdList.Add("493964");
mainBrowserNodeIdList.Add("");
mainBrowserNodeIdList.Add("");
//Request valid, but Error: Code=AWS.InvalidParameterValue, Message=3580501 is not a valid value for BrowseNodeId. Please change this value and retry your request.
//mainBrowserNodeIdList.Add("3580501");
mainBrowserNodeIdList.Add("16310101");
mainBrowserNodeIdList.Add("3760931");
mainBrowserNodeIdList.Add("");
mainBrowserNodeIdList.Add("");
//Request valid, but Error: Code=AWS.InvalidParameterValue, Message=285080 is not a valid value for BrowseNodeId. Please change this value and retry your request.
//mainBrowserNodeIdList.Add("285080");
mainBrowserNodeIdList.Add("1055398");
////https://www.crifan.com/amazon_asw_api_usage_notice/
////PetSupplies=1063498, is just sub category of Home & Kitchen=1055398
//mainBrowserNodeIdList.Add("1055398");
mainBrowserNodeIdList.Add("");
mainBrowserNodeIdList.Add("228239");
mainBrowserNodeIdList.Add("3880591");
mainBrowserNodeIdList.Add("133141011");
//seems miss this
mainBrowserNodeIdList.Add("1063498");
mainBrowserNodeIdList.Add("2972638011");
mainBrowserNodeIdList.Add("");
mainBrowserNodeIdList.Add("599872");
mainBrowserNodeIdList.Add("10304191");
mainBrowserNodeIdList.Add("2350149011");
//Request valid, but Error: Code=AWS.InvalidParameterValue, Message=195211011 is not a valid value for BrowseNodeId. Please change this value and retry your request.
//mainBrowserNodeIdList.Add("195211011");
mainBrowserNodeIdList.Add("301668");
mainBrowserNodeIdList.Add("11091801");
mainBrowserNodeIdList.Add("1084128");
//mainBrowserNodeIdList.Add("1063498");
//mainBrowserNodeIdList.Add("493964");
//http://www.browsenodes.com/node-2619533011.html
mainBrowserNodeIdList.Add("2619533011");
//mainBrowserNodeIdList.Add("1063498");
//mainBrowserNodeIdList.Add("493964");
mainBrowserNodeIdList.Add("");
mainBrowserNodeIdList.Add("409488");
mainBrowserNodeIdList.Add("");
mainBrowserNodeIdList.Add("3375251");
mainBrowserNodeIdList.Add("468240");
//https://www.crifan.com/aws_api_toys_browsernodeid_493964_not_root_category/
//mainBrowserNodeIdList.Add("493964");
mainBrowserNodeIdList.Add("165793011");
//Request valid, but Error: Code=AWS.InvalidParameterValue, Message=404272 is not a valid value for BrowseNodeId. Please change this value and retry your request.
//mainBrowserNodeIdList.Add("404272");
mainBrowserNodeIdList.Add("130");
//mainBrowserNodeIdList.Add("493964");
mainBrowserNodeIdList.Add("377110011");
//mainBrowserNodeIdList.Add("508494");
//mainBrowserNodeIdList.Add("13900851");
return mainBrowserNodeIdList;
}
private void initSingleTreeNode(TreeNode curTreeNode)
{
crifanLibAws.awsBrowseNode curBrowseNode = (crifanLibAws.awsBrowseNode)curTreeNode.Tag;
////debug
////www.crifan.com/amazon_asw_api_usage_notice/
//curBrowseNode.BrowseNodeId = "1055398"; //Home & Kitchen
crifanLibAws.awsBrowseNodeLookupResp browseNodeLookupResp = aws.awsGetBrowseNodeLookupResp(curBrowseNode.BrowseNodeId);
if (!string.IsNullOrEmpty(browseNodeLookupResp.selfBrowseNode.Name))
{
//string nodeText = "";
if (curTreeNode.Parent != null)
{
//parent not null -> not root TreeNode -> node extracted name
//nodeText = browseNodeLookupResp.selfBrowseNodeId.Name;
}
else
{
//no parent -> root TreeNode -> use original (initialized root category) name
browseNodeLookupResp.selfBrowseNode.Name = curBrowseNode.Name;
}
curTreeNode.Text = browseNodeLookupResp.selfBrowseNode.Name;
curTreeNode.Tag = browseNodeLookupResp.selfBrowseNode;
if ((browseNodeLookupResp.Children != null) && (browseNodeLookupResp.Children.Count > 0))
{
//for show in tree node
foreach (crifanLibAws.awsBrowseNode childBrowseNode in browseNodeLookupResp.Children)
{
TreeNode subTreeNode = new TreeNode();
subTreeNode.Text = childBrowseNode.Name;
subTreeNode.Tag = childBrowseNode;
subTreeNode.ContextMenuStrip = cmsSelection;
curTreeNode.Nodes.Add(subTreeNode);
}
gLogger.Info(String.Format("Category [{0}] : {1} chilren", curTreeNode.Text, browseNodeLookupResp.Children.Count));
}
else
{
gLogger.Info(String.Format("Category [{0}] : No chilren", curTreeNode.Text));
}
}
else
{
gLogger.Debug("can not get BrowseNodeLookup Response for singleRootBrowseNodeId=" + curBrowseNode.BrowseNodeId);
}
}
private void initAwsCategory()
{
List<string> mainBrowserNodeNameList = buildMainBrowserNodeNameList();
List<string> mainBrowserNodeIdList = buildMainBrowserNodeIdList();
gMainBrowserNodeList = new List<crifanLibAws.awsBrowseNode>();
for (int idx = 0; idx < mainBrowserNodeNameList.Count; idx++)
{
string mainBrowserNodeName = mainBrowserNodeNameList[idx];
string mainBrowserNodeId = mainBrowserNodeIdList[idx];
if (!string.IsNullOrEmpty(mainBrowserNodeId))
{
crifanLibAws.awsBrowseNode mainBrowserNode = new crifanLibAws.awsBrowseNode();
mainBrowserNode.Name = mainBrowserNodeName;
mainBrowserNode.BrowseNodeId = mainBrowserNodeId;
gMainBrowserNodeList.Add(mainBrowserNode);
}
else
{
gLogger.Debug(String.Format("browser node id is empty for name={0} ", mainBrowserNodeName));
}
}
//init search category
//cmbSearchCategory.DataSource = gMainBrowserNodeList;
//cmbSearchCategory.DisplayMember = "name";
//foreach (crifanLibAws.awsBrowseNode mainBrowserNode in gMainBrowserNodeList)
for (int idx = 0; idx < gMainBrowserNodeList.Count; idx++)
{
crifanLibAws.awsBrowseNode mainBrowserNode = gMainBrowserNodeList[idx];
gLogger.Trace(String.Format("[{0:D2}]{1}\t\t\t={2}", idx + 1, mainBrowserNode.Name, mainBrowserNode.BrowseNodeId));
TreeNode rootTreeNode = new TreeNode();
//rootTreeNode.Name = mainBrowserNode.Name;
rootTreeNode.Text = mainBrowserNode.Name;
rootTreeNode.Tag = mainBrowserNode;
rootTreeNode.ContextMenuStrip = cmsSelection;
trvCategoryTree.Nodes.Add(rootTreeNode);
}
}
//get input TreeNode's BrowseNode's SearchIndex
private string getSearchIndex(TreeNode curTreeNode)
{
string strSearchIndex = "";
//find the root node
TreeNode rootTreeNode = crl.findRootTreeNode(curTreeNode);
//here already makesure the root TreeNode name is SerchIndex
if (rootTreeNode != null)
{
crifanLibAws.awsBrowseNode rootBrowseNode = (crifanLibAws.awsBrowseNode)rootTreeNode.Tag;
strSearchIndex = rootBrowseNode.Name;
}
return strSearchIndex;
}
//get input TreeNode's BrowseNode's full category name
//something like:
// xxx -> xxx -> xxx
private string getFullCategoryName(TreeNode curTreeNode)
{
string strFullCategoryName = "";
//init
strFullCategoryName = curTreeNode.Text;
TreeNode parentTreeNode = curTreeNode.Parent;
//walk trough from current TreeNode to root TreeNode
while (parentTreeNode != null)
{
strFullCategoryName = parentTreeNode.Text + " -> " + strFullCategoryName;
parentTreeNode = parentTreeNode.Parent;
}
return strFullCategoryName;
}
private crifanLibAws.awsBrowseNode getCurSelBrowserNode()
{
crifanLibAws.awsBrowseNode curSelectedBrowserNode = new crifanLibAws.awsBrowseNode();
if (trvCategoryTree.SelectedNode != null)
{
curSelectedBrowserNode = (crifanLibAws.awsBrowseNode)trvCategoryTree.SelectedNode.Tag;
}
else
{
//can not use gLogger here, for it has not init yet
//gLogger.Info("Not selected any tree node");
}
return curSelectedBrowserNode;
}
private void searchSingleBrowseNodeId(string curBrowseNodeId, string curSearchIndex, string curFullCategoryName = "")
{
string strFullCategoryName = String.Format("FullCategoryName={0}", curFullCategoryName);
string strFormattedFullCategoryName = crl.formatString(strFullCategoryName, '=');
string strStartSearch = String.Format("Start search for BrowseNodeId={0}, SearchIndex={1}", curBrowseNodeId, curSearchIndex);
string strFormattedStartSearch = crl.formatString(strStartSearch, '=');
gLogger.Info(strFormattedStartSearch);
gLogger.Info(strFormattedFullCategoryName);
//get first page search result
string strFirstPageNum = "1";
crifanLibAws.awsSearchResultInfo firstPageSearchResultInfo = aws.awsGetBrowserNodeSearchResultItemList(curBrowseNodeId, curSearchIndex, strFirstPageNum);
if (firstPageSearchResultInfo.SearchResultItemList != null)
{
//gLogger.Info(String.Format("=== page {0} ===", strFirstPageNum));
foreach (crifanLibAws.awsSearchResultItem eachItem in firstPageSearchResultInfo.SearchResultItemList)
{
if (!needContinueSearch)
{
break;
}
processAwsSearchItem(eachItem);
}
}
//process following page (page 2-10) search item list, if available
if (firstPageSearchResultInfo.TotalPages != null)
{
int awsPageNumLimit = 10;
int intTotalPages = Int32.Parse(firstPageSearchResultInfo.TotalPages);
int maxPageNum = intTotalPages > awsPageNumLimit ? awsPageNumLimit : intTotalPages;
for (int curPageNum = 2; curPageNum <= maxPageNum; curPageNum++)
{
if (!needContinueSearch)
{
break;
}
//gLogger.Info(String.Format("=== page {0} ===", curPageNum));
crifanLibAws.awsSearchResultInfo curPageItemList = aws.awsGetBrowserNodeSearchResultItemList(curBrowseNodeId, curSearchIndex, curPageNum.ToString());
if (curPageItemList.SearchResultItemList != null)
{
foreach (crifanLibAws.awsSearchResultItem eachItem in curPageItemList.SearchResultItemList)
{
if (!needContinueSearch)
{
break;
}
processAwsSearchItem(eachItem);
}
}
}
}
string strEndSearch = String.Format("End of search for BrowseNodeId={0}, SearchIndex={1}", curBrowseNodeId, curSearchIndex);
string strFormattedEndSearch = crl.formatString(strEndSearch, '=');
gLogger.Info(strFormattedEndSearch);
gLogger.Info(strFormattedFullCategoryName);
}
//for some TreeNode, first find all child node, then do search for each node
private void doSearchForAllChildOfSingleTreeNode(TreeNode curTreeNode)
{
//find all sub child nodes, meanwhile do search
if (curTreeNode != null)
{
//1. find all child
int childNodeCount = 0;
childNodeCount = curTreeNode.GetNodeCount(false);
if (childNodeCount <= 0)
{
//(1) if no child -> maybe really no child, or has not init -> re-init to get all child
initSingleTreeNode(curTreeNode);
}
//re-check, maybe above step has re-got some child
childNodeCount = curTreeNode.GetNodeCount(false);
if (childNodeCount > 0)
{
//(2) if has child -> must has init -> just process each child
foreach (TreeNode childTreeNode in curTreeNode.Nodes)
{
if (!needContinueSearch)
{
break;
}
doSearchForAllChildOfSingleTreeNode(childTreeNode);
}
}
else
{
//still no child, then do real search
string curSearchIndex = getSearchIndex(curTreeNode);
string curFullCategoryName = getFullCategoryName(curTreeNode);
crifanLibAws.awsBrowseNode curBrowserNode = (crifanLibAws.awsBrowseNode)curTreeNode.Tag;
searchSingleBrowseNodeId(curBrowserNode.BrowseNodeId, curSearchIndex, curFullCategoryName);
}
}
}
private void awsCategorySearch()
{
if (curSelTreeNodeList.Count <= 0)
{
string strNothingToSearch = "Not select any category, so nothing to search";
gLogger.Info(crl.formatString(strNothingToSearch, '#'));
}
else
{
string strSearchForAll = String.Format("Do search for total selected {0} categories", curSelTreeNodeList.Count);
gLogger.Info(crl.formatString(strSearchForAll, '#'));
for (int idx = 0; idx < curSelTreeNodeList.Count; idx++)
{
int num = idx + 1;
TreeNode eachSelectedTreeNode = curSelTreeNodeList[idx];
string fullCategoryName = getFullCategoryName(eachSelectedTreeNode);
gLogger.Info(String.Format("[{0}] {1}", num, fullCategoryName));
}
gLogger.Info(crl.formatString("#", '#'));
for (int idx = 0; idx < curSelTreeNodeList.Count; idx++)
{
int num = idx + 1;
TreeNode eachSelectedTreeNode = curSelTreeNodeList[idx];
string fullCategoryName = getFullCategoryName(eachSelectedTreeNode);
string strSearchForEach = String.Format("Process for selected category [{0}] {1}", num, fullCategoryName);
gLogger.Info(crl.formatString(strSearchForEach, '#'));
gLogger.Info(crl.formatString("#", '#'));
//doSearchForAllChildOfSingleTreeNode(trvCategoryTree.SelectedNode);
doSearchForAllChildOfSingleTreeNode(eachSelectedTreeNode);
}
}
}
public void processAwsSearchItem(crifanLibAws.awsSearchResultItem singleAwsSearchItem)
{
string asinToHandle;
asinToHandle = singleAwsSearchItem.Asin;
if (gProcessedAsinList.Contains(asinToHandle))
{
gLogger.Debug(String.Format("omit ASIN={0} for has processed it", asinToHandle));
}
else
{
//1. find variation if avaliable
List<crifanLibAws.awsSearchResultItem> variationItemList = new List<crifanLibAws.awsSearchResultItem>();
variationItemList = aws.awsGetVariationItemList(asinToHandle);
//2. real goto process each item
foreach (crifanLibAws.awsSearchResultItem singleAsin in variationItemList)
{
if (!needContinueSearch)
{
break;
}
//process each ASIN (product)
string realAsinToHandle = singleAsin.Asin;
//note:
//here ParentASIN maybe nulll -> should check it before use
processAmazonItem(realAsinToHandle);
gProcessedAsinList.Add(realAsinToHandle);
}
}
}
private bool awsItemIsValid(crifanLibAws.awsItemAttributes itemAttributes, out string invalidReason)
{
bool bItemIsValid = true;
invalidReason = "valid item";
//1. check weight
if (bItemIsValid)
{
string strWeightHundredthsPound = itemAttributes.itemDimensions.WeightPound;
float fWeightPound;
if (string.IsNullOrEmpty(strWeightHundredthsPound))
{
fWeightPound = 0.0F;
}
else
{
float fWeightHundredthsPound = float.Parse(strWeightHundredthsPound);
fWeightPound = fWeightHundredthsPound / 100.0F;
}
rule_maxWeightPounds = float.Parse(txbMaxWeightPounds.Text);
if (fWeightPound <= rule_maxWeightPounds)
{
bItemIsValid = true;
}
else
{
bItemIsValid = false;
invalidReason = String.Format("Weight is {0} pounds, more than weight limit: {1} pounds", fWeightPound, rule_maxWeightPounds);
}
}
//2. check dimension
if (bItemIsValid)
{
string strLengthHundredthsInch = (itemAttributes.itemDimensions.LengthHundredthsInch != null) ?
itemAttributes.itemDimensions.LengthHundredthsInch :
itemAttributes.packageDimensions.LengthHundredthsInch;
string strWidthHundredthsInch = (itemAttributes.itemDimensions.WidthHundredthsInch != null) ?
itemAttributes.itemDimensions.WidthHundredthsInch :
itemAttributes.packageDimensions.WidthHundredthsInch;
string strHeightHundredthsInch = (itemAttributes.itemDimensions.HeightHundredthsInch != null) ?
itemAttributes.itemDimensions.HeightHundredthsInch :
itemAttributes.packageDimensions.HeightHundredthsInch;
float fLengthInch;
if (string.IsNullOrEmpty(strLengthHundredthsInch))
{
fLengthInch = 0.0F;
}
else
{
float fLengthHundredthsInch = float.Parse(strLengthHundredthsInch);
fLengthInch = fLengthHundredthsInch / 100.0F;
}
float fWidthInch;
if (string.IsNullOrEmpty(strWidthHundredthsInch))
{
fWidthInch = 0.0F;
}
else
{
float fWeightHundredthsInch = float.Parse(strWidthHundredthsInch);
fWidthInch = fWeightHundredthsInch / 100.0F;
}
float fHeightInch;
if (string.IsNullOrEmpty(strHeightHundredthsInch))
{
fHeightInch = 0.0F;
}
else
{
float fHeightHundredthsInch = float.Parse(strHeightHundredthsInch);
fHeightInch = fHeightHundredthsInch / 100.0F;
}
float fLengthCm = crl.inchToCm(fLengthInch);
float fWidthCm = crl.inchToCm(fWidthInch);
float fHeightCm = crl.inchToCm(fHeightInch);
rule_dimensionMaxLengthCm = float.Parse(txbDimensionLength.Text);
rule_dimensionMaxWidthCm = float.Parse(txbDimensionWidth.Text);
rule_dimensionMaxHeightCm = float.Parse(txbDimensionHeight.Text);
//check valid or not
if (
(fLengthCm <= rule_dimensionMaxLengthCm) &&
(fWidthCm <= rule_dimensionMaxWidthCm) &&
(fHeightCm <= rule_dimensionMaxHeightCm)
)
{
bItemIsValid = true;
}
else
{
bItemIsValid = false;
invalidReason = String.Format("Dimension: {0}cm x {1}cm x {2}cm invalid for exceed dimension limit: {3}cm x {4}cm x {5}cm",
fLengthCm, fWidthCm, fHeightCm,
rule_dimensionMaxLengthCm, rule_dimensionMaxWidthCm, rule_dimensionMaxHeightCm);
}
}
if (bItemIsValid)
{
//get offer full info
//following info get by awsGetOffersInfo
//is NOT unit number, but is OFFER number
//eg:
//B0009IQXFO
//http://www.amazon.com/gp/offer-listing/B0009IQXFO
//can see 18 offers
//then here get:
//Asin "B0009IQXFO"
//TotalCollectible "0"
//TotalNew "18"
//TotalOfferPages "1"
//TotalOffers "1"
//TotalRefurbished "0"
//TotalUsed "0"
crifanLibAws.awsOffersInfo offersInfo = aws.awsGetOffersInfo(itemAttributes.Asin);
//3. check buyer number
if (bItemIsValid)
{
int totalOfferNum = 0;
if(
(!string.IsNullOrEmpty(offersInfo.TotalNew)) &&
(!string.IsNullOrEmpty(offersInfo.TotalUsed)) &&
(!string.IsNullOrEmpty(offersInfo.TotalCollectible)) &&
(!string.IsNullOrEmpty(offersInfo.TotalRefurbished))
)
{
int intOfferTotalNew = Int32.Parse(offersInfo.TotalNew);
int intOfferTotalUsed = Int32.Parse(offersInfo.TotalUsed);
int intOfferTotalCollectible = Int32.Parse(offersInfo.TotalCollectible);
int intOfferTotalRefurbished = Int32.Parse(offersInfo.TotalRefurbished);
totalOfferNum = intOfferTotalNew + intOfferTotalUsed + intOfferTotalCollectible + intOfferTotalRefurbished;
}
else
{
totalOfferNum = 0;
}
rule_minimalBuyerNumber = Int32.Parse(txbMinBuyerNum.Text);
if (totalOfferNum >= rule_minimalBuyerNumber)
{
bItemIsValid = true;
}
else
{
bItemIsValid = false;
invalidReason = String.Format("buyer number {0} less than minimal limit {1}", totalOfferNum, rule_minimalBuyerNumber);
}
}
//4. check total unit number
if (bItemIsValid)
{
string itemAsin = itemAttributes.Asin;
string productUrl = amazonLib.generateProductUrlFromAsin(itemAsin);
bool bTotalUnitNumValid = checkTotalUnitNumber(productUrl, out invalidReason);
if (bTotalUnitNumValid)
{
bItemIsValid = true;
}
else
{
bItemIsValid = false;
//invalidReason = invalidReason;
}
}
}
return bItemIsValid;
}
public void processAmazonItem(string itemAsin)
{
gLogger.Trace("Processing amazon product ASIN=" + itemAsin);
//get item info
crifanLibAws.awsItemAttributes itemAttributes = aws.awsGetItemAttributes(itemAsin);
//then check is valid or not
string invalidReason = "";
bool bIsValid = awsItemIsValid(itemAttributes, out invalidReason);
//debug
//bIsValid = true;
if (bIsValid)
{
string strValid = String.Format("[{0}] Valid: {1}", gCurItemNum++, itemAttributes.Asin);
string strFormattedValid = crl.formatString(strValid, '-', 120);
gLogger.Info(strFormattedValid);
awsFindAndSaveItem(itemAttributes.Asin, itemAttributes);
}
else
{
string strInvalid = String.Format("[{0}] Invalid: {1}", gCurItemNum++, itemAttributes.Asin);
string strFormattedInvalid = crl.formatString(strInvalid, '-', 120);
gLogger.Info(strFormattedInvalid);
gLogger.Info("InvalidReason=" + invalidReason);
}
}
private void awsFindAndSaveItem(string itemAsin, crifanLibAws.awsItemAttributes itemAttributes)
{
//1. extract other necessary info
//2. save product info
AmazonProductInfo productInfo;
if (awsGetAllProductInfo(itemAsin, itemAttributes, out productInfo))
{
saveProductInfo(productInfo);
}
}
private void awsDownloadPictures(string itemAsin, List<string> imageUrlList, int imgFullnameArrLength, out string[] savedImageUrlList)
{
//creat folder
string strOutputImageFoler = Path.Combine(txbOutputFolder.Text, defaultOutputImageFolderName);
string picFolderFullPath = Path.Combine(strOutputImageFoler,itemAsin);
if (!Directory.Exists(picFolderFullPath))
{
Directory.CreateDirectory(picFolderFullPath);
}
int maxImageCount = imageUrlList.Count > imgFullnameArrLength ?
imgFullnameArrLength : imageUrlList.Count;
savedImageUrlList = new string[maxImageCount];
for (int idx = 0; idx < maxImageCount; idx++)
{
int num = idx + 1;
string imageUrl = imageUrlList[idx];
gLogger.Info(String.Format("[Image{0}]\t\t\t{1}", num, imageUrl));
string picFilename = crl.extractFilenameFromUrl(imageUrl);
string picFullFilename = Path.Combine(picFolderFullPath, picFilename);
string errorStr = "";
gLogger.Debug(String.Format("Downloading {0}", imageUrl));
gLogger.Debug(String.Format("to {0}", picFullFilename));
crl.downloadFile(imageUrl, picFullFilename, out errorStr, updateProgress);
//update
//savedFullPicNameList[idx] = picFullFilename;
savedImageUrlList[idx] = imageUrl;
}
}
private void checkDescriptionAndBullets(ref AmazonProductInfo productInfo, string description, List<string> bulletList)
{
//makesure bulletList is not null
if (bulletList == null)
{
bulletList = new List<string>();
}
/*
* 1. if no description, use bullet
* 2. if more than normal 5 bullets, get all bullets, just use first 5 bullets to description
* 3. if no bullet, use description to split to 5 bullets
*/
//type1: has description, has bullet
if ((!string.IsNullOrEmpty(description)) && (bulletList.Count > 0))
{
productInfo.description = description;
//bullets
//maybe has more than 5 bullets
//maybe less than 5 bullets
//http://www.amazon.com/AmazonBasics-Lightning-Compatible-Cable-inch/dp/B00B5RGAWY/ref=sr_1_3?s=wireless&ie=UTF8&qid=1369753764&sr=1-3
//has feature-bullets_feature_div, but no content -> bulletsNodeList is null
for (int idx = 0; idx < bulletList.Count; idx++)
{
string bulletStr = bulletList[idx];
//get first 5 -> to bullet
if (idx < productInfo.bulletArr.Length)
{
productInfo.bulletArr[idx] = bulletStr;
}
else
{
//only need 5 bullets
break;
}
}
}
//type2: no description, has bullet
else if ( string.IsNullOrEmpty(description) && (bulletList.Count > 0))
{
//bullets
//maybe has more than 5 bullets
//maybe less than 5 bullets
for (int idx = 0; idx < bulletList.Count; idx++)
{
string bulletStr = bulletList[idx];
//get first 5 -> to bullet
if (idx < productInfo.bulletArr.Length)
{
productInfo.bulletArr[idx] = bulletStr;
}
//all bullet -> description
description = description + bulletStr + Environment.NewLine;
}
productInfo.description = description;
}
//type3: has description, no bullet
else if ((!string.IsNullOrEmpty(description)) && (bulletList.Count == 0))
{
productInfo.description = description;
//seperate description to many lines
string[] lines = description.Split('.');
//maybe less than 5, maybe greater than 5
for (int idx = 0; idx < lines.Length; idx++)
{
string curLine = lines[idx];
//get first 5 -> to bullet
if (idx < productInfo.bulletArr.Length)
{
productInfo.bulletArr[idx] = curLine;
}
else
{
//only need 5 bullets
break;
}
}
}
//type4: no description, no bullet
else if ((string.IsNullOrEmpty(description)) && (bulletList.Count == 0))
{
//something wrong
//or just leave it
productInfo.description = string.Empty;
crl.emptyStringArray(productInfo.bulletArr);
}
}
private bool awsGetAllProductInfo(string itemAsin, crifanLibAws.awsItemAttributes itemAttributes, out AmazonProductInfo productInfo)
{
gLogger.Debug("Extracting info for: " + itemAsin);
//init
bool extractProductInfoOk = true;
string productUrl = amazonLib.generateProductUrlFromAsin(itemAsin);
productInfo = new AmazonProductInfo();
productInfo.url = amazonLib.generateProductUrlFromAsin(itemAsin);
productInfo.cheapestPrice = float.MaxValue;
productInfo.isOneSellerIsAmazon = false;
//must init, otherwise, when only got 4 bullet, here total 5 -> last is null -> assign later will exception
productInfo.bulletArr = new string[5];
crl.emptyStringArray(productInfo.bulletArr);
productInfo.imgUrlArr = new string[5];
crl.emptyStringArray(productInfo.imgUrlArr);
productInfo.keywordFieldArr = new string[3];
crl.emptyStringArray(productInfo.keywordFieldArr);
//1. title
productInfo.title = itemAttributes.Title; //"Frigidaire FRA052XT7 5,000-BTU Mini Window Air Conditioner"
gLogger.Info("[Title]\t\t\t" + productInfo.title);
//2. description
crifanLibAws.awsEditorialReview editorialReview = aws.awsGetEditorialReview(itemAttributes.Asin);
string originContentHtml = editorialReview.Content;
string description = crl.htmlRemoveTag(originContentHtml);
description = description.Trim();
//3. bullets
List<string> bulletList = itemAttributes.FeatureList;
//check for bullets and description
checkDescriptionAndBullets(ref productInfo, description, bulletList);
//check max length for whole description
rule_maxDescriptionLen = Int32.Parse(txbMaxDescriptionLen.Text);
if (productInfo.description.Length > rule_maxDescriptionLen)
{
productInfo.description = productInfo.description.Substring(0, rule_maxDescriptionLen);
}
//check max length for each bullet
rule_maxLenEachBullet = Int32.Parse(txbEachBulletMaxLen.Text);
for (int idx = 0; idx < productInfo.bulletArr.Length; idx++)
{
if (productInfo.bulletArr[idx].Length > rule_maxLenEachBullet)
{
productInfo.bulletArr[idx] = productInfo.bulletArr[idx].Substring(0, rule_maxLenEachBullet);
}
}
//output description
int iDescToShowLen = 40;
iDescToShowLen = (productInfo.description.Length > iDescToShowLen) ? iDescToShowLen : productInfo.description.Length;
string strShowDes = productInfo.description.Substring(0, iDescToShowLen) + " ......";
gLogger.Info("[Description]\t\t" + strShowDes);
//output bullets
int iRealBulletNum = 0;
for (int idx = 0; idx < productInfo.bulletArr.Length; idx++)
{
if (!string.IsNullOrEmpty(productInfo.bulletArr[idx]))
{
++iRealBulletNum;
}
}
gLogger.Info("[BulletList]\t\t\tTotal " + iRealBulletNum.ToString() + " bullets");
//3. download images
//(1) get images
List<string> imageUrlList = new List<string>();
crifanLibAws.awsImages imagesInfo = aws.awsGetImages(itemAsin);
if (imagesInfo.LargeImageList != null)
{
foreach (crifanLibAws.awsImageItem singleImageItem in imagesInfo.LargeImageList)
{
string largeImageUrl = singleImageItem.Url;
if (!imageUrlList.Contains(largeImageUrl))
{
//makesure not duplicated
imageUrlList.Add(largeImageUrl);
}
}
}
//here use AWS only can find Primary-> LargeImage
//remaining custom images can not find
//so need continue find more custom images
string customImageUrl = amazonLib.generateCustomImageUrlFromAsin(itemAsin);
List<string> customImageUrlList = amazonLib.extractCustomImageUrlList(customImageUrl);
imageUrlList.AddRange(customImageUrlList);
//(2) download images
string[] savedImageUrlList = null;
awsDownloadPictures(itemAsin, imageUrlList, productInfo.imgUrlArr.Length, out savedImageUrlList);
if ((savedImageUrlList != null) && (savedImageUrlList.Length > 0))
{
//here, already: savedImageUrlList.Length <= productInfo.imgUrlArr.Length
for (int idx = 0; idx < savedImageUrlList.Length; idx++)
{
productInfo.imgUrlArr[idx] = savedImageUrlList[idx];
}
}
//4.extract product seller info: price and name
//get all seller
//int intStartPageNum = 1;
//for here use awsGetOfferFullInfo ONLY get 2 offer
//for more offers, API give you the MoreOffersUrl
//http://www.amazon.com/gp/offer-listing/B003F4TH6G%3FSubscriptionId%3DAKIAJQAUAH2R4HCG63LQ%26tag%3Dcrifancom-20%26linkCode%3Dxm2%26camp%3D2025%26creative%3D386001%26creativeASIN%3DB003F4TH6G
//it just is:
//http://www.amazon.com/gp/offer-listing/B003F4TH6G
//so can generate from ASIN
//crifanLibAws.awsOfferFullInfo offerFullInfo = aws.awsGetOfferFullInfo(itemAsin, intStartPageNum);
string offerListingUrl = amazonLib.generateOfferListingUrl(itemAsin); //"http://www.amazon.com/gp/offer-listing/B003F4TH6G"
List<crifanLibAmazon.productSellerInfo> allSellerInfoList = new List<crifanLibAmazon.productSellerInfo>();
if (amazonLib.extractAllSellerInfo(offerListingUrl, out allSellerInfoList))
{
if ((allSellerInfoList != null) && (allSellerInfoList.Count > 0))
{
foreach (crifanLibAmazon.productSellerInfo eachSellerInfo in allSellerInfoList)
{
//(1) calc cheapest price
if (eachSellerInfo.price < productInfo.cheapestPrice)
{
productInfo.cheapestPrice = eachSellerInfo.price;
}
//(2) find whether one of the sellers is Amazon
//here means: one of the seller's name is: Amazon.com
if (eachSellerInfo.name.Equals("Amazon.com", StringComparison.CurrentCultureIgnoreCase))
{
productInfo.isOneSellerIsAmazon = true;
}
}
if (productInfo.cheapestPrice.CompareTo(float.MaxValue) == 0)
{
gLogger.Info(String.Format("Omit this {0} for not find valid cheapest price", productUrl));
extractProductInfoOk = false;
return extractProductInfoOk;
}
else
{
gLogger.Info("[CheapestPrice]\t\t" + productInfo.cheapestPrice);
gLogger.Info("[OneOfSellerIsAmazon]\t" + productInfo.isOneSellerIsAmazon);
}
}
else
{
gLogger.Info(String.Format("Omit this {0} for found seller info but is invalid", productUrl));
extractProductInfoOk = false;
return extractProductInfoOk;
}
}
else
{
gLogger.Info(String.Format("Omit this {0} for not found seller info for {1} ", productUrl, offerListingUrl));
extractProductInfoOk = false;
return extractProductInfoOk;
}
//5. 3 keyword Field
productInfo.keywordFieldArr = amazonLib.extractProductKeywordField(productInfo.title, productInfo.keywordFieldArr.Length, rule_maxSingleKeywordFieldLen);
gLogger.Debug("Keyword Field List:");
if ((productInfo.keywordFieldArr != null) && (productInfo.keywordFieldArr.Length > 0))
{
for (int idx = 0; idx < productInfo.keywordFieldArr.Length; idx++)
{
String keywordField = productInfo.keywordFieldArr[idx];
gLogger.Debug(String.Format("[{0}]={1}", idx, keywordField));
}
}
//6. product review
string productHtml = crl.getUrlRespHtml_multiTry(productUrl);
productInfo.reviewNumber = amazonLib.extractProductReviewNumber(productUrl, productHtml);
gLogger.Info("[ReviewNumber]\t\t" + productInfo.reviewNumber);
//7. product best seller rank number list
List<crifanLibAmazon.productBestRank> bestSellerRankList = amazonLib.extractProductBestSellerRankList(productUrl);
if ((bestSellerRankList != null) && (bestSellerRankList.Count > 0))
{
productInfo.isBestSeller = true;
gLogger.Info("[IsBestSeller]\t\t" + productInfo.isBestSeller);
}
else
{
gLogger.Debug("bestSellerRankList is null or count not > 0 : " + bestSellerRankList.ToArray().ToString());
gLogger.Info(String.Format("Omit this {0} for not found valid best seller rank info", productUrl));
extractProductInfoOk = false;
return extractProductInfoOk;
}
return extractProductInfoOk;
}
private void rtbLog_TextChanged(object sender, EventArgs e)
{
rtbLog.SelectionStart = rtbLog.Text.Length; //Set the current caret position at the end
rtbLog.ScrollToCaret(); //Now scroll it automatically
}
private bool categoryTreeNodeHasInitialized(TreeNode curSelectedCategoryNode)
{
bool hasInited = false;
if (curSelectedCategoryNode != null)
{
int subNodeNum = trvCategoryTree.SelectedNode.GetNodeCount(true);
if (subNodeNum > 0)
{
hasInited = true;
}
}
return hasInited;
}
private void trvCategoryTree_DoubleClick(object sender, EventArgs e)
{
if (trvCategoryTree.SelectedNode != null)
{
if (!categoryTreeNodeHasInitialized(trvCategoryTree.SelectedNode))
{
initSingleTreeNode(trvCategoryTree.SelectedNode);
trvCategoryTree.SelectedNode.Expand();
}
else
{
//trvCategoryTree.SelectedNode.Toggle();
}
}
}
private void updateSelectionNotice()
{
if (curSelTreeNodeList.Count == 0)
{
txbCurFullCategoryName.Text = "Not select any category";
}
else
{
txbCurFullCategoryName.Text = String.Format("Total select {0} categories:", curSelTreeNodeList.Count);
for (int idx = 0; idx < curSelTreeNodeList.Count; idx++)
{
int num = idx + 1;
TreeNode eachSelectedTreeNode = curSelTreeNodeList[idx];
string fullCategoryName = getFullCategoryName(eachSelectedTreeNode);
txbCurFullCategoryName.Text += Environment.NewLine + String.Format("[{0}] {1}", num, fullCategoryName);
}
}
}
private void trvCategoryTree_AfterSelect(object sender, TreeViewEventArgs e)
{
updateSelectionNotice();
}
private void cmsSelection_ItemClicked(object sender, ToolStripItemClickedEventArgs e)
{
TreeNode curSelTreeNode = trvCategoryTree.SelectedNode;
if (e.ClickedItem == tsmiAddToSelection)
{
if (!curSelTreeNodeList.Contains(curSelTreeNode))
{
// add to selection
curSelTreeNodeList.Add(curSelTreeNode);
//hightlight node
crl.highlightNode(trvCategoryTree, curSelTreeNode);
}
}
else if (e.ClickedItem == tsmiRemoveFromSelection)
{
if (curSelTreeNodeList.Contains(curSelTreeNode))
{
//remove selection
curSelTreeNodeList.Remove(curSelTreeNode);
//unhightlight node
crl.unHighlightNode(trvCategoryTree, curSelTreeNode);
}
}
updateSelectionNotice();
}
private void trvCategoryTree_MouseUp(object sender, MouseEventArgs e)
{
if (e.Button == MouseButtons.Right)
{
// Select the clicked node
trvCategoryTree.SelectedNode = trvCategoryTree.GetNodeAt(e.X, e.Y);
}
}
/****************************** AWS ********************************/
}
}【总结】
转载请注明:在路上 » 【代码分享】C#代码:ScrapeAmazonProduct – 抓取Amazon产品数据(主要从AWS API抓取,其次再从网页中抓取)