13.4. 去除HTML的标签tag:htmlRemoveTag


    /*
     * [Function]
     * remove html tag, retain html content
     * [Input]
     * html, with tag
     * 
     * [Output]
     * pure content, no html tag
     * 
     * [Note]
     */
    public string htmlRemoveTag(string html)
    {
        string filteredHtml = "";

        if (!string.IsNullOrEmpty(html))
        {
            HtmlAgilityPack.HtmlDocument htmlDoc = new HtmlAgilityPack.HtmlDocument();
            htmlDoc.LoadHtml(html);
            if (htmlDoc == null)
            {
                return "";
            }

            // 1. remove all comments
            //(1)get all comment nodes using XPATH
            HtmlNodeCollection commentNodeList = htmlDoc.DocumentNode.SelectNodes("//comment()");
            if (commentNodeList != null)
            {
                foreach (HtmlNode comment in commentNodeList)
                {
                    //(2) remove comment node itself
                    comment.ParentNode.RemoveChild(comment);
                }
            }

            //2. get all content
            foreach (var node in htmlDoc.DocumentNode.ChildNodes)
            {
                filteredHtml += node.InnerText;
            }
        }

        return filteredHtml;
    }

    

例 13.4. htmlRemoveTag 的使用范例


            HtmlAgilityPack.HtmlDocument htmlDoc = crl.htmlToHtmlDoc(googleSearchRespHtml);
            HtmlNodeCollection liNodeList = htmlDoc.DocumentNode.SelectNodes("//li[@class='g']");
            foreach (HtmlNode liNode in liNodeList)
            {
                HtmlNode h3ANode = liNode.SelectSingleNode(".//h3[@class='r']/a");
                if (h3ANode != null)
                {
                    googleSearchResultItem singleResultItem = new googleSearchResultItem();

                    //string titleHtml = h3ANode.InnerHtml; //"Amritanandamayi Math to <em>sponsor charity</em> events - Times Of India"
                    string titleHtml = h3ANode.InnerText; //"Amritanandamayi Math to sponsor charity events - Times Of India"
                    string filteredTitle = crl.htmlRemoveTag(titleHtml);