This project is read-only.
HtmlProcessor is the main class for HTML parsing.

Features:
  • HTML parsing;
  • search DOM elements by tag name, id, class name etc.
  • Linq to DOM

Example:
string html = "<html><head><title>My page</title></head><body>" +
                    "<div class=\"divClass\" id=\"divId\">Div text.</div></body></html>";
HtmlProcessor proc = new HtmlProcessor(html);
var elements = proc.Elements;
DomElement body = proc.Body;
string source = proc.DocumentSource; // Gets source HTML
string title = proc.Title; // My page
string innerText = proc.InnerText; // Div text.
bool hasFlash = proc.HasFlash; // false
string divClass = proc.GetElementById("divId").Class; // divClass
string divClassByTag = proc.GetElementsByTagName("div").FirstOrDefault().Class; // divClass
string metaDesc = proc.MetaDescription; // null
string metaKeyword = proc.MetaKeywords; // null
List<DomElement> headers = proc.Headers;
List<DataTable> htmlTables = proc.HtmlTables;
List<DomElement> images = proc.Images;
List<DomElement> links = proc.Links;
  
proc = new HtmlProcessor(new Uri("http://microsoft.com/"));
  
// LINQ to SQL example
var q = from e in proc.Elements
        where e.TagName == "img" && e.Attributes["class"].IsNotNullOrEmpty() == true
        select e.Attributes["class"];

Last edited Mar 11, 2011 at 4:02 PM by akrakovetsky, version 2

Comments

No comments yet.