UriHtmlExtractor class can be used for downloading HTML content.

Features:
  • POST, GET support
  • availability to change User Agent;
  • proxy support
  • CSS, HTML standards validation, checking robots.txt, favicon.ico existence
  • calculation of Google PageRank

Minimum example:

UriHtmlExtractor ext = new UriHtmlExtractor(new Uri("http://google.com/"));

Full example:

NameValueCollection postParameters = new NameValueCollection();
postParameters.Add("name", "value");  // adding some POST parameter
postParameters.Add("submit", "true"); // adding some POST parameter
 
// Let's be IE9
string userAgent = "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)";
 
UriHtmlExtractor ext = new UriHtmlExtractor(new Uri("http://google.com/"),
postParameters, new WebProxy("127.0.0.1", 80), userAgent);
 
string html = ext.DocumentHtml; // Get HTML source string 
string text = ext.DocumentText; // Get cleaned from HTML tags string
 
bool isCssValid = ext.IsCSSValid(); // checks if page is CSS-valid
bool isHtmlValid = ext.IsHtmlMarkupValid(); // checks if page is HTML-valid
bool contactUsPageExists = ext.IsPageExists("contact.php"); // checks if page exists
bool robotsExists = ext.IsRobotsTxtExists(); // checks if robots.txt exists
 
Uri contactUsUri = ext.FindPage("contact"); // search inner page with "contact" slug
if (contactUsUri != null) {
    // do something
}
  
Uri foundUri = ext.FindPage(new string[] { "contact", "feedback" }); // multiple page search
  
if (foundUri != null) {
    // do something
}
  
ext.SaveHtml("C://1.html"); // saves HTML string to file
ext.SaveText("C://1.txt");  // saves text string to file
  
int PR = ext.GooglePageRank; // checks Google PageRank

Last edited Mar 11, 2011 at 2:57 PM by akrakovetsky, version 1

Comments

No comments yet.