- Create new Console project
- References HtmlAgilityPack.dll to web application
- Create class with content:
- In main function type:
- Completed
- class Cralwer source:
class Crawler
{
public string Url
{
get;
set;
}
public Crawler() { }
public Crawler(string Url) {
this.Url = Url;
}
public XDocument GetXDocument()
{
HtmlAgilityPack.HtmlWeb doc1 = new HtmlAgilityPack.HtmlWeb();
doc1.UserAgent = "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)";
HtmlAgilityPack.HtmlDocument doc2 = doc1.Load(Url);
doc2.OptionOutputAsXml = true;
doc2.OptionAutoCloseOnEnd = true;
doc2.OptionDefaultStreamEncoding = System.Text.Encoding.UTF8;
XDocument xdoc = XDocument.Parse(doc2.DocumentNode.SelectSingleNode("html").OuterHtml);
return xdoc;
}
}
- Main source
static void Main(string[] args){string url = "http://vnexpress.net/GL/Xa-hoi/";string xmlns = "{http://www.w3.org/1999/xhtml}";Crawler cl = new Crawler(url);XDocument xdoc = cl.GetXDocument();var res = from item in xdoc.Descendants(xmlns + "div")where item.Attribute("class") != null && item.Attribute("class").Value == "folder-news"&& item.Element(xmlns + "a") != null//select item;select new{Link = item.Element(xmlns + "a").Attribute("href").Value,Image = item.Element(xmlns + "a").Element(xmlns + "img").Attribute("src").Value,Title = item.Elements(xmlns + "p").ElementAt(0).Element(xmlns + "a").Value,Desc = item.Elements(xmlns + "p").ElementAt(1).Value};foreach (var node in res){Console.WriteLine(node);Console.WriteLine("\n");}Console.ReadKey();}
Done, Wish you success!!
Where does the main source go? same as the Crawler.cs file?
Trả lờiXóa