用XPATH解析网页并抓取要的内容

HTML解析器有很多种,最常用的是HtmlAgilityPack和SgmlReader(http://sourceforge.net/projects/dekiwiki/files/SgmlReader/)。



这里使用的是HtmlAgilityPack:



下载地址:http://htmlagilitypack.codeplex.com


也可以用nuget


To install HtmlAgilityPack, run the following command in the Package Manager Console


PM> Install-Package HtmlAgilityPack



同时官网提供了一个自动生成xpath路径的工具HAP Explorer。(http://htmlagilitypack.codeplex.com/downloads/get/120936)



关于XPATH表达式以及相关教程参见



获取HTML的方式有很多种:



1.通过HttpWebRequest类可实现模拟登录并获取页面信息



2.用第三方控件模拟登录


如下:


1.实现模拟登录:



从http://code.google.com/p/autotester/downloads/list 下载AutoTesterLib.dll


在项目里添加AutoTesterLib.dll的引用.



代码如下:



3 分析网页内容

protected void Page_Load(object sender, EventArgs e)

{

string url = "http://rencai.baidu.com/user/login.jsp";

TestSession ts = new HTMLTestSession();

ts.Browser.Start(url);

    //这里你可以用任意属性查找,比如name=1;type=password等等.用分号分隔.



ts.Objects.TextBox("name=username").Input("用户名");

ts.Objects.TextBox("name=password").Input("密码");

ts.Objects.Button("type=submit").Click();

}


使用方法:


首先引用HtmlAgilityPack的DLL文件 using HtmlAgilityPack;




根据XPath提取内容的函数: 


/// <summary>

/// 根据XPATH获取筛选的字符串

/// </summary>

/// <param name="content">需要提取HTML的内容</param>

/// <param name="xpath">XPath表达式</param>

/// <param name="separ">分隔符</param>

/// <returns>提取后的内容</returns>

public static string GetStrByXPath(string content, string xpath, string separ)

{

var doc1 = new HtmlDocument();

doc1.LoadHtml(content);

var repeatNodes = doc1.DocumentNode.SelectNodes(xpath);

var text = "";

//循环节点

foreach (var node in repeatNodes)

{

//text += node.InnerText + separ;

text += node.InnerHtml + separ;

}

}


例如用 //div[@itemprop='articleBody'] 这个xpath可以抓取 http://news.sohu.com/20151029/n424555111.shtml 这个新闻的正文内容



一段例子代码

public async Task<News> NewsGathering(string newsUrl)
{
//获取网页所有内容
var strContent = await HttpHelper.GetContentAsync(newsUrl, Encoding.UTF8);
var title = "";
var content = "";
var pubTime = DateTime.Now;
try
{
//取出标题,时间
title = StrHelperUtil.GetStrByXPath(strContent, "//h1[@class='art_title_h1']", "");
var strTime = StrHelperUtil.GetStrByXPath(strContent, "//time", "");
strTime = StrHelperUtil.FormatHTML(strTime).Replace("\n","").Replace("\\n","");
if (strTime.Length > 18)
{
strTime = strTime.Substring(0, 17);
DateTime.TryParse(strTime, out pubTime);
}
//取出正文区内容
content = StrHelperUtil.GetStrByXPath(strContent, "//section[@data-sudaclick='articleContent']", "");
//利用正则去掉一些不要的内容
content = Regex.Replace(content, "<h1 class=\"art_title_h1\">[\\s\\S]*?</h1>", "");
content = Regex.Replace(content, "<time>[\\s\\S]*?</time>", "");
content = Regex.Replace(content, "<aside>[\\s\\S]*?</aside>", "");
content = Regex.Replace(content, "<script type=\"comos/ver\">[\\s\\S]*?</script>", "");
content = Regex.Replace(content, "<section class=\"M_attitude\" data-pl=\"attitude\" data-sudaclick=\"attitude\">[\\s\\S]*?</section>", "");
content = Regex.Replace(content, "<!--.*?-->", "");

var news = new News
{
Content = content,
Title = title,
PubDate = pubTime,
From = newsUrl
};
return news;
}
catch (Exception ex)
{
return null;
}
return null;
}


解释:

1 content = Regex.Replace(content, "<time>[\\s\\S]*?</time>", ""); 这个是用正则替换<time>开头,</time>结尾的(含头尾tag 字符串 

<pre name="code" class="csharp">[\\s\\S]*? 这个是非贪婪匹配 如
<time>2015-9-10 12:25</time>afkjldkfkljkl<time>2015-10-11 11:25</time>
只会匹配 <time>2015-9-10 12:25</time> 和 <time>2015-10-11 11:25</time>
如果是这样 [\\s\\S]* 不要后面的?则是贪婪匹配 则匹配 <time>2015-9-10 12:25</time>afkjldkfkljkl<time>2015-10-11 11:25</time>

附上获取网页内容的类 

里面有些方法是多余的

using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Net;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading.Tasks;
using System.Web;

namespace Net.Tools
{
public class HttpHelper
{
/// <summary>
/// 获取指定网页的内容
/// </summary>
/// <param name="strUrl">网页地址</param>
/// <param name="encoder">网页编码格式</param>
/// <returns>string</returns>
public static string GetContent(string strUrl, Encoding encoder)
{
string strMsg = string.Empty;
try
{
CookieContainer cc = new CookieContainer();
//WebRequest request = WebRequest.Create(strUrl);
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(strUrl);

//set request args
request.Method = "Get";
request.CookieContainer = cc;
request.KeepAlive = true;

//request.ContentType = "application/x-www-form-urlencoded; charset=UTF-8";
request.ContentType = "text/html";

//模拟goole浏览器访问
request.UserAgent =
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36";
//request.Referer = strUrl;
//request.Headers.Add("x-requested-with:XMLHttpRequest");
request.Headers.Add("x-requested-with:com.android.browser");
request.Headers.Add(HttpRequestHeader.AcceptLanguage, "zh-CN,zh;q=0.8,en;q=0.6,nl;q=0.4,zh-TW;q=0.2");
//request.ContentLength = postdataByte.Length; text/html; charset=utf-8
request.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8";
request.AutomaticDecompression = DecompressionMethods.Deflate | DecompressionMethods.GZip |
DecompressionMethods.None;
//支持跳转页面,查询结果将是跳转后的页面
request.AllowAutoRedirect = true;

request.Headers.Add("Accept-Encoding", "gzip, deflate");
if (request.Method == "POST")
{
(request as HttpWebRequest).ContentType = "application/x-www-form-urlencoded";
}

HttpWebResponse response = (HttpWebResponse)request.GetResponse();

//StreamReader reader = new StreamReader(response.GetResponseStream(), Encoding.GetEncoding("gb2312"));
StreamReader reader = new StreamReader(response.GetResponseStream(), encoder);

strMsg = reader.ReadToEnd();
// .\0为null,空字符,也是字符串结束标志
strMsg = strMsg.Replace("\0", "");
reader.Close();
reader.Dispose();
response.Close();
}
catch
{
}
return strMsg;
}

public static async Task<string> GetContentAsync(string strUrl, Encoding encoder)
{
var strMsg = await Task.Run<string>(() =>
{
try
{
#region

CookieContainer cc = new CookieContainer();
//WebRequest request = WebRequest.Create(strUrl);
HttpWebRequest request = (HttpWebRequest) WebRequest.Create(strUrl);

//set request args
request.Method = "Get";
request.CookieContainer = cc;
request.KeepAlive = true;

//request.ContentType = "application/x-www-form-urlencoded; charset=UTF-8";
request.ContentType = "text/html";

//模拟goole浏览器访问
request.UserAgent =
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36";
//request.Referer = strUrl;
//request.Headers.Add("x-requested-with:XMLHttpRequest");
request.Headers.Add("x-requested-with:com.android.browser");

request.Headers.Add(HttpRequestHeader.AcceptLanguage, "zh-CN,zh;q=0.8,en;q=0.6,nl;q=0.4,zh-TW;q=0.2");
//request.ContentLength = postdataByte.Length; text/html; charset=utf-8
request.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8";
request.AutomaticDecompression = DecompressionMethods.Deflate | DecompressionMethods.GZip |
DecompressionMethods.None;
//支持跳转页面,查询结果将是跳转后的页面
request.AllowAutoRedirect = true;

request.Headers.Add("Accept-Encoding", "gzip, deflate");
if (request.Method == "POST")
{
(request as HttpWebRequest).ContentType = "application/x-www-form-urlencoded";
}

HttpWebResponse response = (HttpWebResponse) request.GetResponse();

//StreamReader reader = new StreamReader(response.GetResponseStream(), Encoding.GetEncoding("gb2312"));
StreamReader reader = new StreamReader(response.GetResponseStream(), encoder);

var strcontent = reader.ReadToEnd();
// .\0为null,空字符,也是字符串结束标志
strcontent = strcontent.Replace("\0", "");
reader.Close();
reader.Dispose();
response.Close();
return strcontent;

#endregion
}
catch (Exception ex)
{
return "";
}
});

return strMsg;
}
public static async Task<string> GetContentByMobileAgentAsync(string strUrl, Encoding encoder)
{
var strMsg = await Task.Run<string>(() =>
{
try
{
#region

CookieContainer cc = new CookieContainer();
//WebRequest request = WebRequest.Create(strUrl);
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(strUrl);

//set request args
request.Method = "Get";
request.CookieContainer = cc;
request.KeepAlive = true;

//request.ContentType = "application/x-www-form-urlencoded; charset=UTF-8";
request.ContentType = "text/html";

//模拟goole浏览器访问
request.UserAgent =
"CoolPad8750_CMCC_TD/1.0 Linux/3.4.5 Android/4.2.1 Release/06.31.2013 Browser/1.0 Profile/MIDP-1.0 Configuration/CLDC-1.0";
//request.Referer = strUrl;
//request.Headers.Add("x-requested-with:XMLHttpRequest");
request.Headers.Add("x-requested-with:com.android.browser");

request.Headers.Add(HttpRequestHeader.AcceptLanguage, "zh-CN,zh;q=0.8,en;q=0.6,nl;q=0.4,zh-TW;q=0.2");
//request.ContentLength = postdataByte.Length; text/html; charset=utf-8
request.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8";
request.AutomaticDecompression = DecompressionMethods.Deflate | DecompressionMethods.GZip |
DecompressionMethods.None;
//支持跳转页面,查询结果将是跳转后的页面
request.AllowAutoRedirect = true;

request.Headers.Add("Accept-Encoding", "gzip, deflate");
if (request.Method == "POST")
{
(request as HttpWebRequest).ContentType = "application/x-www-form-urlencoded";
}

HttpWebResponse response = (HttpWebResponse)request.GetResponse();

//StreamReader reader = new StreamReader(response.GetResponseStream(), Encoding.GetEncoding("gb2312"));
StreamReader reader = new StreamReader(response.GetResponseStream(), encoder);

var strcontent = reader.ReadToEnd();
// .\0为null,空字符,也是字符串结束标志
strcontent = strcontent.Replace("\0", "");
reader.Close();
reader.Dispose();
response.Close();
return strcontent;

#endregion
}
catch (Exception ex)
{
return "";
}
});

return strMsg;
}
public static string GetContent(string strUrl, Encoding encoder, CookieContainer cc)
{
string strMsg = string.Empty;
try
{
//CookieContainer cc = new CookieContainer();
//WebRequest request = WebRequest.Create(strUrl);
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(strUrl);

//set request args
request.Method = "Get";
request.CookieContainer = cc;
request.KeepAlive = true;

//request.ContentType = "application/x-www-form-urlencoded; charset=UTF-8";
request.ContentType = "text/html";

//模拟goole浏览器访问
request.UserAgent =
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36";
//request.Referer = strUrl;
request.Headers.Add("x-requested-with:XMLHttpRequest");
request.Headers.Add(HttpRequestHeader.AcceptLanguage, "zh-CN,zh;q=0.8,en;q=0.6,nl;q=0.4,zh-TW;q=0.2");
//request.ContentLength = postdataByte.Length; text/html; charset=utf-8
request.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8";
request.AutomaticDecompression = DecompressionMethods.Deflate | DecompressionMethods.GZip |
DecompressionMethods.None;
//支持跳转页面,查询结果将是跳转后的页面
request.AllowAutoRedirect = true;

request.Headers.Add("Accept-Encoding", "gzip, deflate");
if (request.Method == "POST")
{
(request as HttpWebRequest).ContentType = "application/x-www-form-urlencoded";
}

HttpWebResponse response = (HttpWebResponse)request.GetResponse();

//StreamReader reader = new StreamReader(response.GetResponseStream(), Encoding.GetEncoding("gb2312"));
StreamReader reader = new StreamReader(response.GetResponseStream(), encoder);

strMsg = reader.ReadToEnd();
// .\0为null,空字符,也是字符串结束标志
strMsg = strMsg.Replace("\0", "");
reader.Close();
reader.Dispose();
response.Close();
}
catch
{
}
return strMsg;
}
/// <summary>
/// 获取指定网页的内容
/// </summary>
/// <param name="strUrl">网页地址</param>
/// <param name="encoder">网页编码格式,不指定null时将自动获取网页编码格式</param>
/// <returns>string</returns>
public static string GetContent2(string strUrl, Encoding encoder)
{
string strMsg = string.Empty;
CookieContainer cc = new CookieContainer();
//WebRequest request = WebRequest.Create(strUrl);
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(strUrl);

//set request args
request.Method = "Get";
request.CookieContainer = cc;
request.KeepAlive = true;
//request.ContentType = "application/x-www-form-urlencoded; charset=UTF-8";
request.ContentType = "text/html";
//request.UserAgent = "Mozilla/5.0 (Windows NT 5.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1";
request.UserAgent =
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36";
request.Referer = strUrl;
request.Headers.Add("x-requested-with:XMLHttpRequest");
request.Headers.Add(HttpRequestHeader.AcceptLanguage, "zh-CN,zh;q=0.8,en;q=0.6,nl;q=0.4,zh-TW;q=0.2");
//request.ContentLength = postdataByte.Length; text/html; charset=utf-8
request.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8";
request.AutomaticDecompression = DecompressionMethods.Deflate | DecompressionMethods.GZip |
DecompressionMethods.None;
//支持跳转页面,查询结果将是跳转后的页面
request.AllowAutoRedirect = true;

request.Headers.Add("Accept-Encoding", "gzip, deflate");
if (request.Method == "POST")
{
(request as HttpWebRequest).ContentType = "application/x-www-form-urlencoded";
}

#region 获取数据

using (HttpWebResponse response = (HttpWebResponse)request.GetResponse())
{
//从这里开始我们要无视编码了
if (encoder == null)
{
MemoryStream _stream = new MemoryStream();
response.GetResponseStream().CopyTo(_stream, 10240);
byte[] RawResponse = _stream.ToArray();
string temp = Encoding.Default.GetString(RawResponse, 0, RawResponse.Length);
//<meta(.*?)charset([\s]?)=[^>](.*?)>
Match meta = Regex.Match(temp, "<meta([^<]*)charset=([^<]*)[\"']",
RegexOptions.IgnoreCase | RegexOptions.Multiline);
string charter = (meta.Groups.Count > 2) ? meta.Groups[2].Value : string.Empty;
charter = charter.Replace("\"", string.Empty).Replace("'", string.Empty).Replace(";", string.Empty);
if (charter.Length > 0)
{
encoder = Encoding.GetEncoding(charter);
}
else
{
if (string.IsNullOrEmpty(response.CharacterSet))
{
encoder = Encoding.UTF8;
}
else
{
encoder = Encoding.GetEncoding(response.CharacterSet);
}
}
strMsg = encoder.GetString(RawResponse);
}
else
{
//开始读取流并设置编码方式
using (StreamReader reader = new StreamReader(response.GetResponseStream(), encoder))
{
strMsg = reader.ReadToEnd();
}
}
}

#endregion

return strMsg.Replace("\0", "");
}

public static string PostLogin(string postData, string requestUrlString, ref CookieContainer cookie)
{
ASCIIEncoding encoding = new ASCIIEncoding();
byte[] data = encoding.GetBytes(postData);
//向服务端请求
HttpWebRequest myRequest = (HttpWebRequest)WebRequest.Create(requestUrlString);
myRequest.Method = "POST";
myRequest.ContentType = "application/x-www-form-urlencoded";
myRequest.ContentLength = data.Length;
myRequest.CookieContainer = new CookieContainer();
Stream newStream = myRequest.GetRequestStream();
newStream.Write(data, 0, data.Length);
newStream.Close();
//将请求的结果发送给客户端(界面、应用)
HttpWebResponse myResponse = (HttpWebResponse)myRequest.GetResponse();
cookie.Add(myResponse.Cookies);
StreamReader reader = new StreamReader(myResponse.GetResponseStream(), Encoding.UTF8);
return reader.ReadToEnd();
}
}
}