用XPATH解析网页并抓取要的内容
HTML解析器有很多种,最常用的是HtmlAgilityPack和SgmlReader(http://sourceforge.net/projects/dekiwiki/files/SgmlReader/)。
这里使用的是HtmlAgilityPack:
下载地址:http://htmlagilitypack.codeplex.com
也可以用nuget
To install HtmlAgilityPack, run the following command in the Package Manager Console
PM> Install-Package HtmlAgilityPack
同时官网提供了一个自动生成xpath路径的工具HAP Explorer。(http://htmlagilitypack.codeplex.com/downloads/get/120936)
关于XPATH表达式以及相关教程参见
获取HTML的方式有很多种:
1.通过HttpWebRequest类可实现模拟登录并获取页面信息
2.用第三方控件模拟登录
如下:
1.实现模拟登录:
从http://code.google.com/p/autotester/downloads/list 下载AutoTesterLib.dll
在项目里添加AutoTesterLib.dll的引用.
代码如下:
3 分析网页内容
protected void Page_Load(object sender, EventArgs e)
{
string url = "http://rencai.baidu.com/user/login.jsp";
TestSession ts = new HTMLTestSession();
ts.Browser.Start(url);
//这里你可以用任意属性查找,比如name=1;type=password等等.用分号分隔.
ts.Objects.TextBox("name=username").Input("用户名");
ts.Objects.TextBox("name=password").Input("密码");
ts.Objects.Button("type=submit").Click();
}
使用方法:
首先引用HtmlAgilityPack的DLL文件 using HtmlAgilityPack;
根据XPath提取内容的函数:
/// <summary>
/// 根据XPATH获取筛选的字符串
/// </summary>
/// <param name="content">需要提取HTML的内容</param>
/// <param name="xpath">XPath表达式</param>
/// <param name="separ">分隔符</param>
/// <returns>提取后的内容</returns>
public static string GetStrByXPath(string content, string xpath, string separ)
{
var doc1 = new HtmlDocument();
doc1.LoadHtml(content);
var repeatNodes = doc1.DocumentNode.SelectNodes(xpath);
var text = "";
//循环节点
foreach (var node in repeatNodes)
{
//text += node.InnerText + separ;
text += node.InnerHtml + separ;
}
}
例如用 //div[@itemprop='articleBody'] 这个xpath可以抓取 http://news.sohu.com/20151029/n424555111.shtml 这个新闻的正文内容
一段例子代码
public async Task<News> NewsGathering(string newsUrl)
{
//获取网页所有内容
var strContent = await HttpHelper.GetContentAsync(newsUrl, Encoding.UTF8);
var title = "";
var content = "";
var pubTime = DateTime.Now;
try
{
//取出标题,时间
title = StrHelperUtil.GetStrByXPath(strContent, "//h1[@class='art_title_h1']", "");
var strTime = StrHelperUtil.GetStrByXPath(strContent, "//time", "");
strTime = StrHelperUtil.FormatHTML(strTime).Replace("\n","").Replace("\\n","");
if (strTime.Length > 18)
{
strTime = strTime.Substring(0, 17);
DateTime.TryParse(strTime, out pubTime);
}
//取出正文区内容
content = StrHelperUtil.GetStrByXPath(strContent, "//section[@data-sudaclick='articleContent']", "");
//利用正则去掉一些不要的内容
content = Regex.Replace(content, "<h1 class=\"art_title_h1\">[\\s\\S]*?</h1>", "");
content = Regex.Replace(content, "<time>[\\s\\S]*?</time>", "");
content = Regex.Replace(content, "<aside>[\\s\\S]*?</aside>", "");
content = Regex.Replace(content, "<script type=\"comos/ver\">[\\s\\S]*?</script>", "");
content = Regex.Replace(content, "<section class=\"M_attitude\" data-pl=\"attitude\" data-sudaclick=\"attitude\">[\\s\\S]*?</section>", "");
content = Regex.Replace(content, "<!--.*?-->", "");
var news = new News
{
Content = content,
Title = title,
PubDate = pubTime,
From = newsUrl
};
return news;
}
catch (Exception ex)
{
return null;
}
return null;
}
解释:
1 content = Regex.Replace(content, "<time>[\\s\\S]*?</time>", ""); 这个是用正则替换<time>开头,</time>结尾的(含头尾tag 字符串
<pre name="code" class="csharp">[\\s\\S]*? 这个是非贪婪匹配 如
<time>2015-9-10 12:25</time>afkjldkfkljkl<time>2015-10-11 11:25</time>
只会匹配 <time>2015-9-10 12:25</time> 和 <time>2015-10-11 11:25</time>
如果是这样 [\\s\\S]* 不要后面的?则是贪婪匹配 则匹配 <time>2015-9-10 12:25</time>afkjldkfkljkl<time>2015-10-11 11:25</time>
附上获取网页内容的类
里面有些方法是多余的
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Net;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading.Tasks;
using System.Web;
namespace Net.Tools
{
public class HttpHelper
{
/// <summary>
/// 获取指定网页的内容
/// </summary>
/// <param name="strUrl">网页地址</param>
/// <param name="encoder">网页编码格式</param>
/// <returns>string</returns>
public static string GetContent(string strUrl, Encoding encoder)
{
string strMsg = string.Empty;
try
{
CookieContainer cc = new CookieContainer();
//WebRequest request = WebRequest.Create(strUrl);
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(strUrl);
//set request args
request.Method = "Get";
request.CookieContainer = cc;
request.KeepAlive = true;
//request.ContentType = "application/x-www-form-urlencoded; charset=UTF-8";
request.ContentType = "text/html";
//模拟goole浏览器访问
request.UserAgent =
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36";
//request.Referer = strUrl;
//request.Headers.Add("x-requested-with:XMLHttpRequest");
request.Headers.Add("x-requested-with:com.android.browser");
request.Headers.Add(HttpRequestHeader.AcceptLanguage, "zh-CN,zh;q=0.8,en;q=0.6,nl;q=0.4,zh-TW;q=0.2");
//request.ContentLength = postdataByte.Length; text/html; charset=utf-8
request.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8";
request.AutomaticDecompression = DecompressionMethods.Deflate | DecompressionMethods.GZip |
DecompressionMethods.None;
//支持跳转页面,查询结果将是跳转后的页面
request.AllowAutoRedirect = true;
request.Headers.Add("Accept-Encoding", "gzip, deflate");
if (request.Method == "POST")
{
(request as HttpWebRequest).ContentType = "application/x-www-form-urlencoded";
}
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
//StreamReader reader = new StreamReader(response.GetResponseStream(), Encoding.GetEncoding("gb2312"));
StreamReader reader = new StreamReader(response.GetResponseStream(), encoder);
strMsg = reader.ReadToEnd();
// .\0为null,空字符,也是字符串结束标志
strMsg = strMsg.Replace("\0", "");
reader.Close();
reader.Dispose();
response.Close();
}
catch
{
}
return strMsg;
}
public static async Task<string> GetContentAsync(string strUrl, Encoding encoder)
{
var strMsg = await Task.Run<string>(() =>
{
try
{
#region
CookieContainer cc = new CookieContainer();
//WebRequest request = WebRequest.Create(strUrl);
HttpWebRequest request = (HttpWebRequest) WebRequest.Create(strUrl);
//set request args
request.Method = "Get";
request.CookieContainer = cc;
request.KeepAlive = true;
//request.ContentType = "application/x-www-form-urlencoded; charset=UTF-8";
request.ContentType = "text/html";
//模拟goole浏览器访问
request.UserAgent =
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36";
//request.Referer = strUrl;
//request.Headers.Add("x-requested-with:XMLHttpRequest");
request.Headers.Add("x-requested-with:com.android.browser");
request.Headers.Add(HttpRequestHeader.AcceptLanguage, "zh-CN,zh;q=0.8,en;q=0.6,nl;q=0.4,zh-TW;q=0.2");
//request.ContentLength = postdataByte.Length; text/html; charset=utf-8
request.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8";
request.AutomaticDecompression = DecompressionMethods.Deflate | DecompressionMethods.GZip |
DecompressionMethods.None;
//支持跳转页面,查询结果将是跳转后的页面
request.AllowAutoRedirect = true;
request.Headers.Add("Accept-Encoding", "gzip, deflate");
if (request.Method == "POST")
{
(request as HttpWebRequest).ContentType = "application/x-www-form-urlencoded";
}
HttpWebResponse response = (HttpWebResponse) request.GetResponse();
//StreamReader reader = new StreamReader(response.GetResponseStream(), Encoding.GetEncoding("gb2312"));
StreamReader reader = new StreamReader(response.GetResponseStream(), encoder);
var strcontent = reader.ReadToEnd();
// .\0为null,空字符,也是字符串结束标志
strcontent = strcontent.Replace("\0", "");
reader.Close();
reader.Dispose();
response.Close();
return strcontent;
#endregion
}
catch (Exception ex)
{
return "";
}
});
return strMsg;
}
public static async Task<string> GetContentByMobileAgentAsync(string strUrl, Encoding encoder)
{
var strMsg = await Task.Run<string>(() =>
{
try
{
#region
CookieContainer cc = new CookieContainer();
//WebRequest request = WebRequest.Create(strUrl);
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(strUrl);
//set request args
request.Method = "Get";
request.CookieContainer = cc;
request.KeepAlive = true;
//request.ContentType = "application/x-www-form-urlencoded; charset=UTF-8";
request.ContentType = "text/html";
//模拟goole浏览器访问
request.UserAgent =
"CoolPad8750_CMCC_TD/1.0 Linux/3.4.5 Android/4.2.1 Release/06.31.2013 Browser/1.0 Profile/MIDP-1.0 Configuration/CLDC-1.0";
//request.Referer = strUrl;
//request.Headers.Add("x-requested-with:XMLHttpRequest");
request.Headers.Add("x-requested-with:com.android.browser");
request.Headers.Add(HttpRequestHeader.AcceptLanguage, "zh-CN,zh;q=0.8,en;q=0.6,nl;q=0.4,zh-TW;q=0.2");
//request.ContentLength = postdataByte.Length; text/html; charset=utf-8
request.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8";
request.AutomaticDecompression = DecompressionMethods.Deflate | DecompressionMethods.GZip |
DecompressionMethods.None;
//支持跳转页面,查询结果将是跳转后的页面
request.AllowAutoRedirect = true;
request.Headers.Add("Accept-Encoding", "gzip, deflate");
if (request.Method == "POST")
{
(request as HttpWebRequest).ContentType = "application/x-www-form-urlencoded";
}
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
//StreamReader reader = new StreamReader(response.GetResponseStream(), Encoding.GetEncoding("gb2312"));
StreamReader reader = new StreamReader(response.GetResponseStream(), encoder);
var strcontent = reader.ReadToEnd();
// .\0为null,空字符,也是字符串结束标志
strcontent = strcontent.Replace("\0", "");
reader.Close();
reader.Dispose();
response.Close();
return strcontent;
#endregion
}
catch (Exception ex)
{
return "";
}
});
return strMsg;
}
public static string GetContent(string strUrl, Encoding encoder, CookieContainer cc)
{
string strMsg = string.Empty;
try
{
//CookieContainer cc = new CookieContainer();
//WebRequest request = WebRequest.Create(strUrl);
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(strUrl);
//set request args
request.Method = "Get";
request.CookieContainer = cc;
request.KeepAlive = true;
//request.ContentType = "application/x-www-form-urlencoded; charset=UTF-8";
request.ContentType = "text/html";
//模拟goole浏览器访问
request.UserAgent =
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36";
//request.Referer = strUrl;
request.Headers.Add("x-requested-with:XMLHttpRequest");
request.Headers.Add(HttpRequestHeader.AcceptLanguage, "zh-CN,zh;q=0.8,en;q=0.6,nl;q=0.4,zh-TW;q=0.2");
//request.ContentLength = postdataByte.Length; text/html; charset=utf-8
request.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8";
request.AutomaticDecompression = DecompressionMethods.Deflate | DecompressionMethods.GZip |
DecompressionMethods.None;
//支持跳转页面,查询结果将是跳转后的页面
request.AllowAutoRedirect = true;
request.Headers.Add("Accept-Encoding", "gzip, deflate");
if (request.Method == "POST")
{
(request as HttpWebRequest).ContentType = "application/x-www-form-urlencoded";
}
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
//StreamReader reader = new StreamReader(response.GetResponseStream(), Encoding.GetEncoding("gb2312"));
StreamReader reader = new StreamReader(response.GetResponseStream(), encoder);
strMsg = reader.ReadToEnd();
// .\0为null,空字符,也是字符串结束标志
strMsg = strMsg.Replace("\0", "");
reader.Close();
reader.Dispose();
response.Close();
}
catch
{
}
return strMsg;
}
/// <summary>
/// 获取指定网页的内容
/// </summary>
/// <param name="strUrl">网页地址</param>
/// <param name="encoder">网页编码格式,不指定null时将自动获取网页编码格式</param>
/// <returns>string</returns>
public static string GetContent2(string strUrl, Encoding encoder)
{
string strMsg = string.Empty;
CookieContainer cc = new CookieContainer();
//WebRequest request = WebRequest.Create(strUrl);
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(strUrl);
//set request args
request.Method = "Get";
request.CookieContainer = cc;
request.KeepAlive = true;
//request.ContentType = "application/x-www-form-urlencoded; charset=UTF-8";
request.ContentType = "text/html";
//request.UserAgent = "Mozilla/5.0 (Windows NT 5.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1";
request.UserAgent =
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36";
request.Referer = strUrl;
request.Headers.Add("x-requested-with:XMLHttpRequest");
request.Headers.Add(HttpRequestHeader.AcceptLanguage, "zh-CN,zh;q=0.8,en;q=0.6,nl;q=0.4,zh-TW;q=0.2");
//request.ContentLength = postdataByte.Length; text/html; charset=utf-8
request.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8";
request.AutomaticDecompression = DecompressionMethods.Deflate | DecompressionMethods.GZip |
DecompressionMethods.None;
//支持跳转页面,查询结果将是跳转后的页面
request.AllowAutoRedirect = true;
request.Headers.Add("Accept-Encoding", "gzip, deflate");
if (request.Method == "POST")
{
(request as HttpWebRequest).ContentType = "application/x-www-form-urlencoded";
}
#region 获取数据
using (HttpWebResponse response = (HttpWebResponse)request.GetResponse())
{
//从这里开始我们要无视编码了
if (encoder == null)
{
MemoryStream _stream = new MemoryStream();
response.GetResponseStream().CopyTo(_stream, 10240);
byte[] RawResponse = _stream.ToArray();
string temp = Encoding.Default.GetString(RawResponse, 0, RawResponse.Length);
//<meta(.*?)charset([\s]?)=[^>](.*?)>
Match meta = Regex.Match(temp, "<meta([^<]*)charset=([^<]*)[\"']",
RegexOptions.IgnoreCase | RegexOptions.Multiline);
string charter = (meta.Groups.Count > 2) ? meta.Groups[2].Value : string.Empty;
charter = charter.Replace("\"", string.Empty).Replace("'", string.Empty).Replace(";", string.Empty);
if (charter.Length > 0)
{
encoder = Encoding.GetEncoding(charter);
}
else
{
if (string.IsNullOrEmpty(response.CharacterSet))
{
encoder = Encoding.UTF8;
}
else
{
encoder = Encoding.GetEncoding(response.CharacterSet);
}
}
strMsg = encoder.GetString(RawResponse);
}
else
{
//开始读取流并设置编码方式
using (StreamReader reader = new StreamReader(response.GetResponseStream(), encoder))
{
strMsg = reader.ReadToEnd();
}
}
}
#endregion
return strMsg.Replace("\0", "");
}
public static string PostLogin(string postData, string requestUrlString, ref CookieContainer cookie)
{
ASCIIEncoding encoding = new ASCIIEncoding();
byte[] data = encoding.GetBytes(postData);
//向服务端请求
HttpWebRequest myRequest = (HttpWebRequest)WebRequest.Create(requestUrlString);
myRequest.Method = "POST";
myRequest.ContentType = "application/x-www-form-urlencoded";
myRequest.ContentLength = data.Length;
myRequest.CookieContainer = new CookieContainer();
Stream newStream = myRequest.GetRequestStream();
newStream.Write(data, 0, data.Length);
newStream.Close();
//将请求的结果发送给客户端(界面、应用)
HttpWebResponse myResponse = (HttpWebResponse)myRequest.GetResponse();
cookie.Add(myResponse.Cookies);
StreamReader reader = new StreamReader(myResponse.GetResponseStream(), Encoding.UTF8);
return reader.ReadToEnd();
}
}
}