先看效果:

取得搜索引擎综合新闻并分页_stylesheet

      左边为GOOGLE的关于“上海”的新闻,右边为Baidu中关于“上海”的新闻。由于GOOGLE的新闻有图片及相关新闻摘要,因此,左边十条,右边列出八十条与之对应。

      目前已初具雏型,使用C#代码编写。

贴出部分核心代码:

------------------------------------------------------

(以下仅做研究之用,勿用于其他目的。)

------------------------------------------------------

 

// INews.cs
using System;
using System.Collections.Generic;
using System.Text;namespace NewsFromSearchingEngine
{
public interface INews
{
bool IsError { get ; set ; }
int PageNo { get ; set ; }
int PerPageCount { get ; set ; }
int NewsStartNo { get ; set ; }
}
}// INewsFromSearching.cs
using System;
using System.Collections.Generic;
using System.Text;namespace NewsFromSearchingEngine
{
public interface INewsFromSearching : INews
{
string ErrorMsgForGetNews { get ; set ; }
bool IsErrorForGetNews { get ; set ; }
string FromUrl { get; set; }
string SearchKeywords { get ; set ; }
string StartFilterHtmlCode { get ; set ; }
string EndFilterHtmlCode { get ; set ; }
string StartFilterCssCode { get ; set ; }
string EndFilterCssCode { get ; set ; }
string StartFilterFooterCode { get ; set ; }
string EndFilterFooterCode { get ; set ; }
bool IsTrimFooterEnd { get; set; }
}
}// NewsFromASearching.cs
using System;
using System.Collections.Generic;
using System.Text;namespace NewsFromSearchingEngine
{
/// <summary>
/// 从搜索引擎获取新闻的抽象类
/// </summary>
public abstract class NewsFromASearching
{
/// <summary>
/// 给定URL和编码方式,从搜索引擎获取新闻
/// </summary>
/// <param name="url">获取新闻的URL</param>
/// <param name="enc">编码方式</param>
/// <returns>返回新闻内容</returns>
public virtual string GetNews(string url, Encoding enc)
{
return "GetNews(string url, Encoding enc)";
} /// <summary>
/// 给定URL,从搜索引擎获取新闻(编码方式为:Encoding.Default)
/// </summary>
/// <param name="url">获取新闻的URL</param>
/// <returns>返回新闻内容</returns>
public virtual string GetNews(string url)
{
return "GetNews(string url)";
} /// <summary>
/// 给定编码方式,从搜索引擎获取新闻(URL另行指定)
/// </summary>
/// <param name="enc">编码方式</param>
/// <returns>返回新闻内容</returns>
public virtual string GetNews(Encoding enc)
{
return "GetNews(Encoding enc)";
} /// <summary>
/// 过滤指定新闻内容的HTML代码
/// </summary>
/// <param name="newsContent">新闻内容</param>
/// <returns>返回过滤后的HTML代码</returns>
public virtual string FilterNews(string newsContent)
{
return "FilterNews(string newsContent)";
} /// <summary>
/// 取得指定新闻内容中的CSS相关代码
/// </summary>
/// <param name="newsContent">新闻内容</param>
/// <returns>返回CSS相关代码</returns>
public virtual string GetCss(string newsContent)
{
return "FilterNews(string newsContent)";
} /// <summary>
/// 取得指定新闻内容的结尾(Footer)代码
/// </summary>
/// <param name="newsContent">新闻内容</param>
/// <returns>返回尾部代码</returns>
public virtual string FilterFooter(string newsContent)
{
return "FilterFooter(string newsContent)";
} /// <summary>
/// 从内容中去掉从开始内容到结束内容之间的代码
/// </summary>
/// <param name="newsContent">内容</param>
/// <param name="startFilterHtmlCode">开始代码</param>
/// <param name="endFilterHtmlCode">结束代码</param>
/// <param name="isTrimEnd">布尔值,是否去掉结束代码?</param>
/// <returns>返回过滤后的代码</returns>
public virtual string FilterStartToEnd(string newsContent, string startFilterHtmlCode, string endFilterHtmlCode, bool isTrimEnd)
{
return "FilterStartToEnd(string newsContent, string startFilterHtmlCode, string endFilterHtmlCode, bool isTrimEnd)";
}
}
}// NewsFromSearchingEngine.cs
using System;
using System.Web.UI;
using System.Collections.Generic;
using System.Text;
using System.Web;
using System.IO;
using System.Net;
using System.Text.RegularExpressions;namespace NewsFromSearchingEngine
{
public class NewsFromSearchingEngine : NewsFromASearching, INewsFromSearching
{
string _newsFromSiteName = "google";
public string NewsFromSiteName
{
get { return _newsFromSiteName; }
set { _newsFromSiteName = value; }
} int _perPageCount = 10;
/// <summary>
/// 每页新闻的显示条数
/// </summary>
public int PerPageCount
{
get { return _perPageCount; }
set { _perPageCount = value; }
} int _newsStartNo = 0;
/// <summary>
/// 新闻开始序号
/// </summary>
public int NewsStartNo
{
get { return _newsStartNo; }
set { _newsStartNo = value; }
} string _errorMsgForGetNews = @"<b>Sorry,发生意外错误!</b><br />可能由于网络通讯不畅而暂时无法访问或者参数有误。<br />";
/// <summary>
/// 抓取新闻发生错误时,指定的提示性语言
/// </summary>
public string ErrorMsgForGetNews
{
get { return _errorMsgForGetNews; }
set { _errorMsgForGetNews = value; }
} bool _isErrorForGetNews = false;
/// <summary>
/// 获取或设置抓取新闻时是否出错
/// </summary>
public bool IsErrorForGetNews
{
get { return _isErrorForGetNews; }
set { _isErrorForGetNews = value; }
} string _fromUrl = string.Empty;
/// <summary>
/// 获取或设置抓取新闻的URL目标地址
/// </summary>
public string FromUrl
{
get { return _fromUrl; }
set { _fromUrl = value; }
} string _searchKeywords = "上海";
/// <summary>
/// 获取或设置抓取新闻的关键字
/// </summary>
public string SearchKeywords
{
get { return _searchKeywords; }
set { _searchKeywords = value; }
} string _startFilterHtmlCode = string.Empty;
/// <summary>
/// 目标内容页开始的HTML代码
/// </summary>
public string StartFilterHtmlCode
{
get { return _startFilterHtmlCode; }
set { _startFilterHtmlCode = value; }
} string _endFilterHtmlCode = string.Empty;
/// <summary>
/// 目标内容页中实质性新闻内容开始的HTML代码。
/// 一般以此为分界,在此之前的部分为头部内容,需要去掉除CSS相关代码外的其他内容。
/// 紧接此分界代码后的代码为“实质性新闻内容”,这部分为新闻的核心,是需要保留的内容。
/// 在GetNews(...)方法后,还需要去掉尾部(Footer)的内容,这样才能最终得到新闻的核心内容部分。
/// </summary>
public string EndFilterHtmlCode
{
get { return _endFilterHtmlCode; }
set { _endFilterHtmlCode = value; }
} string _startFilterCssCode = string.Empty;
/// <summary>
/// CSS代码开始的特征代码
/// </summary>
public string StartFilterCssCode
{
get { return _startFilterCssCode; }
set { _startFilterCssCode = value; }
} string _endFilterCssCode = string.Empty;
/// <summary>
/// CSS代码结束的特征代码
/// </summary>
public string EndFilterCssCode
{
get { return _endFilterCssCode; }
set { _endFilterCssCode = value; }
} string _startFilterFooterCode = string.Empty;
/// <summary>
/// 尾部开始的特征代码(用来去除搜索引擎网页尾部多余代码)
/// </summary>
public string StartFilterFooterCode
{
get { return _startFilterFooterCode; }
set { _startFilterFooterCode = value; }
} string _endFilterFooterCode = string.Empty;
/// <summary>
/// 尾部结束的特征代码(用来去除搜索引擎网页尾部多余代码)
/// </summary>
public string EndFilterFooterCode
{
get { return _endFilterFooterCode; }
set { _endFilterFooterCode = value; }
} bool _isTrimFooterEnd = true;
/// <summary>
/// 获取或设置是否去除尾部特征代码
/// </summary>
public bool IsTrimFooterEnd
{
get { return _isTrimFooterEnd; }
set { _isTrimFooterEnd = value; }
} bool _isError = false;
/// <summary>
/// 获取或设置抓取新闻是否出错。
/// </summary>
public bool IsError
{
get
{
return _isError;
}
set
{
_isError = value;
}
} int _pageNo = 1;
/// <summary>
/// 获取或设置页码
/// </summary>
public int PageNo
{
get { return _pageNo; }
set { _pageNo = value; }
} /// <summary>
/// 给定URL和编码方式,从搜索引擎获取新闻
/// </summary>
/// <param name="url">获取新闻的URL</param>
/// <param name="enc">编码方式</param>
/// <returns>返回新闻内容</returns>
public override string GetNews(string url, Encoding enc)
{
string result;
WebRequest request = WebRequest.Create(url);
request.ContentType = "application/x-www-form-urlencoded";
request.Method = "Get"; try
{
WebResponse response = request.GetResponse();
Stream resStream = response.GetResponseStream();
StreamReader sr = new StreamReader(resStream, enc);
result = sr.ReadToEnd();
resStream.Close();
sr.Close(); _isErrorForGetNews = false;
}
catch (WebException exc)
{
StringBuilder sbError = new StringBuilder();
sbError.Append(_errorMsgForGetNews);
sbError.Append(@"<!--");
sbError.Append(@"所请求的网址是:<br />");
sbError.Append(url);
sbError.Append(@"-->"); _isErrorForGetNews = true;
result = sbError.ToString();
} return result;
} /// <summary>
/// 给定URL,从搜索引擎获取新闻(编码方式为:Encoding.Default)
/// </summary>
/// <param name="url">获取新闻的URL</param>
/// <returns>返回新闻内容</returns>
public override string GetNews(string url)
{
return GetNews(url, Encoding.Default);
} /// <summary>
/// 给定编码方式,从搜索引擎获取新闻(URL另行指定)
/// </summary>
/// <param name="enc">编码方式</param>
/// <returns>返回新闻内容</returns>
public override string GetNews(Encoding enc)
{
if (string.IsNullOrEmpty(this.FromUrl))
{
return string.Empty;
}
return GetNews(this.FromUrl, enc);
} /// <summary>
/// 过滤指定新闻内容的HTML代码
/// </summary>
/// <param name="newsContent">新闻内容</param>
/// <returns>返回过滤后的HTML代码</returns>
public string FilterNews(string newsContent,bool isLinkCss)
{
if (_isErrorForGetNews)
{
return newsContent;
}
string result;
try
{
int startIndex = newsContent.IndexOf(_startFilterHtmlCode);
int removeLength = newsContent.IndexOf(_endFilterHtmlCode) - startIndex + _endFilterHtmlCode.Length;
StringBuilder sb = new StringBuilder();
StringBuilder sbNews = new StringBuilder(newsContent);
sbNews.Remove(startIndex, removeLength);
sb.AppendLine();
sb.Append(GetCss(newsContent, isLinkCss));
sb.Append(sbNews);
result = sb.ToString(); result = FilterFooter(result);
return result;
}
catch (Exception exc)
{
result = "<b>抱歉,发生意外错误!</b><br />可能由于网络通讯不畅,暂时无法访问,或参数有误。";
_isError = true;
return result;
}
}
/// <summary>
/// 过滤指定新闻内容的HTML代码
/// </summary>
/// <param name="newsContent">新闻内容</param>
/// <returns>返回过滤后的HTML代码</returns>
public override string FilterNews(string newsContent)
{
return FilterNews(newsContent, false);
}
/// <summary>
/// 取得指定新闻内容中的CSS相关代码
/// </summary>
/// <param name="newsContent">新闻内容</param>
/// <returns>返回CSS相关代码</returns>
public override string GetCss(string newsContent)
{
return GetCss(newsContent, false);
} /// <summary>
/// 取得指定新闻内容中的CSS相关代码
/// </summary>
/// <param name="newsContent">新闻内容</param>
/// <returns>返回CSS相关代码</returns>
public string GetCss(string newsContent, bool isLinkCss)
{
string result = string.Empty;
if (isLinkCss)
{
//写成Css文件并加上链接
if (this.NewsFromSiteName.StartsWith("google"))
{
result = @"<link href=""gNews.css"" rel=""stylesheet"" type=""text/css"" />";
}
else if (this.NewsFromSiteName.StartsWith("baidu"))
{
result = @"<link href=""bNews.css"" rel=""stylesheet"" type=""text/css"" />";
}
else
{
result = string.Empty;
}
return result;
} int startIndex = newsContent.IndexOf(_startFilterCssCode);
int removeLength = newsContent.Length - newsContent.IndexOf(_endFilterCssCode);
StringBuilder sb = new StringBuilder(newsContent);
sb.Remove(newsContent.IndexOf(_endFilterCssCode), removeLength);
sb.Remove(0, startIndex);
sb.Append(_endFilterCssCode);
result = sb.ToString();
//string result = sb.ToString().Replace("h2{font-size:1.34em}", "h2{font-size:1.2em}"); return result;
} /// <summary>
/// 取得指定新闻内容的结尾(Footer)代码
/// </summary>
/// <param name="newsContent">新闻内容</param>
/// <returns>返回尾部代码</returns>
public override string FilterFooter(string newsContent)
{
string result = newsContent;
result = FilterStartToEnd(result, _startFilterFooterCode, _endFilterFooterCode, true); return result;
} /// <summary>
/// 从内容中去掉从开始内容到结束内容之间的代码
/// </summary>
/// <param name="newsContent">内容</param>
/// <param name="startFilterHtmlCode">开始代码</param>
/// <param name="endFilterHtmlCode">结束代码</param>
/// <param name="isTrimEnd">布尔值,是否去掉结束代码?</param>
/// <returns>返回过滤后的代码</returns>
public override string FilterStartToEnd(string newsContent, string startFilterHtmlCode, string endFilterHtmlCode, bool isTrimEnd)
{
int startIndex = newsContent.IndexOf(startFilterHtmlCode);
int removeLength = newsContent.IndexOf(endFilterHtmlCode) - startIndex;
if (isTrimEnd)
{
removeLength += endFilterHtmlCode.Length;
}
StringBuilder sbNews = new StringBuilder(newsContent);
sbNews.Remove(startIndex, removeLength); string result = sbNews.ToString();
return result;
} /// <summary>
/// 增加翻页代码
/// </summary>
/// <param name="sessionKeywords">搜索关键字(可以保存到Session中)</param>
/// <param name="perPageNumberDisplay">显示页码数量</param>
/// <returns>返回翻页代码</returns>
public string AddPager(string sessionKeywords, int perPageNumberDisplay)
{
return AddPager(sessionKeywords, perPageNumberDisplay, 999);
} /// <summary>
/// 增加翻页代码
/// </summary>
/// <param name="sessionKeywords">搜索关键字(可以保存到Session中)</param>
/// <param name="perPageNumberDisplay">显示页码数量</param>
/// <returns>返回翻页代码</returns>
public string AddPager(string sessionKeywords, int perPageNumberDisplay, int maxPageNumber)
{
string currentUrl = VirtualPathUtility.GetFileName(System.Web.HttpContext.Current.Request.FilePath.ToString()).ToString();
StringBuilder sbPager = new StringBuilder();
sbPager.Append(@"<div id=""pageNum"" style=""margin-top:16px;"">");
if (_pageNo > 1)
{
sbPager.Append(@"<a href=""" + currentUrl + "?pn=1&kw=" + sessionKeywords + @""">首页</a>   ");
}
if (_pageNo > 2)
{
sbPager.Append(@"<a href=""" + currentUrl + "?pn=");
sbPager.Append((_pageNo - 1).ToString());
sbPager.Append(@"&kw=" + sessionKeywords + @""">上一页</a>   ");
} for (int i = 0; i < perPageNumberDisplay; i++)
{
int tmpPageNo = _pageNo + i;
if (maxPageNumber < tmpPageNo) break; if (tmpPageNo == _pageNo)
{
sbPager.Append("<font color=red size+><b>");
sbPager.Append((_pageNo + i).ToString() + "</b></font>   ");
}
else
{
if (_pageNo < 5)
{
sbPager.Append(@"<a href=""" + currentUrl + "?pn=");
sbPager.Append((_pageNo + i).ToString());
sbPager.Append(@"&kw=" + sessionKeywords + @""">[" + (_pageNo + i).ToString() + "]</a>   ");
}
else
{
sbPager.Append(@"<a href=""" + currentUrl + "?pn=");
sbPager.Append((_pageNo + i).ToString());
sbPager.Append(@"&kw=" + sessionKeywords + @""">[" + (_pageNo + i).ToString() + "]</a>   ");
}
}
}
sbPager.Append(@"</div>"); return sbPager.ToString();
}
}
}

 

OK!