前两天朋友叫我模仿一个网站,刚刚开始,我一个页面一个页面查看源码并复制和保存,花了我很多时间,一个字“累”,为了减轻工作量,我写了个网站“克隆工具”,一键克隆,比起人工操作,
效率提高了200%以上,精确度也大大提高,下面我将我写的“网站克隆工具”实现方法分享给大家。
一睹为快,先看看界面:
开发工具:vs2012(winform)
1.新建UrlModel模型
public class UrlModel
{
public string RelatedPath { get; set; }
public string AbsoluteUri { get; set; }
public string CurrPath { get; set; }
public string RootPath { get; set; }
public string Host { get; set; }
public int Port { get; set; }
public string Scheme { get; set; }
}
2.新建UrlParser解析器
public class UrlParser
{
public static UrlModel Parse(string url)
{
UrlModel model = new UrlModel();
//默认
if (url.Length < 8)
throw new Exception("url参数不正确");
else if (!url.ToLower().StartsWith("http:") && !url.ToLower().StartsWith("https:"))
throw new Exception("url格式有误");
if (url.LastIndexOf('/') < 8)
url = url + "/";
Regex reg = new Regex("(?<scheme>(http|https))://(?<host>.+?)/", RegexOptions.Singleline);
if (reg.IsMatch(url))
{
string scheme = reg.Match(url).Groups["scheme"].Value;
string host = reg.Match(url).Groups["host"].Value;
if (host.Contains(":"))
{
var aa = host.Split(':');
if (aa.Length == 2)
{
model.Host = aa[0];
model.Port = int.Parse(aa[1]);
}
}
else
{
model.Host = host;
model.Port = 80;
}
int index = url.IndexOf('/', 8);
model.RelatedPath = url.Substring(index);
model.AbsoluteUri = url;
model.Scheme = scheme;
model.CurrPath = url.Substring(0, url.LastIndexOf("/"));
if (80 == model.Port)
{
model.RootPath = string.Format("{0}://{1}", model.Scheme, model.Host);
}
else
{
model.RootPath = string.Format("{0}://{1}:{2", model.Scheme, model.Host, model.Port);
}
}
else
{
throw new Exception("url解析失败!");
}
return model;
}
}
3.网页处理服务工具
/// <summary>
/// 网页处理服务工具
/// </summary>
public class WebPageService
{
private static string[] excludekeys = { "http:", "https:", "//", "#", "javascript:", "?", "tel:", "mailto:" };
/// <summary>
/// 获取所有html元素的href属性值,只获取站点本地的链接,站外的不获取
/// </summary>
/// <param name="html">页面的html源码</param>
/// <returns></returns>
public static List<UrlModel> GetLocalHrefs(string url,string html)
{
if (string.IsNullOrEmpty(html))
return new List<UrlModel>();
Dictionary<string, UrlModel> urls = GetHrefs(url,html);
List<UrlModel> newUrls = new List<UrlModel>();
if (null != urls)
{
foreach (string key in urls.Keys)
{
string newkey = key.ToLower();
bool iscontained = false;
foreach (var exkey in excludekeys)
{
if (newkey.IndexOf(exkey) == 0)
{
iscontained = true;
break;
}
}
if (!iscontained) {
//只获取本地路径
newUrls.Add(urls[key]);
}
}
}
return newUrls;
}
/// <summary>
/// 获取所有html元素的src属性值,只获取站点本地的链接,站外的不获取
/// </summary>
/// <param name="html">页面的html源码</param>
/// <returns></returns>
public static List<UrlModel> GetLocalSrcs(string url,string html)
{
if (string.IsNullOrEmpty(html))
return new List<UrlModel>();
Dictionary<string, UrlModel> urls = GetSrc(url, html);
List<UrlModel> newUrls = new List<UrlModel>();
if (null != urls)
{
foreach (string key in urls.Keys)
{
string newkey = key.ToLower();
bool iscontained = false;
foreach (var exkey in excludekeys)
{
if (newkey.IndexOf(exkey) == 0)
{
iscontained = true;
break;
}
}
if (!iscontained)
{
//只获取本地路径
newUrls.Add(urls[key]);
}
}
}
return newUrls;
}
private static Dictionary<string, UrlModel> GetHrefs(string url,string html)
{
if (string.IsNullOrEmpty(html))
return null;
UrlModel currUrl = UrlParser.Parse(url);
Dictionary<string, UrlModel> urls = new Dictionary<string, UrlModel>();
Regex reg = new Regex("href=\"(?<Url>.+?)\"", RegexOptions.IgnoreCase);
if (currUrl != null)
{
AddUrlModel(html, currUrl, urls, reg);
}
return urls;
}
private static Dictionary<string, UrlModel> GetSrc(string url,string html)
{
if (string.IsNullOrEmpty(html))
return null;
UrlModel currUrl = UrlParser.Parse(url);
Dictionary<string, UrlModel> urls = new Dictionary<string, UrlModel>();
Regex reg = new Regex("(src=\"(?<Url>.+?)\"|url\\((?<Url>.+?)\\))", RegexOptions.IgnoreCase);
if (currUrl != null)
{
AddUrlModel(html, currUrl, urls, reg);
}
return urls;
}
private static void AddUrlModel(string html, UrlModel currUrl, Dictionary<string, UrlModel> urls, Regex reg)
{
if (reg.IsMatch(html))
{
MatchCollection matchs = reg.Matches(html);
foreach (Match item in matchs)
{
try
{
string strUrl = item.Groups["Url"].Value;
UrlModel model = new UrlModel();
model.RelatedPath = strUrl;
model.CurrPath = currUrl.CurrPath;
model.RootPath = currUrl.RootPath;
model.Scheme = currUrl.Scheme;
model.Port = currUrl.Port;
model.Host = currUrl.Host;
if (strUrl.StartsWith("/"))
{
//绝对目录情况下
model.AbsoluteUri = string.Format("{0}{1}", model.RootPath, model.RelatedPath);
}
else
{
//相对目录情况下
string currPath = model.CurrPath;
int depth = 0;
string path = model.RelatedPath;
if (path.StartsWith(".."))
{
try
{
while (path.StartsWith(".."))
{
depth++;
path = path.Substring(3);
currPath = currPath.Substring(0, currPath.LastIndexOf("/"));
}
model.AbsoluteUri = string.Format("{0}/{1}", currPath, path);
}
catch
{
}
}
else
{
model.AbsoluteUri = string.Format("{0}/{1}", currPath, path);
}
}
strUrl = strUrl.Trim().ToLower();
urls.Add(strUrl, model);
}
catch
{
}
}
}
}
}
4.新建网站克隆接口
interface IWebCloneWorker
{
void Start();
void Cancel();
}
5.新建实现
public class WebCloneWorker : IWebCloneWorker
{
//网站页面克隆深度(如:0-首页,1-分类页,2-详细页面)
public static int depth = 0;
//要克隆的网站网址
public string Url { get; set; }
//克隆后,保存的路径
public string SavePath { get; set; }
private BackgroundWorker backgroundWorker1 = null;
public event UrlChangedEventHandler UrlChanged;
public event FileSavedSuccessEventHandler FileSavedSuccess;
public event FileSavedFailEventHandler FileSavedFail;
public event DownloadCompletedEventHandler DownloadCompleted;
public event CollectingUrlEventHandler CollectingUrl;
public event CollectedUrlEventHandler CollectedUrl;
public event ProgressChangedEventHandler ProgressChanged;
//所有页面、文件资源地址集合
private Dictionary<string, UrlModel> _Hrefs = new Dictionary<string, UrlModel>();
/// <summary>
/// 所有页面、文件资源地址集合
/// </summary>
public Dictionary<string,UrlModel> Hrefs
{
get { return _Hrefs; }
set { _Hrefs = value; }
}
//网站页面请求编码,默认为UTF-8
private string _Encoding = "utf-8";
//网站页面请求编码,默认为UTF-8
public string Encoding
{
get { return _Encoding; }
set { _Encoding = value; }
}
public WebCloneWorker() { }
public WebCloneWorker(string url,string path)
{
//设置网站、保存路径
this.Url = url;
this.SavePath = path;
if (string.IsNullOrEmpty(this.Url))
throw new Exception("请输入网址");
if (string.IsNullOrEmpty(this.SavePath))
throw new Exception("请选择要保存的目录");
backgroundWorker1 = new BackgroundWorker();
//设置报告进度更新
backgroundWorker1.WorkerReportsProgress = true;
backgroundWorker1.WorkerSupportsCancellation = true;
//注册线程主体方法
backgroundWorker1.DoWork += backgroundWorker1_DoWork;
//注册更新UI方法
backgroundWorker1.ProgressChanged += backgroundWorker1_ProgressChanged;
//处理完毕
backgroundWorker1.RunWorkerCompleted += backgroundWorker1_RunWorkerCompleted;
}
void backgroundWorker1_RunWorkerCompleted(object sender, RunWorkerCompletedEventArgs e)
{
if (e.Cancelled) {
return;
}
if (this.DownloadCompleted != null)
{
DownloadCompletedEventArgs eventArgs = new DownloadCompletedEventArgs(e.Result, e.Error, e.Cancelled);
this.DownloadCompleted(this, eventArgs);
}
}
void backgroundWorker1_ProgressChanged(object sender, ProgressChangedEventArgs e)
{
//进度回调
if (this.ProgressChanged != null)
this.ProgressChanged(this, e);
UrlModel model = (UrlModel)e.UserState;
if (this.UrlChanged != null)
{
//Url改变后,回调
UrlChangedEventArgs eventArgs = new UrlChangedEventArgs(model);
this.UrlChanged(this, eventArgs);
}
try
{
string dir = this.SavePath;
string url = model.AbsoluteUri;
string AbsolutePath = url.Substring(url.IndexOf('/', 8));
string fileName = "";
if (url.IndexOf('?') > 0)
{
string path = AbsolutePath.Substring(0, model.RelatedPath.IndexOf('?'));
fileName = System.IO.Path.GetFileName(path);
}
else
{
fileName = System.IO.Path.GetFileName(AbsolutePath);
}
//默认首页
if (string.IsNullOrEmpty(fileName) || fileName.IndexOf(".") < 0)
{
fileName = "index.html";
if (!AbsolutePath.EndsWith("/"))
AbsolutePath = AbsolutePath + "/";
}
fileName = System.Web.HttpUtility.UrlDecode(fileName);
string localPath = string.Format("{0}{1}", dir, System.IO.Path.GetDirectoryName(AbsolutePath));
if (!System.IO.Directory.Exists(localPath))
{
System.IO.Directory.CreateDirectory(localPath);
}
//判断文件是否存在,存在不再下载
string path2 = Path.Combine(localPath, fileName);
if (File.Exists(path2))
{
return;
}
//下载网页、图片、资源文件
HttpTool.DownFile(url, localPath, fileName);
//保存成功后,回调
if (this.FileSavedSuccess != null)
{
FileSavedSuccessEventArgs eventArgs = new FileSavedSuccessEventArgs(model);
this.FileSavedSuccess(this, eventArgs);
}
}
catch (Exception ex)
{
//保存失败后,回调
if (this.FileSavedFail != null)
{
FileSavedFailEventArgs eventArgs = new FileSavedFailEventArgs(ex);
this.FileSavedFail(this, eventArgs);
}
}
}
void backgroundWorker1_DoWork(object sender, DoWorkEventArgs e)
{
//获取资源
GetResource();
int index = 1;
if (this.Hrefs.Keys.Count > 0)
{
foreach (var k in this.Hrefs.Keys)
{
//取消操作
if (backgroundWorker1.CancellationPending)
{
e.Cancel = true;
return;
}
backgroundWorker1.ReportProgress(index, this.Hrefs[k]);
index++;
//挂起当前线程200毫秒
Thread.Sleep(200);
}
}
}
public void Start()
{
if (this.backgroundWorker1.IsBusy)
return;
this.backgroundWorker1.RunWorkerAsync();
}
public void Cancel()
{
if (this.backgroundWorker1.CancellationPending)
return;
this.backgroundWorker1.CancelAsync();
}
private void GetResource()
{
string url = this.Url;
string referer = this.Url;
string msg = "";
string html = HttpTool.HttpGet(url, referer, this.Encoding, out msg);
//收集页面链接
GetHrefs(0, url, html);
//收集完毕
if (null != CollectedUrl)
{
UrlModel urlModel = new UrlModel();
CollectedUrlEventArgs eventArgs = new CollectedUrlEventArgs(urlModel);
this.CollectedUrl(this, eventArgs);
}
}
private void GetHrefs(int level,string url,string html)
{
#region 添加当前页
UrlModel currUrl = UrlParser.Parse(url);
try
{
//取消
if (backgroundWorker1.CancellationPending)
return;
this.Hrefs.Add(currUrl.RelatedPath, currUrl);
//收集回调
if (null != CollectingUrl)
{
CollectingUrlEventArgs eventArgs = new CollectingUrlEventArgs(currUrl);
this.CollectingUrl(this, eventArgs);
}
}
catch
{
}
#endregion
//获取相关链接(含有href属性的)
List<UrlModel> list1 = WebPageService.GetLocalHrefs(url,html);
//获取图片,文件等资源文件(含有src属性的)
List<UrlModel> listSrcs = WebPageService.GetLocalSrcs(url,html);
#region 获取当级资源文件
if (listSrcs != null)
{
for (int i = 0; i < listSrcs.Count; i++)
{
UrlModel urlModel = listSrcs[i];
try
{
//取消
if (backgroundWorker1.CancellationPending)
return;
this.Hrefs.Add(urlModel.RelatedPath, urlModel);
//收集回调
if (null != CollectingUrl)
{
CollectingUrlEventArgs eventArgs = new CollectingUrlEventArgs(urlModel);
this.CollectingUrl(this, eventArgs);
}
}
catch
{ }
}
}
#endregion
#region 获取子级页面资源
//获取第二级
if (list1 != null)
{
for (int i = 0; i < list1.Count; i++)
{
UrlModel urlModel = list1[i];
try
{
//取消
if (backgroundWorker1.CancellationPending)
return;
this.Hrefs.Add(urlModel.RelatedPath, urlModel);
//收集回调
if (null != CollectingUrl)
{
CollectingUrlEventArgs eventArgs = new CollectingUrlEventArgs(urlModel);
this.CollectingUrl(this, eventArgs);
}
}
catch
{ }
string msg = "";
html = HttpTool.HttpGet(urlModel.AbsoluteUri, urlModel.AbsoluteUri, this.Encoding, out msg);
#region 获取子级资源文件
/*
* 获取二级资源文件
* */
listSrcs = WebPageService.GetLocalSrcs(urlModel.AbsoluteUri, html);//资源文件
if (listSrcs != null)
{
for (int j = 0; j < listSrcs.Count; j++)
{
UrlModel urlModel2 = listSrcs[j];
try
{
//取消
if (backgroundWorker1.CancellationPending)
return;
this.Hrefs.Add(urlModel2.RelatedPath, urlModel2);
//收集回调
if (null != CollectingUrl)
{
CollectingUrlEventArgs eventArgs = new CollectingUrlEventArgs(urlModel2);
this.CollectingUrl(this, eventArgs);
}
}
catch
{ }
//挂起线程20毫秒
Thread.Sleep(20);
}
}
#endregion
//挂起线程20毫秒
Thread.Sleep(20);
//到达指定深度后,退出
if (level >= depth)
return;
//递归
GetHrefs(level + 1, urlModel.AbsoluteUri, html);
}
}
#endregion
}
6.代码有点多,各位有需要的还是下载源码查看并运行吧。