蜘蛛爬虫

原创

mb61b856e04bb98 2021-12-15 17:52:54 博主文章分类：C# ©著作权

文章标签 html xml 递归调用 文章分类 代码人生

©著作权归作者所有：来自51CTO博客作者mb61b856e04bb98的原创作品，请联系作者获取转载授权，否则将追究法律责任

using System;

using System.Collections.Generic;

using System.IO;

using System.Net;

using System.Text.RegularExpressions;

namespace ConsoleApplication1

{

class Program

{

static void Main(string[] args)

{

//需要解析的集合

List<string> list = new List<string>();

//已经解析的集合

List<string> listCount = new List<string>();

list.Add(

Console.ReadLine();

}

/// <summary>

/// 读取HTML中的URL

/// </summary>

/// <param name="list"></param>

/// <param name="listCount"></param>

public static void ReadHtml(List<string> list, List<string> listCount)

{

List<string> count = new List<string>();

for (int a = 0; a < list.Count; a++)

{

//没有解析过该项

if (!listCount.Contains(list[a]))

{

try

{

//在已解析过的集合里面添加本条数据

listCount.Add(list[a]);

WebRequest req = WebRequest.Create(list[a]);

WebResponse result = req.GetResponse();

//得到的流是网页内容

Stream ReceiveStream = result.GetResponseStream();

StreamReader readerOfStream = new StreamReader(ReceiveStream, System.Text.Encoding.GetEncoding("GB2312"));

//得到当前URL的源码

string str = readerOfStream.ReadToEnd();

//解析

Regex regex = new Regex(@ ./?%&=]*)?");

foreach (Match mc in regex.Matches(str))

{

Regex regexOhter = new Regex(list[a] + "|.png|.jpg|.gif|.bmp|.js|.css|.xls|.doc|.pdf|.chw|.exe|.mp3|.mp4|.avi|.swf|.xml");

if (!regexOhter.IsMatch(mc.ToString()))

{

Console.WriteLine(mc);

count.Add(mc.ToString());

}

Console.WriteLine("----------------------解析完一个页面!--------------------");

if (a == list.Count - 1)

{

//递归调用本方法

ReadHtml(count, listCount);

}

catch (System.Exception ex) { }

finally

{

List<string> error = new List<string>();

//如果出错在出错的后面一条URL继续解析

for (int z = a + 1; z < list.Count; z++)

{

error.Add(list[z]);

}

//继续解析

ReadHtml(error, listCount);

}

上一篇：javascript计算器小程序

下一篇：Program Single-cell...

提问和评论都可以，用心的回复会被更多人看到评论

发布评论

相关文章

官方博客	全部文章	热门标签	班级博客
了解我们	网站地图	意见反馈

鸿蒙开发者社区	51CTO学堂
51CTO	软考资讯