写了一个小爬虫,把发表的博客全都备份了下。

获取发表过的文章信息,存入到数据库。

C#中用 AngleSharp这个组件就可以像用linq一样就行html标签的查询操作。

所以从html里获取需要的内容是非常方便的

具体代码,随便写的:

using AngleSharp.Parser.Html;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Net.Http;
using System.Text;
using System.Threading.Tasks;

namespace Crawler
{
    class Program
    {
        static void Main(string[] args)
        {
            Console.WriteLine("Hello World!");
            //for (int i = 1; i <= 10; i++)
            //{
            //    fuac("https://blog.cs.net/qq_32688731/article/list/"+i);
            //    Console.WriteLine(count);
            //}
            //bds284289328_dbEntities1 db = new bds284289328_dbEntities1();
            //db.CS_Article.ToList().ForEach(r =>
            //{
            //    ff(r.Link, r.Id);
            //    System.Threading.Thread.Sleep(500);
            //});
            f("https://user.qzone.qq.com/1439084907", -1);
            
            Console.ReadLine();
        }

        static int count = 0;
        static void f(string url, int id)
        {
            using (HttpClient http = new HttpClient())
            {
                var htmlString = http.GetStringAsync(url).Result;
                HtmlParser htmlParser = new HtmlParser();
                var data = htmlParser.Parse(htmlString)
                    .QuerySelectorAll("body")
                    .Select(t => t)
                    .ToList();
            }
        }
        static void ff(string url,int id)
        {
            using (HttpClient http = new HttpClient())
            {
                var htmlString = http.GetStringAsync(url).Result;
                HtmlParser htmlParser = new HtmlParser();
                var data = htmlParser.Parse(htmlString)
                    .QuerySelectorAll("#main")
                    .Select(t => new details_itme()
                    {
                        ArticleType = t.QuerySelectorAll(".subItem_t a").Length == 0 ? "-1": t.QuerySelectorAll(".subItem_t a").FirstOrDefault().GetAttribute("href"),

                        ArticleContent = t.QuerySelectorAll(".article_content").FirstOrDefault().InnerHtml.Trim().Replace("\n", ""),
                        ArticleDetails = t.QuerySelectorAll("#article_details").FirstOrDefault().InnerHtml.Trim().Replace("\n", ""),
                    })
                    .ToList();
                bds284289328_dbEntities1 db = new bds284289328_dbEntities1();
                foreach (var item in data)
                {
                    CS_Details cS_Details = new CS_Details();
                    //https://blog.cs.net/qq_32688731/article/category/6568994
                    string temp = item.ArticleType.Substring(item.ArticleType.LastIndexOf("/")+1);
                    cS_Details.ArticleType =int.Parse(temp);
                    cS_Details.ArticleContent = item.ArticleContent;
                    cS_Details.ArticleDetails = item.ArticleDetails;

                    cS_Details.ArticleListId = id;

                    db.CS_Details.Add(cS_Details);
                    count++;
                    Console.WriteLine(count);
                }
                db.SaveChanges();
            }
        }


        static void fuac(string url)
        {
            using (HttpClient http = new HttpClient())
            {
                var htmlString = http.GetStringAsync(url).Result;
                HtmlParser htmlParser = new HtmlParser();
                var data = htmlParser.Parse(htmlString)
                    .QuerySelectorAll(".list_item")
                    .Select(t => new list_item()
                    {
                        article_type = t.QuerySelectorAll(".ico_type_Original").FirstOrDefault() != null ? 1 : 0,


                        article_link = t.QuerySelectorAll(".link_title a").FirstOrDefault().GetAttribute("href"),
                        article_title = t.QuerySelectorAll(".link_title").FirstOrDefault().TextContent,
                        article_description = t.QuerySelectorAll(".article_description").FirstOrDefault().TextContent,

                        article_postdate = t.QuerySelectorAll(".link_postdate").FirstOrDefault().TextContent,
                        article_view = t.QuerySelectorAll(".link_view").FirstOrDefault().TextContent,
                        article_comments = t.QuerySelectorAll(".link_comments").FirstOrDefault().TextContent,
                    })
                    .ToList();
                bds284289328_dbEntities1 db = new bds284289328_dbEntities1();
                foreach (var item in data)
                {
                    CS_Article cS_Article = new CS_Article();
                    cS_Article.Type = item.article_type;

                    cS_Article.Link = item.article_link.Trim().Replace("\n", "");
                    cS_Article.Title = item.article_title.Trim().Replace("\n", "");
                    cS_Article.Description = item.article_description.Trim().Replace("\n", "");

                    cS_Article.Postdate = Convert.ToDateTime(item.article_postdate.Trim().Replace("\n", ""));
                    cS_Article.ViewCount = Convert.ToInt32(item.article_view.Trim().Replace("\n", "").Replace("阅读(", "").Replace(")", ""));
                    cS_Article.Comments = Convert.ToInt32(item.article_comments.Trim().Replace("\n", "").Replace("评论(", "").Replace(")", ""));
                    db.CS_Article.Add(cS_Article);
                    System.Threading.Thread.Sleep(100);
                    count++;
                }
                db.SaveChanges();
            }
        }
    }

    class list_item
    {
        //文章类型 1原创 0转载
        public int article_type { get; set; }

        //文章连接 
        public string article_link { get; set; }

        //文章标题
        public string article_title { get; set; }

        //文章描述
        public string article_description { get; set; }
        //发表时间
        public string article_postdate { get; set; }

        //阅读次数
        public string article_view { get; set; }

        //评论次数
        public string article_comments { get; set; }
    }

    class details_itme
    {
        //文章类别
        public string ArticleType { get; set; }

        //文章内容
        public string ArticleContent { get; set; }

        //文章详情
        public string ArticleDetails { get; set; }

        //列表外键
        public string ArticleListId { get; set; }
    }
}

C#一个简单的爬虫_Parse