因要每天去查找漏洞信息,来判断架构和应用服务有没有新漏洞被发现,若有 则需修复升级。所以写了一个去漏洞库平台通过关键字爬取数据生成日志文件,这三个平台,就美国国家信息安全漏洞库地址会时不时出现超时情况。若出现超时,可多试两次,三个平台检索出的漏洞差不多,写的不好,仅供参考

python版本3.7
pip安装requests即可

#coding=utf-8
import requests as r
import re
import time
import datetime

#爬取国家信息安全漏洞平台
class gjxxaqpt:
    def get_404(self,url,keyword):
        #定义提交数据 qcvCname 检索的词,pageno 页数 一般是抓取第一页
        data = {"qcvCname":keyword,"pageno":1}
        #post数据
        result = r.post(url,data=data).text
        #正则匹配信息
        filter_result = re.findall('<li  style=".*?class="a_title2" >\r\n                               \t\t  (.*?)</a>.*?<p><a href="(.*?)" target="_blank">(.*?)</a>.*?<img title="(.*?)" src=".*?<br/ >(.*?)\r\n\t\t\t\t\t\t   </div>.*?</li>',result,re.S)
        return filter_result

    def get_404_mes(self,url):
        header = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36'}
        try:
            result = r.get(url,headers=header).text
            filter_result = re.findall('<div class="d_ldjj">.*?</p><p style="text-indent:2em">\n(.*?)\r\n\t\t\t</p>',result,re.S)
            if filter_result is not None:
                return filter_result[0]
            else:
                return ""
        except:
            print("连接超时"+url)

    def write_file(self,keyword,date_time):
        #定义post的url
        url = "http://www.cnnvd.org.cn/web/vulnerability/queryLds.tag"
        #定义后面组合信息需要的域名
        url_domain = "http://www.cnnvd.org.cn"
        #存放最后结果数据的数组
        mes_list = []
        #循环检索关键词
        for keyword in keylist:
            try:
                #调用方法获取检索的结果
                get_404_re = self.get_404(url,keyword)
                #循环结果,拼接成字符串,写入log文件
                for res in get_404_re:
                    #判断包含本年日期的漏洞
                    if date_time in res[4]:
                        mes_url = url_domain + res[1]
                        try:
                            message = self.get_404_mes(mes_url)
                            mes = res[0] + " | " "漏洞编号:" + res[2] + " | " + "等级:" + res[3] + " | " + "时间:" + res[4] + " | " + "详情地址:" + mes_url + " | " + "漏洞简介:" + message
                            mes_list.append(mes)
                        except:
                            print("timeout: "+mes_url)
            except:
                print("timeout:"+url+","+"keyword")
        return mes_list

#cve中文漏洞信息库 - scap中文社区
class cve_scap:
    #获取所有漏洞集合
    def get_cve_404(self,url,keyword):
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36'}
        #定义提交数据 qcvCname 检索的词,pageno 页数 一般是抓取第一页
        data = {"search_type":"t_keyword","keyword":keyword}
        #post数据
        result = r.get(url,params=data).text
        filter_result = re.findall("<td class='hidden-xs'>.*?<a href=(.*?)>\n                            (.*?)\n                        </a>.*?<td class='hidden-xs hidden-sm'>(.*?)</td>.*?title='(.*?)' class='grade",result,re.S)
        return filter_result

    #对单个漏洞信息获取
    def get_cve_404_mes(self,url):
        header = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36'}
        try:
            result = r.get(url,headers=header).text
            filter_result = re.findall("pad30T pad30B mrg0B' style='word-wrap: break-word;'>\n                        (.*?)</p>",result,re.S)
            if filter_result is not None:
                return filter_result[0]
            else:
                return ""
        except:
            print("timeout: " + url)

    #信息获取
    def write_file(self,keylist,date_time):
        #定义post的url
        url = "http://cve.scap.org.cn/vulns/1"
        #定义数组,存放信息
        mes_list = []
        for keyword in keylist:
            #爬取网站
            html_filter = self.get_cve_404(url,keyword)
            #定义后面组合信息需要的域名
            url_domain = "http://cve.scap.org.cn"
            for res in html_filter:
                if date_time in res[2]:
                    try:
                        mes_url = url_domain + res[0].strip('"')
                        message = self.get_cve_404_mes(mes_url)
                        mes = "漏洞编号:" + res[1] + " | " + "等级:" + res[3] + " | " + "时间:" + res[2] + " | " + "详情地址:" + mes_url + " | " + "漏洞简介:" + message.replace("\n","")
                        mes_list.append(mes)
                    except:
                        print("timeout: "+mes_url)

        return mes_list

#美国国家信息安全漏洞库
class nvd_nist:
    #获取所有漏洞集合
    def get_nvd_404(self,url,keyword):
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36'}
        #定义提交数据 qcvCname 检索的词,pageno 页数 一般是抓取第一页
        data = {"form_type":"Basic","results_type":"overview","query":keyword,"search_type":"all"}
        #关闭ssl告警提示
        r.packages.urllib3.disable_warnings()
        #post数据
        result = r.get(url,params=data,verify=False).text
        filter_result = re.findall("<tr data-testid=\"vuln-row.*?<a href=\"(.*?)\" id=\".*?data-testid=\"vuln-detail-link-[0-9]{1,2}\">(.*?)</a></strong><br/>.*?<p data-testid='vuln-summary-[0-9]{1,2}'>(.*?)</p>.*?<span data-testid='vuln-published-on-[0-9]{1,2}'>(.*?)</span>",result,re.S)
        return filter_result

    def write_file(self,keylist,date_time):
        #查找漏洞的url地址
        url = "https://nvd.nist.gov/vuln/search/results"
        #拼接url的地址
        url_dom = "https://nvd.nist.gov"
        #存储最后结果的数组
        mes_list = []
        #循环获取关键字的漏洞信息
        for keyword in keylist:
            try:
                filter_html = self.get_nvd_404(url,"nginx")
                for res in filter_html:
                    url_domain = url_dom + res[0]
                    #对英文的时间格式进行转换
                    eng_time = res[3]
                    if "AM" in eng_time:
                        up_time = eng_time.split("AM")[0]
                    elif "PM" in eng_time:
                        up_time = eng_time.split("PM")[0]
                    else:
                        print("时间判断有误")
                    #我获取到的时间是英文的时间格式,需要转换为数字时间格式,这里时间字符串和里面的时间格式要保持一次,差一个空格,也不行
                    time_format=str(datetime.datetime.strptime(up_time,'%B %d, %Y; %H:%M:%S '))
                    if date_time in time_format:
                        mes = "漏洞编号:" + res[1]  + " | " + "时间:" + time_format + " | " + "详情地址:" + url_domain + " | " + "漏洞简介:" + res[2]
                        mes_list.append(mes)
            except:
                print("timeout:" + url + "," + keyword)
        return mes_list

if __name__ == "__main__":
    #需要查找的关键字数组
    keylist=['nginx','openssl','openssh']
    #获取本年的日期
    date_time = time.strftime("%Y",time.localtime())
    #打开写入log文件
    files = open("404_message.log","w+",encoding='utf-8')

    #获取国家信息漏洞库
    guojia = gjxxaqpt()
    files.write("#国家信息漏洞库:\n")
    for i in guojia.write_file(keylist,date_time):
        files.write(i+"\n")
    files.write("\n")

    #cve中文漏洞信息库 - scap中文社区 查找
    cve = cve_scap()
    files.write("#cve中文漏洞信息库:\n")
    for i in cve.write_file(keylist,date_time):
        files.write(i+"\n")
    files.write("\n")

    #美国国家信息安全漏洞库 查找
    nvd = nvd_nist()
    files.write("#美国国家信息安全漏洞库:\n")
    for i in nvd.write_file(keylist,date_time):
        files.write(i+"\n")
    files.write("\n")

    files.close()