论文链接爬取

首先是获取网页的html代码,然后通过正则表达式获取论文的下载链接

 public static String getHTMLText(String requesturl){
        StringBuffer buffer = null;
        BufferedReader bufferedReader = null;
        InputStreamReader inputStreamReader = null;
        InputStream inputStream = null;
        HttpsURLConnection httpsURLConnection = null;
        try {
            URL url = new URL(requesturl);
            httpsURLConnection = (HttpsURLConnection) url.openConnection();
            httpsURLConnection.setDoInput(true);
            httpsURLConnection.setRequestMethod("GET");
            inputStream = httpsURLConnection.getInputStream();
            inputStreamReader = new InputStreamReader(inputStream, "utf-8");
            bufferedReader = new BufferedReader(inputStreamReader);
            buffer = new StringBuffer();
            String str = null;
            while ((str = bufferedReader.readLine()) != null) {
                buffer.append(str);
            }
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }

        return buffer.toString();
    }
//返回的,一个是论文名,一个是论文链接
public static Map<String,List<String>> getPapers(){
        Map<String,List<String>> urlAndName=new HashMap();
        String htmlStr=GetDataUtil.getHTMLText("https://openaccess.thecvf.com/WACV2021");
        List<String> urls= new ArrayList<>();
        List<String> nameS= new ArrayList<>();
        String url = "";
        Pattern p_url;
        Matcher m_url;
        String regEx_url = "href=\"([\\w\\s./:]+?)\"";
        p_url = Pattern.compile
                (regEx_url, Pattern.CASE_INSENSITIVE);
        m_url = p_url.matcher(htmlStr);
        while (m_url.find()) {
            url = m_url.group();
            if (url.contains(".pdf")){
                url=url.substring(url.indexOf("\"")+1);
                url=url.substring(0,url.indexOf("\""));
                urls.add("https://openaccess.thecvf.com/"+url);
                if (url.contains("papers")){
                    url=url.substring(url.indexOf("papers")+7);
                }else if (url.contains("supplemental")){
                    url=url.substring(url.indexOf("supplemental")+13);
                }
                nameS.add(url);
            }
        }
        urlAndName.put("name",nameS);
        urlAndName.put("url",urls);
        return urlAndName;
    }
论文爬取

这里主要进行的是获取pdf论文

转成txt文本为下一步做准备

获取pdf

//这个函数其实和上面的getHTMLText包括了,但我懒得改了
public static InputStream getInputStream(String requesturl){
        InputStream inputStream = null;
        HttpsURLConnection httpsURLConnection = null;
        try {
            URL url = new URL(requesturl);
            httpsURLConnection = (HttpsURLConnection) url.openConnection();
            httpsURLConnection.setConnectTimeout(10000);
            httpsURLConnection.setReadTimeout(10000);
            httpsURLConnection.setDoInput(true);
            httpsURLConnection.setRequestMethod("GET");
            inputStream = httpsURLConnection.getInputStream();
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
        return inputStream;
    }

public static void saveAsPdf(InputStream inputStream,String path){
        File file = new File(path);
        if (!file.exists()) {
            FileOutputStream out = null;
            try {
                out = new FileOutputStream(path);
            } catch (FileNotFoundException e) {
                e.printStackTrace();
            }
            int data;
            try {
                while((data = inputStream.read()) != -1) {
                    out.write(data);
                }
                inputStream.close();
                out.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        } else {
            System.out.println("文件已存在");
        }
    }

转为txt

  /**
     * 传入一个.pdf文件
     * @param file
     * @throws Exception
     */
    public static void readPdf(String file) throws Exception {
        // 是否排序
        boolean sort = false;
        // pdf文件名
        String pdfFile = file;
        // 输入文本文件名称
        String textFile = null;
        // 编码方式
        String encoding = "UTF-8";
        // 开始提取页数
        int startPage = 1;
        // 结束提取页数
        int endPage = Integer.MAX_VALUE;
        // 文件输入流,生成文本文件
        Writer output = null;
        // 内存中存储的PDF Document
        PDDocument document = null;
        try {
            try {
                // 首先当作一个URL来装载文件,如果得到异常再从本地文件系统//去装载文件
                URL url = new URL(pdfFile);
                //注意参数已不是以前版本中的URL.而是File。
                document = PDDocument.load(new File(pdfFile));
                // 获取PDF的文件名
                String fileName = url.getFile();
                // 以原来PDF的名称来命名新产生的txt文件
                if (fileName.length() > 4) {
                    File outputFile = new File(fileName.substring(0, fileName.length() - 4)+ ".txt");
                    textFile ="F:\\CVF\\PDF\\"+outputFile.getName();
                }
            } catch (MalformedURLException e) {
                // 如果作为URL装载得到异常则从文件系统装载
                //注意参数已不是以前版本中的URL.而是File。
                document = PDDocument.load(new File(pdfFile));
                if (pdfFile.length() > 4) {
                    textFile = pdfFile.substring(0, pdfFile.length() - 4)+ ".txt";
                }
            }
            // 文件输入流,写入文件倒textFile
            output = new OutputStreamWriter(new FileOutputStream(textFile),encoding);
            // PDFTextStripper来提取文本
            PDFTextStripper stripper = null;
            stripper = new PDFTextStripper();
            // 设置是否排序
            stripper.setSortByPosition(sort);
            // 设置起始页
            stripper.setStartPage(startPage);
            // 设置结束页
            stripper.setEndPage(endPage);
            // 调用PDFTextStripper的writeText提取并输出文本
            stripper.writeText(document, output);

            System.out.println(textFile + " 输出成功!");

            if (output != null) {
                // 关闭输出流
                output.close();
            }
            if (document != null) {
                // 关闭PDF Document
                document.close();
            }
        } finally {
            if (output != null) {
                // 关闭输出流
                output.close();
            }
            if (document != null) {
                // 关闭PDF Document
                document.close();
            }
        }
    }
论文内关键词提取

这里用到了GitHub上一个开源项目

项目链接:RAKE-MASTER

Java运行powershell类

public class RunPowerShell {
    public void execCommand(String[] arstringCommand) {
        for (int i = 0; i < arstringCommand.length; i++) {
            System.out.print(arstringCommand[i] + " ");
        }
        try {
            Runtime.getRuntime().exec(arstringCommand);

        } catch (Exception e) {
            System.out.println(e.getMessage());
        }
    }
    public void execCommand(String arstringCommand) {
        try {
           Process process= Runtime.getRuntime().exec(arstringCommand);
           process.waitFor();

        } catch (Exception e) {
            System.out.println(e.getMessage());
        }
    }

    public void cmd(){
        //打开记算器
        String[] arstringCommand = new String[] {
                "cmd ",
                "/k",
                "start", // cmd Shell命令
                "calc"
        };
        execCommand(arstringCommand);
        //打开记事本
        String cmd = "cmd /k start notepad";
        execCommand(cmd);
    }

    public static void main(String[] args){
        new RunPowerShell().cmd();
    }

}

运行rake.py

new RunPowerShell().execCommand(" python rake.py "+InputPath+" "+分词文件路径+" -o "+输出路径);
数据处理

循环运行完rake.py后会生成众多txt文件,这一步做的便是读取这些文件然后处理写入数据库

pojo类

paper

package com.keyword.alice.pojo;

import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;

@Data
@AllArgsConstructor
@NoArgsConstructor

public class Paper {
    private String papertitle;
    private String authors;
    private String Abstract;
    private String pdf;
    private String year;
    private String type;
}

上个整体一点的代码得了

public void JustRun(Paper paper) throws Exception {
        String name=paper.getPapertitle();
        String url=paper.getPdf();
        String year=paper.getYear();
        String type=paper.getType();
        List<KeyWords> resList;
        List<String>wordList=new ArrayList<>();
        List<String>urlList=new ArrayList<>();
    //爬取的论文中有空格或者斜杠会影响输出
        try {
            name=name.replace("/","_");
            name=name.replace(" ","_");
            url=url.replace("/../../","/");
            System.out.println(url);
        }catch (Exception e){

        }
        GetDataUtil.saveAsPdf(GetDataUtil.getInputStream(url),path+name+".pdf");
        PdfToTxt.readPdf(path+name+".pdf");
        name=name+".txt";
        new RunPowerShell().execCommand(" python rake.py F:\\CVF\\PDF\\"+name+" stopwords.txt -o F:\\CVF\\TXT\\KeyWords_"+name);

        try {
            FileReader fr = new FileReader("F:\\CVF\\TXT\\KeyWords_"+name);
            BufferedReader bf = new BufferedReader(fr);
            String str;
            // 按行读取字符串
            while ((str = bf.readLine()) != null) {
                wordList.add(str);
                urlList.add(url);
            }
            bf.close();
            fr.close();
        } catch (IOException e) {
            e.printStackTrace();
        }

        resList= ToKeyWordsObj.ToObjKeyWords(wordList,urlList,year,type);
        resList.removeIf(keyWords -> keyWords.getValue() < 8);
        for (KeyWords keyWords : resList) {
            keyMapper.addkeyWord(keyWords);
        }
    }
动态sql语句

其实到这里就已经基本做完了项目了,剩下的就是CUDR了,唯一有难度的便是多条件查询

paper是数据表名

<script> select * from papers where 1=1 
            <if test='传入的参数!=null'>and 数据库字段名 like concat('%',#{传入的参数},'%')</if>

这里其实是用的mybatis的注解

图表联动

这一段是图表联动,可以直接使用,但是注意变量名称还有元素的绑定

    table.on('row(keyW)', function(obj){
        var n=0;
        mycharts.dispatchAction({
            type: 'downplay',
            name: temp
        })
        temp=obj.data.name;
        array.push(temp);
        n=n+1;
        mycharts.dispatchAction({
            type: 'highlight',
            name: obj.data.name
        })
        if (array[n]===array[n-1]){
            mycharts.dispatchAction({
                type: 'downplay',
                name: temp
            })
        }
        setTimeout(function(){mycharts.dispatchAction({
            type: 'downplay',
            name: temp
        })},1000);
    });



    function eConsole(param) {
        var i = param.dataIndex;// 获取当前点击索引,
        //clickFunc(param.dataIndex);//执行点击效果
        $("tbody tr").each(function(){
            //alert($(this).index()); 获取数据表格每行的索引
            var a = $(this).index();
            if(a == i){
                $("tbody tr").eq(a).css("background-color","yellow");
            }
        });

    };
    //鼠标移出
    function zConsole(param) {
        var i = param.dataIndex;// 获取当前点击索引,


        $("tbody tr").each(function(){
            //alert($(this).index()); 获取数据表格每行的索引
            var a = $(this).index();
            if(a == i){
                $("tbody tr").eq(a).css("background-color","");
            }
        });

    };

    mycharts.on("mouseover", eConsole);//鼠标移入
    mycharts.on("mouseout", zConsole);//鼠标移出
    mycharts.hideLoading();
    mycharts.on('click',function(params){
        table.reload('test',{
            url: 'getUrlList'
            ,where: {
                word:params.name
            }
            ,cols: [
                        [
                            {field:'type', width:80, title: '会议',}
                            ,{field:'year', width:80, title: '年份'}
                            ,{field:'url', title: '论文链接', sort: true}
                        ]
                    ]
        });
    });

完整页面代码

<!DOCTYPE html>
<html>
<head>
    <meta charset="utf-8">
    <title>检索</title>
    <meta name="renderer" content="webkit">
    <meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">
    <meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1">
    <link rel="stylesheet" type="text/css" href="https://www.layuicdn.com/layui/css/layui.css" />
    <script src="https://cdn.bootcdn.net/ajax/libs/jquery/3.6.0/jquery.js"></script>
    <script src="../echarts.js"></script>
    <script src="../echarts-wordcloud.js"></script>
    <!-- 注意:如果你直接复制所有代码到本地,上述css路径需要改成你本地的 -->
    <style>
        .layui-input-block {
            margin-left: 80px;
            min-height: 36px;
        }
    </style>
</head>
<body>

<div class="layui-row">
    <div class="layui-col-md9">
        <div class="grid-demo grid-demo-bg1"> <div id="main" style="width: 1200px;height: 800px; "></div></div>
    </div>
    <div class="layui-col-md3">
        <div class="grid-demo"><table class="layui-hide" id="keyW" lay-filter="keyW"></table></div>
    </div>
</div>
<table class="layui-hide" id="test" lay-filter="test"></table>

<script type="text/html" id="toolbarDemo">
        <div class="layui-form-item">
            <label class="layui-form-label">关键词</label>
            <div class="layui-input-block">
                <input type="text"  id="keyWord" name="keyWord" autocomplete="off" placeholder="请输入关键词,最多五个关键词,通过,和.分割(‘,’为或 '.'为和)" class="layui-input">
            </div>
        </div>
        <div class="layui-form-item">
            <label class="layui-form-label">检索</label>
            <div class="layui-input-inline">
                <select name="year" id="year">
                    <option value="">请选择年份</option>
                    <option value="">所有</option>
                    <option value="2021" selected="">2021</option>
                    <option value="2020">2020</option>
                    <option value="2019">2019</option>
                    <option value="2018">2018</option>
                    <option value="2017">2017</option>
                    <option value="2016">2016</option>
                    <option value="2015">2015</option>
                    <option value="2014">2014</option>
                    <option value="2013">2013</option>
                </select>
            </div>
            <div class="layui-input-inline">
                <select name="type" id="type">
                    <option value="">请选会议类型</option>
                    <option value="">所有</option>
                    <option value="WACV">WACV</option>
                    <option value="CVPR">CVPR</option>
                    <option value="ICCV">ICCV</option>
                </select>
            </div>
            <div class="layui-input-inline">
                <div class="layui-input-inline">
                    <input type="text" name="author" id="author" placeholder="请输入作者" autocomplete="off" class="layui-input">
                </div>
            </div>
            <div class="layui-input-inline">
                <button class="layui-btn layui-btn-sm" onclick="change()" >搜索</button>
                <button class="layui-btn layui-btn-sm" lay-event="getCheckData">获取选中行数据</button>
<!--                layui-btn-sm-->
            </div>
        </div>
</script>

<script type="text/html" id="barDemo">
    <a class="layui-btn layui-btn-xs" lay-event="edit">查看本行</a>
    <a class="layui-btn layui-btn-danger layui-btn-xs" lay-event="del">删除</a>
</script>


<script src="https://www.layuicdn.com/layui/layui.js"></script>
<!-- 注意:如果你直接复制所有代码到本地,上述 JS 路径需要改成你本地的 -->

<script>
    var mycharts = echarts.init(document.getElementById("main"));
    //用来存储数据
    var jsonlist = [];
    var option;
    image1= "";
    var maskResource = new Image()
    $.ajax({
        url:"getList",
        dataType:"json",
        type:"GET",
        async:"false",
        success:function(data){
            jsonlist=data;
            option ={
                //设置标题,居中显示
                title:{
                    text: 'CVF',
                    left:'center',
                },
                //数据可以点击
                tooltip:{
                    show:true
                },

                series:[
                    {
                        maskImage:maskResource,
                        //词的类型
                        type: 'wordCloud',
                        //设置字符大小范围
                        sizeRange:[5,80],

                        //不要忘记调用数据
                        data:jsonlist,

                        rotationRange:[-45,90],
                        textStyle: {
                            normal:{
                                //生成随机的字体颜色
                                color:function () {
                                    return 'rgb(' + [
                                        Math.round(Math.random() * 10),
                                        Math.round(Math.random() * 10),
                                        Math.round(Math.random() * 10)
                                    ].join(',')+')';

                                }
                            },

                        },
                        emphasis: {  // 单词高亮时显示的效果
                            shadowBlur: 100000,
                            shadowColor: '#333',
                            backgroundColor:'#D92932',
                            show:true,
                            label: {
                                show: true,
                                fontSize: '40',
                                fontWeight: 'bold'
                            },
                            textStyle: {
                                fontSize:50
                            }

                        },
                        hoverAnimation:true,
                    }
                ],
            };
            $.ajax({
                url:"../image.json",
                dataType:"json",
                type:"GET",
                async:"false",
                success:function(data){
                    image1=data;
                    maskResource.src=image1;
                    //加载图像,将数据放在图像中
                    maskResource.onload = function(){
                        mycharts.setOption(option)
                    };
                },
            })
        },
    })

    var table = layui.table;
    layui.use('table', function(){
        table = layui.table;

        table.render({
            elem: '#test'
            ,url:'getPaper'
            ,toolbar: '#toolbarDemo' //开启头部工具栏,并为其绑定左侧模板
            ,defaultToolbar: ['filter', 'exports', 'print', { //自定义头部工具栏右侧图标。如无需自定义,去除该参数即可
                title: '提示'
                ,layEvent: 'LAYTABLE_TIPS'
                ,icon: 'layui-icon-tips'
            }]
            ,title: '用户数据表'
            ,cols: [
                [
                    {type: 'checkbox', fixed: 'left'}
                ,{field:'papertitle', title:'论文题目', width:500, edit: 'text'}
                ,{field:'authors', title:'作者', width:500, edit: 'text', sort: true}
                ,{field:'abstract', title:'摘要', edit: 'text', width:100}
                ,{field:'pdf', title:'论文链接', edit: 'text'}
                ,{field:'year', title:'年份', edit: 'text', width:80, sort: true}
                ,{field:'type', title:'会议类型', edit: 'text', width:120}
                ,{fixed: 'right', title:'操作', toolbar: '#barDemo', width:150}
            ]
            ]
            ,page: true
        });

        //头工具栏事件
        table.on('toolbar(test)', function(obj){
            var checkStatus = table.checkStatus(obj.config.id);
            switch(obj.event){
                case 'getCheckData':
                    var data = checkStatus.data;
                    layer.alert(JSON.stringify(data));
                    break;
                case 'getCheckLength':
                    var data = checkStatus.data;
                    layer.msg('选中了:'+ data.length + ' 个');
                    break;
                case 'isAll':
                    layer.msg(checkStatus.isAll ? '全选': '未全选');
                    break;

                //自定义头工具栏右侧图标 - 提示
                case 'LAYTABLE_TIPS':
                    layer.alert('这是工具栏右侧自定义的一个图标按钮');
                    break;
            };
        });

        //监听行工具事件
        table.on('tool(test)', function(obj){
            var data = obj.data;
            //console.log(obj)
            if(obj.event === 'del'){
                layer.confirm('真的删除行么', function(index){
                    obj.del();
                    layer.close(index);
                });
            } else if(obj.event === 'edit'){
                window.open(data.pdf,'_blank');
            }
        });
    });
    layui.use('table', function() {
        var table3 = layui.table;

        table3.render({
            elem: '#keyW'
            , url: 'getList20'
            , title: '数据表'
            , cols: [
                [
                    , {field: 'name', title: '词', sort: true}
                    , {field: 'value', title: '词频', width: 80, sort: true}
                ]
            ]
            , page: true
        });
    })
    var temp;
    var array=new Array();
    table.on('row(keyW)', function(obj){
        var n=0;
        mycharts.dispatchAction({
            type: 'downplay',
            name: temp
        })
        temp=obj.data.name;
        array.push(temp);
        n=n+1;
        mycharts.dispatchAction({
            type: 'highlight',
            name: obj.data.name
        })
        if (array[n]===array[n-1]){
            mycharts.dispatchAction({
                type: 'downplay',
                name: temp
            })
        }
        setTimeout(function(){mycharts.dispatchAction({
            type: 'downplay',
            name: temp
        })},1000);
    });



    function eConsole(param) {
        var i = param.dataIndex;// 获取当前点击索引,
        //clickFunc(param.dataIndex);//执行点击效果
        $("tbody tr").each(function(){
            //alert($(this).index()); 获取数据表格每行的索引
            var a = $(this).index();
            if(a == i){
                $("tbody tr").eq(a).css("background-color","yellow");
            }
        });

    };
    //鼠标移出
    function zConsole(param) {
        var i = param.dataIndex;// 获取当前点击索引,


        $("tbody tr").each(function(){
            //alert($(this).index()); 获取数据表格每行的索引
            var a = $(this).index();
            if(a == i){
                $("tbody tr").eq(a).css("background-color","");
            }
        });

    };

    mycharts.on("mouseover", eConsole);//鼠标移入
    mycharts.on("mouseout", zConsole);//鼠标移出
    mycharts.hideLoading();
    mycharts.on('click',function(params){
        table.reload('test',{
            url: 'getUrlList'
            ,where: {
                word:params.name
            }
            ,cols: [
                        [
                            {field:'type', width:80, title: '会议',}
                            ,{field:'year', width:80, title: '年份'}
                            ,{field:'url', title: '论文链接', sort: true}
                        ]
                    ]
        });
    });
    function change(){
        table.reload('test',{
            url: 'getPaper'
            ,where: {
                keyWord:document.getElementById('keyWord').value
                ,year:document.getElementById('year').value
                ,type:document.getElementById('type').value
                ,author:document.getElementById('author').value
            }
            ,cols: [
                [
                    {type: 'checkbox', fixed: 'left'}
                    ,{field:'papertitle', title:'论文题目', width:500, edit: 'text'}
                    ,{field:'authors', title:'作者', width:500, edit: 'text', sort: true}
                    ,{field:'abstract', title:'摘要', edit: 'text', width:100}
                    ,{field:'pdf', title:'论文链接', edit: 'text'}
                    ,{field:'year', title:'年份', edit: 'text', width:80, sort: true}
                    ,{field:'type', title:'会议类型', edit: 'text', width:120}
                    ,{fixed: 'right', title:'操作', toolbar: '#barDemo', width:150}
                ]
            ]
        });
    }
</script>

</body>
</html>