上传日志文件

 

/export/servers/es/esdatas
mkdir -p  /export/servers/es/esdatas

开发logstash的配置文件

cd /export/servers/es/logstash-6.7.0/config
vim nginxlog.conf
input{
  file {
	path =>  "/export/servers/es/esdatas/access.log"
        type => "access.log"
 	start_position => "beginning"
	}
}
filter {  
    grok {
            match => {
           	   "message" => "%{IPORHOST:clientip} \- \- \[%{HTTPDATE:time_local}\] \"(?:%{WORD:method} %{NOTSPACE:request}(?:HTTP/%{NUMBER:httpversion})?|%{DATA:rawrequest})\" %{NUMBER:status} %{NUMBER:body_bytes_sent} %{QS:http_referer}"
		 }
        }
}
output {
       stdout{codec=>rubydebug}
       elasticsearch {
                action => "index"
                hosts =>"node01:9200"
                index => "nginxes"
     }
}

启动logstash并查看es当中的数据

 

执行以下命令启动logstash
cd /export/servers/es/logstash-6.7.0
bin/logstash -f /export/servers/es/logstash-6.7.0/config/nginxlog.conf

 

es整合Hbase实现二级索引

 

需求:解决海量数据的存储,并且能够实现海量数据的秒级查询.

实际生产中,一遍文章要分成标题和正文;但是正文的量是比较大的,那么我们一般会在es中存储标题,在hbase 中存储正文(hbase本身就是做海量数据的存储);这样通过es的倒排索引列表检索到关键词的文档id,然后根据文档id在hbase中查询出具体的正文。

 

存储设计

 

 

分析,数据哪些字段需要构建索引: 文章数据(id、title、author、describe、conent)

字段名称

是否需要索引

是否需要存储

Id

默认索引

默认存储

Title

需要

需要

Author

看需求

看需求

Dscribe

需要

存储

Content

看需求(高精度查询,是需要的  )

看需求

Time

需要

需要

索引库设计

PUT /articles
{  
    "settings":{  
         "number_of_shards":3,  
         "number_of_replicas":1,
         "analysis" : {
            "analyzer" : {
                "ik" : {
                    "tokenizer" : "ik_max_word"
                }
            }
        }
    }, 
    "mappings":{  
         "article":{  
             "dynamic":"strict",
             "_source": {
               "includes": [
                  "id","title","from","readCounts","times"
                ],
               "excludes": [
                  "content"
               ]
             },
             "properties":{  
                 "id":{"type": "keyword", "store": true},  
                 "title":{"type": "text","store": true,"index" : true,"analyzer": "ik_max_word"}, 
                 "from":{"type": "keyword","store": true}, 
                 "readCounts":{"type": "integer","store": true},  
                 "content":{"type": "text","store": false,"index": false},
                 "times": {"type": "keyword", "index": false}
             }  
         }  
    }  
}

导入jar包

<dependencies>
<!--解析excel文件-->
        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi-ooxml-schemas</artifactId>
            <version>3.8</version>
        </dependency>
        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi-ooxml</artifactId>
            <version>3.8</version>
        </dependency>
        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi</artifactId>
            <version>3.8</version>
        </dependency>
<dependency>
            <groupId>org.elasticsearch.client</groupId>
            <artifactId>transport</artifactId>
            <version>6.7.0</version>
        </dependency>
        <dependency>
            <groupId>org.apache.logging.log4j</groupId>
            <artifactId>log4j-core</artifactId>
            <version>2.9.1</version>
        </dependency>
        <dependency>
            <groupId>com.google.code.gson</groupId>
            <artifactId>gson</artifactId>
            <version>2.8.2</version>
        </dependency>
<dependency>
    <groupId>org.apache.hadoop</groupId>
    <artifactId>hadoop-client</artifactId>
    <version>2.7.5</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.hbase/hbase-client -->
<dependency>
    <groupId>org.apache.hbase</groupId>
    <artifactId>hbase-client</artifactId>
    <version>2.0.0</version>
</dependency>
<dependency>
    <groupId>org.apache.hbase</groupId>
    <artifactId>hbase-server</artifactId>
    <version>2.0.0</version>
</dependency>
  </dependencies>
<build>
    <plugins>
        <plugin>
            <groupId>org.apache.maven.plugins</groupId>
            <artifactId>maven-compiler-plugin</artifactId>
            <version>3.0</version>
            <configuration>
                <source>1.8</source>
                <target>1.8</target>
                <encoding>UTF-8</encoding>
                <!--    <verbal>true</verbal>-->
            </configuration>
        </plugin>
    </plugins>
</build>

 

代码开发

定义Article实体类
public class Article {
    private String id;
    private String title;
    private String from;
    private String times;
    private String readCounts;
    private String content;

    public Article() {
    }

    public String getId() {
        return id;
    }

    public void setId(String id) {
        this.id = id;
    }

    public Article(String id, String title, String from, String times, String readCounts, String content) {
        this.id = id;
        this.title = title;
        this.from = from;
        this.times = times;

        this.readCounts = readCounts;
        this.content = content;
    }

    public String getTitle() {
        return title;
    }

    public void setTitle(String title) {
        this.title = title;
    }

    public String getFrom() {
        return from;
    }

    public void setFrom(String from) {
        this.from = from;
    }

    public String getTimes() {
        return times;
    }

    public void setTimes(String times) {
        this.times = times;
    }

    public String getReadCounts() {
        return readCounts;
    }

    public void setReadCounts(String readCounts) {
        this.readCounts = readCounts;
    }

    public String getContent() {
        return content;
    }

    public void setContent(String content) {
        this.content = content;
    }
}

定义excel解析工具类

public class ExcelUtil {

    //读取exel,将文件内容打印出来
    public static void main(String[] args) throws IOException {
        List<Article> exceInfo = getExceInfo();
    }


    public static List<Article>  getExceInfo() throws IOException {
        FileInputStream fileInputStream = new FileInputStream("F:\\传智播客大数据离线阶段课程资料\\ELK文档资料教案\\excel数据集\\baijia.xlsx");
        //获取我们解析excel表格的对象
        XSSFWorkbook xssfSheets = new XSSFWorkbook(fileInputStream);
        //获取excel的第一个sheet页
        XSSFSheet sheetAt = xssfSheets.getSheetAt(0);
        //获取我们sheet页的最后一行的数字之,说白了就是看这个excel一共有多少行
        int lastRowNum = sheetAt.getLastRowNum();
        List<Article> articleList = new ArrayList<Article>();
        for(int i =1 ;i<lastRowNum;i++){
            Article article = new Article();
            //获取我们一行 行的数据
            XSSFRow row = sheetAt.getRow(i);
            //通过我们的row对象,解析里面一个个的字段
            XSSFCell title = row.getCell(0);
            XSSFCell from  = row.getCell(1);
            XSSFCell time = row.getCell(2);
            XSSFCell readCount = row.getCell(3);
            XSSFCell content = row.getCell(4);
           // System.out.println(title.toString());
            article.setId(i+"");
            article.setTitle(title.toString());
            article.setContent(content.toString());
            article.setFrom(from.toString());
            article.setReadCounts(readCount.toString());
            article.setTimes(time.toString());
            articleList.add(article);
        }
        fileInputStream.close();
        return  articleList;
    }
}

定义main方法

public class AppMain {
    private static  final String  tableName = "hbase_es_article";
    private static final String familyName = "f1";
    private static final String title = "title";
    private static final String from = "from";
    private static final String times ="times";
    private static final String readCounts =  "readCounts";
    private static final String content ="content";


    public static void main(String[] args) throws IOException {
        //使用java代码解析excel表格
        List<Article> exceInfo = ExcelUtil.getExceInfo();

       /* //将集合当中的数据,保存到es当中去
        TransportClient client = getEsClient();
        save2Es(exceInfo, client);
        Table table = getTable();
        //循环遍历我们的数据,将我们的数据装到List<Put>
        saveToHbase(exceInfo, table);*/

      /* //通过一个关键字进行搜索,将我们的数据从es当中查询出来
        TransportClient esClient = getEsClient();
        List<String>  getAllKeyWord =  getByKeyWord(esClient,"机器人");*/
        //拿到数据的id,看数据详情 1216
        Table table = getTable();
        Get get = new Get("1216".getBytes());
        Result result = table.get(get);
        Cell[] cells = result.rawCells();
        for (Cell cell : cells) {
            byte[] value = cell.getValue();
            System.out.println(Bytes.toString(value));
            //将文章内容封装到article给前端返回即可
        }
    }

    private static List<String> getByKeyWord(TransportClient esClient,String keyWord) {
        ArrayList<String> strings = new ArrayList<String>();
        SearchResponse searchResponse = esClient.prepareSearch("articles").setTypes("article").setQuery(QueryBuilders.termQuery("title", keyWord)).get();
        SearchHits hits = searchResponse.getHits();
        for (SearchHit hit : hits) {
            //获取我们数据的系统的id
            String id = hit.getId();
           // System.out.println(id);
            strings.add(id);
        }
        return strings;
    }

    private static void saveToHbase(List<Article> exceInfo, Table table) throws IOException {
        System.out.println(exceInfo.size());
        long startTime = System.currentTimeMillis();
        List<Put> putList = new ArrayList<Put>();
        for (Article article : exceInfo) {
            System.out.println(article.getTitle());
            Put put = new Put(Bytes.toBytes(article.getId()));
            if(article.getTitle() != null   &&  article.getTitle() != ""){
                put.addColumn(familyName.getBytes(),title.getBytes(),article.getTitle().getBytes());
                put.addColumn(familyName.getBytes(),from.getBytes(),article.getFrom().getBytes());
                put.addColumn(familyName.getBytes(),times.getBytes(),article.getTimes().getBytes());
                put.addColumn(familyName.getBytes(),readCounts.getBytes(),article.getReadCounts().getBytes());
                put.addColumn(familyName.getBytes(),content.getBytes(),article.getContent().getBytes());
                putList.add(put);
            }
        }
        table.put(putList);
        long endTime = System.currentTimeMillis();
        System.out.println((endTime-startTime)/1000);
        table.close();
    }

    private static Table getTable() throws IOException {
        //将集合当中的数据,保存到hbase当中去
        //第一步:获取hbase的客户端连接
        Configuration configuration = HBaseConfiguration.create();
        configuration.set("hbase.zookeeper.quorum","node01:2181,node02:2181,node03:2181");
        Connection connection = ConnectionFactory.createConnection(configuration);
        Admin admin = connection.getAdmin();
        //设置我们表名
        HTableDescriptor hTableDescriptor = new HTableDescriptor(TableName.valueOf(tableName));
        HColumnDescriptor f1 = new HColumnDescriptor(familyName);
        hTableDescriptor.addFamily(f1);
        if(!admin.tableExists(TableName.valueOf(tableName))){
            admin.createTable(hTableDescriptor);
        }
        return connection.getTable(TableName.valueOf(tableName));
    }

    private static void save2Es(List<Article> exceInfo, TransportClient client) {
        //通过批量添加,将我们的数据保存到es当中去
        BulkRequestBuilder bulk = client.prepareBulk();
        /**
         * 循环遍历我们的集合,组织我们IndexRequestBuilder
         */
        for (Article article : exceInfo) {
            IndexRequestBuilder indexRequestBuilder = client.prepareIndex("articles", "article", article.getId());
            Gson gson = new Gson();
            String jsonStr = gson.toJson(article);
            indexRequestBuilder.setSource(jsonStr, XContentType.JSON);
            bulk.add(indexRequestBuilder);
        }
        //触发我们的数据真正的保存到es当中去
        BulkResponse bulkItemResponses = bulk.get();
        client.close();
    }

    private static TransportClient getEsClient() throws UnknownHostException {
        Settings settings = Settings.builder().put("cluster.name", "myes").build();
        TransportClient client = new PreBuiltTransportClient(settings)
                .addTransportAddress(new TransportAddress(InetAddress.getByName("node01"),9300))
                .addTransportAddress(new TransportAddress(InetAddress.getByName("node02"),9300));
        return client;
    }

}