java 全文检索原理 java实现全文检索

转载

架构师之光 2023-07-17 16:37:15

文章标签 java 全文检索原理 lucene 搜索索引全文索引 文章分类 Java 后端开发

java之全文索引搜索lucene之增删改查文档

在java web开发中经常需要跟“搜索”打交道，大多数应用如果对于搜索的精确匹配度要求不高的话，一般都采用模糊查询的方式，但是很明显，这对于用户来说，体验性却不会很好，如你搜索“lucene”，如果是模糊查询，那就是“like %lucene%” 的搜索，那么排的靠前的结果估计是lucene的官网，百度百科，各种介绍，最后才是案例之类的，如果你是搜索“spring lucene 案例”，那么很有可能一上来就是一个整合的demo，不信的话你可以上百度搜索，有条件的话，google也行

关于lucene的一些介绍以及简单入门就不说了，接下来将花2-3篇博文来记录一些我学习lucene的笔记。本文将介绍一下我对于lucene在使用时的理解以及对于记录（或者叫文档）的增删改查时索引的变化。

下面这张图是我对Lucene全文索引过程的大致理解：

java 全文检索原理 java实现全文检索_java 全文检索原理

接下来，就要上真正的代码了：用于验证我上面所讲的图。下面介绍一下lucene对文档进行增删改的过程，对于查的介绍，在后文进行详细的介绍。

package com.steadyjack.lucene02;

import java.nio.file.Paths;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.junit.Before;
import org.junit.Test;

/**
 * Description:添加删除修改文档
 * Date:2017年2月22日 下午8:59:12
 * @author debug-steadyjack
 */
public class TestIndexing {
	
	private String ids[]={"1","2","3"};
	private String citys[]={"qingdao","nanjing","shanghai"};
	private String descs[]={
			"Qingdao is a beautiful city.",
			"Nanjing is a city of culture.",
			"Shanghai is a bustling city."
	};
	
	private Directory dir;
	
	/**
	 * Description:相当于启动时-构建索引器（索引文件目录）
	 * Date:2017年2月22日 下午8:59:17
	 * Author:debug-steadyjack
	 * @throws Exception
	 */
	@Before
	public void before()throws Exception{
		dir=FSDirectory.open(Paths.get("D:\\lucene-exercise\\lucene2"));
		IndexWriter writer=getWriter();
		
		for(int i=0;i<ids.length;i++){
			Document doc=new Document();
			doc.add(new StringField("id", ids[i], Field.Store.YES));
			doc.add(new StringField("city", citys[i], Field.Store.YES));
			doc.add(new StringField("desc", descs[i], Field.Store.NO));
			//将文档的用于索引的部分写入 索引器
			writer.addDocument(doc);
		}
		writer.close();
	}
	
	/**
	 * Description:获得写索引器
	 * Date:2017年2月22日 下午9:00:30
	 * Author:debug-steadyjack
	 * @return
	 * @throws Exception
	 */
	public IndexWriter getWriter()throws Exception{
		//标准分词器
		Analyzer analyzer=new StandardAnalyzer();
		IndexWriterConfig iwc=new IndexWriterConfig(analyzer);
		IndexWriter writer=new IndexWriter(dir, iwc);
		return writer;
	}
	
	/**
	 * Description:
	 * Date:2017年2月22日 下午9:00:55
	 * Author:debug-steadyjack
	 * @throws Exception
	 */
	@Test
	public void testIndexWriter()throws Exception{
		IndexWriter writer=getWriter();
		System.out.println("写入了 "+writer.numDocs()+" 个文档");
		writer.close();
	}
	
	/**
	 * Description:测试读取索引所在的文件目录共有多少个文档（即用于建立索引的文档有多少个）
	 * Date:2017年2月22日 下午9:01:34
	 * Author:debug-steadyjack
	 * @throws Exception
	 */
	@Test
	public void testIndexReader()throws Exception{
		IndexReader reader=DirectoryReader.open(dir);
		System.out.println("读入的最大文档: "+reader.maxDoc());
		System.out.println("读入的实际文档: "+reader.numDocs());
		reader.close();
	}
	
	/**
	 * Description:删除文档-不合并
	 * Date:2017年2月22日 下午9:03:10
	 * Author:debug-steadyjack
	 * @throws Exception
	 */
	@Test
	public void testDeleteBeforeMerge()throws Exception{
		IndexWriter writer=getWriter();
		System.out.println("删除之前 "+writer.numDocs());
		//Term其实也可以理解为关键字：在这里表示根据id来删除文档
		writer.deleteDocuments(new Term("id","1"));

		writer.commit();
		System.out.println("删除不合并后的最大文档: "+writer.maxDoc());
		System.out.println("删除不合并后的实际文档: "+writer.numDocs());
		writer.close();
	}
	
	/**
	 * Description:删除后合并
	 * Date:2017年2月22日 下午9:04:50
	 * Author:debug-steadyjack
	 * @throws Exception
	 */
	@Test
	public void testDeleteAfterMerge()throws Exception{
		IndexWriter writer=getWriter();
		System.out.println("删除后实际文档：  "+writer.numDocs());
		writer.deleteDocuments(new Term("id","1"));
		//强制删除
		writer.forceMergeDeletes();
		writer.commit();
		System.out.println("删除后最大文档: "+writer.maxDoc());
		System.out.println("删除后实际文档: "+writer.numDocs());
		writer.close();
	}
		
	
	/**
	 * Description:更新文档
	 * Date:2017年2月22日 下午9:06:09
	 * Author:debug-steadyjack
	 * @throws Exception
	 */
	@Test
	public void testUpdate()throws Exception{
		IndexWriter writer=getWriter();
		Document doc=new Document();
		//StringField:String类型-直接存储，不分词，,TextField:文档类型-一般不存储，用于分词
		doc.add(new StringField("id", "1", Field.Store.YES));
		doc.add(new StringField("city","qingdao",Field.Store.YES));
		doc.add(new TextField("desc", "dsss is a city.", Field.Store.NO));
		//更新-也是根据关键字来更新
		writer.updateDocument(new Term("id","1"), doc);
		writer.close();
	}
	
}

当然啦，上面的代码，只需要你建立一个简单的java project即可，然后建立lib文件夹，放入相关的jar，add to buildpath...即可，jar包可以来这里下载：lucene应用所需要的jar包，后文介绍的博文也是需要这些jar的。

本文章为转载内容，我们尊重原作者对文章享有的著作权。如有内容错误或侵权问题，欢迎原作者联系我们进行内容更正或删除文章。