java模糊查询、自动补全的实现
- 1使用场景
- 2 maven依赖
- 3 拼音的工具类
- 4 模糊搜索具体的实现
- 5 模糊搜索字段的含义和用法
- 6 调用
- 7 工具类提供
- 8 注意事项
1使用场景
在平时的开发过程中,我们可能会遇到需要使用到模糊搜索的地方,类似这样的场景:
那么我们该怎么实现呢?
2 maven依赖
引用模糊搜索jar包和拼音的jar包
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-core</artifactId>
<version>3.6.0</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-highlighter</artifactId>
<version>3.6.0</version>
</dependency>
<dependency>
<groupId>net.sourceforge.pinyin4j</groupId>
<artifactId>pinyin4j</artifactId>
<version>2.5.0</version>
</dependency>
3 拼音的工具类
开发将汉字转换为拼音首字母和拼音全拼的功能 如:北京->bj 、北京->beijing
import net.sourceforge.pinyin4j.PinyinHelper;
import net.sourceforge.pinyin4j.format.HanyuPinyinCaseType;
import net.sourceforge.pinyin4j.format.HanyuPinyinOutputFormat;
import net.sourceforge.pinyin4j.format.HanyuPinyinToneType;
import net.sourceforge.pinyin4j.format.HanyuPinyinVCharType;
import net.sourceforge.pinyin4j.format.exception.BadHanyuPinyinOutputFormatCombination;
public class PinyinUtils {
/**
* 将汉字转换为全拼
*
* @param src
* @return String
*/
public static String getPinYin(String src) {
char[] t1 = null;
t1 = src.toCharArray();
String[] t2 = new String[t1.length];
// 设置汉字拼音输出的格式
HanyuPinyinOutputFormat t3 = new HanyuPinyinOutputFormat();
t3.setCaseType(HanyuPinyinCaseType.LOWERCASE);
t3.setToneType(HanyuPinyinToneType.WITHOUT_TONE);
t3.setVCharType(HanyuPinyinVCharType.WITH_V);
String t4 = "";
int t0 = t1.length;
try {
for (int i = 0; i < t0; i++) {
// 判断是否为汉字字符
if (Character.toString(t1[i]).matches("[\\u4E00-\\u9FA5]+")) {
// 将汉字的几种全拼都存到t2数组中
t2 = PinyinHelper.toHanyuPinyinStringArray(t1[i], t3);
// 取出该汉字全拼的第一种读音并连接到字符串t4后
t4 += t2[0];
} else {
// 如果不是汉字字符,直接取出字符并连接到字符串t4后
t4 += Character.toString(t1[i]);
}
}
} catch (BadHanyuPinyinOutputFormatCombination e) {
e.printStackTrace();
}
return t4;
}
/**
* 提取每个汉字的首字母
*
* @param str
* @return String
*/
public static String getPinYinHeadChar(String str) {
String convert = "";
for (int j = 0; j < str.length(); j++) {
char word = str.charAt(j);
// 提取汉字的首字母
String[] pinyinArray = PinyinHelper.toHanyuPinyinStringArray(word);
if (pinyinArray != null) {
convert += pinyinArray[0].charAt(0);
} else {
convert += word;
}
}
return convert;
}
public static String getChineseByPinYin(String src) {
char[] englishChars = src.toCharArray();
StringBuilder sb = new StringBuilder();
for (int i = 0; i < englishChars.length; i++)
{
String[] pinYin;
try {
pinYin = PinyinHelper.toHanyuPinyinStringArray(englishChars[i], getDefaultOutputFormat());
if (pinYin != null)
{
sb.append(pinYin[0]);
}
} catch (BadHanyuPinyinOutputFormatCombination e) {
e.printStackTrace();
}
}
return sb.toString();
}
public static HanyuPinyinOutputFormat getDefaultOutputFormat() {
HanyuPinyinOutputFormat format = new HanyuPinyinOutputFormat();
// 小写
format.setCaseType(HanyuPinyinCaseType.LOWERCASE);
// 没有音调数字
format.setToneType(HanyuPinyinToneType.WITHOUT_TONE);
// lv显示
format.setVCharType(HanyuPinyinVCharType.WITH_V);
return format;
}
public static void main(String [] args) {
String pinyin = getPinYin("北京");
String pinyinhead = getPinYinHeadChar("北京");
System.out.println(pinyin + " ; " + pinyinhead);
}
}
4 模糊搜索具体的实现
index函数为模糊搜索加载的内容,这里我们改成自己的数据。
search函数为模糊搜索的实现,直接调用该函数就可以获取我们想要的内容
话不多说 直接上代码
import java.io.IOException;
import java.io.Reader;
import java.util.Date;
import java.util.List;
import java.util.Map;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReentrantLock;
import org.apache.log4j.Logger;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryParser.MultiFieldQueryParser;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.PrefixQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopScoreDocCollector;
import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.LockObtainFailedException;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Version;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;
import com.renren.toro.dao.AppletNewsDao;
import com.renren.toro.model.AppletNews;
import com.renren.toro.service.SearcherNewService;
import com.renren.toro.util.ObjectUtil;
import com.renren.toro.util.PinyinUtils;
import com.renren.toro.util.SearchTokenizer;
import net.sf.json.JSONArray;
import net.sf.json.JSONObject;
@Service
public class SearcherNewServiceImpl implements SearcherNewService {
private static final Logger LOGGER = Logger.getLogger("search");
@Autowired
private AppletNewsDao appletNewsDao;
private static final String [] QUERY_FIELD = { "name" , "pinyin" , "pinyinHead", "id", "update_date", "show_date", "sticky_status", "sticky_text", "country_name", "label_name"}; // 需要参与模糊搜索的字段和最后需要显示的字段 如本次需求需要模糊搜索的字段为name、pinyin、pinyinHead 剩余字段不参与模糊搜索,仅为需要返回给前端显示的字段
private static IndexSearcher indexSearcher = null;
private static IndexReader reader = null;
private static final String REGEX_NO = "^[0-9]\\w*$";
private static final String REGEX_CHAR = "^[a-zA-Z]*";
private static final int RESULT_COUNT = 100000;
private static Directory ramdDrectory = new RAMDirectory();
private final Lock writerLock = new ReentrantLock();
private volatile IndexWriter writer = null;
private Analyzer analyzer = new Analyzer(){
@Override
public TokenStream tokenStream(
String fileName,
Reader reader) {
return new SearchTokenizer(reader);
}
};
public IndexWriter getIndexWriter(Directory dir, IndexWriterConfig config) {
if (null == dir) {
throw new IllegalArgumentException("Directory can not be null.");
}
if (null == config) {
throw new IllegalArgumentException("IndexWriterConfig can not be null.");
}
try {
if (null == writer) {
if (IndexWriter.isLocked(dir)) {
//throw new LockObtainFailedException("Directory of index had been locked.");
IndexWriter.unlock(dir);
}
writer = new IndexWriter(dir, config);
}
} catch (IOException e) {
e.printStackTrace();
} finally {
}
return writer;
}
@Override
public void index() throws CorruptIndexException,
LockObtainFailedException, IOException {
LOGGER.info(" init search method index() ");
List<Map<String, Object>> list = loadResources();
if (list == null || list.isEmpty()) return ;
IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_36, analyzer);
try {
writerLock.lock();
getIndexWriter(ramdDrectory, config);
writer.deleteAll();
Document doc = null;
String pinyin = null;
String pinyinHead = null;
for (Map<String, Object> appleNews : list) {
//根据name生成对应的全拼
pinyin = PinyinUtils.getChineseByPinYin(appleNews.get("name").toString()).toLowerCase();
//根据name生成对应的拼音首字母
pinyinHead = PinyinUtils.getPinYinHeadChar(appleNews.get("name").toString()).toLowerCase();
//为每个字段赋值,根据自己需求展示对应字段 与上面数组对应即可, Field.Store和Field.Index具体的含义见下面解释
doc = new Document();
doc.add(new Field(QUERY_FIELD[0], appleNews.get("name").toString(), Field.Store.YES, Field.Index.ANALYZED));
doc.add(new Field(QUERY_FIELD[1], pinyin, Field.Store.YES, Field.Index.NOT_ANALYZED));
doc.add(new Field(QUERY_FIELD[2], pinyinHead, Field.Store.YES, Field.Index.NOT_ANALYZED));
doc.add(new Field(QUERY_FIELD[3], String.valueOf(appleNews.get("id")), Field.Store.YES, Field.Index.NOT_ANALYZED));
doc.add(new Field(QUERY_FIELD[4], appleNews.get("updateDate").toString(), Field.Store.YES, Field.Index.NOT_ANALYZED));
doc.add(new Field(QUERY_FIELD[5], appleNews.get("showDate").toString(), Field.Store.YES, Field.Index.NOT_ANALYZED));
doc.add(new Field(QUERY_FIELD[6], appleNews.get("stickyStatus").toString(), Field.Store.YES, Field.Index.NOT_ANALYZED));
if(!ObjectUtil.isEmpty(appleNews, "stickyText")){
doc.add(new Field(QUERY_FIELD[7], appleNews.get("stickyText").toString(), Field.Store.YES, Field.Index.NOT_ANALYZED));
}else{
doc.add(new Field(QUERY_FIELD[7], "", Field.Store.YES, Field.Index.NOT_ANALYZED));
}
if(!ObjectUtil.isEmpty(appleNews, "countryName")){
doc.add(new Field(QUERY_FIELD[8], appleNews.get("countryName").toString(), Field.Store.YES, Field.Index.NOT_ANALYZED));
}else{
doc.add(new Field(QUERY_FIELD[8], "", Field.Store.YES, Field.Index.NOT_ANALYZED));
}
if(!ObjectUtil.isEmpty(appleNews, "labelName")){
doc.add(new Field(QUERY_FIELD[9], appleNews.get("labelName").toString(), Field.Store.YES, Field.Index.NOT_ANALYZED));
}else{
doc.add(new Field(QUERY_FIELD[9], "", Field.Store.YES, Field.Index.NOT_ANALYZED));
}
writer.addDocument(doc);
}
} catch (Exception e) {
e.printStackTrace();
} finally {
writer.close();
writer = null;
writerLock.unlock();
}
}
@Override
public Object search(String queryWord)
throws Exception {
JSONArray appletNewsList = new JSONArray();
indexSearcher = getIndexSearcher(reader);
if (indexSearcher == null) {
return appletNewsList;
}
Query query = null;
PhraseQuery phrase = null;
PrefixQuery prefix = null;
BooleanQuery blquery = null;
QueryParser parser = null;
MultiFieldQueryParser multiParser = null;
TermQuery term = null;
String[] multiQueryField = {QUERY_FIELD[0]};
if (queryWord.matches(REGEX_NO)) {
queryWord = queryWord.toLowerCase();
// code搜索
phrase = new PhraseQuery();
phrase.setSlop(0);
for (int i = 0; i < queryWord.length(); i++) {
phrase.add(new Term(QUERY_FIELD[2], Character.toString(queryWord.charAt(i))));
}
query = phrase;
} else if (queryWord.matches(REGEX_CHAR)) {
// 拼音搜索
prefix = new PrefixQuery(new Term(QUERY_FIELD[1], queryWord.toLowerCase()));
query = new WildcardQuery(new Term(QUERY_FIELD[2], queryWord.toLowerCase() + "*"));
term = new TermQuery(new Term(QUERY_FIELD[0], queryWord.toLowerCase()));
blquery = new BooleanQuery();
blquery.add(prefix, Occur.SHOULD);
blquery.add(query, Occur.SHOULD);
blquery.add(term, Occur.SHOULD);
query = blquery;
} else {
multiParser = new MultiFieldQueryParser(Version.LUCENE_36, multiQueryField, analyzer);
parser = multiParser;
parser.setDefaultOperator(QueryParser.Operator.AND);
query = parser.parse(QueryParser.escape(queryWord));
}
LOGGER.info("query param is : " + query.toString());
// start time
TopScoreDocCollector collector = TopScoreDocCollector.create(RESULT_COUNT, false);
long start = new Date().getTime();
indexSearcher.search(query, collector);
ScoreDoc[] hits = collector.topDocs().scoreDocs;
JSONObject appletNews = null;
for (ScoreDoc scoreDoc : hits) {
Document doc = indexSearcher.doc(scoreDoc.doc);
appletNews = new JSONObject();
appletNews.put(QUERY_FIELD[0], doc.get(QUERY_FIELD[0]));
appletNews.put(QUERY_FIELD[1], doc.get(QUERY_FIELD[1]));
appletNews.put(QUERY_FIELD[2], doc.get(QUERY_FIELD[2]));
appletNews.put(QUERY_FIELD[3], doc.get(QUERY_FIELD[3]));
appletNews.put(QUERY_FIELD[4], doc.get(QUERY_FIELD[4]));
appletNews.put(QUERY_FIELD[5], doc.get(QUERY_FIELD[5]));
appletNews.put(QUERY_FIELD[6], doc.get(QUERY_FIELD[6]));
appletNews.put(QUERY_FIELD[7], doc.get(QUERY_FIELD[7]));
appletNews.put(QUERY_FIELD[8], doc.get(QUERY_FIELD[8]));
appletNews.put(QUERY_FIELD[9], doc.get(QUERY_FIELD[9]));
appletNewsList.add(appletNews);
}
// end time
long end = new Date().getTime();
LOGGER.info(
"\nFound " + collector.getTotalHits() + " document(s) (in "
+ (end - start) + " millindexSearchereconds) that matched query '"
+ queryWord + "':"
);
return appletNewsList;
}
/**
* 获取索引
* @param reader
* @return
*/
private IndexSearcher getIndexSearcher(
IndexReader reader){
try {
if (reader == null) {
reader = IndexReader.open(ramdDrectory);
} else {
//如果当前reader在打开期间index发生改变,则打开并返回一个新的IndexReader,否则返回null
IndexReader ir = IndexReader.openIfChanged(reader);
if (ir != null) {
reader.close();
reader = ir;
}
}
return new IndexSearcher(reader);
}catch(Exception e) {
e.printStackTrace();
}
return null; //发生异常则返回null
}
@Override
public void loadFundInfo() {}
public List<Map<String, Object>> loadResources() {
List<Map<String, Object>> fundInfoList = appletNewsDao.newSelectAll();
return fundInfoList;
}
}
5 模糊搜索字段的含义和用法
对照该用法对自己的参数进行设置
Field.Store.YES:存储字段值(未分词前的字段值)
Field.Store.NO:不存储,存储与索引没有关系
Field.Store.COMPRESS:压缩存储,用于长文本或二进制,但性能受损
Field.Index.ANALYZED:分词建索引
Field.Index.ANALYZED_NO_NORMS:分词建索引,但是Field的值不像通常那样被保存,而是只取一个byte,这样节约存储空间
Field.Index.NOT_ANALYZED:不分词且索引
Field.Index.NOT_ANALYZED_NO_NORMS:不分词建索引,Field的值去一个byte保存
TermVector表示文档的条目(由一个Document和Field定位)和它们在当前文档中所出现的次数
Field.TermVector.YES:为每个文档(Document)存储该字段的TermVector
Field.TermVector.NO:不存储TermVector
Field.TermVector.WITH_POSITIONS:存储位置
Field.TermVector.WITH_OFFSETS:存储偏移量
Field.TermVector.WITH_POSITIONS_OFFSETS:存储位置和偏移量
6 调用
其实原理就是在项目启动的过程中将数据添加到内存中,那么我们开始设置启动加载
加载过程:
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.context.ApplicationEvent;
import org.springframework.context.ApplicationListener;
import org.springframework.stereotype.Component;
import com.renren.toro.service.SearcherNewService;
@Component
public class StartUpInit implements ApplicationListener<ApplicationEvent>{
private static final Logger logger = LoggerFactory.getLogger(StartUpInit.class);
@Autowired
private SearcherNewService searcherNewService;
private static boolean isStart = false;
@Override
public void onApplicationEvent(ApplicationEvent event) {
try {
if (! isStart) {
isStart = true;
logger.info(" init search data ");
searcherNewService.index();
}
} catch (Exception e1) {
e1.printStackTrace();
}
}
}
调用过程:
JSONArray letterList = (JSONArray) searcherNewService.search(search);
至此我们就完成了模糊搜索的全部内容,在实现的过程中根据自己的实际需求改动即可。更深层次的研究大家可以看看官方文档和lucene包的源码
7 工具类提供
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import java.io.IOException;
import java.io.Reader;
/**
* Created by Administrator on 2019/2/26.
*/
public final class SearchTokenizer extends Tokenizer {
private final TermAttribute termAtt = addAttribute(TermAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
private int pos;
public SearchTokenizer(Reader input){
super(input);
}
@Override
public final boolean incrementToken() throws IOException {
clearAttributes();
while (true) {
int c = input.read();
if (c == -1) return false;
// 只处理数字、字母、汉字
if (Character.isDigit(c) || Character.isLetter(c) || (c >=19968 && c <= 171941)) {
termAtt.setTermBuffer(Character.isLetter(c) ? String.valueOf((char) c).toLowerCase() : String.valueOf((char) c));
termAtt.setTermLength(1);
offsetAtt.setOffset(correctOffset(pos++), correctOffset(pos));
return true;
}
pos += Character.charCount(c);
}
}
@Override
public final void end() throws IOException {
super.end();
int finalOffset = correctOffset(pos);
offsetAtt.setOffset(finalOffset, finalOffset);
}
@Override
public final void reset() throws IOException {
pos = 0;
}
}
import org.springframework.stereotype.Component;
import org.springframework.util.StringUtils;
import java.util.Map;
/**
* Created by Administrator on 2019/2/26.
*/
@Component
public class ObjectUtil {
/**
* 判断map中的key对应的value是否为空
* 注意此方法仅对Map<String, String>,或能够转为Map<String, String>的对象有效
* @param map
* @param key
* @return
*/
public static boolean isEmpty(Map<String, Object> map, String key){
if(map == null){
return true;
}else{
if(map.get(key) == null){
return true;
}else{
String value = map.get(key).toString();
if(StringUtils.isEmpty(value)){
return true;
}else{
return false;
}
}
}
}
}
8 注意事项
需要注意的是如果模糊查询的数据发生变化,需要调用index函数或者重启项目来重新将数据索引读入到缓存中。
如果频繁的更数据的话,建议在增删改接口的末尾添加index重新读入索引到缓存中的操作。