java模糊查询、自动补全的实现

  • 1使用场景
  • 2 maven依赖
  • 3 拼音的工具类
  • 4 模糊搜索具体的实现
  • 5 模糊搜索字段的含义和用法
  • 6 调用
  • 7 工具类提供
  • 8 注意事项


1使用场景

在平时的开发过程中,我们可能会遇到需要使用到模糊搜索的地方,类似这样的场景:

java 模糊匹配不区分大小写 java拼音模糊匹配字段_java

java 模糊匹配不区分大小写 java拼音模糊匹配字段_模糊搜索_02


java 模糊匹配不区分大小写 java拼音模糊匹配字段_lucene使用_03

java 模糊匹配不区分大小写 java拼音模糊匹配字段_lucene使用_04


那么我们该怎么实现呢?

2 maven依赖

引用模糊搜索jar包和拼音的jar包

<dependency>
    <groupId>org.apache.lucene</groupId>
    <artifactId>lucene-core</artifactId>
    <version>3.6.0</version>
</dependency>

<dependency>
    <groupId>org.apache.lucene</groupId>
    <artifactId>lucene-highlighter</artifactId>
    <version>3.6.0</version>
</dependency>

<dependency>
  	<groupId>net.sourceforge.pinyin4j</groupId>
 	<artifactId>pinyin4j</artifactId>
  	<version>2.5.0</version>
</dependency>

3 拼音的工具类

开发将汉字转换为拼音首字母和拼音全拼的功能 如:北京->bj 、北京->beijing

import net.sourceforge.pinyin4j.PinyinHelper;
import net.sourceforge.pinyin4j.format.HanyuPinyinCaseType;
import net.sourceforge.pinyin4j.format.HanyuPinyinOutputFormat;
import net.sourceforge.pinyin4j.format.HanyuPinyinToneType;
import net.sourceforge.pinyin4j.format.HanyuPinyinVCharType;
import net.sourceforge.pinyin4j.format.exception.BadHanyuPinyinOutputFormatCombination;

public class PinyinUtils {
	/** 
     * 将汉字转换为全拼 
     *  
     * @param src 
     * @return String 
     */  
    public static String getPinYin(String src) {  
        char[] t1 = null;  
        t1 = src.toCharArray();  
        String[] t2 = new String[t1.length];  
        // 设置汉字拼音输出的格式  
        HanyuPinyinOutputFormat t3 = new HanyuPinyinOutputFormat();  
        t3.setCaseType(HanyuPinyinCaseType.LOWERCASE);  
        t3.setToneType(HanyuPinyinToneType.WITHOUT_TONE);  
        t3.setVCharType(HanyuPinyinVCharType.WITH_V);  
        String t4 = "";  
        int t0 = t1.length;  
        try {  
            for (int i = 0; i < t0; i++) {  
                // 判断是否为汉字字符  
                if (Character.toString(t1[i]).matches("[\\u4E00-\\u9FA5]+")) {  
                	// 将汉字的几种全拼都存到t2数组中
                    t2 = PinyinHelper.toHanyuPinyinStringArray(t1[i], t3);
                    // 取出该汉字全拼的第一种读音并连接到字符串t4后
                    t4 += t2[0];  
                } else {  
                    // 如果不是汉字字符,直接取出字符并连接到字符串t4后  
                    t4 += Character.toString(t1[i]);  
                }  
            }  
        } catch (BadHanyuPinyinOutputFormatCombination e) {  
            e.printStackTrace();  
        }  
        return t4;  
    }  
  
    /** 
     * 提取每个汉字的首字母 
     *  
     * @param str 
     * @return String 
     */  
    public static String getPinYinHeadChar(String str) {  
        String convert = "";  
        for (int j = 0; j < str.length(); j++) {  
            char word = str.charAt(j);  
            // 提取汉字的首字母  
            String[] pinyinArray = PinyinHelper.toHanyuPinyinStringArray(word);  
            if (pinyinArray != null) {  
                convert += pinyinArray[0].charAt(0);  
            } else {  
                convert += word;  
            }  
        }  
        return convert;  
    }  
    
    public static String getChineseByPinYin(String src) {
    	char[] englishChars = src.toCharArray();
    	StringBuilder sb = new StringBuilder();
    	for (int i = 0; i < englishChars.length; i++)
    	{
    		String[] pinYin;
			try {
				pinYin = PinyinHelper.toHanyuPinyinStringArray(englishChars[i], getDefaultOutputFormat());
				if (pinYin != null)
	    		{
	    			sb.append(pinYin[0]);
	    		}
			} catch (BadHanyuPinyinOutputFormatCombination e) {
				e.printStackTrace();
			}
    	}
    	return sb.toString();
    }
    
    public static HanyuPinyinOutputFormat getDefaultOutputFormat() {
    	HanyuPinyinOutputFormat format = new HanyuPinyinOutputFormat();
    	// 小写
    	format.setCaseType(HanyuPinyinCaseType.LOWERCASE);
    	// 没有音调数字
    	format.setToneType(HanyuPinyinToneType.WITHOUT_TONE);
    	// lv显示
    	format.setVCharType(HanyuPinyinVCharType.WITH_V);
    	
    	return format;
    }
    
    public static void main(String [] args) {
    	String pinyin = getPinYin("北京");
    	String pinyinhead = getPinYinHeadChar("北京");
    	System.out.println(pinyin + " ; " + pinyinhead);
    }
}

4 模糊搜索具体的实现

index函数为模糊搜索加载的内容,这里我们改成自己的数据。
search函数为模糊搜索的实现,直接调用该函数就可以获取我们想要的内容
话不多说 直接上代码

import java.io.IOException;
import java.io.Reader;
import java.util.Date;
import java.util.List;
import java.util.Map;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReentrantLock;

import org.apache.log4j.Logger;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryParser.MultiFieldQueryParser;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.PrefixQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopScoreDocCollector;
import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.LockObtainFailedException;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Version;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;

import com.renren.toro.dao.AppletNewsDao;
import com.renren.toro.model.AppletNews;
import com.renren.toro.service.SearcherNewService;
import com.renren.toro.util.ObjectUtil;
import com.renren.toro.util.PinyinUtils;
import com.renren.toro.util.SearchTokenizer;

import net.sf.json.JSONArray;
import net.sf.json.JSONObject;

@Service
public class SearcherNewServiceImpl implements SearcherNewService {
	
	private static final Logger LOGGER = Logger.getLogger("search");
	
	@Autowired
	private AppletNewsDao appletNewsDao;
	
	private static final String [] QUERY_FIELD = { "name" , "pinyin" , "pinyinHead", "id", "update_date", "show_date", "sticky_status", "sticky_text", "country_name", "label_name"}; // 需要参与模糊搜索的字段和最后需要显示的字段 如本次需求需要模糊搜索的字段为name、pinyin、pinyinHead 剩余字段不参与模糊搜索,仅为需要返回给前端显示的字段
	
	private static IndexSearcher indexSearcher = null;
	
	private static IndexReader reader = null;
	
    private static final String REGEX_NO = "^[0-9]\\w*$";
    
    private static final String REGEX_CHAR = "^[a-zA-Z]*";

    private static final int RESULT_COUNT = 100000;
    
    private static Directory ramdDrectory = new RAMDirectory();
    
    private final Lock writerLock = new ReentrantLock();
    
    private volatile IndexWriter writer = null;
	
    private Analyzer analyzer = new Analyzer(){
		@Override
		public TokenStream tokenStream(
				String fileName,
				Reader reader) {
			return new SearchTokenizer(reader);
		}
    };
    
    public IndexWriter getIndexWriter(Directory dir, IndexWriterConfig config) {
    	if (null == dir) {
    		throw new IllegalArgumentException("Directory can not be null."); 
    	}
    	if (null == config) {
    		throw new IllegalArgumentException("IndexWriterConfig can not be null.");
    	}
    	try {
			if (null == writer) {
				if (IndexWriter.isLocked(dir)) {
					//throw new LockObtainFailedException("Directory of index had been locked.");
					IndexWriter.unlock(dir);
				}
				writer = new IndexWriter(dir, config);
			}
		} catch (IOException e) {
			e.printStackTrace();
		} finally {
		}
    	return writer;
    }
    
	@Override
	public void index() throws CorruptIndexException,
			LockObtainFailedException, IOException {
		LOGGER.info(" init search method index() ");
		List<Map<String, Object>> list = loadResources();
		if (list == null || list.isEmpty()) return ;
		
		IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_36, analyzer);
        try {
        	writerLock.lock();
			getIndexWriter(ramdDrectory, config);
			writer.deleteAll();
			
			Document doc = null;
			String pinyin = null;
			String pinyinHead = null;
			for (Map<String, Object> appleNews : list) {
			    //根据name生成对应的全拼
				pinyin = PinyinUtils.getChineseByPinYin(appleNews.get("name").toString()).toLowerCase();
				//根据name生成对应的拼音首字母
				pinyinHead = PinyinUtils.getPinYinHeadChar(appleNews.get("name").toString()).toLowerCase();
				//为每个字段赋值,根据自己需求展示对应字段 与上面数组对应即可, Field.Store和Field.Index具体的含义见下面解释
				doc = new Document();
				doc.add(new Field(QUERY_FIELD[0], appleNews.get("name").toString(), Field.Store.YES, Field.Index.ANALYZED));
				doc.add(new Field(QUERY_FIELD[1], pinyin, Field.Store.YES, Field.Index.NOT_ANALYZED));
			    doc.add(new Field(QUERY_FIELD[2], pinyinHead, Field.Store.YES, Field.Index.NOT_ANALYZED));
			    doc.add(new Field(QUERY_FIELD[3], String.valueOf(appleNews.get("id")), Field.Store.YES, Field.Index.NOT_ANALYZED));
			    doc.add(new Field(QUERY_FIELD[4], appleNews.get("updateDate").toString(), Field.Store.YES, Field.Index.NOT_ANALYZED));
			    doc.add(new Field(QUERY_FIELD[5], appleNews.get("showDate").toString(), Field.Store.YES, Field.Index.NOT_ANALYZED));
			    doc.add(new Field(QUERY_FIELD[6], appleNews.get("stickyStatus").toString(), Field.Store.YES, Field.Index.NOT_ANALYZED));
			    if(!ObjectUtil.isEmpty(appleNews, "stickyText")){
			    	doc.add(new Field(QUERY_FIELD[7], appleNews.get("stickyText").toString(), Field.Store.YES, Field.Index.NOT_ANALYZED));
			    }else{
			    	doc.add(new Field(QUERY_FIELD[7], "", Field.Store.YES, Field.Index.NOT_ANALYZED));
			    }
			    if(!ObjectUtil.isEmpty(appleNews, "countryName")){
			    	doc.add(new Field(QUERY_FIELD[8], appleNews.get("countryName").toString(), Field.Store.YES, Field.Index.NOT_ANALYZED));
			    }else{
			    	doc.add(new Field(QUERY_FIELD[8], "", Field.Store.YES, Field.Index.NOT_ANALYZED));
			    }
			    if(!ObjectUtil.isEmpty(appleNews, "labelName")){
			    	doc.add(new Field(QUERY_FIELD[9], appleNews.get("labelName").toString(), Field.Store.YES, Field.Index.NOT_ANALYZED));
			    }else{
			    	doc.add(new Field(QUERY_FIELD[9], "", Field.Store.YES, Field.Index.NOT_ANALYZED));
			    }
			    writer.addDocument(doc);
			}
		} catch (Exception e) {
			e.printStackTrace();
		} finally {
			writer.close();
			writer = null;
			writerLock.unlock();
		}
	}

	@Override
	public Object search(String queryWord)
			throws Exception {
		JSONArray appletNewsList = new JSONArray();
		indexSearcher = getIndexSearcher(reader);
    	if (indexSearcher == null) {
    		return appletNewsList;
    	}
		Query query = null;
		PhraseQuery phrase = null;
		PrefixQuery prefix = null;
		BooleanQuery blquery = null;
        QueryParser parser = null;
        MultiFieldQueryParser multiParser = null;
        TermQuery term = null;
        String[] multiQueryField = {QUERY_FIELD[0]};
		if (queryWord.matches(REGEX_NO)) {
			queryWord = queryWord.toLowerCase();
			// code搜索
			phrase = new PhraseQuery();
			phrase.setSlop(0);
			for (int i = 0; i < queryWord.length(); i++) {
				phrase.add(new Term(QUERY_FIELD[2], Character.toString(queryWord.charAt(i))));
			}
			query = phrase;
		} else if (queryWord.matches(REGEX_CHAR)) {
			// 拼音搜索
			prefix = new PrefixQuery(new Term(QUERY_FIELD[1], queryWord.toLowerCase()));
			query = new WildcardQuery(new Term(QUERY_FIELD[2], queryWord.toLowerCase() + "*"));
			term = new TermQuery(new Term(QUERY_FIELD[0], queryWord.toLowerCase()));
			blquery = new BooleanQuery();
			blquery.add(prefix, Occur.SHOULD);
			blquery.add(query, Occur.SHOULD);
			blquery.add(term, Occur.SHOULD);
			query = blquery;
		} else {
			multiParser = new MultiFieldQueryParser(Version.LUCENE_36, multiQueryField, analyzer);
			parser = multiParser;
			parser.setDefaultOperator(QueryParser.Operator.AND);
			query = parser.parse(QueryParser.escape(queryWord));
		}
		LOGGER.info("query param is : " + query.toString());
		// start time
		TopScoreDocCollector collector = TopScoreDocCollector.create(RESULT_COUNT, false);
        long start = new Date().getTime();
        indexSearcher.search(query, collector);
        ScoreDoc[] hits = collector.topDocs().scoreDocs;
        JSONObject appletNews = null;
        for (ScoreDoc scoreDoc : hits) {
            Document doc = indexSearcher.doc(scoreDoc.doc);
            appletNews = new JSONObject();
            appletNews.put(QUERY_FIELD[0], doc.get(QUERY_FIELD[0]));
            appletNews.put(QUERY_FIELD[1], doc.get(QUERY_FIELD[1]));
            appletNews.put(QUERY_FIELD[2], doc.get(QUERY_FIELD[2]));
            appletNews.put(QUERY_FIELD[3], doc.get(QUERY_FIELD[3]));
            appletNews.put(QUERY_FIELD[4], doc.get(QUERY_FIELD[4]));
            appletNews.put(QUERY_FIELD[5], doc.get(QUERY_FIELD[5]));
            appletNews.put(QUERY_FIELD[6], doc.get(QUERY_FIELD[6]));
            appletNews.put(QUERY_FIELD[7], doc.get(QUERY_FIELD[7]));
            appletNews.put(QUERY_FIELD[8], doc.get(QUERY_FIELD[8]));
            appletNews.put(QUERY_FIELD[9], doc.get(QUERY_FIELD[9]));
            appletNewsList.add(appletNews);
        }
        // end time
        long end = new Date().getTime();
        LOGGER.info(
				"\nFound " + collector.getTotalHits() + " document(s) (in "
						+ (end - start) + " millindexSearchereconds) that matched query '"
						+ queryWord + "':"
		);
		return appletNewsList;
	}
	
	/**
	 * 获取索引
	 * @param reader
	 * @return
	 */
    private IndexSearcher getIndexSearcher(
    						IndexReader reader){
        try {
            if (reader == null) {
                reader = IndexReader.open(ramdDrectory);
            } else {
                //如果当前reader在打开期间index发生改变,则打开并返回一个新的IndexReader,否则返回null
                IndexReader ir = IndexReader.openIfChanged(reader);
                if (ir != null) {
                    reader.close();
                    reader = ir;
                }
            }
            return new IndexSearcher(reader);
        }catch(Exception e) {
            e.printStackTrace();
        }
        return null; //发生异常则返回null
    }

	@Override
	public void loadFundInfo() {}
	
	public List<Map<String, Object>> loadResources() {
		List<Map<String, Object>> fundInfoList = appletNewsDao.newSelectAll();
		return fundInfoList;
	}

}

5 模糊搜索字段的含义和用法

对照该用法对自己的参数进行设置

Field.Store.YES:存储字段值(未分词前的字段值) 
Field.Store.NO:不存储,存储与索引没有关系 
Field.Store.COMPRESS:压缩存储,用于长文本或二进制,但性能受损 
Field.Index.ANALYZED:分词建索引 
Field.Index.ANALYZED_NO_NORMS:分词建索引,但是Field的值不像通常那样被保存,而是只取一个byte,这样节约存储空间 
Field.Index.NOT_ANALYZED:不分词且索引 
Field.Index.NOT_ANALYZED_NO_NORMS:不分词建索引,Field的值去一个byte保存 
TermVector表示文档的条目(由一个Document和Field定位)和它们在当前文档中所出现的次数 
Field.TermVector.YES:为每个文档(Document)存储该字段的TermVector 
Field.TermVector.NO:不存储TermVector 
Field.TermVector.WITH_POSITIONS:存储位置 
Field.TermVector.WITH_OFFSETS:存储偏移量 
Field.TermVector.WITH_POSITIONS_OFFSETS:存储位置和偏移量

6 调用

其实原理就是在项目启动的过程中将数据添加到内存中,那么我们开始设置启动加载
加载过程:

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.context.ApplicationEvent;
import org.springframework.context.ApplicationListener;
import org.springframework.stereotype.Component;

import com.renren.toro.service.SearcherNewService;

@Component
public class StartUpInit  implements ApplicationListener<ApplicationEvent>{

	private static final Logger logger = LoggerFactory.getLogger(StartUpInit.class);
	
	@Autowired
	private SearcherNewService searcherNewService;
	
	private static boolean isStart = false;
	
	
	@Override
	public void onApplicationEvent(ApplicationEvent event) {
		try {
			if (! isStart) {
				isStart = true;
				logger.info(" init search data ");
				searcherNewService.index();
			}
		} catch (Exception e1) {
			e1.printStackTrace();
		} 
	}
}

调用过程:

JSONArray letterList = (JSONArray) searcherNewService.search(search);

至此我们就完成了模糊搜索的全部内容,在实现的过程中根据自己的实际需求改动即可。更深层次的研究大家可以看看官方文档和lucene包的源码

7 工具类提供

import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;

import java.io.IOException;
import java.io.Reader;

/**
 * Created by Administrator on 2019/2/26.
 */
public final class SearchTokenizer extends Tokenizer {

    private final TermAttribute termAtt = addAttribute(TermAttribute.class);
    private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);

    private int pos;

    public SearchTokenizer(Reader input){
        super(input);
    }

    @Override
    public final boolean incrementToken() throws IOException {
        clearAttributes();

        while (true) {
            int c = input.read();
            if (c == -1) return false;
            // 只处理数字、字母、汉字
            if (Character.isDigit(c) || Character.isLetter(c) || (c >=19968 && c <= 171941)) {
                termAtt.setTermBuffer(Character.isLetter(c) ? String.valueOf((char) c).toLowerCase() : String.valueOf((char) c));
                termAtt.setTermLength(1);
                offsetAtt.setOffset(correctOffset(pos++), correctOffset(pos));
                return true;
            }

            pos += Character.charCount(c);
        }
    }

    @Override
    public final void end() throws IOException {
        super.end();
        int finalOffset = correctOffset(pos);
        offsetAtt.setOffset(finalOffset, finalOffset);
    }

    @Override
    public final void reset() throws IOException {
        pos = 0;
    }

}
import org.springframework.stereotype.Component;
import org.springframework.util.StringUtils;

import java.util.Map;

/**
 * Created by Administrator on 2019/2/26.
 */
@Component
public class ObjectUtil {
    /**
     * 判断map中的key对应的value是否为空
     * 注意此方法仅对Map<String, String>,或能够转为Map<String, String>的对象有效
     * @param map
     * @param key
     * @return
     */
    public static boolean isEmpty(Map<String, Object> map, String key){
        if(map == null){
            return true;
        }else{
            if(map.get(key) == null){
                return true;
            }else{
                String value = map.get(key).toString();
                if(StringUtils.isEmpty(value)){
                    return true;
                }else{
                    return false;
                }
            }
        }
    }
}

8 注意事项

需要注意的是如果模糊查询的数据发生变化,需要调用index函数或者重启项目来重新将数据索引读入到缓存中。
如果频繁的更数据的话,建议在增删改接口的末尾添加index重新读入索引到缓存中的操作。