1.ik 热词及近义词 远程字典的获取方式

简单看下源码,这里需要注意的
1.每次轮询校验的时候设置了请求头 “If-Modified-Since”,“If-None-Match”
2.用 “Etag”和 “Last-Modified” 来确定文件是否发生变化
3.词库有更新的时候调用了 Dictionary.getSingleton().reLoadMainDict();, reLoadMainDict里调用了 loadRemoteExtDict() 来加载远程字典 然后 getRemoteWords 和 getRemoteWordsUnprivileged 来获取词条,获取词条的请求头并没有加上面两个属性

package org.wltea.analyzer.dic;
public class Monitor implements Runnable {
	.....
	....
	/**
	 * 监控流程:
	 *  ①向词库服务器发送Head请求
	 *  ②从响应中获取Last-Modify、ETags字段值,判断是否变化
	 *  ③如果未变化,休眠1min,返回第①步
	 * 	④如果有变化,重新加载词典
	 *  ⑤休眠1min,返回第①步
	 */

	public void runUnprivileged() {

		//超时设置
		RequestConfig rc = RequestConfig.custom().setConnectionRequestTimeout(10*1000)
				.setConnectTimeout(10*1000).setSocketTimeout(15*1000).build();

		HttpHead head = new HttpHead(location);
		head.setConfig(rc);

		//设置请求头
		if (last_modified != null) {
			head.setHeader("If-Modified-Since", last_modified);
		}
		if (eTags != null) {
			head.setHeader("If-None-Match", eTags);
		}

		CloseableHttpResponse response = null;
		try {

			response = httpclient.execute(head);

			//返回200 才做操作
			if(response.getStatusLine().getStatusCode()==200){

				if (((response.getLastHeader("Last-Modified")!=null) && !response.getLastHeader("Last-Modified").getValue().equalsIgnoreCase(last_modified))
						||((response.getLastHeader("ETag")!=null) && !response.getLastHeader("ETag").getValue().equalsIgnoreCase(eTags))) {

					// 远程词库有更新,需要重新加载词典,并修改last_modified,eTags
					 Dictionary.getSingleton().reLoadMainDict();
					last_modified = response.getLastHeader("Last-Modified")==null?null:response.getLastHeader("Last-Modified").getValue();
					eTags = response.getLastHeader("ETag")==null?null:response.getLastHeader("ETag").getValue();
				}
			}else if (response.getStatusLine().getStatusCode()==304) {
				//没有修改,不做操作
				//noop
			}else{
				logger.info("remote_ext_dict {} return bad code {}" , location , response.getStatusLine().getStatusCode() );
			}

		} catch (Exception e) {
			logger.error("remote_ext_dict {} error!",e , location);
		}finally{
			try {
				if (response != null) {
					response.close();
				}
			} catch (IOException e) {
				logger.error(e.getMessage(), e);
			}
		}
	}
}
....
....

/**
 * 词典管理类,单子模式
 */
public class Dictionary {
...
...
	void reLoadMainDict() {
		logger.info("start to reload ik dict.");
		// 新开一个实例加载词典,减少加载过程对当前词典使用的影响
		Dictionary tmpDict = new Dictionary(configuration);
		tmpDict.configuration = getSingleton().configuration;
		tmpDict.loadMainDict();
		tmpDict.loadStopWordDict();
		_MainDict = tmpDict._MainDict;
		_StopWords = tmpDict._StopWords;
		logger.info("reload ik dict finished.");
	}
	/**
	 * 加载主词典及扩展词典
	 */
	private void loadMainDict() {
		// 建立一个主词典实例
		_MainDict = new DictSegment((char) 0);

		// 读取主词典文件
		Path file = PathUtils.get(getDictRoot(), Dictionary.PATH_DIC_MAIN);
		loadDictFile(_MainDict, file, false, "Main Dict");
		// 加载扩展词典
		this.loadExtDict();
		// 加载远程自定义词库
		this.loadRemoteExtDict();
	}


	/**
	 * 加载远程扩展词典到主词库表
	 */
	private void loadRemoteExtDict() {
		List<String> remoteExtDictFiles = getRemoteExtDictionarys();
		for (String location : remoteExtDictFiles) {
			logger.info("[Dict Loading] " + location);
			List<String> lists = getRemoteWords(location);
			// 如果找不到扩展的字典,则忽略
			if (lists == null) {
				logger.error("[Dict Loading] " + location + " load failed");
				continue;
			}
			for (String theWord : lists) {
				if (theWord != null && !"".equals(theWord.trim())) {
					// 加载扩展词典数据到主内存词典中
					logger.info(theWord);
					_MainDict.fillSegment(theWord.trim().toLowerCase().toCharArray());
				}
			}
		}

	}
	private static List<String> getRemoteWords(String location) {
		SpecialPermission.check();
		return AccessController.doPrivileged((PrivilegedAction<List<String>>) () -> {
			return getRemoteWordsUnprivileged(location);
		});
	}
   }
	
	/**
	 * 从远程服务器上下载自定义词条
	 */
	private static List<String> getRemoteWordsUnprivileged(String location) {

		List<String> buffer = new ArrayList<String>();
		RequestConfig rc = RequestConfig.custom().setConnectionRequestTimeout(10 * 1000).setConnectTimeout(10 * 1000)
				.setSocketTimeout(60 * 1000).build();
		CloseableHttpClient httpclient = HttpClients.createDefault();
		CloseableHttpResponse response;
		BufferedReader in;
		HttpGet get = new HttpGet(location);
		get.setConfig(rc);
		try {
			response = httpclient.execute(get);
			if (response.getStatusLine().getStatusCode() == 200) {

				String charset = "UTF-8";
				// 获取编码,默认为utf-8
				HttpEntity entity = response.getEntity();
				if(entity!=null){
					Header contentType = entity.getContentType();
					if(contentType!=null&&contentType.getValue()!=null){
						String typeValue = contentType.getValue();
						if(typeValue!=null&&typeValue.contains("charset=")){
							charset = typeValue.substring(typeValue.lastIndexOf("=") + 1);
						}
					}

					if (entity.getContentLength() > 0 || entity.isChunked()) {
						in = new BufferedReader(new InputStreamReader(entity.getContent(), charset));
						String line;
						while ((line = in.readLine()) != null) {
							buffer.add(line);
						}
						in.close();
						response.close();
						return buffer;
					}
			}
			}
			response.close();
		} catch (IllegalStateException | IOException e) {
			logger.error("getRemoteWords {} error", e, location);
		}
		return buffer;
	}

......

近义词也是差不多的就不多看了,简单贴一点
git 地址 https://github.com/bells/elasticsearch-analysis-dynamic-synonym 不过 这里需要配一下analysis

"analysis": {
	"analyzer": {
		"my_ik_max_word": {
				"tokenizer": "ik_max_word",
				"filter": ["remote_synonym"]
		}
	},
	"filter": {
			"remote_synonym": {
				"type": "dynamic_synonym",
				"synonyms_path": "http://xxxx/${type}/remote_dic.txt",
				"interval": 30
			}
		},
}
public class Monitor implements Runnable {

        private SynonymFile synonymFile;

        Monitor(SynonymFile synonymFile) {
            this.synonymFile = synonymFile;
        }

        @Override
        public void run() {
            if (synonymFile.isNeedReloadSynonymMap()) {
                synonymMap = synonymFile.reloadSynonymMap();
                for (AbsSynonymFilter dynamicSynonymFilter : dynamicSynonymFilters.keySet()) {
                    dynamicSynonymFilter.update(synonymMap);
                    logger.debug("success reload synonym");
                }
            }
        }
    }


public class RemoteSynonymFile implements SynonymFile {
	...
	...
	@Override
    public boolean isNeedReloadSynonymMap() {
        RequestConfig rc = RequestConfig.custom()
                .setConnectionRequestTimeout(10 * 1000)
                .setConnectTimeout(10 * 1000).setSocketTimeout(15 * 1000)
                .build();
        HttpHead head = AccessController.doPrivileged((PrivilegedAction<HttpHead>) () -> new HttpHead(location));
        head.setConfig(rc);

        // 设置请求头
        if (lastModified != null) {
            head.setHeader("If-Modified-Since", lastModified);
        }
        if (eTags != null) {
            head.setHeader("If-None-Match", eTags);
        }

        CloseableHttpResponse response = null;
        try {
            response = executeHttpRequest(head);
            if (response.getStatusLine().getStatusCode() == 200) { // 返回200 才做操作
                if (!response.getLastHeader(LAST_MODIFIED_HEADER).getValue()
                        .equalsIgnoreCase(lastModified)
                        || !response.getLastHeader(ETAG_HEADER).getValue()
                        .equalsIgnoreCase(eTags)) {

                    lastModified = response.getLastHeader(LAST_MODIFIED_HEADER) == null ? null
                            : response.getLastHeader(LAST_MODIFIED_HEADER)
                            .getValue();
                    eTags = response.getLastHeader(ETAG_HEADER) == null ? null
                            : response.getLastHeader(ETAG_HEADER).getValue();
                    return true;
                }
            } else if (response.getStatusLine().getStatusCode() == 304) {
                return false;
            } else {
                logger.info("remote synonym {} return bad code {}", location,
                        response.getStatusLine().getStatusCode());
            }
        } finally {
            try {
                if (response != null) {
                    response.close();
                }
            } catch (IOException e) {
                logger.error("failed to close http response", e);
            }
        }
        return false;
    }
	...
	...
 	/**
     * Download custom terms from a remote server
     */
    public Reader getReader() {
        Reader reader;
        RequestConfig rc = RequestConfig.custom()
                .setConnectionRequestTimeout(10 * 1000)
                .setConnectTimeout(10 * 1000).setSocketTimeout(60 * 1000)
                .build();
        CloseableHttpResponse response = null;
        BufferedReader br = null;
        HttpGet get = new HttpGet(location);
        get.setConfig(rc);
        try {
            response = executeHttpRequest(get);
            if (response.getStatusLine().getStatusCode() == 200) {
                String charset = "UTF-8"; // 获取编码,默认为utf-8
                if (response.getEntity().getContentType().getValue()
                        .contains("charset=")) {
                    String contentType = response.getEntity().getContentType()
                            .getValue();
                    charset = contentType.substring(contentType
                            .lastIndexOf('=') + 1);
                }

                br = new BufferedReader(new InputStreamReader(response
                        .getEntity().getContent(), charset));
                StringBuilder sb = new StringBuilder();
                String line;
                while ((line = br.readLine()) != null) {
                    logger.debug("reload remote synonym: {}", line);
                    sb.append(line)
                            .append(System.getProperty("line.separator"));
                }
                reader = new StringReader(sb.toString());
            } else reader = new StringReader("");
        } catch (Exception e) {
            logger.error("get remote synonym reader {} error!", location, e);
//            throw new IllegalArgumentException(
//                    "Exception while reading remote synonyms file", e);
            // Fix #54 Returns blank if synonym file has be deleted.
            reader = new StringReader("");
        } finally {
            try {
                if (br != null) {
                    br.close();
                }
            } catch (IOException e) {
                logger.error("failed to close bufferedReader", e);
            }
            try {
                if (response != null) {
                    response.close();
                }
            } catch (IOException e) {
                logger.error("failed to close http response", e);
            }
        }
        return reader;
    }
 }
2.实现

思路挺简单的
就是存一个最后修改时间
最后修改时间变了 证明新增了
存一个重构分词时间,
最后修改时间大于重构分词时间 ,就需要重构下分词

/**词条类**/
@Data
@TableName("ext_dict")
public class ExtDict extends BaseEntity {

    /**
     * id
     */
    @TableId(value = "id", type = IdType.AUTO)
    private Integer id;

    /**
     * 扩展词
     */
    @NotNull(message = "热词不能为空", groups = { AddGroup.class})
    private String word;

    /**
     * 类型 0 热词 1近义词 2.停用词
     */
    private Integer type;

    /**
     * 近义词
     */
    private String synonym;

}

/** ctroller类**/

	 /**
     * 获取远程字典
     * @param type 字典类型, 0 热词 1 近义词 2 禁用词
     * @param request
     * @param response
     */
    @GetMapping("/{type}/remote_dic.txt")
    public void  getRemotDic(@PathVariable("type") int type,HttpServletRequest request,HttpServletResponse response) {
        ArrayList<String> headerNames = Collections.list(request.getHeaderNames());
        response.setContentType("text/plain");
        response.setCharacterEncoding("utf-8");
        String lastModified = RedisUtils.getCacheObject(REMOTE_DIC_LAST_MODIFY+ type +":").toString();
        if(StringUtils.isEmpty(lastModified)){
            lastModified=extDictService.queryLastModified(type)+"";
            RedisUtils.setCacheObject(REMOTE_DIC_LAST_MODIFY+type+":",lastModified);
        }
        response.setHeader("ETag","xxxxxxxxxxxxxxxxdsa");
        response.setDateHeader("Last-Modified",Long.valueOf(lastModified));
        //es轮询校验请求  无需返回字典数据
        if(headerNames.contains("If-None-Match") || headerNames.contains("If-Modified-Since")){
            return;
        }
        //非es校验请求 返回正常内容
        List<ExtDict> list = extDictService.queryListByType(type);
        PrintWriter writer=null;
        try {
            writer = response.getWriter();
            if (type==1){
                list = list.stream().map(exdict -> {
                    exdict.setWord(exdict.getWord() + "=>" + exdict.getSynonym());
                    return exdict;
                }).collect(Collectors.toList());
            }
            for (int i = 0; i < list.size(); i++) {
                writer.write(list.get(i).getWord()+"\n");
            }
            writer.flush();
        } catch (IOException e) {
            e.printStackTrace();
        }finally {
            if (writer!=null){
                writer.close();
            }
        }

        String status = RedisUtils.getCacheObject(REBUILD_ANALYSIS_STATUS);
        String time = RedisUtils.getCacheObject(REBUILD_ANALYSIS_TIME);

        //status 无值 进行初始化操作
        if(status==null){
            RedisUtils.setCacheObject(REBUILD_ANALYSIS_STATUS,ANALYSIS_STATUS_SUCCESS);
            RedisUtils.setCacheObject(REBUILD_ANALYSIS_TIME,System.currentTimeMillis()+"");
            return;
        }
        //进行重建分词
        if(!ANALYSIS_STATUS_UPDATING.equals(status) && StringUtils.compare(lastModified,time)>0){
            elasticSearchService.rebuildAnalysis();
        }
    }



/**Service*/
  /**
     * 重建分词
     * 用于远程扩展词典更新
     */
    @Override
    public void rebuildAnalysis() {
        UpdateByQueryRequest request =new UpdateByQueryRequest(DEFULT_INDEX_NAME);
        request.setConflicts("proceed");
        request.setQuery(QueryBuilders.matchAllQuery());
        request.setRefresh(true);

        restHighLevelClient.updateByQueryAsync(request, RequestOptions.DEFAULT, new 				ActionListener<BulkByScrollResponse>() {
            @Override
            public void onResponse(BulkByScrollResponse bulkByScrollResponse) {
                log.info("------------------重建分词成功");
                RedisUtils.setCacheObject(REBUILD_ANALYSIS_STATUS,ANALYSIS_STATUS_SUCCESS);
                RedisUtils.setCacheObject(REBUILD_ANALYSIS_TIME,System.currentTimeMillis()+"");
            }
            @Override
            public void onFailure(Exception e) {
                log.error("----------- -----重建分词失败",e);
                RedisUtils.setCacheObject(REBUILD_ANALYSIS_STATUS,ANALYSIS_STATUS_FAILED);
            }
        });
        RedisUtils.setCacheObject(REBUILD_ANALYSIS_STATUS,ANALYSIS_STATUS_UPDATING);
    }