package com.iamberry.rst.utils; import org.springframework.core.io.ClassPathResource; import org.springframework.core.io.Resource; import org.springframework.util.Assert; import org.wltea.analyzer.cfg.Configuration; import org.wltea.analyzer.core.IKSegmenter; import org.wltea.analyzer.core.Lexeme; import java.io.*; import java.util.*; /** * @author 献 * @company 深圳爱贝源科技有限公司 * @date 2017/6/19 22:08 * @tel 18271840547 * @website www.iamberry.com */ public class AnalyzerUtil { public static void main(String[] args) throws IOException { init(); dictionary(); splitWord("漏水吗"); } private static Set stopWordSet = new HashSet(); public static void init() throws IOException { String path = (new ClassPathResource("/chinese_stopword.txt").getURL().getPath().substring(1)); //读入停用词文件 BufferedReader StopWordFileBr = new BufferedReader(new InputStreamReader(new FileInputStream(new File(path)),"UTF-8")); //用来存放停用词的集合 //初如化停用词集 String stopWord = null; for(; (stopWord = StopWordFileBr.readLine()) != null;){ stopWordSet.add(stopWord); } } /** * 初始化词典 */ private static Set dictionarySet = new HashSet(); public static void dictionary() throws IOException { String path = (new ClassPathResource("/ext.txt").getURL().getPath().substring(1)); //读入词典 BufferedReader StopWordFileBr = new BufferedReader(new InputStreamReader(new FileInputStream(new File(path)),"UTF-8")); //用来存放词典的集合 //初如化词典 String stopWord = null; for(; (stopWord = StopWordFileBr.readLine()) != null;){ dictionarySet.add(stopWord); } } /** * 分词 * @author 献 * @date 2017/6/26 9:34 * @params 暂无参数 * @return void */ public static Set splitWord(String content) throws IOException { // 配置 Reader reader = new StringReader(content); Configuration config = org.wltea.analyzer.cfg.DefualtConfig.getInstance(); config.setUseSmart(true); // 分词 IKSegmenter ikSegmenter = new IKSegmenter(reader, config); // 分词后的内容 Map words = new HashMap<>(); // 迭代获取分词的内容 Lexeme lexeme = null; while ((lexeme = ikSegmenter.next()) != null) { //去除停用词 if(stopWordSet.contains(lexeme.getLexemeText())) { continue; } if (words.get(lexeme.getLexemeText()) != null) { words.put(lexeme.getLexemeText(), words.get(lexeme.getLexemeText()) + 1); } else { words.put(lexeme.getLexemeText(), 1); } } Set strings = words.keySet(); return strings; } public static Set analyzr(String content)throws IOException { init(); return splitWord(content); } public static Set analyzrEntry(String content)throws IOException { init(); dictionary(); Set sets = splitWord(content); Set setList = sets; /*for(String keyword : sets){ //是否包含在字典里 System.out.println(keyword); if(!dictionarySet.contains(keyword)) { setList.remove(keyword); }` }*/ Iterator iterator = sets.iterator(); while (iterator.hasNext()) { //是否包含在字典里 String stringNext = (String)iterator.next(); if(!dictionarySet.contains(stringNext)) { iterator.remove(); sets.remove(stringNext); } } return setList; } }