123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117 |
- package com.iamberry.rst.utils;
- import org.springframework.core.io.ClassPathResource;
- import org.springframework.core.io.Resource;
- import org.springframework.util.Assert;
- import org.wltea.analyzer.cfg.Configuration;
- import org.wltea.analyzer.core.IKSegmenter;
- import org.wltea.analyzer.core.Lexeme;
- import java.io.*;
- import java.util.*;
- /**
- * @author 献
- * @company 深圳爱贝源科技有限公司
- * @date 2017/6/19 22:08
- * @tel 18271840547
- * @website www.iamberry.com
- */
- public class AnalyzerUtil {
- public static void main(String[] args) throws IOException {
- init();
- dictionary();
- splitWord("漏水吗");
- }
- private static Set<String> stopWordSet = new HashSet<String>();
- public static void init() throws IOException {
- String path = (new ClassPathResource("/chinese_stopword.txt").getURL().getPath().substring(1));
- //读入停用词文件
- BufferedReader StopWordFileBr = new BufferedReader(new InputStreamReader(new FileInputStream(new File(path)),"UTF-8"));
- //用来存放停用词的集合
- //初如化停用词集
- String stopWord = null;
- for(; (stopWord = StopWordFileBr.readLine()) != null;){
- stopWordSet.add(stopWord);
- }
- }
- /**
- * 初始化词典
- */
- private static Set<String> dictionarySet = new HashSet<String>();
- public static void dictionary() throws IOException {
- String path = (new ClassPathResource("/ext.txt").getURL().getPath().substring(1));
- //读入词典
- BufferedReader StopWordFileBr = new BufferedReader(new InputStreamReader(new FileInputStream(new File(path)),"UTF-8"));
- //用来存放词典的集合
- //初如化词典
- String stopWord = null;
- for(; (stopWord = StopWordFileBr.readLine()) != null;){
- dictionarySet.add(stopWord);
- }
- }
- /**
- * 分词
- * @author 献
- * @date 2017/6/26 9:34
- * @params 暂无参数
- * @return void
- */
- public static Set<String> splitWord(String content) throws IOException {
- // 配置
- Reader reader = new StringReader(content);
- Configuration config = org.wltea.analyzer.cfg.DefualtConfig.getInstance();
- config.setUseSmart(true);
- // 分词
- IKSegmenter ikSegmenter = new IKSegmenter(reader, config);
- // 分词后的内容
- Map<String, Integer> words = new HashMap<>();
- // 迭代获取分词的内容
- Lexeme lexeme = null;
- while ((lexeme = ikSegmenter.next()) != null) {
- //去除停用词
- if(stopWordSet.contains(lexeme.getLexemeText())) {
- continue;
- }
- if (words.get(lexeme.getLexemeText()) != null) {
- words.put(lexeme.getLexemeText(), words.get(lexeme.getLexemeText()) + 1);
- } else {
- words.put(lexeme.getLexemeText(), 1);
- }
- }
- Set<String> strings = words.keySet();
- return strings;
- }
- public static Set<String> analyzr(String content)throws IOException {
- init();
- return splitWord(content);
- }
- public static Set<String> analyzrEntry(String content)throws IOException {
- init();
- dictionary();
- Set<String> sets = splitWord(content);
- Set<String> setList = sets;
- /*for(String keyword : sets){
- //是否包含在字典里
- System.out.println(keyword);
- if(!dictionarySet.contains(keyword)) {
- setList.remove(keyword);
- }`
- }*/
- Iterator<String> iterator = sets.iterator();
- while (iterator.hasNext()) {
- //是否包含在字典里
- String stringNext = (String)iterator.next();
- if(!dictionarySet.contains(stringNext)) {
- iterator.remove();
- sets.remove(stringNext);
- }
- }
- return setList;
- }
- }
|