hexiugang
/
iamberry-common-parent


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117
							package com.iamberry.rst.utils;

import org.springframework.core.io.ClassPathResource;
import org.springframework.core.io.Resource;
import org.springframework.util.Assert;
import org.wltea.analyzer.cfg.Configuration;
import org.wltea.analyzer.core.IKSegmenter;
import org.wltea.analyzer.core.Lexeme;
import java.io.*;
import java.util.*;

/**
 * @author 献
 * @company 深圳爱贝源科技有限公司
 * @date 2017/6/19 22:08
 * @tel 18271840547
 * @website www.iamberry.com
 */
public class AnalyzerUtil {


    public static void main(String[] args) throws IOException {
        init();
        dictionary();
        splitWord("漏水吗");
    }
    private static Set<String> stopWordSet = new HashSet<String>();
    public static void init() throws IOException {
        String path = (new ClassPathResource("/chinese_stopword.txt").getURL().getPath().substring(1));
        //读入停用词文件
        BufferedReader StopWordFileBr = new BufferedReader(new InputStreamReader(new FileInputStream(new File(path)),"UTF-8"));
        //用来存放停用词的集合
        //初如化停用词集
        String stopWord = null;
        for(; (stopWord = StopWordFileBr.readLine()) != null;){
            stopWordSet.add(stopWord);
        }
    }

    /**
     * 初始化词典
     */
    private static Set<String> dictionarySet = new HashSet<String>();
        public static void dictionary() throws IOException {
            String path = (new ClassPathResource("/ext.txt").getURL().getPath().substring(1));
            //读入词典
            BufferedReader StopWordFileBr = new BufferedReader(new InputStreamReader(new FileInputStream(new File(path)),"UTF-8"));
            //用来存放词典的集合
            //初如化词典
            String stopWord = null;
            for(; (stopWord = StopWordFileBr.readLine()) != null;){
                dictionarySet.add(stopWord);
        }
    }
    /**
     * 分词
     * @author   献
     * @date     2017/6/26 9:34
     * @params   暂无参数
     * @return   void
     */
    public static Set<String> splitWord(String content) throws IOException {
        // 配置
        Reader reader = new StringReader(content);
        Configuration config = org.wltea.analyzer.cfg.DefualtConfig.getInstance();
        config.setUseSmart(true);
        // 分词
        IKSegmenter ikSegmenter = new IKSegmenter(reader, config);
        // 分词后的内容
        Map<String, Integer> words = new HashMap<>();
        // 迭代获取分词的内容
        Lexeme lexeme = null;
        while ((lexeme = ikSegmenter.next()) != null) {
            //去除停用词
            if(stopWordSet.contains(lexeme.getLexemeText())) {
                continue;
            }
            if (words.get(lexeme.getLexemeText()) != null) {
                words.put(lexeme.getLexemeText(), words.get(lexeme.getLexemeText()) + 1);
            } else {
                words.put(lexeme.getLexemeText(), 1);
            }
        }
        Set<String> strings = words.keySet();
        return strings;
    }
    public static Set<String> analyzr(String content)throws IOException {
        init();
        return splitWord(content);
    }
    public static Set<String> analyzrEntry(String content)throws IOException {
        init();
        dictionary();
        Set<String> sets = splitWord(content);
        Set<String> setList = sets;
        /*for(String keyword : sets){
            //是否包含在字典里
            System.out.println(keyword);
            if(!dictionarySet.contains(keyword)) {
                setList.remove(keyword);
            }`
        }*/
        Iterator<String> iterator = sets.iterator();

        while (iterator.hasNext()) {
            //是否包含在字典里
            String stringNext = (String)iterator.next();
            if(!dictionarySet.contains(stringNext)) {
                iterator.remove();
                sets.remove(stringNext);
            }
        }
        return setList;
    }
}