AnalyzerUtil.java 3.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117
  1. package com.iamberry.rst.utils;
  2. import org.springframework.core.io.ClassPathResource;
  3. import org.springframework.core.io.Resource;
  4. import org.springframework.util.Assert;
  5. import org.wltea.analyzer.cfg.Configuration;
  6. import org.wltea.analyzer.core.IKSegmenter;
  7. import org.wltea.analyzer.core.Lexeme;
  8. import java.io.*;
  9. import java.util.*;
  10. /**
  11. * @author 献
  12. * @company 深圳爱贝源科技有限公司
  13. * @date 2017/6/19 22:08
  14. * @tel 18271840547
  15. * @website www.iamberry.com
  16. */
  17. public class AnalyzerUtil {
  18. public static void main(String[] args) throws IOException {
  19. init();
  20. dictionary();
  21. splitWord("漏水吗");
  22. }
  23. private static Set<String> stopWordSet = new HashSet<String>();
  24. public static void init() throws IOException {
  25. String path = (new ClassPathResource("/chinese_stopword.txt").getURL().getPath().substring(1));
  26. //读入停用词文件
  27. BufferedReader StopWordFileBr = new BufferedReader(new InputStreamReader(new FileInputStream(new File(path)),"UTF-8"));
  28. //用来存放停用词的集合
  29. //初如化停用词集
  30. String stopWord = null;
  31. for(; (stopWord = StopWordFileBr.readLine()) != null;){
  32. stopWordSet.add(stopWord);
  33. }
  34. }
  35. /**
  36. * 初始化词典
  37. */
  38. private static Set<String> dictionarySet = new HashSet<String>();
  39. public static void dictionary() throws IOException {
  40. String path = (new ClassPathResource("/ext.txt").getURL().getPath().substring(1));
  41. //读入词典
  42. BufferedReader StopWordFileBr = new BufferedReader(new InputStreamReader(new FileInputStream(new File(path)),"UTF-8"));
  43. //用来存放词典的集合
  44. //初如化词典
  45. String stopWord = null;
  46. for(; (stopWord = StopWordFileBr.readLine()) != null;){
  47. dictionarySet.add(stopWord);
  48. }
  49. }
  50. /**
  51. * 分词
  52. * @author 献
  53. * @date 2017/6/26 9:34
  54. * @params 暂无参数
  55. * @return void
  56. */
  57. public static Set<String> splitWord(String content) throws IOException {
  58. // 配置
  59. Reader reader = new StringReader(content);
  60. Configuration config = org.wltea.analyzer.cfg.DefualtConfig.getInstance();
  61. config.setUseSmart(true);
  62. // 分词
  63. IKSegmenter ikSegmenter = new IKSegmenter(reader, config);
  64. // 分词后的内容
  65. Map<String, Integer> words = new HashMap<>();
  66. // 迭代获取分词的内容
  67. Lexeme lexeme = null;
  68. while ((lexeme = ikSegmenter.next()) != null) {
  69. //去除停用词
  70. if(stopWordSet.contains(lexeme.getLexemeText())) {
  71. continue;
  72. }
  73. if (words.get(lexeme.getLexemeText()) != null) {
  74. words.put(lexeme.getLexemeText(), words.get(lexeme.getLexemeText()) + 1);
  75. } else {
  76. words.put(lexeme.getLexemeText(), 1);
  77. }
  78. }
  79. Set<String> strings = words.keySet();
  80. return strings;
  81. }
  82. public static Set<String> analyzr(String content)throws IOException {
  83. init();
  84. return splitWord(content);
  85. }
  86. public static Set<String> analyzrEntry(String content)throws IOException {
  87. init();
  88. dictionary();
  89. Set<String> sets = splitWord(content);
  90. Set<String> setList = sets;
  91. /*for(String keyword : sets){
  92. //是否包含在字典里
  93. System.out.println(keyword);
  94. if(!dictionarySet.contains(keyword)) {
  95. setList.remove(keyword);
  96. }`
  97. }*/
  98. Iterator<String> iterator = sets.iterator();
  99. while (iterator.hasNext()) {
  100. //是否包含在字典里
  101. String stringNext = (String)iterator.next();
  102. if(!dictionarySet.contains(stringNext)) {
  103. iterator.remove();
  104. sets.remove(stringNext);
  105. }
  106. }
  107. return setList;
  108. }
  109. }