Looly 6 年 前
コミット
bea37293ad

+ 1 - 0
CHANGELOG.md

@@ -11,6 +11,7 @@
 * 【extra】        Sftp得put方法增加进度支持(issue#518@Github)
 * 【core】        ArrayUtil增加distinct方法
 * 【http】         去除log模块依赖,Cookie中去除日志提示,body方法传入JSON对象废弃,未来移除json模块依赖
+* 【extra】        添加MyNLP支持(issue#519@Github)
 
 ### Bug修复
 

+ 6 - 0
hutool-extra/pom.xml

@@ -200,5 +200,11 @@
 			<version>1.2</version>
 			<optional>true</optional>
 		</dependency>
+		<dependency>
+			<groupId>com.mayabot.mynlp</groupId>
+			<artifactId>mynlp-segment</artifactId>
+			<version>3.0.0</version>
+			<optional>true</optional>
+		</dependency>
 	</dependencies>
 </project>

+ 6 - 0
hutool-extra/src/main/java/cn/hutool/extra/tokenizer/engine/TokenizerFactory.java

@@ -10,6 +10,7 @@ import cn.hutool.extra.tokenizer.engine.ikanalyzer.IKAnalyzerEngine;
 import cn.hutool.extra.tokenizer.engine.jcseg.JcsegEngine;
 import cn.hutool.extra.tokenizer.engine.jieba.JiebaEngine;
 import cn.hutool.extra.tokenizer.engine.mmseg.MmsegEngine;
+import cn.hutool.extra.tokenizer.engine.mynlp.MynlpEngine;
 import cn.hutool.extra.tokenizer.engine.word.WordEngine;
 import cn.hutool.log.StaticLog;
 
@@ -77,6 +78,11 @@ public class TokenizerFactory {
 		} catch (NoClassDefFoundError e) {
 			// ignore
 		}
+		try {
+			return new MynlpEngine();
+		} catch (NoClassDefFoundError e) {
+			// ignore
+		}
 		throw new TokenizerException("No tokenizer found ! Please add some tokenizer jar to your project !");
 	}
 }

+ 44 - 0
hutool-extra/src/main/java/cn/hutool/extra/tokenizer/engine/mynlp/MynlpEngine.java

@@ -0,0 +1,44 @@
+package cn.hutool.extra.tokenizer.engine.mynlp;
+
+import com.mayabot.nlp.segment.Lexer;
+import com.mayabot.nlp.segment.Lexers;
+import com.mayabot.nlp.segment.Sentence;
+
+import cn.hutool.core.util.StrUtil;
+import cn.hutool.extra.tokenizer.Result;
+import cn.hutool.extra.tokenizer.TokenizerEngine;
+
+/**
+ * MYNLP 中文NLP工具包分词实现<br>
+ * 项目地址:https://github.com/mayabot/mynlp/
+ * 
+ * @author looly
+ *
+ */
+public class MynlpEngine implements TokenizerEngine {
+
+	private Lexer lexer;
+	
+	/**
+	 * 构造
+	 */
+	public MynlpEngine() {
+		this.lexer = Lexers.core();
+	}
+	
+	/**
+	 * 构造
+	 * 
+	 * @param lexer 分词器接口{@link Lexer}
+	 */
+	public MynlpEngine(Lexer lexer) {
+		this.lexer = lexer;
+	}
+
+	@Override
+	public Result parse(CharSequence text) {
+		final Sentence sentence = this.lexer.scan(StrUtil.str(text));
+		return new MynlpResult(sentence);
+	}
+
+}

+ 50 - 0
hutool-extra/src/main/java/cn/hutool/extra/tokenizer/engine/mynlp/MynlpResult.java

@@ -0,0 +1,50 @@
+package cn.hutool.extra.tokenizer.engine.mynlp;
+
+import java.util.Iterator;
+
+import com.mayabot.nlp.segment.Sentence;
+import com.mayabot.nlp.segment.WordTerm;
+
+import cn.hutool.extra.tokenizer.Result;
+import cn.hutool.extra.tokenizer.Word;
+
+/**
+ * MYNLP 中文NLP工具包分词结果实现<br>
+ * 项目地址:https://github.com/mayabot/mynlp/
+ * 
+ * @author looly
+ *
+ */
+public class MynlpResult implements Result {
+	
+	private Iterator<WordTerm> result;
+
+	/**
+	 * 构造
+	 * 
+	 * @param sentence 分词结果(中文句子)
+	 */
+	public MynlpResult(Sentence sentence) {
+		this.result = sentence.iterator();
+	}
+
+	@Override
+	public boolean hasNext() {
+		return result.hasNext();
+	}
+
+	@Override
+	public Word next() {
+		return new MynlpWord(result.next());
+	}
+
+	@Override
+	public void remove() {
+		result.remove();
+	}
+
+	@Override
+	public Iterator<Word> iterator() {
+		return this;
+	}
+}

+ 45 - 0
hutool-extra/src/main/java/cn/hutool/extra/tokenizer/engine/mynlp/MynlpWord.java

@@ -0,0 +1,45 @@
+package cn.hutool.extra.tokenizer.engine.mynlp;
+
+import com.mayabot.nlp.segment.WordTerm;
+
+import cn.hutool.extra.tokenizer.Word;
+
+/**
+ * mmseg分词中的一个单词包装
+ * 
+ * @author looly
+ *
+ */
+public class MynlpWord implements Word {
+	
+	private WordTerm word;
+
+	/**
+	 * 构造
+	 * 
+	 * @param word {@link WordTerm}
+	 */
+	public MynlpWord(WordTerm word) {
+		this.word = word;
+	}
+
+	@Override
+	public String getText() {
+		return word.getWord();
+	}
+	
+	@Override
+	public int getStartOffset() {
+		return this.word.offset;
+	}
+	
+	@Override
+	public int getEndOffset() {
+		return getStartOffset() + word.word.length();
+	}
+
+	@Override
+	public String toString() {
+		return getText();
+	}
+}

+ 8 - 0
hutool-extra/src/main/java/cn/hutool/extra/tokenizer/engine/mynlp/package-info.java

@@ -0,0 +1,8 @@
+/**
+ * MYNLP 中文NLP工具包分词实现<br>
+ * 项目地址:https://github.com/mayabot/mynlp/
+ * 
+ * @author Looly
+ * @since 4.6.5
+ */
+package cn.hutool.extra.tokenizer.engine.mynlp;

+ 12 - 0
hutool-extra/src/test/java/cn/hutool/extra/tokenizer/TokenizerUtilTest.java

@@ -3,6 +3,7 @@ package cn.hutool.extra.tokenizer;
 import java.util.Iterator;
 
 import org.junit.Assert;
+import org.junit.Ignore;
 import org.junit.Test;
 
 import cn.hutool.core.collection.CollUtil;
@@ -12,6 +13,7 @@ import cn.hutool.extra.tokenizer.engine.ikanalyzer.IKAnalyzerEngine;
 import cn.hutool.extra.tokenizer.engine.jcseg.JcsegEngine;
 import cn.hutool.extra.tokenizer.engine.jieba.JiebaEngine;
 import cn.hutool.extra.tokenizer.engine.mmseg.MmsegEngine;
+import cn.hutool.extra.tokenizer.engine.mynlp.MynlpEngine;
 import cn.hutool.extra.tokenizer.engine.word.WordEngine;
 
 /**
@@ -86,6 +88,16 @@ public class TokenizerUtilTest {
 		Assert.assertEquals("这两个 方法 的 区别 在于 返回值", resultStr);
 	}
 	
+	@Test
+	@Ignore
+	public void mynlpTest() {
+		// 此单元测试需要JDK8,默认忽略
+		TokenizerEngine engine = new MynlpEngine();
+		Result result = engine.parse(text);
+		String resultStr = CollUtil.join((Iterator<Word>)result, " ");
+		Assert.assertEquals("这 两个 方法 的 区别 在于 返回 值", resultStr);
+	}
+	
 	private void checkResult(Result result) {
 		String resultStr = CollUtil.join((Iterator<Word>)result, " ");
 		Assert.assertEquals("这 两个 方法 的 区别 在于 返回 值", resultStr);

+ 1 - 0
pom.xml

@@ -89,6 +89,7 @@
 				<configuration>
 					<source>${compile.version}</source>
 					<target>${compile.version}</target>
+					<verbose>true</verbose>
 				</configuration>
 			</plugin>
 			<!-- Javadoc -->