C#关于在lucene下的中文切词

技术2022-05-18 13

在实现了中文切词的基础方法上，我将其封装在继承lucene的Analyzer类下chineseAnalzer的方法就不用多说了。

using System; using System.Collections.Generic; using System.Text; using Lucene.Net.Analysis; using Lucene.Net.Analysis.Standard; namespace Lucene.Fanswo { /// <summary> /// /// </summary> public class ChineseAnalyzer:Analyzer { //private System.Collections.Hashtable stopSet; public static readonly System.String[] CHINESE_ENGLISH_STOP_WORDS = new System.String[] { "a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "no", "not", "of", "on", "or", "s", "such", "t", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with", "我", "我们" }; /// <summary>Constructs a {@link StandardTokenizer} filtered by a {@link /// StandardFilter}, a {@link LowerCaseFilter} and a {@link StopFilter}. /// </summary> public override TokenStream TokenStream(System.String fieldName, System.IO.TextReader reader) { TokenStream result = new ChineseTokenizer(reader); result = new StandardFilter(result); result = new LowerCaseFilter(result); result = new StopFilter(result, CHINESE_ENGLISH_STOP_WORDS); return result; } }}

ChineseTokenizer类的实现：这里通过词典来正向匹配字符，返回lucene下定义的token流

using System; using System.Collections.Generic; using System.Text; using Lucene.Net.Analysis; using System.Collections; using System.Text.RegularExpressions; using System.IO; namespace Lucene.Fanswo { class ChineseTokenizer : Tokenizer { private int offset = 0, bufferIndex = 0, dataLen = 0;//偏移量，当前字符的位置，字符长度 private int start;//开始位置 /// <summary> /// 存在字符内容 /// </summary> private string text; /// <summary> /// 切词所花费的时间 /// </summary> public double TextSeg_Span = 0; /// <summary>Constructs a tokenizer for this Reader. </summary> public ChineseTokenizer(System.IO.TextReader reader) { this.input = reader; text = input.ReadToEnd(); dataLen = text.Length; } /// <summary>进行切词，返回数据流中下一个token或者数据流为空时返回null /// </summary> /// public override Token Next() { Token token = null; WordTree tree = new WordTree(); //读取词库 tree.LoadDict(); //初始化词库，为树形 Hashtable t_chartable = WordTree.chartable; string ReWord = ""; string char_s; start = offset; bufferIndex = start; while (true) { //开始位置超过字符长度退出循环 if (start >= dataLen) { break; } //获取一个词 char_s = text.Substring(start, 1); if (string.IsNullOrEmpty(char_s.Trim())) { start++; continue; } //字符不在字典中 if (!t_chartable.Contains(char_s)) { if (ReWord == "") { int j = start + 1; switch (tree.GetCharType(char_s)) { case 0://中文单词 ReWord += char_s; break; case 1://英文单词 j = start + 1; while (j < dataLen) { if (tree.GetCharType(text.Substring(j, 1)) != 1) break; j++; } ReWord += text.Substring(start, j - offset); break; case 2://数字 j = start + 1; while (j < dataLen) { if (tree.GetCharType(text.Substring(j, 1)) != 2) break; j++; } ReWord += text.Substring(start, j - offset); break; default: ReWord += char_s;//其他字符单词 break; } offset = j;//设置取下一个词的开始位置 } else { offset = start;//设置取下一个词的开始位置 } //返回token对象 return new Token(ReWord, bufferIndex, bufferIndex + ReWord.Length - 1); } //字符在字典中 ReWord += char_s; //取得属于当前字符的词典树 t_chartable = (Hashtable)t_chartable[char_s]; //设置下一循环取下一个词的开始位置 start++; if (start == dataLen) { offset = dataLen; return new Token(ReWord, bufferIndex, bufferIndex + ReWord.Length - 1); } } return token; } }}

测试的代码：

using System; using System.Collections.Generic; using System.Text; using Analyzer = Lucene.Net.Analysis.Analyzer; using SimpleAnalyzer = Lucene.Net.Analysis.SimpleAnalyzer; using StandardAnalyzer = Lucene.Net.Analysis.Standard.StandardAnalyzer; using Token = Lucene.Net.Analysis.Token; using TokenStream = Lucene.Net.Analysis.TokenStream; namespace MyLuceneTest { class Program { [STAThread] public static void Main(System.String[] args) { try { Test("中华人民共和国在1949年建立，从此开始了新中国的伟大篇章。长春市长春节致词", true); } catch (System.Exception e) { System.Console.Out.WriteLine(" caught a " + e.GetType() + " with message: " + e.Message + e.ToString()); } } internal static void Test(System.String text, bool verbose) { System.Console.Out.WriteLine(" Tokenizing string: " + text); Test(new System.IO.StringReader(text), verbose, text.Length); } internal static void Test(System.IO.TextReader reader, bool verbose, long bytes) { //Analyzer analyzer = new StandardAnalyzer(); Analyzer analyzer = new Lucene.Fanswo.ChineseAnalyzer(); TokenStream stream = analyzer.TokenStream(null, reader); System.DateTime start = System.DateTime.Now; int count = 0; for (Token t = stream.Next(); t != null; t = stream.Next()) { if (verbose) { System.Console.Out.WriteLine("Token=" + t.ToString()); } count++; } System.DateTime end = System.DateTime.Now; long time = end.Ticks - start.Ticks; System.Console.Out.WriteLine(time + " milliseconds to extract " + count + " tokens"); System.Console.Out.WriteLine((time * 1000.0) / count + " microseconds/token"); System.Console.Out.WriteLine((bytes * 1000.0 * 60.0 * 60.0) / (time * 1000000.0) + " megabytes/hour"); } }}

测试结果：完毕！

专利

最新回复(0)