C#关于在lucene下的中文切词

    技术2022-05-18  13

    在实现了中文切词的基础方法上,我将其封装在继承lucene的Analyzer类下chineseAnalzer的方法就不用多说了。

    using  System; using  System.Collections.Generic; using  System.Text; using  Lucene.Net.Analysis; using  Lucene.Net.Analysis.Standard; namespace  Lucene.Fanswo {    /// <summary>    ///     /// </summary>    public class ChineseAnalyzer:Analyzer    {        //private System.Collections.Hashtable stopSet;        public static readonly System.String[] CHINESE_ENGLISH_STOP_WORDS = new System.String[] "a""an""and""are""as""at""be""but""by""for""if""in""into""is""it""no""not""of""on""or""s""such""t""that""the""their""then""there""these""they""this""to""was""will""with""""我们" };              /// <summary>Constructs a {@link StandardTokenizer} filtered by a {@link        /// StandardFilter}, a {@link LowerCaseFilter} and a {@link StopFilter}.         /// </summary>        public override TokenStream TokenStream(System.String fieldName, System.IO.TextReader reader)        {            TokenStream result = new ChineseTokenizer(reader);            result = new StandardFilter(result);            result = new LowerCaseFilter(result);            result = new StopFilter(result, CHINESE_ENGLISH_STOP_WORDS);            return result;        }    }}

    ChineseTokenizer类的实现:这里通过词典来正向匹配字符,返回lucene下定义的token流

    using  System; using  System.Collections.Generic; using  System.Text; using  Lucene.Net.Analysis; using  System.Collections; using  System.Text.RegularExpressions; using  System.IO; namespace  Lucene.Fanswo {    class ChineseTokenizer : Tokenizer    {        private int offset = 0, bufferIndex = 0, dataLen = 0;//偏移量,当前字符的位置,字符长度        private int start;//开始位置        /// <summary>        /// 存在字符内容        /// </summary>        private string text;               /// <summary>        /// 切词所花费的时间        /// </summary>        public double TextSeg_Span = 0;             /// <summary>Constructs a tokenizer for this Reader. </summary>        public ChineseTokenizer(System.IO.TextReader reader)        {            this.input = reader;            text = input.ReadToEnd();            dataLen = text.Length;        }        /// <summary>进行切词,返回数据流中下一个token或者数据流为空时返回null        /// </summary>        ///         public override Token Next()        {            Token token = null;            WordTree tree = new WordTree();            //读取词库            tree.LoadDict();            //初始化词库,为树形            Hashtable t_chartable = WordTree.chartable;            string ReWord = "";            string char_s;            start = offset;            bufferIndex = start;            while (true)            {                //开始位置超过字符长度退出循环                if (start >= dataLen)                {                    break;                }                //获取一个词                char_s = text.Substring(start, 1);                if (string.IsNullOrEmpty(char_s.Trim()))                {                    start++;                    continue;                }                //字符不在字典中                if (!t_chartable.Contains(char_s))                {                    if (ReWord == "")                    {                        int j = start + 1;                        switch (tree.GetCharType(char_s))                        {                            case 0://中文单词                                ReWord += char_s;                                break;                            case 1://英文单词                                j = start + 1;                                while (j < dataLen)                                {                                    if (tree.GetCharType(text.Substring(j, 1)) != 1)                                        break;                                    j++;                                }                                ReWord += text.Substring(start, j - offset);                                break;                            case 2://数字                                j = start + 1;                                while (j < dataLen)                                {                                    if (tree.GetCharType(text.Substring(j, 1)) != 2)                                        break;                                    j++;                                }                                ReWord += text.Substring(start, j - offset);                                break;                            default:                                ReWord += char_s;//其他字符单词                                break;                        }                        offset = j;//设置取下一个词的开始位置                    }                    else                    {                        offset = start;//设置取下一个词的开始位置                    }                                        //返回token对象                    return new Token(ReWord, bufferIndex, bufferIndex + ReWord.Length - 1);                }                //字符在字典中                ReWord += char_s;                //取得属于当前字符的词典树                t_chartable = (Hashtable)t_chartable[char_s];                //设置下一循环取下一个词的开始位置                start++;                if (start == dataLen)                {                    offset = dataLen;                    return new Token(ReWord, bufferIndex, bufferIndex + ReWord.Length - 1);                }            }            return token;        }    }}

    测试的代码:

    using  System; using  System.Collections.Generic; using  System.Text; using  Analyzer  =  Lucene.Net.Analysis.Analyzer; using  SimpleAnalyzer  =  Lucene.Net.Analysis.SimpleAnalyzer; using  StandardAnalyzer  =  Lucene.Net.Analysis.Standard.StandardAnalyzer; using  Token  =  Lucene.Net.Analysis.Token; using  TokenStream  =  Lucene.Net.Analysis.TokenStream; namespace  MyLuceneTest {    class Program    {        [STAThread]        public static void Main(System.String[] args)        {            try            {                Test("中华人民共和国在1949年建立,从此开始了新中国的伟大篇章。长春市长春节致词"true);            }            catch (System.Exception e)            {                System.Console.Out.WriteLine(" caught a " + e.GetType() + "  with message: " + e.Message + e.ToString());            }        }        internal static void Test(System.String text, bool verbose)        {            System.Console.Out.WriteLine(" Tokenizing string: " + text);            Test(new System.IO.StringReader(text), verbose, text.Length);        }        internal static void Test(System.IO.TextReader reader, bool verbose, long bytes)        {            //Analyzer analyzer = new StandardAnalyzer();            Analyzer analyzer = new Lucene.Fanswo.ChineseAnalyzer();            TokenStream stream = analyzer.TokenStream(null, reader);                        System.DateTime start = System.DateTime.Now;            int count = 0;            for (Token t = stream.Next(); t != null; t = stream.Next())            {                if (verbose)                {                    System.Console.Out.WriteLine("Token=" + t.ToString());                }                count++;            }            System.DateTime end = System.DateTime.Now;            long time = end.Ticks - start.Ticks;            System.Console.Out.WriteLine(time + " milliseconds to extract " + count + " tokens");            System.Console.Out.WriteLine((time * 1000.0/ count + " microseconds/token");            System.Console.Out.WriteLine((bytes * 1000.0 * 60.0 * 60.0/ (time * 1000000.0+ " megabytes/hour");        }    }}

    测试结果:完毕!


    最新回复(0)