今天搞了一个关于Lucene的例子,权当入门教程。网上有很多资料,但是要么不全、要么不好用,所以这里把全部代码以及依赖的包贴上来了。
功能包括:创建索引、检索索引、高亮显示查询结果。分词使用的庖丁解牛。
使用前先下载相关的LuceneCore jar包、LuceneHighLighter jar包、庖丁解牛分分词jar包、庖丁解牛词典。并设定环境变量PAODING_DIC_HOME指向词典位置。
前两个可以到官方网站找,庖丁去http://code.google.com/p/paoding/downloads/list下载。
Lucene庖丁整合方式1:
1、将paoding-analysis.jar拷贝到项目的WEB-INF/lib目录;2、接着需要设置环境变量PAODING_DIC_HOME,变量名:PAODING_DIC_HOME 变量值:E:/paoding/dic 3、第三步将E:/paoding/src目录下的paoding-dic-home.properties属性文件拷贝到项目的src目录下,添加2行
paoding.dic.home.config-fisrt=thispaoding.dic.home=E:/paoding/dic
Lucene庖丁整合方式2:
修改E:/paoding/src/paoding-dic-home.properties,增加一行
paoding.dic.home=classpath:dic
然后运行ant重新生成一个庖丁jar,拷贝到lib下就OK了。
第一种方式便于更新字典,第二种便于移植。本例使用第二种方法整合。
关于庖丁环境的设置可以参考net/paoding/analysis/Constants.java。
使用时注意LuceneCore和LuceneHighLighter的版本配置。我开始使用lucene-core-2.3.2.jar+Highlighter 2.4,后台报错,明显的版本问题。现在使用的是Lucene 2.3.2 + Highlighter 2.2.0。
主要代码实现:
CreateIndex:创建索引文件
Java代码 package demo; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStreamReader; import java.util.Date; import net.paoding.analysis.analyzer.PaodingAnalyzer; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexWriter; /** * 建立索引 * */ public class CreateIndex { public void createIndex() throws Exception { /* 指明要索引文件夹的位置,这里是C盘的S文件夹下 */ File surceFileDir = new File("D://save//source"); /* 这里放索引文件的位置 */ File indexFileDir = new File("D://save"); //Analyzer luceneAnalyzer = new StandardAnalyzer(); Analyzer luceneAnalyzer = new PaodingAnalyzer();//使用庖丁解牛分词法 IndexWriter indexWriter = new IndexWriter(indexFileDir, luceneAnalyzer, true);///参数isEmpty是false表示增量索引 File[] sourceFextFiles = surceFileDir.listFiles(); long startTime = new Date().getTime(); // 增加document到索引去 for (int i = 0; i < sourceFextFiles.length; i++) { if (sourceFextFiles[i].isFile() && sourceFextFiles[i].getName().endsWith(".txt")) { System.out.println("File " + sourceFextFiles[i].getCanonicalPath() + "正在被索引...."); String temp = FileReaderAll(sourceFextFiles[i].getCanonicalPath(), "GBK"); System.out.println(temp); Document document = new Document(); Field FieldPath = new Field("path", sourceFextFiles[i].getPath(), Field.Store.YES, Field.Index.NO); Field FieldBody = new Field("body", temp, Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS); Field FieldTitle = new Field("title", temp, Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS); document.add(FieldPath); document.add(FieldBody);document.add(FieldTitle); indexWriter.addDocument(document); } } // optimize()方法是对索引进行优化 indexWriter.optimize(); indexWriter.close(); // 测试一下索引的时间 long endTime = new Date().getTime(); System.out.println("这花费了" + (endTime - startTime) + " 毫秒来把文档增加到索引里面去!" + indexFileDir.getPath()); } public static String FileReaderAll(String FileName, String charset) throws IOException { BufferedReader reader = new BufferedReader(new InputStreamReader( new FileInputStream(FileName), charset)); String line = new String(); String temp = new String(); while ((line = reader.readLine()) != null) { temp += line; } reader.close(); return temp; } /** * @param args */ public static void main(String[] args) { try { new CreateIndex().createIndex(); } catch (Exception e) { e.printStackTrace(); } } } package demo; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStreamReader; import java.util.Date; import net.paoding.analysis.analyzer.PaodingAnalyzer; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexWriter; /** * 建立索引 * */ public class CreateIndex { public void createIndex() throws Exception { /* 指明要索引文件夹的位置,这里是C盘的S文件夹下 */ File surceFileDir = new File("D://save//source"); /* 这里放索引文件的位置 */ File indexFileDir = new File("D://save"); //Analyzer luceneAnalyzer = new StandardAnalyzer(); Analyzer luceneAnalyzer = new PaodingAnalyzer();//使用庖丁解牛分词法 IndexWriter indexWriter = new IndexWriter(indexFileDir, luceneAnalyzer, true);///参数isEmpty是false表示增量索引 File[] sourceFextFiles = surceFileDir.listFiles(); long startTime = new Date().getTime(); // 增加document到索引去 for (int i = 0; i < sourceFextFiles.length; i++) { if (sourceFextFiles[i].isFile() && sourceFextFiles[i].getName().endsWith(".txt")) { System.out.println("File " + sourceFextFiles[i].getCanonicalPath() + "正在被索引...."); String temp = FileReaderAll(sourceFextFiles[i].getCanonicalPath(), "GBK"); System.out.println(temp); Document document = new Document(); Field FieldPath = new Field("path", sourceFextFiles[i].getPath(), Field.Store.YES, Field.Index.NO); Field FieldBody = new Field("body", temp, Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS); Field FieldTitle = new Field("title", temp, Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS); document.add(FieldPath); document.add(FieldBody);document.add(FieldTitle); indexWriter.addDocument(document); } } // optimize()方法是对索引进行优化 indexWriter.optimize(); indexWriter.close(); // 测试一下索引的时间 long endTime = new Date().getTime(); System.out.println("这花费了" + (endTime - startTime) + " 毫秒来把文档增加到索引里面去!" + indexFileDir.getPath()); } public static String FileReaderAll(String FileName, String charset) throws IOException { BufferedReader reader = new BufferedReader(new InputStreamReader( new FileInputStream(FileName), charset)); String line = new String(); String temp = new String(); while ((line = reader.readLine()) != null) { temp += line; } reader.close(); return temp; } /** * @param args */ public static void main(String[] args) { try { new CreateIndex().createIndex(); } catch (Exception e) { e.printStackTrace(); } } }
QueryHighLighter:检索关键字并高亮显示
Java代码 package demo; import java.io.StringReader; import net.paoding.analysis.analyzer.PaodingAnalyzer; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.document.Document; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TopDocCollector; import org.apache.lucene.search.highlight.Highlighter; import org.apache.lucene.search.highlight.QueryScorer; import org.apache.lucene.search.highlight.SimpleFragmenter; import org.apache.lucene.search.highlight.SimpleHTMLFormatter; import test.TestLuceneHighlighter2; /** * 高亮显示检索结果 * Lucene 2.3.2 + Highlighter 2.2.0 的分页+高亮显示代码例子.<br> * Lucene和Highlighter不是最新版本可以升级。 */ public class QueryHighLighter { private static final String FIELD_TITLE = "title"; private static final String FIELD_BODY = "body"; public synchronized Analyzer getAnalyzer() { return new PaodingAnalyzer();// 此处使用"庖丁解牛"分词法,另外一种是中科院分词法 } public String test(String queryString, int begin, int number) { StringBuffer sb = new StringBuffer(); IndexSearcher isearcher = null; try { isearcher = new IndexSearcher("D://save"); /* 下面这个表示要同时搜索这两个域,而且只要一个域里面有满足我们搜索的内容就行 */ BooleanClause.Occur[] clauses = { BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD }; TopDocCollector collector = new TopDocCollector(10); /*Query query = MultiFieldQueryParser.parse(queryString, new String[] { FIELD_TITLE, FIELD_BODY }, clauses, getAnalyzer());*/ QueryParser queryParse = new QueryParser(FIELD_TITLE, getAnalyzer()); Query query = queryParse.parse(queryString); isearcher.search(query, collector); ScoreDoc[] hits = collector.topDocs().scoreDocs; // 用这个进行高亮显示,默认是<b>..</b> // 用这个指定<read>..</read> SimpleHTMLFormatter simpleHTMLFormatter = new SimpleHTMLFormatter("<b><font color='red'>", "</font></b>"); // 构造高亮 // 指定高亮的格式 // 指定查询评分 Highlighter highlighter = new Highlighter(simpleHTMLFormatter, new QueryScorer(query)); // 这个一般等于你要返回的,高亮的数据长度 // 如果太小,则只有数据的开始部分被解析并高亮,且返回的数据也少 // 太大,有时太浪费了。 highlighter.setTextFragmenter(new SimpleFragmenter(Integer.MAX_VALUE)); for (int i = begin; i < hits.length && i < begin + number; i++) { Document doc = isearcher.doc(hits[i].doc); String value = doc.get(FIELD_TITLE); String value2 = doc.get(FIELD_BODY); // 有三个参数 // 分析器 // 要解析的字段名 // 要解析的数据 //System.out.println(highlighter.getBestFragment(getAnalyzer(), // FIELD_TITLE, doc.get(FIELD_TITLE))); if (value != null) { TokenStream tokenStream = getAnalyzer().tokenStream(FIELD_TITLE, new StringReader(value)); String str = highlighter.getBestFragment(tokenStream, value); sb.append("<li><li>").append(str).append("<br/>"); System.out.println(str); } } } catch (Exception e) { e.printStackTrace(); } finally { if (isearcher != null) { try { isearcher.close(); } catch (Exception e) { e.printStackTrace(); } } } return sb.toString(); } public static void main(String[] args){ TestLuceneHighlighter2 t = new TestLuceneHighlighter2(); String queryString = "中华人民共和国"; int begin = 0; int number = 10; t.test(queryString, begin, number); } } package demo; import java.io.StringReader; import net.paoding.analysis.analyzer.PaodingAnalyzer; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.document.Document; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TopDocCollector; import org.apache.lucene.search.highlight.Highlighter; import org.apache.lucene.search.highlight.QueryScorer; import org.apache.lucene.search.highlight.SimpleFragmenter; import org.apache.lucene.search.highlight.SimpleHTMLFormatter; import test.TestLuceneHighlighter2; /** * 高亮显示检索结果 * Lucene 2.3.2 + Highlighter 2.2.0 的分页+高亮显示代码例子.<br> * Lucene和Highlighter不是最新版本可以升级。 */ public class QueryHighLighter { private static final String FIELD_TITLE = "title"; private static final String FIELD_BODY = "body"; public synchronized Analyzer getAnalyzer() { return new PaodingAnalyzer();// 此处使用"庖丁解牛"分词法,另外一种是中科院分词法 } public String test(String queryString, int begin, int number) { StringBuffer sb = new StringBuffer(); IndexSearcher isearcher = null; try { isearcher = new IndexSearcher("D://save"); /* 下面这个表示要同时搜索这两个域,而且只要一个域里面有满足我们搜索的内容就行 */ BooleanClause.Occur[] clauses = { BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD }; TopDocCollector collector = new TopDocCollector(10); /*Query query = MultiFieldQueryParser.parse(queryString, new String[] { FIELD_TITLE, FIELD_BODY }, clauses, getAnalyzer());*/ QueryParser queryParse = new QueryParser(FIELD_TITLE, getAnalyzer()); Query query = queryParse.parse(queryString); isearcher.search(query, collector); ScoreDoc[] hits = collector.topDocs().scoreDocs; // 用这个进行高亮显示,默认是<b>..</b> // 用这个指定<read>..</read> SimpleHTMLFormatter simpleHTMLFormatter = new SimpleHTMLFormatter("<b><font color='red'>", "</font></b>"); // 构造高亮 // 指定高亮的格式 // 指定查询评分 Highlighter highlighter = new Highlighter(simpleHTMLFormatter, new QueryScorer(query)); // 这个一般等于你要返回的,高亮的数据长度 // 如果太小,则只有数据的开始部分被解析并高亮,且返回的数据也少 // 太大,有时太浪费了。 highlighter.setTextFragmenter(new SimpleFragmenter(Integer.MAX_VALUE)); for (int i = begin; i < hits.length && i < begin + number; i++) { Document doc = isearcher.doc(hits[i].doc); String value = doc.get(FIELD_TITLE); String value2 = doc.get(FIELD_BODY); // 有三个参数 // 分析器 // 要解析的字段名 // 要解析的数据 //System.out.println(highlighter.getBestFragment(getAnalyzer(), // FIELD_TITLE, doc.get(FIELD_TITLE))); if (value != null) { TokenStream tokenStream = getAnalyzer().tokenStream(FIELD_TITLE, new StringReader(value)); String str = highlighter.getBestFragment(tokenStream, value); sb.append("<li><li>").append(str).append("<br/>"); System.out.println(str); } } } catch (Exception e) { e.printStackTrace(); } finally { if (isearcher != null) { try { isearcher.close(); } catch (Exception e) { e.printStackTrace(); } } } return sb.toString(); } public static void main(String[] args){ TestLuceneHighlighter2 t = new TestLuceneHighlighter2(); String queryString = "中华人民共和国"; int begin = 0; int number = 10; t.test(queryString, begin, number); } }
附加上传net/paoding/analysis/Constants.java便于理解参数设置:
Java代码 package net.paoding.analysis; import java.util.HashMap; import java.util.Map; import java.util.Properties; /** * * @author Zhiliang Wang [qieqie.wang@gmail.com] * * @since 2.0.0 */ public class Constants { /** * "词典目录安装目录"配置的优先级别 * <p> * "system-env"以及其他非"this"的配置,表示优先从环境变量PAODING_DIC_HOME的值找词典目录安装环境 * "this"表示优先从本配置文件的paoding.dic.home配置项找<br> * 只有在高优先级没有配置,才会找低优先级的配置。 默认环境变量的优先级别高于paoding-analysis.properties属性文件配置。 */ public static final String DIC_HOME_CONFIG_FIRST = "paoding.dic.home.config-first"; public static final String DIC_HOME_CONFIG_FIRST_DEFAULT = "system-env"; /** * 词典安装目录环境变量名 */ public static final String ENV_PAODING_DIC_HOME = "PAODING_DIC_HOME"; // ------------------------------------------------------------- /** * 词典安装目录 * <p> * 默认值为null,以在环境变量和配置文件都没有配置paoding.dic.home的情况下,让PaodingMaker尝试从当前工作目录下、类路径下探索是否存在dic目录 */ public static final String DIC_HOME = "paoding.dic.home"; public static final String DIC_HOME_DEFAULT = null; // ------------------------------------------------------------- // public static final String DIC_CHARSET = "paoding.dic.charset"; public static final String DIC_CHARSET_DEFAULT = "UTF-8"; // ------------------------------------------------------------- // dictionaries which are skip public static final String DIC_SKIP_PREFIX = "paoding.dic.skip.prefix"; public static final String DIC_SKIP_PREFIX_DEFAULT = "x-"; // ------------------------------------------------------------- // chinese/cjk charactors that will not token public static final String DIC_NOISE_CHARACTOR = "paoding.dic.noise-charactor"; public static final String DIC_NOISE_CHARACTOR_DEFAULT = "x-noise-charactor"; // ------------------------------------------------------------- // chinese/cjk words that will not token public static final String DIC_NOISE_WORD = "paoding.dic.noise-word"; public static final String DIC_NOISE_WORD_DEFAULT = "x-noise-word"; // ------------------------------------------------------------- // unit words, like "ge", "zhi", ... public static final String DIC_UNIT = "paoding.dic.unit"; public static final String DIC_UNIT_DEFAULT = "x-unit"; // ------------------------------------------------------------- // like "Wang", "Zhang", ... public static final String DIC_CONFUCIAN_FAMILY_NAME = "paoding.dic.confucian-family-name"; public static final String DIC_CONFUCIAN_FAMILY_NAME_DEFAULT = "x-confucian-family-name"; // ------------------------------------------------------------- // like public static final String DIC_FOR_COMBINATORICS = "paoding.dic.for-combinatorics"; public static final String DIC_FOR_COMBINATORICS_DEFAULT = "x-for-combinatorics"; // ------------------------------------------------------------- // like public static final String DIC_DETECTOR_INTERVAL = "paoding.dic.detector.interval"; public static final String DIC_DETECTOR_INTERVAL_DEFAULT = "60"; // ------------------------------------------------------------- // like "default", "max", ... public static final String ANALYZER_MODE = "paoding.analyzer.mode"; public static final String ANALYZER_MOE_DEFAULT = "most-words"; // ------------------------------------------------------------- // public static final String ANALYZER_DICTIONARIES_COMPILER = "paoding.analyzer.dictionaries.compiler"; public static final String ANALYZER_DICTIONARIES_COMPILER_DEFAULT = null; // ------------------------------------------------------------- private static final Map/* <String, String> */map = new HashMap(); static { map.put(DIC_HOME_CONFIG_FIRST, DIC_HOME_CONFIG_FIRST_DEFAULT); map.put(DIC_HOME, DIC_HOME_DEFAULT); map.put(DIC_CHARSET, DIC_CHARSET_DEFAULT); map.put(DIC_SKIP_PREFIX, DIC_SKIP_PREFIX_DEFAULT); map.put(DIC_NOISE_CHARACTOR, DIC_NOISE_CHARACTOR_DEFAULT); map.put(DIC_NOISE_WORD, DIC_NOISE_WORD_DEFAULT); map.put(DIC_UNIT, DIC_UNIT_DEFAULT); map.put(DIC_CONFUCIAN_FAMILY_NAME, DIC_CONFUCIAN_FAMILY_NAME_DEFAULT); map.put(DIC_FOR_COMBINATORICS, DIC_FOR_COMBINATORICS_DEFAULT); map.put(DIC_DETECTOR_INTERVAL, DIC_DETECTOR_INTERVAL_DEFAULT); map.put(ANALYZER_MODE, ANALYZER_MOE_DEFAULT); map.put(ANALYZER_DICTIONARIES_COMPILER, ANALYZER_DICTIONARIES_COMPILER_DEFAULT); } // public static final String KNIFE_CLASS = "paoding.knife.class."; public static String getProperty(Properties p, String name) { return p.getProperty(name, (String) map.get(name)); } }