1、准备工作
下载lucene 3.6.1 : http://lucene.apache.org/
下载中文分词IK Analyzer: http://code.google.com/p/ik-analyzer/downloads/list (注意下载的是IK Analyzer 2012_u5_source.zip,其他版本有bug)
下载solr 3.6.1: http://lucene.apache.org/solr/(编译IK Analyzer时需引用包)
OK,将lucene 、solr 相关包(lucene-core-3.6.1.jar、lucene-highlighter-3.6.1.jar、lucene-analyzers-3.6.1.jar、apache-solr-core-3.6.1.jar、apache-solr-solrj-3.6.1.jar)拷贝到项目lib下,IK源码置于项目src下。
2、从Oracle数据库中取数据创建索引(使用IK分词)
003 |
import org.apache.lucene.index.IndexWriter;
|
004 |
import org.apache.lucene.index.IndexWriterConfig;
|
005 |
import org.apache.lucene.index.CorruptIndexException;
|
006 |
import org.apache.lucene.store.FSDirectory;
|
007 |
import org.apache.lucene.store.Directory;
|
008 |
import org.apache.lucene.analysis.Analyzer;
|
009 |
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
010 |
import org.apache.lucene.util.Version;
|
011 |
import org.apache.lucene.document.Document;
|
012 |
import org.apache.lucene.document.Field;
|
013 |
import org.wltea.analyzer.lucene.IKAnalyzer;
|
015 |
import java.sql.Connection;
|
017 |
import java.io.IOException;
|
018 |
import java.util.ArrayList;
|
019 |
import java.util.Date;
|
021 |
import modules.gk.Gk_info;
|
022 |
import modules.gk.Gk_infoSub;
|
023 |
import web.sys.Globals;
|
024 |
import web.db.DBConnector;
|
025 |
import web.db.ObjectCtl;
|
026 |
import web.util.StringUtil;
|
028 |
public class LuceneIndex {
|
029 |
IndexWriter writer = null ;
|
030 |
FSDirectory dir = null ;
|
031 |
boolean create = true ;
|
034 |
long a1 = System.currentTimeMillis();
|
035 |
System.out.println( "[Lucene 开始执行:" + new Date() + "]" );
|
036 |
Connection con = DBConnector.getconecttion();
|
038 |
final File docDir = new File(Globals.SYS_COM_CONFIG.get( "sys.index.path" ).toString());
|
039 |
if (!docDir.exists()) {
|
042 |
String cr = Globals.SYS_COM_CONFIG.get( "sys.index.create" ).toString();
|
043 |
if ( "false" .equals(cr.toLowerCase())) {
|
046 |
Directory dir = FSDirectory.open(docDir);
|
048 |
Analyzer analyzer = new IKAnalyzer( true );
|
049 |
IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_36, analyzer);
|
053 |
iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
|
056 |
iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);
|
058 |
IndexWriter writer = new IndexWriter(dir, iwc);
|
059 |
String sql = "SELECT indexno,title,describes,pdate,keywords FROM TABLEA WHERE STATE=1 AND SSTAG<>1 " ;
|
060 |
int rowCount = ObjectCtl.getRowCount(con, sql);
|
061 |
int pageSize = StringUtil.StringToInt(Globals.SYS_COM_CONFIG.get( "sys.index.size" ).toString());
|
062 |
int pages = (rowCount - 1 ) / pageSize + 1 ;
|
063 |
ArrayList list = null ;
|
064 |
Gk_infoSub gk = null ;
|
065 |
for ( int i = 1 ; i < pages+ 1 ; i++) {
|
066 |
long a = System.currentTimeMillis();
|
067 |
list = ObjectCtl.listPage(con, sql, i, pageSize, new Gk_infoSub());
|
068 |
for ( int j = 0 ; j < list.size(); j++) {
|
069 |
gk = (Gk_infoSub) list.get(j);
|
070 |
Document doc = new Document();
|
071 |
doc.add( new Field( "indexno" , StringUtil.null2String(gk.getIndexno()), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS));
|
072 |
doc.add( new Field( "title" , StringUtil.null2String(gk.getTitle()), Field.Store.YES, Field.Index.ANALYZED));
|
073 |
doc.add( new Field( "describes" , StringUtil.null2String(gk.getDescribes()), Field.Store.YES, Field.Index.ANALYZED));
|
074 |
doc.add( new Field( "pdate" , StringUtil.null2String(gk.getPdate()), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS));
|
075 |
doc.add( new Field( "keywords" , StringUtil.null2String(gk.getKeywords()), Field.Store.YES, Field.Index.ANALYZED));
|
076 |
writer.addDocument(doc);
|
077 |
ObjectCtl.executeUpdateBySql(con, "UPDATE TABLEA SET SSTAG=1 WHERE indexno='" +gk.getIndexno()+ "'" );
|
080 |
long b = System.currentTimeMillis();
|
082 |
System.out.println( "[Lucene " + rowCount + "条," + pages + "页,第" + i + "页花费时间:" + c + "毫秒]" );
|
086 |
} catch (Exception e) {
|
089 |
DBConnector.freecon(con);
|
091 |
if (writer != null ) {
|
094 |
} catch (CorruptIndexException e) {
|
096 |
} catch (IOException e) {
|
100 |
if (dir != null && IndexWriter.isLocked(dir)) {
|
101 |
IndexWriter.unlock(dir);
|
103 |
} catch (IOException e) {
|
108 |
long b1 = System.currentTimeMillis();
|
110 |
System.out.println( "[Lucene 执行完毕,花费时间:" + c1 + "毫秒,完成时间:" + new Date() + "]" );
|
3、单字段查询以及多字段分页查询高亮显示
003 |
import org.apache.lucene.store.FSDirectory;
|
004 |
import org.apache.lucene.store.Directory;
|
005 |
import org.apache.lucene.search.*;
|
006 |
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
|
007 |
import org.apache.lucene.search.highlight.Highlighter;
|
008 |
import org.apache.lucene.search.highlight.SimpleFragmenter;
|
009 |
import org.apache.lucene.search.highlight.QueryScorer;
|
010 |
import org.apache.lucene.queryParser.QueryParser;
|
011 |
import org.apache.lucene.queryParser.MultiFieldQueryParser;
|
012 |
import org.apache.lucene.analysis.TokenStream;
|
013 |
import org.apache.lucene.analysis.Analyzer;
|
014 |
import org.apache.lucene.analysis.KeywordAnalyzer;
|
015 |
import org.apache.lucene.document.Document;
|
016 |
import org.apache.lucene.index.IndexReader;
|
017 |
import org.apache.lucene.index.Term;
|
018 |
import org.apache.lucene.util.Version;
|
019 |
import modules.gk.Gk_infoSub;
|
021 |
import java.util.ArrayList;
|
023 |
import java.io.StringReader;
|
024 |
import java.lang.reflect.Constructor;
|
026 |
import web.util.StringUtil;
|
027 |
import web.sys.Globals;
|
028 |
import org.wltea.analyzer.lucene.IKAnalyzer;
|
030 |
public class LuceneQuery {
|
031 |
private static String indexPath;
|
032 |
private int rowCount;
|
034 |
private int currentPage;
|
035 |
private int pageSize;
|
037 |
public LuceneQuery() {
|
038 |
this .indexPath = Globals.SYS_COM_CONFIG.get( "sys.index.path" ).toString();
|
041 |
public int getRowCount() {
|
045 |
public int getPages() {
|
049 |
public int getPageSize() {
|
053 |
public int getCurrentPage() {
|
060 |
public ArrayList queryIndexTitle(String keyWord, int curpage, int pageSize) {
|
061 |
ArrayList list = new ArrayList();
|
069 |
this .pageSize = pageSize;
|
070 |
this .currentPage = curpage;
|
071 |
int start = (curpage - 1 ) * pageSize;
|
072 |
Directory dir = FSDirectory.open( new File(indexPath));
|
073 |
IndexReader reader = IndexReader.open(dir);
|
074 |
IndexSearcher searcher = new IndexSearcher(reader);
|
075 |
Analyzer analyzer = new IKAnalyzer( true );
|
076 |
QueryParser queryParser = new QueryParser(Version.LUCENE_36, "title" , analyzer);
|
077 |
queryParser.setDefaultOperator(QueryParser.AND_OPERATOR);
|
078 |
Query query = queryParser.parse(keyWord);
|
079 |
int hm = start + pageSize;
|
080 |
TopScoreDocCollector res = TopScoreDocCollector.create(hm, false );
|
081 |
searcher.search(query, res);
|
083 |
SimpleHTMLFormatter simpleHTMLFormatter = new SimpleHTMLFormatter( "<span style='color:red'>" , "</span>" );
|
084 |
Highlighter highlighter = new Highlighter(simpleHTMLFormatter, new QueryScorer(query));
|
085 |
this .rowCount = res.getTotalHits();
|
086 |
this .pages = (rowCount - 1 ) / pageSize + 1 ;
|
087 |
TopDocs tds = res.topDocs(start, pageSize);
|
088 |
ScoreDoc[] sd = tds.scoreDocs;
|
089 |
for ( int i = 0 ; i < sd.length; i++) {
|
090 |
Document hitDoc = reader.document(sd[i].doc);
|
091 |
list.add(createObj(hitDoc, analyzer, highlighter));
|
094 |
} catch (Exception e) {
|
104 |
public ArrayList queryIndexFields(String allkeyword, String onekeyword, String nokeyword, int curpage, int pageSize) {
|
105 |
ArrayList list = new ArrayList();
|
113 |
this .pageSize = pageSize;
|
114 |
this .currentPage = curpage;
|
115 |
int start = (curpage - 1 ) * pageSize;
|
116 |
Directory dir = FSDirectory.open( new File(indexPath));
|
117 |
IndexReader reader = IndexReader.open(dir);
|
118 |
IndexSearcher searcher = new IndexSearcher(reader);
|
119 |
BooleanQuery bQuery = new BooleanQuery();
|
120 |
if (! "" .equals(allkeyword)) {
|
121 |
KeywordAnalyzer analyzer = new KeywordAnalyzer();
|
122 |
BooleanClause.Occur[] flags = {BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD};
|
123 |
Query query = MultiFieldQueryParser.parse(Version.LUCENE_36, allkeyword, new String[]{ "title" , "describes" , "keywords" }, flags, analyzer);
|
124 |
bQuery.add(query, BooleanClause.Occur.MUST);
|
126 |
if (! "" .equals(onekeyword)) {
|
127 |
Analyzer analyzer = new IKAnalyzer( true );
|
128 |
BooleanClause.Occur[] flags = {BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD};
|
129 |
Query query = MultiFieldQueryParser.parse(Version.LUCENE_36, onekeyword, new String[]{ "title" , "describes" , "keywords" }, flags, analyzer);
|
130 |
bQuery.add(query, BooleanClause.Occur.MUST);
|
132 |
if (! "" .equals(nokeyword)) {
|
133 |
Analyzer analyzer = new IKAnalyzer( true );
|
134 |
BooleanClause.Occur[] flags = {BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD};
|
135 |
Query query = MultiFieldQueryParser.parse(Version.LUCENE_36, nokeyword, new String[]{ "title" , "describes" , "keywords" }, flags, analyzer);
|
136 |
bQuery.add(query, BooleanClause.Occur.MUST_NOT);
|
139 |
int hm = start + pageSize;
|
140 |
TopScoreDocCollector res = TopScoreDocCollector.create(hm, false );
|
141 |
searcher.search(bQuery, res);
|
142 |
SimpleHTMLFormatter simpleHTMLFormatter = new SimpleHTMLFormatter( "<span style='color:red'>" , "</span>" );
|
143 |
Highlighter highlighter = new Highlighter(simpleHTMLFormatter, new QueryScorer(bQuery));
|
144 |
this .rowCount = res.getTotalHits();
|
145 |
this .pages = (rowCount - 1 ) / pageSize + 1 ;
|
146 |
System.out.println( "rowCount:" + rowCount);
|
147 |
TopDocs tds = res.topDocs(start, pageSize);
|
148 |
ScoreDoc[] sd = tds.scoreDocs;
|
149 |
Analyzer analyzer = new IKAnalyzer();
|
150 |
for ( int i = 0 ; i < sd.length; i++) {
|
151 |
Document hitDoc = reader.document(sd[i].doc);
|
152 |
list.add(createObj(hitDoc, analyzer, highlighter));
|
155 |
} catch (Exception e) {
|
167 |
private synchronized static Object createObj(Document doc, Analyzer analyzer, Highlighter highlighter) {
|
169 |
Gk_infoSub gk = new Gk_infoSub();
|
173 |
gk.setIndexno(StringUtil.null2String(doc.get( "indexno" )));
|
174 |
gk.setPdate(StringUtil.null2String(doc.get( "pdate" )));
|
175 |
String title = StringUtil.null2String(doc.get( "title" ));
|
177 |
if (! "" .equals(title)) {
|
178 |
highlighter.setTextFragmenter( new SimpleFragmenter(title.length()));
|
179 |
TokenStream tk = analyzer.tokenStream( "title" , new StringReader(title));
|
180 |
String htext = StringUtil.null2String(highlighter.getBestFragment(tk, title));
|
181 |
if (! "" .equals(htext)) {
|
185 |
String keywords = StringUtil.null2String(doc.get( "keywords" ));
|
186 |
gk.setKeywords(keywords);
|
187 |
if (! "" .equals(keywords)) {
|
188 |
highlighter.setTextFragmenter( new SimpleFragmenter(keywords.length()));
|
189 |
TokenStream tk = analyzer.tokenStream( "keywords" , new StringReader(keywords));
|
190 |
String htext = StringUtil.null2String(highlighter.getBestFragment(tk, keywords));
|
191 |
if (! "" .equals(htext)) {
|
192 |
gk.setKeywords(htext);
|
195 |
String describes = StringUtil.null2String(doc.get( "describes" ));
|
196 |
gk.setDescribes(describes);
|
197 |
if (! "" .equals(describes)) {
|
198 |
highlighter.setTextFragmenter( new SimpleFragmenter(describes.length()));
|
199 |
TokenStream tk = analyzer.tokenStream( "keywords" , new StringReader(describes));
|
200 |
String htext = StringUtil.null2String(highlighter.getBestFragment(tk, describes));
|
201 |
if (! "" .equals(htext)) {
|
202 |
gk.setDescribes(htext);
|
209 |
catch (Exception e) {
|
220 |
private synchronized static Object createObj(Document doc) {
|
222 |
Gk_infoSub gk = new Gk_infoSub();
|
226 |
gk.setIndexno(StringUtil.null2String(doc.get( "indexno" )));
|
227 |
gk.setPdate(StringUtil.null2String(doc.get( "pdate" )));
|
228 |
gk.setTitle(StringUtil.null2String(doc.get( "title" )));
|
229 |
gk.setKeywords(StringUtil.null2String(doc.get( "keywords" )));
|
230 |
gk.setDescribes(StringUtil.null2String(doc.get( "describes" )));
|
234 |
catch (Exception e) {
|
单字段查询:
01 |
long a = System.currentTimeMillis();
|
03 |
int curpage = StringUtil.StringToInt(StringUtil.null2String(form.get( "curpage" )));
|
04 |
int pagesize = StringUtil.StringToInt(StringUtil.null2String(form.get( "pagesize" )));
|
05 |
String title = StringUtil.replaceLuceneStr(StringUtil.null2String(form.get( "title" )));
|
06 |
LuceneQuery lu = new LuceneQuery();
|
07 |
form.addResult( "list" , lu.queryIndexTitle(title, curpage, pagesize));
|
08 |
form.addResult( "curPage" , lu.getCurrentPage());
|
09 |
form.addResult( "pageSize" , lu.getPageSize());
|
10 |
form.addResult( "rowCount" , lu.getRowCount());
|
11 |
form.addResult( "pageCount" , lu.getPages());
|
12 |
} catch (Exception e) {
|
15 |
long b = System.currentTimeMillis();
|
17 |
System.out.println( "[搜索信息花费时间:" + c + "毫秒]" );
|
多字段查询:
01 |
long a = System.currentTimeMillis();
|
03 |
int curpage = StringUtil.StringToInt(StringUtil.null2String(form.get( "curpage" )));
|
04 |
int pagesize = StringUtil.StringToInt(StringUtil.null2String(form.get( "pagesize" )));
|
05 |
String allkeyword = StringUtil.replaceLuceneStr(StringUtil.null2String(form.get( "allkeyword" )));
|
06 |
String onekeyword = StringUtil.replaceLuceneStr(StringUtil.null2String(form.get( "onekeyword" )));
|
07 |
String nokeyword = StringUtil.replaceLuceneStr(StringUtil.null2String(form.get( "nokeyword" )));
|
08 |
LuceneQuery lu = new LuceneQuery();
|
09 |
form.addResult( "list" , lu.queryIndexFields(allkeyword,onekeyword,nokeyword, curpage, pagesize));
|
10 |
form.addResult( "curPage" , lu.getCurrentPage());
|
11 |
form.addResult( "pageSize" , lu.getPageSize());
|
12 |
form.addResult( "rowCount" , lu.getRowCount());
|
13 |
form.addResult( "pageCount" , lu.getPages());
|
14 |
} catch (Exception e) {
|
17 |
long b = System.currentTimeMillis();
|
19 |
System.out.println( "[高级检索花费时间:" + c + "毫秒]" );
|
4、Lucene通配符查询
1 |
BooleanQuery bQuery = new BooleanQuery();
|
2 |
if (! "" .equals(title)) {
|
3 |
WildcardQuery w1 = new WildcardQuery( new Term( "title" , title+ "*" ));
|
5 |
bQuery.add(w1, BooleanClause.Occur.MUST);
|
7 |
int hm = start + pageSize;
|
8 |
TopScoreDocCollector res = TopScoreDocCollector.create(hm, false );
|
9 |
searcher.search(bQuery, res); |
5、Lucene嵌套查询
实现SQL:(unitid like 'unitid%' and idml like 'id2%') or (tounitid like 'unitid%' and tomlid like 'id2%' and tostate=1)
01 |
BooleanQuery bQuery = new BooleanQuery();
|
02 |
BooleanQuery b1 = new BooleanQuery();
|
03 |
WildcardQuery w1 = new WildcardQuery( new Term( "unitid" , unitid + "*" ));
|
04 |
WildcardQuery w2 = new WildcardQuery( new Term( "idml" , id2 + "*" ));
|
05 |
b1.add(w1, BooleanClause.Occur.MUST);
|
06 |
b1.add(w2, BooleanClause.Occur.MUST);
|
07 |
bQuery.add(b1, BooleanClause.Occur.SHOULD);
|
08 |
BooleanQuery b2 = new BooleanQuery();
|
09 |
WildcardQuery w3 = new WildcardQuery( new Term( "tounitid" , unitid + "*" ));
|
10 |
WildcardQuery w4 = new WildcardQuery( new Term( "tomlid" , id2 + "*" ));
|
11 |
WildcardQuery w5 = new WildcardQuery( new Term( "tostate" , "1" ));
|
12 |
b2.add(w3, BooleanClause.Occur.MUST);
|
13 |
b2.add(w4, BooleanClause.Occur.MUST);
|
14 |
b2.add(w5, BooleanClause.Occur.MUST);
|
15 |
bQuery.add(b2, BooleanClause.Occur.SHOULD);
|
6、Lucene先根据时间排序后分页
01 |
int hm = start + pageSize;
|
02 |
Sort sort = new Sort( new SortField( "pdate" , SortField.STRING, true ));
|
03 |
TopScoreDocCollector res = TopScoreDocCollector.create(pageSize, false );
|
04 |
searcher.search(bQuery, res); |
05 |
this .rowCount = res.getTotalHits();
|
06 |
this .pages = (rowCount - 1 ) / pageSize + 1 ;
|
07 |
TopDocs tds =searcher.search(bQuery,rowCount,sort);
|
08 |
ScoreDoc[] sd = tds.scoreDocs; |
09 |
System.out.println( "rowCount:" + rowCount);
|
11 |
for (ScoreDoc scoreDoc : sd) {
|
19 |
Document doc = searcher.doc(scoreDoc.doc);
|
20 |
list.add(createObj(doc));
|
这个效率不高,正常的做饭是创建索引的时候进行排序,之后使用分页方法,不要这样进行2次查询。
分享到:
相关推荐
lucene.NET 中文分词 高亮 lucene.NET 中文分词 高亮 lucene.NET 中文分词 高亮 lucene.NET 中文分词 高亮
lucene3.6 搜索例子
(4) 中文分词效果 19 4.2 分词原理 21 (1) TokenStream 21 (2) Tokenizer 22 (3) TokenFilter 23 4.3 分词属性 23 (1) 分词属性查看 24 (2) 分词属性对比 25 4.4 自定义分词器 26 (1) 自定义Stop分词器 26 (2) 实现...
在网上找了实例,但是发现不能使用,只能简历索引。...lucene3.6版本,能够建立索引,能搜索。inderwriter,indexsearch. 其中包C下的helloword实例能用,其余的全是网上不能用的。直接下载 可以运行
这个是使用lucene实现全文检索的jar包 包含IkAnalyer jar这个分词器 使用非常方便
lucene3.6 模仿百度自动补全(lucene3.6 模仿百度自动补全(lucene3.6 模仿百度自动补全(lucene3.6 模仿百度自动补全(lucene3.6 模仿百度自动补全(lucene3.6 模仿百度自动补全(lucene3.6 模仿百度自动补全
lucene 3.6 的入门例子 代码简洁 注释清晰 是入门只必备啊 附带了ik中文分词器 支持 停用词 扩展词等
修复IKAnalyzer2012存在的无法添加扩展的中文停用词的bug。详见:http://blog.csdn.net/kmguo/article/details/8779522
利用IKAnalyzer结合LUCENE.4.9进行中文分词的高亮显示。
NULL 博文链接:https://yuan-bin1990.iteye.com/blog/1700272
来自“猎图网 www.richmap.cn”基于IKAnalyzer分词算法的准商业化Lucene中文分词器。 1. 正向全切分算法,42万汉字字符/每秒的处理能力(IBM ThinkPad 酷睿I 1.6G 1G内存 WinXP) 2. 对数量词、地名、路名的...
SSH + Lucene + 分页 + 排序 + 高亮 模拟简单新闻网站搜索引擎--NewsWithSearch.part3 SSH + Lucene + 分页 + 排序 + 高亮 模拟简单新闻网站搜索引擎--NewsWithSearch.part2 SSH + Lucene + 分页 + 排序 + 高亮 ...
说明: 例子是根据lucene3.6写的,也可以说是直接copy别人的。 包括参考文章的代码,以及修改部分之后的代码
lucene3.6的src包,可以用于附加上去看相应的源代码
SSH + Lucene + 分页 + 排序 + 高亮 模拟简单新闻网站搜索引擎--NewsWithSearch.part3 SSH + Lucene + 分页 + 排序 + 高亮 模拟简单新闻网站搜索引擎--NewsWithSearch.part2 SSH + Lucene + 分页 + 排序 + 高亮 ...
本文档详细介绍了lucene3.6中的索引,以及每个部分对应于硬盘下的文件夹里的哪个文件。这个根据本人多年学术及编程经验总结的
Lucene与中文分词技术的研究及应用Lucene与中文分词技术的研究及应用Lucene与中文分词技术的研究及应用
超全的lucene3.6学习记录,实现了3中不同方法的检索,而且都很简单,容易上手,附带工程原文件
SSH + Lucene + 分页 + 排序 + 高亮 模拟简单新闻网站搜索引擎--NewsWithSearch.part3 SSH + Lucene + 分页 + 排序 + 高亮 模拟简单新闻网站搜索引擎--NewsWithSearch.part2 SSH + Lucene + 分页 + 排序 + 高亮 ...
SSH + Lucene + 分页 + 排序 + 高亮 模拟简单新闻网站搜索引擎--NewsWithSearch.part3 SSH + Lucene + 分页 + 排序 + 高亮 模拟简单新闻网站搜索引擎--NewsWithSearch.part2 SSH + Lucene + 分页 + 排序 + 高亮 ...