lucene是apache软件基金会 jakarta项目组的一个子项目,是一个开放源代码的全文检索引擎工具包及架构,提供了完整的查询引擎和索引引擎,实现了一些通用的分词算法,预留很多词法分析器接口。本文以myrss.easyjf.com网站系统中使用lucene实现全文检索的代码为例,简单演示lucene在实际项目中的应用。
使用lucene实现全文检索,主要有下面三个步骤:
1、建立索引库:根据网站新闻信息库中的已有的数据资料建立lucene索引文件。
2、通过索引库搜索:有了索引后,即可使用标准的词法分析器或直接的词法分析器实现进行全文检索。
3、维护索引库:网站新闻信息库中的信息会不断的变动,包括新增、修改及删除等,这些信息的变动都需要进一步反映到lucene索引文件中。
下面是myrss.easyjf.com相关代码!
一、索引管理(建立及维护)
索引管理类myrssindexmanage主要实现根据网站信息库中的数据建立索引,维护索引等。由于索引的过程需要消耗一定的时间,因此,索引管理类实现runnable接口,使得我们可以在程序中开新线程来运行。
package com.easyjf.lucene;
import java.util.date;
import java.util.list;
import org.apache.lucene.analysis.standard.standardanalyzer;
import org.apache.lucene.document.document;
import org.apache.lucene.document.field;
import org.apache.lucene.index.indexreader;
import org.apache.lucene.index.indexwriter;
import org.apache.lucene.queryparser.multifieldqueryparser;
import org.apache.lucene.queryparser.queryparser;
import org.apache.lucene.search.hits;
import org.apache.lucene.search.indexsearcher;
import org.apache.lucene.search.query;
import org.apache.lucene.search.searcher;
import com.easyjf.dbo.easyjdb;
import com.easyjf.news.business.newsdir;
import com.easyjf.news.business.newsdoc;
import com.easyjf.news.business.newsutil;
import com.easyjf.web.tools.ipagelist;
public class myrssindexmanage implements runnable {
private string indexdir;
private string indextype="add";
public void run() {
// todo auto-generated method stub
if("add".equals(indextype))
normalindex();
else if ("init".equals(indextype)) reindexall();
}
public void normalindex()
{
try{
date start = new date();
int num=0;
indexwriter writer=new indexwriter(indexdir,new standardanalyzer(),false);
//newsdir dir=newsdir.readbysn();
string scope="(needindex<2) or(needindex is null)";
ipagelist plist=newsutil.pagelist(scope,1,50);
for(int p=0;p
plist=newsutil.pagelist(scope,p,100);
list list=plist.getresult();
for(int i=0;i
newsdoc doc=(newsdoc)list.get(i);
writer.adddocument(newsdoc2lucenedoc(doc));
num++;
}
}
writer.optimize();
writer.close();
easyjdb.getinstance().execute("update newsdoc set needindex=2 where "+scope);
date end = new date();
system.out.print("新增索引"+num+"条信息,一共花:"+(end.gettime() - start.gettime())/60000+"分钟!");
}
catch(exception e)
{
e.printstacktrace();
}
}
public void reindexall()
{
try{
date start = new date();
int num=0;
indexwriter writer=new indexwriter(indexdir,new standardanalyzer(),true);
newsdir dir=newsdir.readbysn("easyjf");
ipagelist plist=newsutil.pagelist(dir,1,50);
for(int p=0;p
plist=newsutil.pagelist(dir,p,100);
list list=plist.getresult();
for(int i=0;i
newsdoc doc=(newsdoc)list.get(i);
writer.adddocument(newsdoc2lucenedoc(doc));
num++;
}
}
writer.optimize();
writer.close();
easyjdb.getinstance().execute("update newsdoc set needindex=2 where dirpath like 'easyjf%'");
date end = new date();
system.out.print("全部重新做了一次索引,一共处理了"+num+"条信息,花:"+(end.gettime() - start.gettime())/60000+"分钟!");
}
catch(exception e)
{
e.printstacktrace();
}
}
private document newsdoc2lucenedoc(newsdoc doc)
{
document ldoc=new document();
ldoc.add(new field("title",doc.gettitle(),field.store.yes,field.index.tokenized));
ldoc.add(new field("content",doc.getcontent(),field.store.yes,field.index.tokenized));
ldoc.add(new field("url",doc.getremark(),field.store.yes,field.index.no));
ldoc.add(new field("cid",doc.getcid(),field.store.yes,field.index.no));
ldoc.add(new field("source",doc.getsource(),field.store.yes,field.index.no));
ldoc.add(new field("inputtime",doc.getinputtime().tostring(),field.store.yes,field.index.no));
return ldoc;
}
public string getindexdir() {
return indexdir;
}
public void setindexdir(string indexdir) {
this.indexdir = indexdir;
}
public string getindextype() {
return indextype;
}
public void setindextype(string indextype) {
this.indextype = indextype;
}
}
二、使用lucene实现全文搜索
下面是myrsssearch类的源码,该类主要实现使用lucene中searcher及queryparser实现从索引库中搜索关键词。
package com.easyjf.lucene;
import java.util.list;
import org.apache.lucene.analysis.standard.standardanalyzer;
import org.apache.lucene.document.document;
import org.apache.lucene.index.indexreader;
import org.apache.lucene.queryparser.multifieldqueryparser;
import org.apache.lucene.queryparser.queryparser;
import org.apache.lucene.search.hits;
import org.apache.lucene.search.indexsearcher;
import org.apache.lucene.search.query;
import org.apache.lucene.search.searcher;
import com.easyjf.search.myrssutil;
import com.easyjf.search.searchcontent;
import com.easyjf.web.tools.ipagelist;
import com.easyjf.web.tools.pagelist;
public class myrsssearch {
private string indexdir;
indexreader ir;
searcher search;
public ipagelist search(string key,int pagesize,int currentpage)
{
ipagelist plist=new pagelist(new hitsquery(dosearch(key)));
plist.dolist(pagesize,currentpage,"","",null);
if(plist!=null)
{
list list=plist.getresult();
if(list!=null){
for(int i=0;i
list.set(i,lucene2searchobj((document)list.get(i),key));
}
}
}
try{
if(search!=null)search.close();
if(ir!=null)ir.close();
}
catch(exception e)
{
e.printstacktrace();
}
return plist;
}
private searchcontent lucene2searchobj(document doc,string key)
{
searchcontent searchobj=new searchcontent();
string title=doc.getfield("title").stringvalue();
searchobj.settitle(title.replaceall(key,""+key+""));
searchobj.settvalue(doc.getfield("cid").stringvalue());
searchobj.seturl(doc.getfield("url").stringvalue());
searchobj.setsource(doc.getfield("source").stringvalue());
searchobj.setlastupdated(doc.getfield("inputtime").stringvalue());
searchobj.setintro(myrssutil.content2intro(doc.getfield("content").stringvalue(),key));
return searchobj;
}
public hits dosearch(string key)
{
hits hits=null;
try{
ir=indexreader.open(indexdir);
search=new indexsearcher(ir);
string fields[]={"title","content"};
queryparser parser=new multifieldqueryparser(fields,new standardanalyzer());
query query=parser.parse(key);
hits=search.search(query);
}
catch(exception e)
{
e.printstacktrace();
}
//system.out.println("搜索结果:"+hits.length());
return hits;
}
public string getindexdir() {
return indexdir;
}
public void setindexdir(string indexdir) {
this.indexdir = indexdir;
}
}
在上面的代码中,search方法返回一个封装了分页查询结果的ipagelist,ipagelist是easyjweb tools业务引擎中的分页引擎,对于ipagelist的使用,请看本人写的这篇文章《easyjweb tools中业务引擎分页的设计实现》:
我们针对lucene的的查询结果hits结构,写了一个查询器hitsquery。代码如下所示:
package com.easyjf.lucene;
import java.util.arraylist;
import java.util.collection;
import java.util.list;
import org.apache.lucene.search.hits;
import com.easyjf.web.tools.iquery;
public class hitsquery implements iquery {
private int begin=0;
private int max=0;
private hits hits;
public hitsquery()
{
}
public hitsquery(hits hits)
{
if(hits!=null)
{
this.hits=hits;
this.max=hits.length();
}
}
public int getrows(string arg0) {
// todo auto-generated method stub
return (hits==null?0:hits.length());
}
public list getresult(string arg0) {
// todo auto-generated method stub
list list=new arraylist();
for(int i=begin;i<(begin+max)&&(i
try{
list.add(hits.doc(i));
}
catch(exception e)
{
e.printstacktrace();
}
}
return list;
}
public void setfirstresult(int begin) {
// todo auto-generated method stub
this.begin=begin;
}
public void setmaxresults(int max) {
// todo auto-generated method stub
this.max=max;
}
public void setparavalues(collection arg0) {
// todo auto-generated method stub
}
public list getresult(string condition, int begin, int max) {
// todo auto-generated method stub
if((begin>=0)&&(begin
return getresult(condition);
}
}
三、web调用
下面我们来看看在web中如果调用商业逻辑层的全文检索功能。下面是处理用户请请的action中关于搜索部分的源码:
package com.easyjf.news.action;
public class searchaction implements iwebaction {
public page dosearch(webform form,module module)throws exception
{
string key=commutil.null2string(form.get("v"));
key=urldecoder.decode(urlencoder.encode(key,"iso8859_1"),"utf-8");
form.set("v",key);
form.addresult("v2",urlencoder.encode(key,"utf-8"));
if(key.getbytes().length>2){
string orderby=commutil.null2string(form.get("order"));
int currentpage=commutil.null2int(form.get("page"));
int pagesize=commutil.null2int(form.get("pagesize"));
if(currentpage<1)currentpage=1;
if(pagesize<1)pagesize=15;
searchengine search=new searchengine(key,orderby,pagesize,currentpage);
search.getlucenesearch().setindexdir(globals.app_base_dir+"/web-inf/index");
search.dosearchbylucene();
ipagelist plist=search.getresult();
if(plist!=null && plist.getrowcount()>0){
form.addresult("list",plist.getresult());
form.addresult("pages",new integer(plist.getpages()));
form.addresult("rows",new integer(plist.getrowcount()));
form.addresult("page",new integer(plist.getcurrentpage()));
form.addresult("gotopagehtml",commutil.showpagehtml(plist.getcurrentpage(),plist.getpages()));
}
else
{
form.addresult("notfound","true");//找不到数据
}
}
else
form.addresult("errmsg","您输入的关键字太短!");
form.addresult("hotsearch",searchengine.gethotsearc
闽公网安备 35060202000074号