高级:lucene全文检索应用示例及代码简析

人气：1477 2008-02-29

lucene是apache软件基金会 jakarta项目组的一个子项目，是一个开放源代码的全文检索引擎工具包及架构，提供了完整的查询引擎和索引引擎，实现了一些通用的分词算法，预留很多词法分析器接口。本文以myrss.easyjf.com网站系统中使用lucene实现全文检索的代码为例，简单演示lucene在实际项目中的应用。
　　使用lucene实现全文检索，主要有下面三个步骤：
　　1、建立索引库：根据网站新闻信息库中的已有的数据资料建立lucene索引文件。
　　2、通过索引库搜索：有了索引后，即可使用标准的词法分析器或直接的词法分析器实现进行全文检索。
　　3、维护索引库：网站新闻信息库中的信息会不断的变动，包括新增、修改及删除等，这些信息的变动都需要进一步反映到lucene索引文件中。
    下面是myrss.easyjf.com相关代码!

一、索引管理(建立及维护)
　　索引管理类myrssindexmanage主要实现根据网站信息库中的数据建立索引，维护索引等。由于索引的过程需要消耗一定的时间，因此，索引管理类实现runnable接口，使得我们可以在程序中开新线程来运行。
package com.easyjf.lucene;
import java.util.date;
import java.util.list;
import org.apache.lucene.analysis.standard.standardanalyzer;
import org.apache.lucene.document.document;
import org.apache.lucene.document.field;
import org.apache.lucene.index.indexreader;
import org.apache.lucene.index.indexwriter;
import org.apache.lucene.queryparser.multifieldqueryparser;
import org.apache.lucene.queryparser.queryparser;
import org.apache.lucene.search.hits;
import org.apache.lucene.search.indexsearcher;
import org.apache.lucene.search.query;
import org.apache.lucene.search.searcher;
import com.easyjf.dbo.easyjdb;
import com.easyjf.news.business.newsdir;
import com.easyjf.news.business.newsdoc;
import com.easyjf.news.business.newsutil;
import com.easyjf.web.tools.ipagelist;
public class myrssindexmanage implements runnable {
private string indexdir;
private string indextype="add";
public void run() {
  // todo auto-generated method stub
  if("add".equals(indextype))
   normalindex();
  else if ("init".equals(indextype)) reindexall();
}
public void normalindex()
{
  try{
   date start = new date();
   int num=0;
   indexwriter writer=new indexwriter(indexdir,new standardanalyzer(),false);
   //newsdir dir=newsdir.readbysn();
   string scope="(needindex<2) or(needindex is null)";
   ipagelist plist=newsutil.pagelist(scope,1,50);
   for(int p=0;p   {
   plist=newsutil.pagelist(scope,p,100);
   list list=plist.getresult();
   for(int i=0;i   {
    newsdoc doc=(newsdoc)list.get(i);
    writer.adddocument(newsdoc2lucenedoc(doc));
    num++;
   }
   }
   writer.optimize();
   writer.close();
   easyjdb.getinstance().execute("update newsdoc set needindex=2 where "+scope);
   date end = new date();
   system.out.print("新增索引"+num+"条信息，一共花："+(end.gettime() - start.gettime())/60000+"分钟!");
   }
   catch(exception e)
   {
    e.printstacktrace();
   }
}
public void reindexall()
{
  try{
   date start = new date();
   int num=0;
   indexwriter writer=new indexwriter(indexdir,new standardanalyzer(),true);
   newsdir dir=newsdir.readbysn("easyjf");
   ipagelist plist=newsutil.pagelist(dir,1,50);
   for(int p=0;p   {
   plist=newsutil.pagelist(dir,p,100);
   list list=plist.getresult();
   for(int i=0;i   {
    newsdoc doc=(newsdoc)list.get(i);
    writer.adddocument(newsdoc2lucenedoc(doc));
    num++;
   }
   }
   writer.optimize();
   writer.close();
   easyjdb.getinstance().execute("update newsdoc set needindex=2 where dirpath like 'easyjf%'");
   date end = new date();
   system.out.print("全部重新做了一次索引，一共处理了"+num+"条信息，花："+(end.gettime() - start.gettime())/60000+"分钟!");
   }
   catch(exception e)
   {
    e.printstacktrace();
   }
}
private document newsdoc2lucenedoc(newsdoc doc)
{
  document ldoc=new document();
  ldoc.add(new field("title",doc.gettitle(),field.store.yes,field.index.tokenized));
  ldoc.add(new field("content",doc.getcontent(),field.store.yes,field.index.tokenized));
  ldoc.add(new field("url",doc.getremark(),field.store.yes,field.index.no));
  ldoc.add(new field("cid",doc.getcid(),field.store.yes,field.index.no));
  ldoc.add(new field("source",doc.getsource(),field.store.yes,field.index.no));
  ldoc.add(new field("inputtime",doc.getinputtime().tostring(),field.store.yes,field.index.no));
  return ldoc;
}
public string getindexdir() {
  return indexdir;
}
public void setindexdir(string indexdir) {
  this.indexdir = indexdir;
}

public string getindextype() {
  return indextype;
}
public void setindextype(string indextype) {
  this.indextype = indextype;
}
}

二、使用lucene实现全文搜索
   下面是myrsssearch类的源码，该类主要实现使用lucene中searcher及queryparser实现从索引库中搜索关键词。
package com.easyjf.lucene;

import java.util.list;
import org.apache.lucene.analysis.standard.standardanalyzer;
import org.apache.lucene.document.document;
import org.apache.lucene.index.indexreader;
import org.apache.lucene.queryparser.multifieldqueryparser;
import org.apache.lucene.queryparser.queryparser;
import org.apache.lucene.search.hits;
import org.apache.lucene.search.indexsearcher;
import org.apache.lucene.search.query;
import org.apache.lucene.search.searcher;

import com.easyjf.search.myrssutil;
import com.easyjf.search.searchcontent;
import com.easyjf.web.tools.ipagelist;
import com.easyjf.web.tools.pagelist;

public class myrsssearch {
    private string indexdir;
    indexreader ir;
    searcher search;
    public ipagelist search(string key,int pagesize,int currentpage)
    {
        ipagelist plist=new pagelist(new hitsquery(dosearch(key)));
        plist.dolist(pagesize,currentpage,"","",null);
        if(plist!=null)
        {
            list list=plist.getresult();
            if(list!=null){
            for(int i=0;i            {
                list.set(i,lucene2searchobj((document)list.get(i),key));
            }
            }
        }
        try{
        if(search!=null)search.close();
        if(ir!=null)ir.close();
        }
        catch(exception e)
        {
            e.printstacktrace();
        }
        return plist;
    }
    private searchcontent lucene2searchobj(document doc,string key)
    {
        searchcontent searchobj=new searchcontent();
        string title=doc.getfield("title").stringvalue();
        searchobj.settitle(title.replaceall(key,""+key+""));
        searchobj.settvalue(doc.getfield("cid").stringvalue());
        searchobj.seturl(doc.getfield("url").stringvalue());
        searchobj.setsource(doc.getfield("source").stringvalue());
        searchobj.setlastupdated(doc.getfield("inputtime").stringvalue());
        searchobj.setintro(myrssutil.content2intro(doc.getfield("content").stringvalue(),key));
        return searchobj;
    }
    public hits dosearch(string key)
    {
        hits hits=null;
        try{
        ir=indexreader.open(indexdir);
        search=new indexsearcher(ir);
        string fields[]={"title","content"};
        queryparser parser=new multifieldqueryparser(fields,new standardanalyzer());
        query query=parser.parse(key);
        hits=search.search(query);
        }
        catch(exception e)
        {
            e.printstacktrace();
        }
        //system.out.println("搜索结果:"+hits.length());
        return hits;
    }

    public string getindexdir() {
        return indexdir;
    }
    public void setindexdir(string indexdir) {
        this.indexdir = indexdir;
    }
}

　　在上面的代码中，search方法返回一个封装了分页查询结果的ipagelist，ipagelist是easyjweb tools业务引擎中的分页引擎，对于ipagelist的使用，请看本人写的这篇文章《easyjweb tools中业务引擎分页的设计实现》：

　　我们针对lucene的的查询结果hits结构，写了一个查询器hitsquery。代码如下所示：
package com.easyjf.lucene;
import java.util.arraylist;
import java.util.collection;
import java.util.list;
import org.apache.lucene.search.hits;
import com.easyjf.web.tools.iquery;
public class hitsquery implements iquery {
private int begin=0;
private int max=0;
private hits hits;
public hitsquery()
{

}
public hitsquery(hits hits)
{
  if(hits!=null)
  {
   this.hits=hits;
   this.max=hits.length();
  }
}
public int getrows(string arg0) {
  // todo auto-generated method stub
  return (hits==null?0:hits.length());
}
public list getresult(string arg0) {
  // todo auto-generated method stub
  list list=new arraylist();
  for(int i=begin;i<(begin+max)&&(i  {
   try{
   list.add(hits.doc(i));
   }
   catch(exception e)
   {
    e.printstacktrace();
   }
  }
  return list;
}
public void setfirstresult(int begin) {
  // todo auto-generated method stub
  this.begin=begin;
}
public void setmaxresults(int max) {
  // todo auto-generated method stub
  this.max=max;
}
public void setparavalues(collection arg0) {
  // todo auto-generated method stub

}
public list getresult(string condition, int begin, int max) {
  // todo auto-generated method stub
  if((begin>=0)&&(begin  if(!(max>hits.length()))this.max=max;
  return getresult(condition);
}
}

三、web调用
　　下面我们来看看在web中如果调用商业逻辑层的全文检索功能。下面是处理用户请请的action中关于搜索部分的源码：
package com.easyjf.news.action;
public class searchaction implements iwebaction {
public page dosearch(webform form,module module)throws exception
{
string key=commutil.null2string(form.get("v"));
key=urldecoder.decode(urlencoder.encode(key,"iso8859_1"),"utf-8");
form.set("v",key);
form.addresult("v2",urlencoder.encode(key,"utf-8"));
if(key.getbytes().length>2){
string orderby=commutil.null2string(form.get("order"));
int currentpage=commutil.null2int(form.get("page"));
int pagesize=commutil.null2int(form.get("pagesize"));
if(currentpage<1)currentpage=1;
if(pagesize<1)pagesize=15;
searchengine search=new searchengine(key,orderby,pagesize,currentpage);
search.getlucenesearch().setindexdir(globals.app_base_dir+"/web-inf/index");
search.dosearchbylucene();
ipagelist plist=search.getresult();
if(plist!=null && plist.getrowcount()>0){
  form.addresult("list",plist.getresult());
  form.addresult("pages",new integer(plist.getpages()));
  form.addresult("rows",new integer(plist.getrowcount()));
  form.addresult("page",new integer(plist.getcurrentpage()));
  form.addresult("gotopagehtml",commutil.showpagehtml(plist.getcurrentpage(),plist.getpages()));
  }
else
{
  form.addresult("notfound","true");//找不到数据
}
}
else
  form.addresult("errmsg","您输入的关键字太短!");
form.addresult("hotsearch",searchengine.gethotsearc

技术文档欢迎使用技术文档，我们为你提供从新手到专业开发者的所有资源，你也可以通过它日益精进

高级:lucene全文检索应用示例及代码简析

https访问

7*24小时服务

专业一线支持

7天无理由退款

关于我们

产品与服务

常见问题

技术支持

欢迎登录福佳jsp空间

技术文档 欢迎使用技术文档，我们为你提供从新手到专业开发者的所有资源，你也可以通过它日益精进

高级:lucene全文检索应用示例及代码简析

https访问

7*24小时服务

专业一线支持

7天无理由退款

关于我们

产品与服务

常见问题

技术支持

技术文档欢迎使用技术文档，我们为你提供从新手到专业开发者的所有资源，你也可以通过它日益精进