如何使用lucene对html文件进行索引

人气：1279 2007-11-17

　　我修改了lucene的demo包的indexhtml类，使其可以被其他java类调用。
　　
　　indexhtml类
　　
　　import org.apache.lucene.analysis.standard.standardanalyzer;
　　
　　import org.apache.lucene.document.document;
　　
　　import org.apache.lucene.index.indexreader;
　　
　　import org.apache.lucene.index.indexwriter;
　　
　　import org.apache.lucene.index.term;
　　
　　import org.apache.lucene.index.termenum;
　　
　　import java.io.file;import java.util.date;
　　
　　import java.util.arrays;
　　
　　//还需调用demo的其他类。
　　
　　import org.apache.lucene.demo;
　　
　　/**
　　
　　* create html file index for searching
　　
　　* @author tyrone
　　
　　*
　　
　　*/public class indexhtml { private string docspath=null;
　　
　　/**
　　
　　* the path for index file;
　　
　　*/ private string indexfilepath=null;
　　
　　/**
　　
　　* true during deletion pass
　　
　　*/　 private boolean deleting = false;
　　
　　/**
　　
　　* existing index
　　
　　*/　 private indexreader reader;
　　
　　/**
　　
　　* new index being built
　　
　　*/　 private indexwriter writer;
　　
　　/**
　　
　　* document id iterator
　　
　　*/　 private termenum uiditer;
　　
　　private void indexdocs(file file)throws exception {
　　
　　if (file.isdirectory())
　　
　　{
　　
　　// if a directory　 string[] files = file.list();
　　
　　// list its files　 arrays.sort(files);
　　
　　// sort the files　 for (int i = 0; i < files.length;
　　
　　i++)　 // recursively index them　　this.indexdocs(new file(file, files[i]));
　　
　　} else if (file.getpath().endswith(".html") || // index .html files　　file.getpath().endswith(".htm") || // index .htm files　　file.getpath().endswith(".txt")) { // index .txt files　　　if (this.uiditer != null) {　　string uid = htmldocument.uid(file);
　　
　　// construct uid for doc
　　
　　while (uiditer.term() != null && uiditer.term().field() == "uid" &&
　　
　　uiditer.term().text().compareto(uid) <0) {
　　
　　if (deleting) {
　　
　　// delete stale docs
　　
　　system.out.println("deleting " +
　　
　　htmldocument.uid2url(uiditer.term().text()));
　　
　　reader.delete(uiditer.term());
　　
　　}
　　
　　uiditer.next();
　　
　　}
　　
　　if (uiditer.term() != null && uiditer.term().field() == "uid" &&
　　
　　uiditer.term().text().compareto(uid) == 0) {
　　
　　uiditer.next();
　　
　　// keep matching docs
　　
　　} else if (!deleting) {
　　
　　// add new docs
　　
　　document doc = htmldocument.document(file);
　　
　　system.out.println("adding " + doc.get("url"));
　　
　　writer.adddocument(doc);
　　
　　}
　　
　　} else { // creating a new index
　　
　　document doc = htmldocument.document(file);
　　
　　system.out.println("adding " + doc.get("url"));
　　
　　writer.adddocument(doc);
　　
　　// add docs unconditionally
　　
　　}
　　
　　}　return;
　　
　　}
　　
　　/**
　　
　　* walk directory hierarchy in uid order, while keeping uid iterator from
　　
　　* existing index in sync.　mismatches indicate one of:
　　
　　* (a) old documents to be deleted;
　　
　　* (b) unchanged documents, to be left alone;
　　
　　* or (c) new documents, to be indexed.
　　
　　*/　 private void indexdocs(file file, string index, boolean create)
　　
　　throws exception {
　　
　　if (!create) {
　　
　　// incrementally update
　　
　　reader = indexreader.open(index);
　　
　　// open existing index
　　
　　uiditer = reader.terms(new term("uid", ""));
　　
　　// init uid iterator
　　
　　this.indexdocs(file);
　　
　　if (deleting) {
　　
　　// delete rest of stale docs
　　
　　while (uiditer.term() != null && uiditer.term().field() == "uid") {
　　
　　system.out.println("deleting " +
　　
　　htmldocument.uid2url(uiditer.term().text()));
　　
　　reader.delete(uiditer.term());
　　
　　uiditer.next();
　　
　　}
　　
　　deleting = false;
　　
　　}
　　
　　uiditer.close();
　　
　　// close uid iterator
　　
　　reader.close();
　　
　　// close existing index
　　
　　} else
　　
　　// don't have exisiting
　　
　　this.indexdocs(file);
　　
　　}
　　
　　/**
　　
　　* if create=true, create a new index, else refresh old index.
　　
　　* @param create
　　
　　*/ public void run(boolean create)
　　
　　{
　　
　　try {
　　
　　string index = "index";
　　
　　file root = null;
　　
　　if (this.indexfilepath!=null)
　　
　　{
　　
　　// index file path
　　
　　index = this.indexfilepath;
　　
　　}
　　
　　if (this.docspath==null){
　　
　　system.out.println("root directory is not set");
　　
　　return;
　　
　　}
　　
　　root = new file(this.docspath);
　　
　　date start = new date();
　　
　　/**
　　
　　* not create then maintenance
　　
　　*/
　　
　　if (!create) {
　　
　　// delete stale docs
　　
　　this.deleting = true;
　　
　　this.indexdocs(root, index, create);
　　
　　}
　　
　　writer = new indexwriter(index, new standardanalyzer(), create);
　　
　　writer.maxfieldlength = 1000000;
　　
　　this.indexdocs(root, index, create);
　　
　　// add new docs
　　
　　system.out.println("optimizing index...");
　　
　　writer.optimize();
　　
　　writer.close();
　　
　　date end = new date();
　　
　　system.out.print(end.gettime() - start.gettime());
　　
　　system.out.println(" total milliseconds");
　　
　　} catch (exception e) {
　　
　　system.out.println(" caught a " + e.getclass() +
　　
　　"/n with message: " + e.getmessage());
　　
　　}
　　
　　return;
　　
　　}
　　
　　/**
　　
　　* @return returns the indexfilepath.
　　
　　*/ public string getindexfilepath() {　return indexfilepath;
　　
　　}
　　
　　/**
　　
　　* @param indexfilepath the indexfilepath to set.
　　
　　*/ public void setindexfilepath(string property1) {　this.indexfilepath = property1;
　　
　　}
　　
　　/**
　　
　　* @return returns the docspath.
　　
　　*/ public string getdocspath() {　return docspath;
　　
　　}
　　
　　/**
　　
　　* @param docspath the docspath to set.
　　
　　*/ public void setdocspath(string property1) {　this.docspath = property1;
　　
　　}
　　
　　/**
　　
　　* test
　　
　　* @param args
　　
　　*/ public static void main(string[] args){　indexhtml ih=new indexhtml();
　　
　　ih.setdocspath("d://myproject//colimas//clms-doc2//html");
　　
　　ih.setindexfilepath("d://myproject//colimas//index");　ih.run(true); }}
　　
　　运行后生成3个文件_3i8.cfs，deletable，segments
　　
　　搜索文件类：
　　
　　/*
　　
　　* created on 2005/07/28
　　
　　*
　　
　　* todo to change the template for this generated file go to
　　
　　* window - preferences - java - code style - code templates
　　
　　*/package com.nova.colimas.search.query;
　　
　　/** * @author tyrone * * todo to change the template for this generated type comment go to
　　
　　* window - preferences - java - code style - code templates
　　
　　*/public class hitshtmldoc {　private string title;
　　
　　priva

技术文档欢迎使用技术文档，我们为你提供从新手到专业开发者的所有资源，你也可以通过它日益精进

如何使用lucene对html文件进行索引

https访问

7*24小时服务

专业一线支持

7天无理由退款

关于我们

产品与服务

常见问题

技术支持

欢迎登录福佳jsp空间

技术文档 欢迎使用技术文档，我们为你提供从新手到专业开发者的所有资源，你也可以通过它日益精进

如何使用lucene对html文件进行索引

https访问

7*24小时服务

专业一线支持

7天无理由退款

关于我们

产品与服务

常见问题

技术支持

技术文档欢迎使用技术文档，我们为你提供从新手到专业开发者的所有资源，你也可以通过它日益精进