我修改了lucene的demo包的indexhtml类,使其可以被其他java类调用。
indexhtml类
import org.apache.lucene.analysis.standard.standardanalyzer;
import org.apache.lucene.document.document;
import org.apache.lucene.index.indexreader;
import org.apache.lucene.index.indexwriter;
import org.apache.lucene.index.term;
import org.apache.lucene.index.termenum;
import java.io.file;import java.util.date;
import java.util.arrays;
//还需调用demo的其他类。
import org.apache.lucene.demo;
/**
* create html file index for searching
* @author tyrone
*
*/public class indexhtml { private string docspath=null;
/**
* the path for index file;
*/ private string indexfilepath=null;
/**
* true during deletion pass
*/ private boolean deleting = false;
/**
* existing index
*/ private indexreader reader;
/**
* new index being built
*/ private indexwriter writer;
/**
* document id iterator
*/ private termenum uiditer;
private void indexdocs(file file)throws exception {
if (file.isdirectory())
{
// if a directory string[] files = file.list();
// list its files arrays.sort(files);
// sort the files for (int i = 0; i < files.length;
i++) // recursively index them this.indexdocs(new file(file, files[i]));
} else if (file.getpath().endswith(".html") || // index .html files file.getpath().endswith(".htm") || // index .htm files file.getpath().endswith(".txt")) { // index .txt files if (this.uiditer != null) { string uid = htmldocument.uid(file);
// construct uid for doc
while (uiditer.term() != null && uiditer.term().field() == "uid" &&
uiditer.term().text().compareto(uid) <0) {
if (deleting) {
// delete stale docs
system.out.println("deleting " +
htmldocument.uid2url(uiditer.term().text()));
reader.delete(uiditer.term());
}
uiditer.next();
}
if (uiditer.term() != null && uiditer.term().field() == "uid" &&
uiditer.term().text().compareto(uid) == 0) {
uiditer.next();
// keep matching docs
} else if (!deleting) {
// add new docs
document doc = htmldocument.document(file);
system.out.println("adding " + doc.get("url"));
writer.adddocument(doc);
}
} else { // creating a new index
document doc = htmldocument.document(file);
system.out.println("adding " + doc.get("url"));
writer.adddocument(doc);
// add docs unconditionally
}
} return;
}
/**
* walk directory hierarchy in uid order, while keeping uid iterator from
* existing index in sync. mismatches indicate one of:
* (a) old documents to be deleted;
* (b) unchanged documents, to be left alone;
* or (c) new documents, to be indexed.
*/ private void indexdocs(file file, string index, boolean create)
throws exception {
if (!create) {
// incrementally update
reader = indexreader.open(index);
// open existing index
uiditer = reader.terms(new term("uid", ""));
// init uid iterator
this.indexdocs(file);
if (deleting) {
// delete rest of stale docs
while (uiditer.term() != null && uiditer.term().field() == "uid") {
system.out.println("deleting " +
htmldocument.uid2url(uiditer.term().text()));
reader.delete(uiditer.term());
uiditer.next();
}
deleting = false;
}
uiditer.close();
// close uid iterator
reader.close();
// close existing index
} else
// don't have exisiting
this.indexdocs(file);
}
/**
* if create=true, create a new index, else refresh old index.
* @param create
*/ public void run(boolean create)
{
try {
string index = "index";
file root = null;
if (this.indexfilepath!=null)
{
// index file path
index = this.indexfilepath;
}
if (this.docspath==null){
system.out.println("root directory is not set");
return;
}
root = new file(this.docspath);
date start = new date();
/**
* not create then maintenance
*/
if (!create) {
// delete stale docs
this.deleting = true;
this.indexdocs(root, index, create);
}
writer = new indexwriter(index, new standardanalyzer(), create);
writer.maxfieldlength = 1000000;
this.indexdocs(root, index, create);
// add new docs
system.out.println("optimizing index...");
writer.optimize();
writer.close();
date end = new date();
system.out.print(end.gettime() - start.gettime());
system.out.println(" total milliseconds");
} catch (exception e) {
system.out.println(" caught a " + e.getclass() +
"/n with message: " + e.getmessage());
}
return;
}
/**
* @return returns the indexfilepath.
*/ public string getindexfilepath() { return indexfilepath;
}
/**
* @param indexfilepath the indexfilepath to set.
*/ public void setindexfilepath(string property1) { this.indexfilepath = property1;
}
/**
* @return returns the docspath.
*/ public string getdocspath() { return docspath;
}
/**
* @param docspath the docspath to set.
*/ public void setdocspath(string property1) { this.docspath = property1;
}
/**
* test
* @param args
*/ public static void main(string[] args){ indexhtml ih=new indexhtml();
ih.setdocspath("d://myproject//colimas//clms-doc2//html");
ih.setindexfilepath("d://myproject//colimas//index"); ih.run(true); }}
运行后生成3个文件_3i8.cfs,deletable,segments
搜索文件类:
/*
* created on 2005/07/28
*
* todo to change the template for this generated file go to
* window - preferences - java - code style - code templates
*/package com.nova.colimas.search.query;
/** * @author tyrone * * todo to change the template for this generated type comment go to
* window - preferences - java - code style - code templates
*/public class hitshtmldoc { private string title;
priva
闽公网安备 35060202000074号