服务热线:13616026886

技术文档 欢迎使用技术文档,我们为你提供从新手到专业开发者的所有资源,你也可以通过它日益精进

位置:首页 > 技术文档 > JAVA > 新手入门 > 基础入门 > 查看文档

如何使用lucene对html文件进行索引


  我修改了lucene的demo包的indexhtml类,使其可以被其他java类调用。
  
  indexhtml类
  
  import org.apache.lucene.analysis.standard.standardanalyzer;
  
  import org.apache.lucene.document.document;
  
  import org.apache.lucene.index.indexreader;
  
  import org.apache.lucene.index.indexwriter;
  
  import org.apache.lucene.index.term;
  
  import org.apache.lucene.index.termenum;
  
  import java.io.file;import java.util.date;
  
  import java.util.arrays;
  
  //还需调用demo的其他类。
  
  import org.apache.lucene.demo;
  
  /**
  
  * create html file index for searching
  
  * @author tyrone
  
  *
  
  */public class indexhtml { private string docspath=null;
  
  /**
  
  * the path for index file;
  
  */ private string indexfilepath=null;
  
  /**
  
  * true during deletion pass
  
  */  private boolean deleting = false;
  
  /**
  
  * existing index
  
  */  private indexreader reader;
  
  /**
  
  * new index being built
  
  */  private indexwriter writer;
  
  /**
  
  * document id iterator
  
  */  private termenum uiditer;
  
  private void indexdocs(file file)throws exception {
  
  if (file.isdirectory())
  
  {
  
  // if a directory  string[] files = file.list();
  
  // list its files  arrays.sort(files);
  
  // sort the files  for (int i = 0; i < files.length;
  
  i++)  // recursively index them  this.indexdocs(new file(file, files[i]));
  
  } else if (file.getpath().endswith(".html") || // index .html files  file.getpath().endswith(".htm") || // index .htm files  file.getpath().endswith(".txt")) { // index .txt files   if (this.uiditer != null) {  string uid = htmldocument.uid(file);
  
  // construct uid for doc
  
  while (uiditer.term() != null && uiditer.term().field() == "uid" &&
  
  uiditer.term().text().compareto(uid) <0) {
  
  if (deleting) {
  
  // delete stale docs
  
  system.out.println("deleting " +
  
  htmldocument.uid2url(uiditer.term().text()));
  
  reader.delete(uiditer.term());
  
  }
  
  uiditer.next();
  
  }
  
  if (uiditer.term() != null && uiditer.term().field() == "uid" &&
  
  uiditer.term().text().compareto(uid) == 0) {
  
  uiditer.next();
  
  // keep matching docs
  
  } else if (!deleting) {
  
  // add new docs
  
  document doc = htmldocument.document(file);
  
  system.out.println("adding " + doc.get("url"));
  
  writer.adddocument(doc);
  
  }
  
  } else { // creating a new index
  
  document doc = htmldocument.document(file);
  
  system.out.println("adding " + doc.get("url"));
  
  writer.adddocument(doc);
  
  // add docs unconditionally
  
  }
  
  } return;
  
  }
  
  /**
  
  * walk directory hierarchy in uid order, while keeping uid iterator from
  
  * existing index in sync. mismatches indicate one of:
  
  * (a) old documents to be deleted;
  
  * (b) unchanged documents, to be left alone;
  
  * or (c) new documents, to be indexed.
  
  */  private void indexdocs(file file, string index, boolean create)
  
  throws exception {
  
  if (!create) {
  
  // incrementally update
  
  reader = indexreader.open(index);
  
  // open existing index
  
  uiditer = reader.terms(new term("uid", ""));
  
  // init uid iterator
  
  this.indexdocs(file);
  
  if (deleting) {
  
  // delete rest of stale docs
  
  while (uiditer.term() != null && uiditer.term().field() == "uid") {
  
  system.out.println("deleting " +
  
  htmldocument.uid2url(uiditer.term().text()));
  
  reader.delete(uiditer.term());
  
  uiditer.next();
  
  }
  
  deleting = false;
  
  }
  
  uiditer.close();
  
  // close uid iterator
  
  reader.close();
  
  // close existing index
  
  } else
  
  // don't have exisiting
  
  this.indexdocs(file);
  
  }
  
  /**
  
  * if create=true, create a new index, else refresh old index.
  
  * @param create
  
  */ public void run(boolean create)
  
  {
  
  try {
  
  string index = "index";
  
  file root = null;
  
  if (this.indexfilepath!=null)
  
  {
  
  // index file path
  
  index = this.indexfilepath;
  
  }
  
  if (this.docspath==null){
  
  system.out.println("root directory is not set");
  
  return;
  
  }
  
  root = new file(this.docspath);
  
  date start = new date();
  
  /**
  
  * not create then maintenance
  
  */
  
  if (!create) {
  
  // delete stale docs
  
  this.deleting = true;
  
  this.indexdocs(root, index, create);
  
  }
  
  writer = new indexwriter(index, new standardanalyzer(), create);
  
  writer.maxfieldlength = 1000000;
  
  this.indexdocs(root, index, create);
  
  // add new docs
  
  system.out.println("optimizing index...");
  
  writer.optimize();
  
  writer.close();
  
  date end = new date();
  
  system.out.print(end.gettime() - start.gettime());
  
  system.out.println(" total milliseconds");
  
  } catch (exception e) {
  
  system.out.println(" caught a " + e.getclass() +
  
  "/n with message: " + e.getmessage());
  
  }
  
  return;
  
  }
  
  /**
  
  * @return returns the indexfilepath.
  
  */ public string getindexfilepath() { return indexfilepath;
  
  }
  
  /**
  
  * @param indexfilepath the indexfilepath to set.
  
  */ public void setindexfilepath(string property1) { this.indexfilepath = property1;
  
  }
  
  /**
  
  * @return returns the docspath.
  
  */ public string getdocspath() { return docspath;
  
  }
  
  /**
  
  * @param docspath the docspath to set.
  
  */ public void setdocspath(string property1) { this.docspath = property1;
  
  }
  
  /**
  
  * test
  
  * @param args
  
  */ public static void main(string[] args){ indexhtml ih=new indexhtml();
  
  ih.setdocspath("d://myproject//colimas//clms-doc2//html");
  
  ih.setindexfilepath("d://myproject//colimas//index"); ih.run(true); }}
  
  运行后生成3个文件_3i8.cfs,deletable,segments
  
  搜索文件类:
  
  /*
  
  * created on 2005/07/28
  
  *
  
  * todo to change the template for this generated file go to
  
  * window - preferences - java - code style - code templates
  
  */package com.nova.colimas.search.query;
  
  /** * @author tyrone * * todo to change the template for this generated type comment go to
  
  * window - preferences - java - code style - code templates
  
  */public class hitshtmldoc { private string title;
  
  priva

扫描关注微信公众号