服务热线:13616026886

技术文档 欢迎使用技术文档,我们为你提供从新手到专业开发者的所有资源,你也可以通过它日益精进

位置:首页 > 技术文档 > JAVA > 新手入门 > 基础入门 > 查看文档

用java+mysql+php轻松构建跨平台的搜索引擎

  此搜索引擎适于在一个中等规模的局域网中使用,由于找到的网页存在数据库中,不仅可以索静态的html页面,可以搜索php、asp等动态页面。对于一个拥有5万个网页的系统(使用pii-400作为服务器),搜索响应时间在2-10秒左右,完全可以满足要求,由于java、mysql、php都是跨平台的软件,所以此搜索引擎不仅可以工作在windows服务器上,而且也可以工作在linux等其他系统中。

  一、建立搜索引擎需要的数据库和数据表。

  首先建立数据库:

  c:/mysql/bin/> mysqladmin -uroot -pmypasswd create spider

  然后建立数据库中的表结构

  c:/mysql/bin/> mysql -uroot -pmypasswd spider < spider.mysql

  其中spider.mysql为一个文本文件,其内容如下:

create table link (
id int(10) unsigned not null auto_increment,
url varchar(120) not null,
class tinyint(3) unsigned not null default 0 ,
issearchlink tinyint(3) unsigned default 0,
primary key (url),
unique id (id),
key url (url),
key class (class)
);

  # 本局域网的初始主页地址,搜索蜘蛛从此网址开始搜索所有其他网页

insert into link values( '1', 'http://102.211.69.1/', '0', '0');

  # 数据表 webpagelocal 用来存放下载的所有的网页

create table webpagelocal (
id int(10) unsigned not null auto_increment,
url varchar(120) not null,
content text not null,
primary key (url),
unique id (id),
key url (url)
);

  # 数据表 webpagefindfast

  # 用makefast.php从表webpagelocal中提取512字节的检索信息存放其中

create table webpagefindfast (
id int(10) unsigned not null,
url varchar(120) not null,
title varchar(64),
content blob,
primary key (url),
key url (url),
key title (title)
);

  二、以下为搜索网页和下载网页至本地数据库的java程序linktodb.java,它也是此搜索引擎的核心和基础

/***************************** linktodb.java ***********************************
*
* 对url中的http链接进行分析,将相对路径转换为绝对路径,排序方式输出结果到数据库
*
* 如果分析得到的url是link表中唯一的,就将其内容下载到表 webpagelocal 中。
*
********************************************************************************
/
import java.io.*;
import java.util.*;
import java.net.*;
import java.lang.string;
import java.sql.*;
import java.text.*;

class counter {
private int i = 1;
int read() { return i; }
void increment() { i++; }
}

public class linktodb {
string urlhost = "";
string urlfile = "";
string urlpath = "";
static string startwith = null;
boolean outsidetag = true; //判断是否在标记之中
static char[] buffer = new char[4096]; // 缓冲区:用于保存从 url 读的数据
inputstreamreader read = null;
bufferedreader reader = null;
urlconnection uc = null;
private url url = null;
private streamtokenizer st;
private treemap counts = new treemap();//以排序方式保存找到的链接

linktodb(string myurl,string startonly){
try {
startwith = startonly;
if(startonly!=null) { if(!myurl.startswith(startonly)) return; }//只搜索此网站
url = new url(myurl);
urlhost = url.gethost();
urlhost = urlhost.touppercase();
urlfile = url.getfile();
int v=urlfile.lastindexof("/");
if(v!=-1) urlpath = urlfile.substring(0,v);
system.out.println("分析文件:"+myurl);
int uclength=200000;
int ucerror=0;
try{
uc = url.openconnection();
uc.setusecaches(false);
uc.connect();
}
catch(ioexception io) { ucerror=1; system.out.println("打不开待分析网页:"+myu
rl); }
if(ucerror!=1){
uclength = uc.getcontentlength();
if (uclength<200000) {
try{ read = new inputstreamreader(url.openstream()); }
catch(ioexception io) {system.out.println("流打开错误:"+myurl);}
}
else system.out.println("文件太大,不分析");
}
if(read!=null){
reader=new bufferedreader(read);
if(reader!=null){
st = new streamtokenizer(reader);
st.resetsyntax(); // 重置语法表
st.wordchars(0,255); // 令牌范围为全部字符
st.ordinarychar('<'); // html标记两边的分割符
st.ordinarychar('>');
}
}
}
catch(malformedurlexception e){ system.out.println("malformed url string!");}
}
void cleanup() {
try { read.close(); }
catch(ioexception e) { system.out.println("流关闭错误"); }
}
void countwords() {
try {
while(st.nexttoken()!=streamtokenizer.tt_eof) {
string s0="";
string s_nocase="";
switch(st.ttype) {
case '<': //入标记字段
outsidetag=false;
continue; //countwords();
case '>': //出标记字段
outsidetag=true;
continue; //countwords();
case streamtokenizer.tt_eol: s0 = new string("eol"); break;
case streamtokenizer.tt_word: if(!outsidetag) s0 = st.sval; /*已经是字符
串*/ break;
default: s0 = "";// s0 = string.valueof((char)st.ttype);/*单一字符*/
}
if(outsidetag) continue;//出了标记区域(<a >)
string s = "";
s_nocase = s0.trim();
s0=s_nocase.touppercase();
if(s0.startswith("a ")||s0.startswith("area ")||s0.startswith("frame ")||s0.s
tartswith("iframe ")){ //以这些开始的都是超级链接
int href_pos = -1;
if(s0.startswith("frame ")||s0.startswith("iframe ")) {
href_pos = s0.indexof("src=");
s0 = s0.substring(href_pos+4).trim();
s_nocase=s_nocase.substring(href_pos+4).trim();
}
else {
href_pos=s0.indexof("href=");
s0=s0.substring(href_pos+5).trim();
s_nocase=s_nocase.substring(href_pos+5).trim();
}
if(href_pos!=-1) {
if(s0.startswith("/""))
{s0=s0.substring(1);s_nocase=s_nocase.substring(1);}
int quote=s0.indexof("/"");
if(quote!=-1)
{s0=s0.substring(0,quote).trim();s_nocase=s_nocase.substring(0,quote).trim
();}
int space=s0.indexof(" ");
if(space!=-1)
{s0=s0.substring(0,space).trim();s_nocase=s_nocase.substring(0,space).trim
();}
if(s0.endswith("/""))
{s0=s0.substring(0,s0.length()-1);s_nocase=s_nocase.substring(0,s_nocase.l
ength()-1);}
if(s0.indexof("'")!=-1||s0.indexof("javascript:")!=-1||s0.indexof("..")!=-1
)
{s0="";s_nocase="";} //有这些符号,认为非合法链接;两点表示上一目录,而我
只想向下级查找
if ( !s0.startswith("ftp://") &&//以下后缀或前缀通常非网页格式
!s0.startswith("ftp://") &&
!s0.startswith("mailto:") &&
!s0.endswith(".swf") &&
!s0.startswith("../")) //因../表示上一目录,通常只需考虑本级和下n级目录
s=s0;
if (!s.startswith("http://")&&!s.equals("")) {s=urlhost+urlpath+"/"+s;s_no
case=urlhost+urlpath+"/"+s_nocase;}
else if(s.startswith("/")) {s=urlhost+s;s_nocase=urlhost+s_nocase;}
if(s.startswith("http://")) {s=s.substring(7);s_nocase=s_nocase.substring(
7);}
int jinhao=s.indexof("#"); //如果含有"#"号,表示有效的链接是此前的部分
if(jinhao!=-1) {s=s.substring(0,jinhao).trim();s_nocase=s_nocase.substring(
0,jinhao).trim();}
int h=-1; //以下将/./转换为/
for(int m=0;m<4;m++){
h=s.indexof("/./");
if(h!=-1) {s=s.substring(0,h)+s.substring(h+2);s_nocase=s_nocase.substring
(0,h)+s_nocase.substring(h+2);}
}
int twoxg=-1; //以下将//转换为/
for(int m=0;m<5;m++){
twoxg=s.indexof("//");
if(twoxg!=-1) {s=s.substring(0,twoxg)+s.substring(twoxg+1);s_nocase=s_noca
se.substring(0,twoxg)+s_nocase.substring(twoxg+1);}
}
int onexg=s.indexof("/");
if(onexg==-1) {s=s+"/";s_nocase+="/";} //将xx.xx.xx.xxx转换为xx.xx.xx.xxx/的
标准形式
if (!s.startswith("http://")) {s="http://"+s;s_nocase="http://"+s_nocase;}
}
}
if(counts.containskey(s_nocase)) ((counter)counts.get(s_nocase)).increment();
else counts.put(s_nocase,new counter());
}
} catch(ioexception e) {
system.out.println("st.nexttoken() unsuccessful");
}
}

collection values() { return counts.values(); }
set keyset() { return counts.keyset(); }
counter getcounter(string s) { return (counter)counts.get(s); }

public static void main(string[] argv) throws filenotfoundexception {
try{
class.forname("org.gjt.mm.mysql.driver").newinstance();
}
catch (exception e) {
system.out.println("加载jdbc驱动程序失败");
e.printstacktrace();
}
try{
connection conn = drivermanager.getconnection(
"jdbc:mysql://localhost/spider?user=root&password=mypassword");
statement stmt = conn.createstatement();
string myurl;
for(int i=1;i<=6;i++){
string query = "select url from link where issearchlink=0 and class="+(i-1)+"
order by url";
resultset rs = stmt.executequery(query);
while (rs.next()) {
myurl = rs.getstring("url");
string startonly = null;
if(argv.length>0) startonly=argv[0];
linktodb wc = new linktodb(myurl,startonly);
if(wc.reader!=null){
stmt.executeupdate("update link set issearchlink=1 where class="+(i-1)+" a
nd url='"+myurl+"'");
wc.countwords();
iterator keys = wc.keyset().iterator();
while(keys.hasnext()) {
string key = (string)keys.next();
system.out.println("分析找到链接:"+key + ": "+ wc.getcounter(key).read());
int errordb=0;
if(startwith==null||(startwith!=null&&key.startswith(startwith))){
try{//-------------------------- 找到的链接插入数据库link -----------------
--
stmt.executeupdate("insert into link(id,url,class) values(0,'"+key+"',"+
i+")");
}
catch(sqlexception ex){
errordb=1;
system.out.println("插入数据错 sqlexception: " + ex.getmessage())
;
}
if(errordb!=1){ //链接不重复就下载网页到webpagelocal
system.out.println("下载网页:"+key);
int length; // 读的字符数
int filelength=200000;
inputstreamreader read=null;
url rurl=null;
urlconnection urlc=null;
string content="";
try{ rurl = new url(key); }
catch(malformedurlexception mu) {
system.out.println("打开下载网页出错:"+mu.getmessage());
}
if(rurl!=null){
int ucerror=0;
try{ urlc = rurl.openconnection(); urlc.connect();}
catch(ioexception io) { ucerror=1; system.out.println("下载网页打不开:"+ke
y); }
if(ucerror==0){
try{
filelength=urlc.getcontentlength();
if (filelength>=200000) system.out.println("网页太大,我不下载了。"+key);
else read = new inputstreamreader(rurl.openstream());
}
catch(ioexception io) {system.out.println("下载网页打不开:"+key);}
}
// 读入 url 并写入数据库
if(read!=null&&filelength<200000){
try{
while((length = read.read(buffer)) != -1) {
string s = new string(buffer, 0, length);
content=content+s;
}
}
catch(ioexception io) {
content="";
system.out.println("不能读入url文件");
}
try{
statement stmt2 = conn.createstatement(resultset.type_scroll_sensitive, res
ultset.concur_updatable);
resultset uprs = stmt2.executequery("select id,url,content from webpageloca
l where 0");//where 0很重要,否则会耗尽内存
uprs.movetoinsertrow();
uprs.updateint("id",0);
uprs.updatestring("url",key);
uprs.updatestring("content",content);
uprs.insertrow();
uprs.beforefirst();
uprs.close();
stmt2.close();
}
catch(sqlexception ex){
system.out.println("插入数据错:" + ex.getmessage());
}
}//if(read!=null&&filelength<200000)
}//
}
}
//------------------------------- 下载网页 ----------------------------------
}//while(keys.hasnext())
wc.cleanup();
}//if(wc.reader!=null)
}//while rs.next
rs.close(); //关闭记录结果
}//end for
stmt.close(); //关闭语句
conn.close(); //关闭连接
}//try
catch(sqlexception ex){ system.out.println("sql异常:" + ex.getmessage()); }
}//main()函数结束
}//类 linktodb 结束

  三、编译和运行此java程序

d:/spider/> set classpath=d:/j/mm.mysql.jdbc2;
d:/spider/> d:/j/bin/javac linktodb.java
d:/spider/> d:/j/bin/java linktodb

  其中第一行命令是设置mysql的jdbc驱动程序路径。


  四、由于网页中含有大量的无用的格式信息,直接用它来搜索要浪费大量的时间,所以需要掉其中的html格式控制信息,并将太长的网页截短,然后将整理后的用于搜索的信息存到另一个数据表中。由于php4中有一个很方便的函数strip_tags可以去掉其中的html格式标记,所以我们用php来整理。

  makefast.php的内容如下:

<?php
mysql_connect("localhost","root","mypassword");
$result = mysql_db_query("spider","select id,url,content from webpagelocal whe
re id>$n1 and id<$n2");
while($mt = mysql_fetch_array($result)){
$title = "";
$body = "";
$mt2 = strtoupper($mt[2]);
$postitlel = strpos($mt2,"<title>");
$postitler = strpos($mt2,"</title>");
$posbody = strpos($mt2,"<body");
$posheadr = strpos($mt2,"</head>");
if($postitlel&&$postitler) $title = substr($mt[2],$postitlel+7,$postitl
er-$postitlel-7);
$title = eregi_replace("'","’",$title);
if($posbody) $body = substr($mt[2],$posbody);
else if($posheadr) $body = substr($mt[2],$posheadr+7);
else if($postitler) $body = substr($mt[2],$postitler+8);
else if($postitlel) $body = substr($mt[2],$postitlel);
else $body = $mt[2];
$bodytext = strip_tags($body);
$bodynospace = eregi_replace(" ","",$bodytext);
$bodynoquote = eregi_replace("'","",$bodynospace);
$body512 = substr($bodynoquote,0,511)." ";
$id = $mt[0];
$url = $mt[1];
$sql="insert into webpagefindfast(id,url,title,content)
values($id,'$url','$title','$body512')";
mysql_db_query("spider",$sql) or die($sql);
echo $id." ";
}
?>

  使用方式:

  在浏览器中输入http://mywebsite/spider/makefast.php?n1=1&n2=10000
  五、以上是建立搜索引擎所用到的数据,下面编制用于用户搜索的网页和php脚本文件。首先是用于搜索的表单页面searchform.htm,其内容如下。

<html>
<head>
<title>红蜘蛛搜索引擎-v0.1</title>
<meta http-equiv="content-type" content="text/html; charset=gb2312">
<link rel="stylesheet" href="../all.css" type="text/css">
</head>

<body bgcolor="#eeffee" text="#000000">
<table width="600" border="0" cellspacing="2" cellpadding="2" align="center" bgc
olor="#99cc00">
<tr>
<td>
<div align="center"><font color="#ff0000"><b><span class="pt16">红蜘蛛搜索
引擎</span>
<span class="pt12">v0.1</span></b></font></div>
</td>
</tr>
</table>
<form name="form1" method="post" action="search.php">

<table width="600" border="1" cellspacing="1" cellpadding="1" align="center" b
ordercolor="#99cc00">
<tr>
<td>

<div align="left"><span class="pt15"><font color="#ff0000"><b>关 键 字</
b>:</font></span>
<input type="text" name="keywords" size="40" maxlength="40">
</div>
</td>
<td>

<div align="left"><span class="pt15"><font color="#ff0000"><b>查找范围</
b>:</font></span>
<select name="searchin">
<option value="content" selected>网页正文</option>
<option value="title">网页标题</option>
</select>
</div>
</td>
</tr>
<tr>

<td colspan="2"><span class="pt15"><font color="#ff0000"><b>查找方式</b>:
</font></span>
<select name="speed">
<option value="fast" selected>快速查找</option>
<option value="slow">更深查找</option>
</select>
</td>
</tr>
<tr>
<td colspan="2">
<div align="left">
<input type="submit" name="submit" value="搜索">
</div>
</td>
</tr>
</table>
</form>
<table width="600" border="0" cellspacing="2" cellpadding="2" align="center">
<tr>
<td height="18">
<p class="pt12"><font color="#ff0000"><b>使用方法</b></font>:仅需在关键字
一栏输入查询内容并按回车键(enter)即可。
</p>

<p align="left" class="pt12">如果希望输入多个条件,只需要用空格分隔即可:
</p>
<p class="pt12">例如:要查询同时包含“西昌”和“卫星”的网页,只需输入[西
昌 卫星]。</p>
<p class="pt12">又如:要查询只包含关键字“西昌”而不包含“卫星”的网页,
只需要输入[西昌 -卫星]。注意中间的空格不能少。</p>

<p class="pt12"><font color="#ff0000"><b>查找范围</b></font>:你可以选择从
“网页标题”中查找或者从“网页正文”中查找。</p>
<p class="pt12"><font color="#ff0000"><b>查找方式</b></font>:“快速查找”
速度快但找到的网页数可能较少,因为:</p>

<p class="pt12">“快速查找”只搜索网页正文的前512个字符。</p>
<p class="pt12">“更深查找”搜索网页正文的前2048个字符。</p>
<div align="right"><a href="mailto:zdyhlp@263.net"><font color="#ff0000" class="pt13"><b>欢迎
提出宝贵意见</b></font></a></div>

</td>
</tr>
</table>
</body>
</html>

  search.php根据用户输入的条件,完成搜索,显示找到的网页的链接地址、标题和提要信息
。内容如下:

<title>红蜘蛛正在搜索关键词为[<?php echo $keywords?>]的网页</title>
<link rel="stylesheet" href="../all.css" type="text/css">
<body bgcolor="#eeffee">
<table width="96%" border="0" cellspacing="2" cellpadding="2" align="center" bgc
olor="#99cc00">
<tr>
<td>
<div align="center"><font color="#ff0000"><b><span class="pt16">红蜘蛛搜索
引擎</span>
<span class="pt12">v0.1</span></b></font></div>
</td>
</tr>
</table>
<form name="form1" method="post" action="search.php">
<table width="96%" border="1" cellspacing="1" cellpadding="1" align="center" b
ordercolor="#99cc00">
<tr>
<td valign="top"> <font color="#ff0000"><b><span class="pt13">关键字</span
></b><span class="pt13">:</span></font>
<input type="text" name="keywords" value="<?php echo $keywords?>" size="
30" maxlength="30">
<input type="submit" name="submit" value="重新搜索">
</td>
<td valign="top"><font color="#ff0000"><b><span class="pt13">查找范围</spa
n></b><span class="pt13">:</span></font>
<select name="searchin">
<option value="content" <?php if ($searchin=="content") echo "selected
";?>>网页正文</option>
<option value="title" <?php if ($searchin=="title") echo "selected
";?>>网页标题</option>
</select>
</td>
<td valign="top">
<div align="left"><font color="#ff0000"><b><span class="pt13">查找方式</
span></b><span class="pt13">:</span></font>
<select name="speed">
<option value="fast" <?php if ($speed=="fast") echo "selected";?>>快
速查找</option>
<option value="slow" <?php if ($speed=="slow") echo "selected";?>>更
深查找</option>
</select>
</div>
</td>
</tr>
</table>
</form>
<?php
if($searchin=="title") $sql="select id,url,title,content from webpagefindfast
where ";
else $sql="select id,url,title,content from webpagefind$speed where ";
$keywords=str_replace(" ", " ", $keywords);
if($keywords=="") {echo "关键字不能为空"; exit();}
$tok = strtok($keywords," ");
$i=0;
$j=0;
while($tok) {
$i++;
$tok = strtok(" ");
}
$key = strtok($keywords," ");
while($key) {
$j++;
if(substr($key,0,1)!="-") {
$sql=$sql.$searchin." like '%".$key."%' ";
$words[]=$key;
}
else {
$sql=$sql.$searchin." not like '%".substr($key,1)."%' ";
}
if($j<$i) $sql.=" and ";
$key = strtok(" ");
}
if($curpos!="") $sql.=" and id>=$curpos ";
$sql.=" limit 100";
//echo "/$sql=".$sql."
";
mysql_connect("localhost","root","mypassword");
$result=mysql_db_query("spider",$sql);
$rowcount=mysql_num_rows($result);
$findcount=0;
?>
<table border=0 align=center width="96%">
<tr>
<th nowrap width="41%">
<div align="left" class="pt12">共找到关键字为 <font color=red>
<?php echo $keywords?>
</font> 的网页共 <font color=red>
<?php echo $rowcount;?>
</font> 个</div>
</th>
<td nowrap>
</td>
</tr>
<tr bgcolor="#ff0000">
<th nowrap colspan="2" height="3"></th>
</tr>
<?php while($row= mysql_fetch_array($result)){
$pos=$row[0];
$findcount++;
if($findcount>20) break;?>
<tr>
<td nowrap colspan="2">
<?php echo $findcount;?>
<a href="<?php echo $row[1]?>" target=_black>
<?php if($row[2]!="") echo $row[2];
else echo substr($row[3],0,64);
?>
</a></td>
</tr>
<tr>
<td colspan="2" ><span class="pt13">摘要:</span>
<?php
if($searchin=="title") {
$zhaiyao=substr($row[3],0,1024);
}
else{
if($speed=="fast") $zhaiyao=$row[3];
else{
$rowlen=strlen($row[3]);
if ($rowlen<1024) {
$zhaiyao=$row[3];
}
else {
$cutpos=0;
$posword1=strpos($row[3],$words[0]);
if($posword1-512<0) $zhaiyao=substr($row[3],0,1024);
else {
for($i=24;$i<500;$i++){ //避免将中文字符从半个字处截断,选择从英文处截断
if(ord(substr($row[3],$posword1-$i,1))<128) {$cutpos=$i;break;}
}
$zhaiyao=substr($row[3],$posword1-$cutpos,1024);
}
}
}
for($i=0;$i<count($words);$i++){
$zhaiyao=str_replace($words[$i],"<font color=red>".$words[$i]."</font>", $zhaiy
ao);
}
}
echo $zhaiyao;
?>
</td>
</tr>
<tr>
<td colspan="2" align="right"><a href="one.php?num=<?php echo $row[0]?>" tar
get=_black>
<font color="#0033ff" class="pt12">本地镜像</font></a></td>
</tr>
<tr bgcolor="#999933">
<td nowrap colspan="2" height="1"></td>
</tr>
<?php } ?>
<?php if($rowcount>20){ ?>
<tr>
<td align="right" colspan="2" height="10">
<form name="form2" method="post" action="search.php">
<input type="hidden" name="keywords" value="<?php echo $keywords;?>">
<input type="hidden" name="searchin" value="<?php echo $searchin;?>">
<input type="hidden" name="speed" value="<?php echo $speed;?>">
<input type="hidden" name="curpos" value="<?php echo $pos;?>">
<input type="submit" name="submit" value="下20个网页">
</form>
</td>
</tr>
<?php } ?>
</table>

  one.php用于从本地镜像中显示一个找到的网页。由于网页的原始信息已经在webpagelocal中存储,所以
只需简单的读出,发给用户。

<?php
 mysql_connect("localhost","root","mypassword");
 $sql="select url,content from webpagelocal where ";
 if($num!="") $sql=$sql."id=$num";
 else exit();
 $result=mysql_db_query("spider",$sql);
 $mt=mysql_fetch_row($result);
 echo $mt[1];
?>

扫描关注微信公众号