package com.search.crawler;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.util.Date;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
public class IndexProcess {
private static String indexPath = "src/indexFiles"; //生成索引存放目录
public static void createIndex(String docsPath) {
File docDir = new File(docsPath);
if (!docDir.exists() || !docDir.canRead()) {
System.out .println("Document directory '"
+ docDir.getAbsolutePath()
+ "' does not exist or is not readable, please check the path");
System.exit(1);
}
System.out.println("Indexing to directory '" + indexPath + "'...");
Directory dir = null;
try {
dir = FSDirectory.open(new File(indexPath));
Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_31);
IndexWriter writer = new IndexWriter(dir, analyzer, true, new IndexWriter.MaxFieldLength(25000));
indexDocs(writer, docDir);
writer.close();
dir.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
static void indexDocs(IndexWriter writer, File file) throws IOException {
// do not try to index files that cannot be read
if (file.canRead()) {
if (file.isDirectory()) {
File[] files = file.listFiles();
if (files != null) {
for (int i = 0; i < files.length; i++) {
indexDocs(writer, files[i]);
}
}
} else {
FileInputStream fis = null;
try {
fis = new FileInputStream(file);
} catch (FileNotFoundException fnfe) {
fnfe.printStackTrace();
}
try {
// make a new, empty document
Document doc = new Document();
//create index of path
doc.add( new Field("path", file.getPath(), Field.Store.YES, Field.Index.ANALYZED));
//create index of content
doc.add(new Field("content", new FileReader(file)));
writer.addDocument(doc);
} finally {
fis.close();
}
}
}
}
static void search(String key ,String value) {
Date startTime = new Date(); //
Directory dir = null;
try {
dir = FSDirectory.open(new File(indexPath));
IndexSearcher searcher = new IndexSearcher(dir,true);
QueryParser par = new QueryParser(Version.LUCENE_31,key,new StandardAnalyzer(Version.LUCENE_31));
Query query = null;
try {
query = par.parse(value);
} catch (ParseException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
// System.out.println(query.toString());
TopDocs topDocs = searcher.search(query, null, 1000);
ScoreDoc[] scores = topDocs.scoreDocs;
for (ScoreDoc soc : scores) {
System.out.println(soc+"\t"+searcher.doc(soc.doc).get("path") );
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
Date endTime = new Date();
System.out.println("总共花了" + (endTime.getTime() - startTime.getTime())+ "毫秒时间");
}
/**
* @param args
*/
public static void main(String[] args) {
// TODO Auto-generated method stub
IndexProcess.createIndex("src/index");
System.out.println("search starting :");
IndexProcess.search("content","revision");
}
}
分享到:
相关推荐
Lucene3.1使用教程 随着Lucene开发的推进,Lucene3.1推出了,但是目前Lucene3.1的使用文档较少,特收集了《Lucene3.1使用教程》 值得关注的内容有: 1. 性能提升 2. ReusableAnalyzerBase使得跟容易让 ...
Lucene3.4开发入门.pdf
C# Lucene.Net .Analysis.Cn.dll和Lucene.Net.dll两个文件
搜索引擎核心技术与实现(基于Lucene和Solr).pdf
本压缩包的主要内容是Lucene分词器的demo版本,可以导入到程序中直接使用,包含Lucene分词使用的pom文件,使用前请注意修改存储地址。
lucene2.9.1-src.tar.gz源码及完整Demo Lucene是一套用于全文检索和搜寻的开源程式库,由Apache软件基金会支持和提供。Lucene提供了一个简单确强大的应用程式接口,能够做全文索引和搜寻,在Java开发环境里Lucene...
Lucene.Net.DemoLib.dll
基于lucene的搜索引擎,相关论文,欢迎下载
lucene3.0-highlighter.jar lucene3.0的高亮jar包,从lucene3.0源码中导出来的
Lucene3.1 jar Lucene3.1 jarLucene3.1 jarLucene3.1 jarLucene3.1 jar
基于lucene的开发JavaEE项目 基于lucene的开发JavaEE项目 基于lucene的开发JavaEE项目 基于lucene的开发JavaEE项目 基于lucene的开发JavaEE项目
基于lucen的web工程,很好,很实用的一部分
Lucene开发详解.pdf,详细介绍Lucene的开发
Lucene.Net.NLS.rar Lucene.net的中文分词器组建 源码
Lucene.Net中文分词组件 Lucene.Net.Analysis.Cn
lucene-demo-3.3.0.jar lucene-grouping-3.3.0.jar lucene-highlighter-3.3.0.jar lucene-icu-3.3.0.jar lucene-instantiated-3.3.0.jar lucene-memory-3.3.0.jar lucene-misc-3.3.0.jar lucene-queries-3.3.0.jar ...
Lucene 全文检索实践.pdf(清晰版本)
基于lucene 的简单搜索引擎.rar
Lucene3.0分词系统.doc
主要是做站内搜索,即对一个系统内的资源进行搜索。如BBS、BLOG中的文章搜索,网上商店中的商品搜索等