TextFileIndexer.java
Here's a simple indexer which indexes text and HTML files on your file system, written by Vladimir Polony. Available for download here: TextFileIndexer.javapackage com.lucenetutorial.apps; import java.io.BufferedReader; import java.io.File; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.IOException; import java.io.InputStreamReader; import java.util.ArrayList; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.store.LockObtainFailedException; /** * This terminal application creates an Apache Lucene index in a folder and adds files into this index * based on the input of the user. */ public class TextFileIndexer { private IndexWriter writer; private ArrayList<File> queue = new ArrayList(); public static void main(String[] args) throws IOException { System.out.println("Enter the path where the index will be created: "); BufferedReader br = new BufferedReader( new InputStreamReader(System.in)); String s = br.readLine(); TextFileIndexer indexer = null; try { indexer = new TextFileIndexer(s); } catch (Exception ex) { System.out.println("Cannot create index..." + ex.getMessage()); System.exit(-1); } //=================================================== //read input from user until he enters q for quit //=================================================== while (!s.equalsIgnoreCase("q")) { try { System.out.println("Enter the file or folder name to add into the index (q=quit):"); System.out.println("[Acceptable file types: .xml, .html, .html, .txt]"); s = br.readLine(); if (s.equalsIgnoreCase("q")) { break; } //try to add file into the index indexer.indexFileOrDirectory(s); } catch (Exception e) { System.out.println("Error indexing " + s + " : " + e.getMessage()); } } //=================================================== //after adding, we always have to call the //closeIndex, otherwise the index is not created //=================================================== indexer.closeIndex(); } /** * Constructor * @param indexDir the name of the folder in which the index should be created */ TextFileIndexer(String indexDir) throws CorruptIndexException, LockObtainFailedException, IOException { // the boolean true parameter means to create a new index everytime, // potentially overwriting any existing files there. writer = new IndexWriter(indexDir, new StandardAnalyzer(), true); } /** * Indexes a file or directory * @param fileName the name of a text file or a folder we wish to add to the index */ public void indexFileOrDirectory(String fileName) throws FileNotFoundException, CorruptIndexException, IOException { //=================================================== //gets the list of files in a folder (if user has submitted //the name of a folder) or gets a single file name (is user //has submitted only the file name) //=================================================== listFiles(new File(fileName)); int doccount = writer.docCount(); for (File f : queue) { FileReader fr = null; try { Document doc = new Document(); //=================================================== // add contents of file //=================================================== fr = new FileReader(f); doc.add(new Field("contents", fr)); //=================================================== //adding second field which contains the path of the file //=================================================== doc.add(new Field("path", f.getPath(), Field.Store.YES, Field.Index.UN_TOKENIZED)); writer.addDocument(doc); System.out.println("Added: " + f); } catch (Exception e) { System.out.println("Could not add: " + f); } finally { fr.close(); } } int newdoccount = writer.docCount(); System.out.println(""); System.out.println("************************"); System.out.println((newdoccount - doccount) + " documents added."); System.out.println("************************"); queue.clear(); } private void listFiles(File file) { if (!file.exists()) { System.out.println(file + " does not exist."); } if (file.isDirectory()) { for (File f : file.listFiles()) { listFiles(f); } } else { String filename = file.getName().toLowerCase(); //=================================================== // Only index text files //=================================================== if (filename.endsWith(".htm") || filename.endsWith(".html") || filename.endsWith(".xml") || filename.endsWith(".txt")) { queue.add(file); } else { System.out.println("Skipped " + filename); } } } /** * Close the index. */ public void closeIndex() throws CorruptIndexException, IOException { writer.optimize(); //optimizing the index before closing writer.close(); //closing index } }