Tuesday, January 13, 2015

Lucene search

Maven dependencies:

        <dependency>
            <groupId>org.apache.lucene</groupId>
            <artifactId>lucene-core</artifactId>
            <version>4.0.0</version>
        </dependency>
        <dependency>
            <groupId>org.apache.lucene</groupId>
            <artifactId>lucene-analyzers-common</artifactId>
            <version>4.0.0</version>
        </dependency>
        <dependency>
            <groupId>org.apache.lucene</groupId>
            <artifactId>lucene-queryparser</artifactId>
            <version>4.0.0</version>
        </dependency>


Create IndexWriterFactory:

public enum IndexWriterFactory {
    WRITER;

    private static IndexWriter indexWriter = null;

    public IndexWriter getIndexWriter() throws IOException {

        ConfigUtil systemConfig = ConfigUtil.getInstance();
        String directoryConfig = systemConfig.getValue("index.directory");
        FSDirectory directory = null;

        boolean isLocked = new File(directoryConfig, IndexWriter.WRITE_LOCK_NAME).exists();
        if (isLocked) {
            return null;
        }
        File indexDirectory = new File(directoryConfig);

        StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_40);
        IndexWriterConfig indexConfig = new IndexWriterConfig(Version.LUCENE_40, analyzer);

        try {
            NativeFSLockFactory lock = new NativeFSLockFactory();
            directory = NIOFSDirectory.open(indexDirectory, lock);

            indexWriter = new IndexWriter(directory, indexConfig);
        } catch (IOException e) {
            e.printStackTrace();
            throw (e);
        }
        return indexWriter;
    }

    public void deleteAll() throws IOException {
        indexWriter.deleteAll();
        indexWriter.commit();
    }

    public void close() throws IOException {
        indexWriter.close();
    }
}

Create IndexReaderFactory:

public enum IndexReaderFactory {
    READER;

    private static IndexReader indexReader = null;

    public static IndexReader getIndexReader() throws IOException {
        ConfigUtil config = ConfigUtil.getInstance();
        String directoryConfig = config.getValue("index.directory");
        try {
            File indexDirectory = new File(directoryConfig);
            FSDirectory directory = FSDirectory.open(indexDirectory);
            indexReader = DirectoryReader.open(directory);
        } catch (IOException e) {
            e.printStackTrace();
            throw (e);
        }
        return indexReader;
    }
}

Create your index implementation:

public enum IndexServiceImpl {
    INDEXER;

    public Set<YourModel> indexItemQueue = null;

    private static Logger LOGGER = LoggerFactory.getLogger(IndexAssets.class);
    {
        indexItemQueue = new LinkedHashSet<YourModel>();
    }

    public void addIndexItem(YourModel indexItem) {
        indexItemQueue.add(indexItem);
    }

    public void addIndexItem(List<YourModel> indexItemList) {
        indexItemQueue.addAll(indexItemList);
    }

    public synchronized String reIndex() throws Exception {
        LOGGER.debug("BEGIN: index " + indexItemQueue.size());
        IndexWriter indexWriter = IndexWriterFactory.WRITER.getIndexWriter();

        String status = null;
        int count = 0;
        if (indexWriter != null) {
            YourModel entry = null;
            Iterator<YourModel> iterator = indexItemQueue.iterator();
            try {
                while (iterator.hasNext()) {
                    entry = iterator.next();
                    iterator.remove();
                    Document document = new Document();
                    Term term = null;


                    //add index fields to document
                    document.add(new StringField("field_name", entry.getValue(), Field.Store.YES));


                    //create term to query to check if this index is existed or not
                    term = new Term("field_name", entry.getValue());
                    //
                    // Query for term above
                    //
                    if(term != null){
                        indexWriter.updateDocument(term, document);
                    } else {
                        indexWriter.addDocument(document);
                    }
                }
            } finally {
                indexWriter.close();
            }
            LOGGER.debug("END: index " + count + " records");
            status = "INDEX_FINISHED";
            return status;

        } else {
            status = "INDEX_UNFINISHED";
            return status;
        }
    }


public synchronized void reIndexAll() throws Exception {
        //delete all index data first
        IndexAssets.INDEXER.deleteAll();
       
        //checkout if deleting process done, every 5s
        try {
            IndexReader reader = IndexAssets.getIndexReader();
            int numOfDoc = reader.numDocs();
            while (0 != numOfDoc) {
                Thread.sleep(5000);
                numOfDoc = reader.numDocs();
            }
        } catch (InterruptedException e) {
            LOGGER.error("There is an exeception  when deleting all old index items. Cause:" + e.getMessage());
        } catch (Exception e) {
            LOGGER.error("There is an exeception  when deleting all old index items. Cause:" + e.getMessage());
        }
       
        List<YourModel> list = new ArrayList<YourModel>();
        list.addAll(/*your data here*/);
       
        LOGGER.info("INDEXING: " + list.size() + " records");
       
        IndexAssets.INDEXER.addIndexItem(list);
        IndexAssets.INDEXER.index();
    }


    public String deleteAll() throws Exception {
        String status = null;
        indexItemQueue.clear();
        IndexWriter indexWriter = IndexWriterFactory.WRITER.getIndexWriter();
        if (indexWriter != null) {
            IndexWriterFactory.WRITER.deleteAll();
            indexWriter.close();
            status = "INDEX_FINISHED";
            return status;
        } else {
            status = "INDEX_UNFINISHED";
            return status;
        }
    }
}

Search in index:

public List<String[]> search(String keyword, int pageNum, boolean isSearchExactKeyword) {
        ConfigUtil config = ConfigUtil.getInstance();
        String numberOfProgramPerPageStr = config.getValue("search.numberOfProgramPerPage");
        int numberOfProgramPerPage = Integer.parseInt(numberOfProgramPerPageStr);
        String directoryStr = config.getValue("index.directory");

        // get from config
        int from = (pageNum - 1) * numberOfProgramPerPage;

        List<String[]> results = new ArrayList<String[]>();

        //escape special characters
        keyword = keyword.replaceAll("/^\\s+|\\s$/g", " ").toLowerCase();


        String[] keywords_Array = keyword.split(" ");
        Set<String> stopwordsSet = new HashSet<String>();

        //remove out the stopword from the keyword
        Iterator iter = StandardAnalyzer.STOP_WORDS_SET.iterator();
        while (iter.hasNext()) {
            char[] stopWord = (char[]) iter.next();
            stopwordsSet.add(new String(stopWord));
        }

        String[] stopwordsArray = new String[stopwordsSet.size()];
        stopwordsSet.toArray(stopwordsArray);
        BooleanQuery bq = new BooleanQuery();
        boolean isStopword = false;
        for (String key : keywords_Array) {
            isStopword = false;
            for (String stopword : stopwordsArray) {
                if (key.toLowerCase().equals(stopword)) {
                    isStopword = true;
                    break;
                }
            }
            if (!isStopword) {

                Query query = new TermQuery(new Term("field_name", key));

                if (isSearchExactKeyword ==  true) {
                    bq.add(query, BooleanClause.Occur.MUST);

                } else {// for program search
                    bq.add(query, BooleanClause.Occur.SHOULD);

                }
               
            }
        }
        try {
            IndexReader reader = IndexReaderFactory.INDEXREADER.getIndexReader();
            IndexSearcher searcher = new IndexSearcher(reader);

            TotalHitCountCollector collectorCount = new TotalHitCountCollector();
            searcher.search(bq, collectorCount);
            int count = collectorCount.getTotalHits();

            if (count <= 0) {
                LOGGER.debug("No Result");
                return results;
            }
            TopFieldCollector collector = TopFieldCollector.create(Sort.RELEVANCE, count, true, false, false, true);
            searcher.search(bq, collector);
           
            ScoreDoc[] hits = new ScoreDoc[0];
            if (pageNum == -1) {
                hits = collector.topDocs().scoreDocs;
            } else {
                hits = collector.topDocs(from, numberOfProgramPerPage).scoreDocs;
                // If found nothing, return search from page 0;
                if (hits.length <= 0) {
                    from = 0;
                    hits = collector.topDocs(from, numberOfProgramPerPage).scoreDocs;
                }
            }
      
            int sequenceNum= 0;
            String sequenceNumString = "";

            for (int i = 0; i < hits.length; i++) {
                int docId = hits[i].doc;
                Document doc = null;
                try {
                    doc = searcher.doc(docId);
                } catch (IOException e) {
                    e.printStackTrace();
                }

                //get out the data from document
                String fieldData_1 = doc.get("field_name1");
                String fieldData_2 = doc.get("field_name2");


                sequenceNum++;
                sequenceNumString = String.valueOf(sequenceNum);

                results.add(new String[] { fieldData_1, fieldData_2, sequenceNumString });
            }
            reader.close();


        } catch (IOException e) {
            e.printStackTrace();
        }
        return results;
    } 

No comments:

Post a Comment