下面列出了org.apache.lucene.index.IndexWriter#setMergeFactor ( ) 实例代码,或者点击链接到github查看源代码,也可以在右侧发表评论。
/**
* @param id
* Unique index ID. Is used to generate unique directory name.
* @param tempIndexPath
* Absolute directory-path where the temporary index can be generated.
* @param fullIndexer
* Reference to full-index
*/
public IndexWriterWorker(final int id, final File tempIndexDir, final OlatFullIndexer fullIndexer) {
this.id = id;
this.indexPartDir = new File(tempIndexDir, "part" + id);
this.fullIndexer = fullIndexer;
try {
final Directory luceneIndexPartDir = FSDirectory.open(indexPartDir);
indexWriter = new IndexWriter(luceneIndexPartDir, new StandardAnalyzer(Version.LUCENE_30), true, IndexWriter.MaxFieldLength.UNLIMITED);
indexWriter.setMergeFactor(fullIndexer.getSearchModuleConfig().getIndexerWriterMergeFactor());
log.info("IndexWriter config MergeFactor=" + indexWriter.getMergeFactor());
indexWriter.setRAMBufferSizeMB(fullIndexer.getSearchModuleConfig().getIndexerWriterRamBuffer());
log.info("IndexWriter config RAMBufferSizeMB=" + indexWriter.getRAMBufferSizeMB());
indexWriter.setUseCompoundFile(false);
} catch (final IOException e) {
log.warn("Can not create IndexWriter");
}
}
public void updateIndex(Directory dir, int base, int numDocs,
IndexDeletionPolicy policy) throws IOException {
IndexWriter writer =
new IndexWriter(dir, false, new StandardAnalyzer(), policy);
writer.setMaxBufferedDocs(maxBufferedDocs);
writer.setMergeFactor(1000);
for (int i = 0; i < numDocs; i++) {
addDoc(writer, base + i);
}
writer.close();
}
public void updateIndex(Directory dir, int base, int numDocs,
IndexDeletionPolicy policy) throws IOException {
IndexWriter writer =
new IndexWriter(dir, false, new StandardAnalyzer(), policy);
writer.setMaxBufferedDocs(maxBufferedDocs);
writer.setMergeFactor(1000);
for (int i = 0; i < numDocs; i++) {
addDoc(writer, base + i);
}
writer.close();
}
public void indexforentity() throws Exception
{
if(EntityFragmentFields.entityId2Name == null)
EntityFragmentFields.load();
long startTime = new Date().getTime();
//Try update KB index to DBpedia2015. by husen 2016-04-08
//Try update KB index to DBpedia2016. by husen 2018-8-22
File indexDir_en = new File("D:/husen/gAnswer/data/DBpedia2016/lucene/entity_fragment_index");
File sourceDir_en = new File("D:/husen/gAnswer/data/DBpedia2016/fragments/entity_RDF_fragment/16entity_fragment.txt");
Analyzer luceneAnalyzer_en = new StandardAnalyzer();
IndexWriter indexWriter_en = new IndexWriter(indexDir_en, luceneAnalyzer_en,true);
int mergeFactor = 100000; //default 10
int maxBufferedDoc = 1000; //default 10
int maxMergeDoc = Integer.MAX_VALUE; //INF
//indexWriter.DEFAULT_MERGE_FACTOR = mergeFactor;
indexWriter_en.setMergeFactor(mergeFactor);
indexWriter_en.setMaxBufferedDocs(maxBufferedDoc);
indexWriter_en.setMaxMergeDocs(maxMergeDoc);
FileInputStream file = new FileInputStream(sourceDir_en);
InputStreamReader in = new InputStreamReader(file,"UTF-8");
BufferedReader br = new BufferedReader(in);
int count = 0;
while(true)
{
String _line = br.readLine();
{
if(_line == null) break;
}
count++;
if(count % 100000 == 0)
System.out.println(count);
String line = _line;
String temp[] = line.split("\t");
if(temp.length != 2)
continue;
else
{
int entity_id = Integer.parseInt(temp[0]);
if(!EntityFragmentFields.entityId2Name.containsKey(entity_id))
continue;
String entity_name = EntityFragmentFields.entityId2Name.get(entity_id);
String entity_fragment = temp[1];
entity_name = entity_name.replace("____", " ");
entity_name = entity_name.replace("__", " ");
entity_name = entity_name.replace("_", " ");
Document document = new Document();
Field EntityName = new Field("EntityName", entity_name, Field.Store.YES,
Field.Index.TOKENIZED,
Field.TermVector.WITH_POSITIONS_OFFSETS);
Field EntityId = new Field("EntityId", String.valueOf(entity_id),
Field.Store.YES, Field.Index.NO);
Field EntityFragment = new Field("EntityFragment", entity_fragment,
Field.Store.YES, Field.Index.NO);
document.add(EntityName);
document.add(EntityId);
document.add(EntityFragment);
indexWriter_en.addDocument(document);
}
}
indexWriter_en.optimize();
indexWriter_en.close();
br.close();
// input the time of Build index
long endTime = new Date().getTime();
System.out.println("entity_name index has build ->" + count + " " + "Time:" + (endTime - startTime));
}
public static void buildIndex(HashMap<String, ArrayList<Integer>> typeShortName2IdList) throws Exception
{
long startTime = new Date().getTime();
File indexDir_li = new File("D:/husen/gAnswer/data/DBpedia2016/lucene/type_fragment_index");
Analyzer luceneAnalyzer_li = new StandardAnalyzer();
IndexWriter indexWriter_li = new IndexWriter(indexDir_li, luceneAnalyzer_li,true);
int mergeFactor = 100000;
int maxBufferedDoc = 1000;
int maxMergeDoc = Integer.MAX_VALUE;
//indexWriter.DEFAULT_MERGE_FACTOR = mergeFactor;
indexWriter_li.setMergeFactor(mergeFactor);
indexWriter_li.setMaxBufferedDocs(maxBufferedDoc);
indexWriter_li.setMaxMergeDocs(maxMergeDoc);
int count = 0;
Iterator<String> it = typeShortName2IdList.keySet().iterator();
while (it.hasNext())
{
String sn = it.next();
if (sn.length() == 0) {
continue;
}
count ++;
StringBuilder splittedSn = new StringBuilder("");
if(sn.contains("_"))
{
String nsn = sn.replace("_", " ");
splittedSn.append(nsn.toLowerCase());
}
else
{
int last = 0, i = 0;
for(i = 0; i < sn.length(); i ++)
{
// if it were not a small letter, then break it.
if(!(sn.charAt(i)>='a' && sn.charAt(i)<='z'))
{
splittedSn.append(sn.substring(last, i).toLowerCase());
splittedSn.append(' ');
last = i;
}
}
splittedSn.append(sn.substring(last, i).toLowerCase());
while(splittedSn.charAt(0) == ' ') {
splittedSn.deleteCharAt(0);
}
}
System.out.println("SplitttedType: "+splittedSn);
Document document = new Document();
Field SplittedTypeShortName = new Field("SplittedTypeShortName", splittedSn.toString(),
Field.Store.YES,
Field.Index.TOKENIZED,
Field.TermVector.WITH_POSITIONS_OFFSETS);
Field TypeShortName = new Field("TypeShortName", sn,
Field.Store.YES, Field.Index.NO);
document.add(SplittedTypeShortName);
document.add(TypeShortName);
indexWriter_li.addDocument(document);
}
indexWriter_li.optimize();
indexWriter_li.close();
// input the time of Build index
long endTime = new Date().getTime();
System.out.println("TypeShortName index has build ->" + count + " " + "Time:" + (endTime - startTime));
}
/**
* Create index-writer object. In multi-threaded mode ctreates an array of index-workers. Start indexing with main-index as root object. Index recursive all elements.
* At the end optimze and close new index. The new index is stored in [temporary-index-path]/main
*
* @throws InterruptedException
*/
private void doIndex() throws InterruptedException {
try {
final File tempIndexDir = new File(tempIndexPath);
final Directory indexPath = FSDirectory.open(new File(tempIndexDir, "main"));
final Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT);
indexWriter = new IndexWriter(indexPath, analyzer, true, IndexWriter.MaxFieldLength.UNLIMITED);
indexWriter.deleteAll();
indexWriter.setMergeFactor(INDEX_MERGE_FACTOR); // for better performance
indexWriter.setRAMBufferSizeMB(ramBufferSizeMB);// for better performance set to 48MB (see lucene docu 'how to make indexing faster")
log.info("IndexWriter config RAMBufferSizeMB=" + indexWriter.getRAMBufferSizeMB());
indexWriter.setUseCompoundFile(useCompoundFile); // for better performance (see lucene docu 'how to make indexing faster")
log.info("IndexWriter config UseCompoundFile=" + indexWriter.getUseCompoundFile());
// Create IndexWriterWorker
log.info("Running with " + numberIndexWriter + " IndexerWriterWorker");
indexWriterWorkers = new IndexWriterWorker[numberIndexWriter];
final Directory[] partIndexDirs = new Directory[numberIndexWriter];
for (int i = 0; i < numberIndexWriter; i++) {
final IndexWriterWorker indexWriterWorker = new IndexWriterWorker(i, tempIndexDir, this);
indexWriterWorkers[i] = indexWriterWorker;
indexWriterWorkers[i].start();
partIndexDirs[i] = indexWriterWorkers[i].getIndexDir();
}
final SearchResourceContext searchResourceContext = new SearchResourceContext();
log.info("doIndex start. OlatFullIndexer with Debug output");
mainIndexer.doIndex(searchResourceContext, null /* no parent */, this);
log.info("Wait until every folder indexer is finished");
DBFactory.getInstance().commitAndCloseSession();
// check if every folder indexer is finished max waiting-time 10Min (=waitingCount-limit = 60)
int waitingCount = 0;
final int MAX_WAITING_COUNT = 60;// = 10Min
while (FolderIndexerWorkerPool.getInstance().isIndexerRunning() && (waitingCount++ < MAX_WAITING_COUNT)) {
Thread.sleep(10000);
}
if (waitingCount >= MAX_WAITING_COUNT) {
log.info("Finished with max waiting time!");
}
log.info("Set Finish-flag for each indexWriterWorkers");
// Set Finish-flag
for (int i = 0; i < numberIndexWriter; i++) {
indexWriterWorkers[i].finishIndexing();
}
log.info("Wait until every indexworker is finished");
// check if every indexworker is finished max waiting-time 10Min (=waitingCount-limit = 60)
waitingCount = 0;
while (!areIndexingDone() && (waitingCount++ < MAX_WAITING_COUNT)) {
Thread.sleep(10000);
}
if (waitingCount >= MAX_WAITING_COUNT) {
log.info("Finished with max waiting time!");
}
// Merge all partIndex
DBFactory.getInstance().commitAndCloseSession();
if (partIndexDirs.length > 0) {
log.info("Start merging part Indexes");
indexWriter.addIndexesNoOptimize(partIndexDirs);
log.info("Added all part Indexes");
}
fullIndexerStatus.setIndexSize(indexWriter.maxDoc());
indexWriter.optimize();
indexWriter.close();
} catch (final IOException e) {
e.printStackTrace();
log.warn("Can not create IndexWriter, indexname=" + tempIndexPath, e);
} finally {
DBFactory.getInstance().commitAndCloseSession();
log.debug("doIndex: commit & close session");
}
}