下面列出了org.apache.lucene.index.LeafReader#getLiveDocs ( ) 实例代码,或者点击链接到github查看源代码,也可以在右侧发表评论。
private static Document getDocument(LeafReader reader, Term term) throws IOException {
PostingsEnum docs = reader.postings(term);
if (docs != null) {
int docId = docs.nextDoc();
// PostingsEnum may contain deleted documents, we have to cope for it
while (docId != PostingsEnum.NO_MORE_DOCS) {
// if document is deleted, skip and continue
Bits liveDocs = reader.getLiveDocs();
if (liveDocs != null && !liveDocs.get(docId)) {
docId = docs.nextDoc();
continue;
}
if (docs.nextDoc() != PostingsEnum.NO_MORE_DOCS) {
throw new IllegalStateException("Multiple Documents for term " + term.text());
}
return readDocument(reader, docId, null);
}
}
return null;
}
private void estimateTermVectors(Map<String, Object> result) throws IOException {
log.info("- estimating term vectors...");
Map<String, Map<String, Object>> stats = new HashMap<>();
for (LeafReaderContext leafReaderContext : reader.leaves()) {
LeafReader leafReader = leafReaderContext.reader();
Bits liveDocs = leafReader.getLiveDocs();
for (int docId = 0; docId < leafReader.maxDoc(); docId += samplingStep) {
if (liveDocs != null && !liveDocs.get(docId)) {
continue;
}
Fields termVectors = leafReader.getTermVectors(docId);
if (termVectors == null) {
continue;
}
for (String field : termVectors) {
Terms terms = termVectors.terms(field);
if (terms == null) {
continue;
}
estimateTermStats(field, terms, stats, true);
}
}
}
result.put(TERM_VECTORS, stats);
}
private static Document getFirstLiveDoc(Terms terms, LeafReader reader) throws IOException {
PostingsEnum postingsEnum = null;
TermsEnum termsEnum = terms.iterator();
BytesRef text;
// Deal with the chance that the first bunch of terms are in deleted documents. Is there a better way?
for (int idx = 0; idx < 1000 && postingsEnum == null; ++idx) {
text = termsEnum.next();
if (text == null) { // Ran off the end of the terms enum without finding any live docs with that field in them.
return null;
}
postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.NONE);
final Bits liveDocs = reader.getLiveDocs();
if (postingsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
if (liveDocs != null && liveDocs.get(postingsEnum.docID())) {
continue;
}
return reader.document(postingsEnum.docID());
}
}
return null;
}
private boolean innerMoveNext() throws IOException {
while (tryAdvanceDocIdSetIterator()) {
LeafReader reader = currentLeaf.reader();
Bits liveDocs = reader.getLiveDocs();
int doc;
while ((doc = currentDocIdSetIt.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
if (docDeleted(liveDocs, doc) || belowMinScore(currentScorer)) {
continue;
}
onDoc(doc);
return true;
}
currentDocIdSetIt = null;
}
clearState();
return false;
}
protected static Document getFirstLiveDoc(Terms terms, LeafReader reader)
throws IOException {
TermsEnum termsEnum = terms.iterator();
if (termsEnum.next() == null) { // Ran off the end of the terms enum without finding any live docs with that field in them.
return null;
}
PostingsEnum postingsEnum = termsEnum.postings(null, PostingsEnum.NONE);
final Bits liveDocs = reader.getLiveDocs();
if (postingsEnum.nextDoc() == DocIdSetIterator.NO_MORE_DOCS
|| (liveDocs != null && liveDocs.get(postingsEnum.docID()))) {
return null;
}
return reader.document(postingsEnum.docID());
}
private static void addDocuments(LeafReader reader, Term term, Collection<Document> documents) throws IOException {
PostingsEnum docs = reader.postings(term);
if (docs != null) {
int docId;
while ((docId = docs.nextDoc()) != PostingsEnum.NO_MORE_DOCS) {
Bits liveDocs = reader.getLiveDocs();
// Maybe some of the docs have been deleted! Check that too..
if (liveDocs != null && !liveDocs.get(docId)) {
continue;
}
Document document = readDocument(reader, docId, null);
documents.add(document);
}
}
}
private void estimateStoredFields(Map<String, Object> result) throws IOException {
log.info("- estimating stored fields...");
Map<String, Map<String, Object>> stats = new HashMap<>();
for (LeafReaderContext context : reader.leaves()) {
LeafReader leafReader = context.reader();
EstimatingVisitor visitor = new EstimatingVisitor(stats, topN, maxLength, samplingStep);
Bits liveDocs = leafReader.getLiveDocs();
if (leafReader instanceof CodecReader) {
CodecReader codecReader = (CodecReader)leafReader;
StoredFieldsReader storedFieldsReader = codecReader.getFieldsReader();
// this instance may be faster for a full sequential pass
StoredFieldsReader mergeInstance = storedFieldsReader.getMergeInstance();
for (int docId = 0; docId < leafReader.maxDoc(); docId += samplingStep) {
if (liveDocs != null && !liveDocs.get(docId)) {
continue;
}
mergeInstance.visitDocument(docId, visitor);
}
if (mergeInstance != storedFieldsReader) {
mergeInstance.close();
}
} else {
for (int docId = 0; docId < leafReader.maxDoc(); docId += samplingStep) {
if (liveDocs != null && !liveDocs.get(docId)) {
continue;
}
leafReader.document(docId, visitor);
}
}
}
result.put(STORED_FIELDS, stats);
}
/**
* Compute termvector number basic.
*
* @param termsEnum
* the terms enum
* @param r
* the r
* @return the termvector number basic
* @throws IOException
* Signals that an I/O exception has occurred.
*/
private static TermvectorNumberBasic computeTermvectorNumberBasic(
TermsEnum termsEnum, LeafReader r) throws IOException {
TermvectorNumberBasic result = new TermvectorNumberBasic();
boolean hasDeletedDocuments = (r.getLiveDocs() != null);
if (!hasDeletedDocuments) {
result.valueSum[0] = termsEnum.totalTermFreq();
result.docNumber = termsEnum.docFreq();
if (result.valueSum[0] > -1) {
return result;
}
}
throw new IOException("should not call this");
}
/**
* Compute termvector number basic.
*
* @param docSet
* the doc set
* @param termDocId
* the term doc id
* @param termsEnum
* the terms enum
* @param r
* the r
* @param lrc
* the lrc
* @param postingsEnum
* the postings enum
* @return the termvector number basic
* @throws IOException
* Signals that an I/O exception has occurred.
*/
private static TermvectorNumberBasic computeTermvectorNumberBasic(
List<Integer> docSet, int termDocId, TermsEnum termsEnum, LeafReader r,
LeafReaderContext lrc, PostingsEnum postingsEnum) throws IOException {
TermvectorNumberBasic result = new TermvectorNumberBasic();
boolean hasDeletedDocuments = (r.getLiveDocs() != null);
if ((docSet.size() == r.numDocs()) && !hasDeletedDocuments) {
try {
return computeTermvectorNumberBasic(termsEnum, r);
} catch (IOException e) {
log.debug("problem", e);
// problem
}
}
result.docNumber = 0;
result.valueSum[0] = 0;
int localTermDocId = termDocId;
Iterator<Integer> docIterator = docSet.iterator();
postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.FREQS);
int docId;
while (docIterator.hasNext()) {
docId = docIterator.next() - lrc.docBase;
if (docId >= localTermDocId && ((docId == localTermDocId)
|| ((localTermDocId = postingsEnum.advance(docId)) == docId))) {
result.docNumber++;
result.valueSum[0] += postingsEnum.freq();
}
if (localTermDocId == DocIdSetIterator.NO_MORE_DOCS) {
break;
}
}
return result;
}
@SuppressWarnings({"ConstantConditions", "PointlessBooleanExpression"})
@Nightly
public void testTriggerUnInvertLimit() throws IOException {
final boolean SHOULD_TRIGGER = false; // Set this to true to use the test with the old implementation
// Ensure enough terms inside of a single UnInvert-pass-structure to trigger the limit
final int REF_LIMIT = (int) Math.pow(2, 24); // Maximum number of references within a single pass-structure
final int DOCS = (1<<16)-1; // The number of documents within a single pass (simplified)
final int TERMS = REF_LIMIT/DOCS; // Each document must have this many references aka terms hit limit
// disk based Directory and IWC settings to reduce risk of OOM
Directory dir = newFSDirectory(createTempDir("TestDocTermOrdsUninvertLimit"));
final IndexWriter w = new IndexWriter(dir,
new IndexWriterConfig(new MockAnalyzer(random()))
.setMaxBufferedDocs(IndexWriterConfig.DISABLE_AUTO_FLUSH)
.setRAMBufferSizeMB(256.0)
.setMergeScheduler(new ConcurrentMergeScheduler())
.setMergePolicy(newLogMergePolicy(false, 10))
.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
.setCodec(TestUtil.getDefaultCodec()));
Document doc = new Document();
Field field = newTextField("field", "", Field.Store.NO);
doc.add(field);
StringBuilder sb = new StringBuilder(TERMS*(Integer.toString(TERMS).length()+1));
for (int i = 0 ; i < TERMS ; i++) {
sb.append(" ").append(Integer.toString(i));
}
field.setStringValue(sb.toString());
for (int i = 0 ; i < DOCS ; i++) {
w.addDocument(doc);
}
//System.out.println("\n Finished adding " + DOCS + " documents of " + TERMS + " unique terms");
w.close();
final IndexReader r = DirectoryReader.open(dir);
try {
final LeafReader ar = SlowCompositeReaderWrapper.wrap(r);
TestUtil.checkReader(ar);
final DocTermOrds dto = new DocTermOrds(ar, ar.getLiveDocs(), "field"); // bigTerms turned off
if (SHOULD_TRIGGER) {
fail("DocTermOrds should have failed with a \"Too many values for UnInvertedField\" message");
}
} catch (IllegalStateException e) {
if (!SHOULD_TRIGGER) {
fail("DocTermsOrd should not have failed with this implementation, but got exception " +
e.getClass().getSimpleName() + " with message " + e.getMessage());
}
// This is (hopefully) "Too many values for UnInvertedField faceting on field field", so all is as expected
} finally {
r.close();
dir.close();
}
}
public void testSimple() throws Exception {
Directory dir = newDirectory();
final RandomIndexWriter w = new RandomIndexWriter(random(), dir, newIndexWriterConfig(new MockAnalyzer(random())).setMergePolicy(newLogMergePolicy()));
Document doc = new Document();
Field field = newTextField("field", "", Field.Store.NO);
doc.add(field);
field.setStringValue("a b c");
w.addDocument(doc);
field.setStringValue("d e f");
w.addDocument(doc);
field.setStringValue("a f");
w.addDocument(doc);
final IndexReader r = w.getReader();
w.close();
final LeafReader ar = SlowCompositeReaderWrapper.wrap(r);
TestUtil.checkReader(ar);
final DocTermOrds dto = new DocTermOrds(ar, ar.getLiveDocs(), "field");
SortedSetDocValues iter = dto.iterator(ar);
assertEquals(0, iter.nextDoc());
assertEquals(0, iter.nextOrd());
assertEquals(1, iter.nextOrd());
assertEquals(2, iter.nextOrd());
assertEquals(SortedSetDocValues.NO_MORE_ORDS, iter.nextOrd());
assertEquals(1, iter.nextDoc());
assertEquals(3, iter.nextOrd());
assertEquals(4, iter.nextOrd());
assertEquals(5, iter.nextOrd());
assertEquals(SortedSetDocValues.NO_MORE_ORDS, iter.nextOrd());
assertEquals(2, iter.nextDoc());
assertEquals(0, iter.nextOrd());
assertEquals(5, iter.nextOrd());
assertEquals(SortedSetDocValues.NO_MORE_ORDS, iter.nextOrd());
r.close();
dir.close();
}
@Test
public void testEstimator() throws Exception {
JettySolrRunner jetty = cluster.getRandomJetty(random());
String randomCoreName = jetty.getCoreContainer().getAllCoreNames().iterator().next();
SolrCore core = jetty.getCoreContainer().getCore(randomCoreName);
RefCounted<SolrIndexSearcher> searcherRef = core.getSearcher();
try {
SolrIndexSearcher searcher = searcherRef.get();
// limit the max length
IndexSizeEstimator estimator = new IndexSizeEstimator(searcher.getRawReader(), 20, 50, true, true);
IndexSizeEstimator.Estimate estimate = estimator.estimate();
Map<String, Long> fieldsBySize = estimate.getFieldsBySize();
assertFalse("empty fieldsBySize", fieldsBySize.isEmpty());
assertEquals(fieldsBySize.toString(), fields.size(), fieldsBySize.size());
fieldsBySize.forEach((k, v) -> assertTrue("unexpected size of " + k + ": " + v, v > 0));
Map<String, Long> typesBySize = estimate.getTypesBySize();
assertFalse("empty typesBySize", typesBySize.isEmpty());
assertTrue("expected at least 8 types: " + typesBySize.toString(), typesBySize.size() >= 8);
typesBySize.forEach((k, v) -> assertTrue("unexpected size of " + k + ": " + v, v > 0));
Map<String, Object> summary = estimate.getSummary();
assertNotNull("summary", summary);
assertFalse("empty summary", summary.isEmpty());
assertEquals(summary.keySet().toString(), fields.size(), summary.keySet().size());
Map<String, Object> details = estimate.getDetails();
assertNotNull("details", details);
assertFalse("empty details", details.isEmpty());
// by type
assertEquals(details.keySet().toString(), 6, details.keySet().size());
// check sampling
estimator.setSamplingThreshold(searcher.getRawReader().maxDoc() / 2);
IndexSizeEstimator.Estimate sampledEstimate = estimator.estimate();
Map<String, Long> sampledFieldsBySize = sampledEstimate.getFieldsBySize();
assertFalse("empty fieldsBySize", sampledFieldsBySize.isEmpty());
// verify that the sampled values are within 50% of the original values
fieldsBySize.forEach((field, size) -> {
Long sampledSize = sampledFieldsBySize.get(field);
assertNotNull("sampled size for " + field + " is missing in " + sampledFieldsBySize, sampledSize);
double delta = (double) size * 0.5;
assertEquals("sampled size of " + field + " is wildly off", (double)size, (double)sampledSize, delta);
});
// verify the reader is still usable - SOLR-13694
IndexReader reader = searcher.getRawReader();
for (LeafReaderContext context : reader.leaves()) {
LeafReader leafReader = context.reader();
assertTrue("unexpected LeafReader class: " + leafReader.getClass().getName(), leafReader instanceof CodecReader);
Bits liveDocs = leafReader.getLiveDocs();
CodecReader codecReader = (CodecReader) leafReader;
StoredFieldsReader storedFieldsReader = codecReader.getFieldsReader();
StoredFieldVisitor visitor = new DocumentStoredFieldVisitor();
assertNotNull(storedFieldsReader);
for (int docId = 0; docId < leafReader.maxDoc(); docId++) {
if (liveDocs != null && !liveDocs.get(docId)) {
continue;
}
storedFieldsReader.visitDocument(docId, visitor);
}
}
} finally {
searcherRef.decref();
core.close();
}
}