下面列出了org.apache.lucene.index.LeafReader#terms ( ) 实例代码,或者点击链接到github查看源代码,也可以在右侧发表评论。
@Override
public AtomicGeoPointFieldData loadDirect(LeafReaderContext context) throws Exception {
LeafReader reader = context.reader();
Terms terms = reader.terms(getFieldNames().indexName());
AtomicGeoPointFieldData data = null;
// TODO: Use an actual estimator to estimate before loading.
NonEstimatingEstimator estimator = new NonEstimatingEstimator(breakerService.getBreaker(CircuitBreaker.FIELDDATA));
if (terms == null) {
data = AbstractAtomicGeoPointFieldData.empty(reader.maxDoc());
estimator.afterLoad(null, data.ramBytesUsed());
return data;
}
return (Version.indexCreated(indexSettings).before(Version.V_2_2_0)) ?
loadLegacyFieldData(reader, estimator, terms, data) : loadFieldData22(reader, estimator, terms, data);
}
/**
* Collect collection.
*
* @param reader
* the reader
* @param docSet
* the doc set
* @param collectionInfo
* the collection info
* @throws IOException
* Signals that an I/O exception has occurred.
*/
public static void collectCollection(IndexReader reader, List<Integer> docSet,
ComponentCollection collectionInfo) throws IOException {
if (collectionInfo.action().equals(ComponentCollection.ACTION_CHECK)) {
// can't do anything in lucene for check
} else if (collectionInfo.action()
.equals(ComponentCollection.ACTION_LIST)) {
// can't do anything in lucene for list
} else if (collectionInfo.action()
.equals(ComponentCollection.ACTION_CREATE)) {
BytesRef term = null;
PostingsEnum postingsEnum = null;
Integer docId;
Integer termDocId = -1;
Terms terms;
LeafReaderContext lrc;
LeafReader r;
ListIterator<LeafReaderContext> iterator = reader.leaves().listIterator();
while (iterator.hasNext()) {
lrc = iterator.next();
r = lrc.reader();
for (String field : collectionInfo.fields()) {
if ((terms = r.terms(field)) != null) {
TermsEnum termsEnum = terms.iterator();
while ((term = termsEnum.next()) != null) {
Iterator<Integer> docIterator = docSet.iterator();
postingsEnum = termsEnum.postings(postingsEnum,
PostingsEnum.NONE);
termDocId = -1;
while (docIterator.hasNext()) {
docId = docIterator.next() - lrc.docBase;
if ((docId >= termDocId) && ((docId.equals(termDocId))
|| ((termDocId = postingsEnum.advance(docId))
.equals(docId)))) {
collectionInfo.addValue(term.utf8ToString());
break;
}
if (termDocId.equals(PostingsEnum.NO_MORE_DOCS)) {
break;
}
}
}
}
}
}
}
}
private static void findSplitDocs(String idField, Predicate<BytesRef> includeInShard, LeafReader leafReader,
IntConsumer consumer) throws IOException {
Terms terms = leafReader.terms(idField);
TermsEnum iterator = terms.iterator();
BytesRef idTerm;
PostingsEnum postingsEnum = null;
while ((idTerm = iterator.next()) != null) {
if (includeInShard.test(idTerm) == false) {
postingsEnum = iterator.postings(postingsEnum);
int doc;
while ((doc = postingsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
consumer.accept(doc);
}
}
}
}
public BaseTermsEnumTraverser(LeafReaderContext context) throws IOException {
this.context = context;
LeafReader reader = context.reader();
this.maxDoc = reader.maxDoc();
terms = reader.terms(fieldName);
if (terms != null) {
this.termsEnum = terms.iterator();
} else {
this.termsEnum = null;
}
}
@Override
public BulkScorer bulkScorer(final LeafReaderContext context) throws IOException {
final LeafReader reader = context.reader();
final Terms terms;
final NRTSuggester suggester;
if ((terms = reader.terms(completionQuery.getField())) == null) {
return null;
}
if (terms instanceof CompletionTerms) {
CompletionTerms completionTerms = (CompletionTerms) terms;
if ((suggester = completionTerms.suggester()) == null) {
// a segment can have a null suggester
// i.e. no FST was built
return null;
}
} else {
throw new IllegalArgumentException(completionQuery.getField() + " is not a SuggestField");
}
BitsProducer filter = completionQuery.getFilter();
Bits filteredDocs = null;
if (filter != null) {
filteredDocs = filter.getBits(context);
if (filteredDocs.getClass() == Bits.MatchNoBits.class) {
return null;
}
}
return new CompletionScorer(this, suggester, reader, filteredDocs, filter != null, automaton);
}
/**
* Takes the categories from the given taxonomy directory, and adds the
* missing ones to this taxonomy. Additionally, it fills the given
* {@link OrdinalMap} with a mapping from the original ordinal to the new
* ordinal.
*/
public void addTaxonomy(Directory taxoDir, OrdinalMap map) throws IOException {
ensureOpen();
DirectoryReader r = DirectoryReader.open(taxoDir);
try {
final int size = r.numDocs();
final OrdinalMap ordinalMap = map;
ordinalMap.setSize(size);
int base = 0;
PostingsEnum docs = null;
for (final LeafReaderContext ctx : r.leaves()) {
final LeafReader ar = ctx.reader();
final Terms terms = ar.terms(Consts.FULL);
// TODO: share per-segment TermsEnum here!
TermsEnum te = terms.iterator();
while (te.next() != null) {
FacetLabel cp = new FacetLabel(FacetsConfig.stringToPath(te.term().utf8ToString()));
final int ordinal = addCategory(cp);
docs = te.postings(docs, PostingsEnum.NONE);
ordinalMap.addMapping(docs.nextDoc() + base, ordinal);
}
base += ar.maxDoc(); // no deletions, so we're ok
}
ordinalMap.addDone();
} finally {
r.close();
}
}
private Query buildFilterClause(LeafReader reader, String field) throws IOException {
Terms terms = reader.terms(field);
if (terms == null)
return null;
BooleanQuery.Builder bq = new BooleanQuery.Builder();
int docsInBatch = reader.maxDoc();
BytesRef term;
TermsEnum te = terms.iterator();
while ((term = te.next()) != null) {
// we need to check that every document in the batch has the same field values, otherwise
// this filtering will not work
if (te.docFreq() != docsInBatch)
throw new IllegalArgumentException("Some documents in this batch do not have a term value of "
+ field + ":" + Term.toString(term));
bq.add(new TermQuery(new Term(field, BytesRef.deepCopyOf(term))), BooleanClause.Occur.SHOULD);
}
BooleanQuery built = bq.build();
if (built.clauses().size() == 0)
return null;
return built;
}
@Override
public Query rewrite(IndexReader reader) throws IOException {
byte type = 0;
boolean first = true;
Terms terms;
for (LeafReaderContext context : reader.leaves()) {
LeafReader leafReader = context.reader();
try {
if ((terms = leafReader.terms(getField())) == null) {
continue;
}
} catch (IOException e) {
continue;
}
if (terms instanceof CompletionTerms) {
CompletionTerms completionTerms = (CompletionTerms) terms;
byte t = completionTerms.getType();
if (first) {
type = t;
first = false;
} else if (type != t) {
throw new IllegalStateException(getField() + " has values of multiple types");
}
}
}
if (first == false) {
if (this instanceof ContextQuery) {
if (type == SuggestField.TYPE) {
throw new IllegalStateException(this.getClass().getSimpleName()
+ " can not be executed against a non context-enabled SuggestField: "
+ getField());
}
} else {
if (type == ContextSuggestField.TYPE) {
return new ContextQuery(this);
}
}
}
return super.rewrite(reader);
}
@Override
public final Query buildQuery(LeafReader reader, BiPredicate<String, BytesRef> termAcceptor) {
try {
DocumentQueryBuilder queryBuilder = getQueryBuilder();
for (FieldInfo field : reader.getFieldInfos()) {
Terms terms = reader.terms(field.name);
if (terms == null) {
continue;
}
TokenStream ts = new TermsEnumTokenStream(terms.iterator());
for (CustomQueryHandler handler : queryHandlers) {
ts = handler.wrapTermStream(field.name, ts);
}
ts = new FilteringTokenFilter(ts) {
TermToBytesRefAttribute termAtt = addAttribute(TermToBytesRefAttribute.class);
@Override
protected boolean accept() {
return filterFields.contains(field.name) == false && termAcceptor.test(field.name, termAtt.getBytesRef());
}
};
TermToBytesRefAttribute termAtt = ts.addAttribute(TermToBytesRefAttribute.class);
while (ts.incrementToken()) {
queryBuilder.addTerm(field.name, BytesRef.deepCopyOf(termAtt.getBytesRef()));
}
ts.close();
}
Query presearcherQuery = queryBuilder.build();
BooleanQuery.Builder bq = new BooleanQuery.Builder();
bq.add(presearcherQuery, BooleanClause.Occur.SHOULD);
bq.add(new TermQuery(new Term(ANYTOKEN_FIELD, ANYTOKEN)), BooleanClause.Occur.SHOULD);
presearcherQuery = bq.build();
if (filterFields.isEmpty() == false) {
bq = new BooleanQuery.Builder();
bq.add(presearcherQuery, BooleanClause.Occur.MUST);
Query filterQuery = buildFilterFields(reader);
if (filterQuery != null) {
bq.add(filterQuery, BooleanClause.Occur.FILTER);
presearcherQuery = bq.build();
}
}
return presearcherQuery;
} catch (IOException e) {
// We're a MemoryIndex, so this shouldn't happen...
throw new RuntimeException(e);
}
}
@Override
protected Accountable createValue(LeafReader reader, CacheKey key)
throws IOException {
final int maxDoc = reader.maxDoc();
Terms terms = reader.terms(key.field);
final float acceptableOverheadRatio = ((Float) key.custom).floatValue();
final PagedBytes bytes = new PagedBytes(15);
int startTermsBPV;
// TODO: use Uninvert?
if (terms != null) {
// Try for coarse estimate for number of bits; this
// should be an underestimate most of the time, which
// is fine -- GrowableWriter will reallocate as needed
long numUniqueTerms = terms.size();
if (numUniqueTerms != -1L) {
if (numUniqueTerms > maxDoc) {
throw new IllegalStateException("Type mismatch: " + key.field + " was indexed with multiple values per document, use SORTED_SET instead");
}
startTermsBPV = PackedInts.bitsRequired(numUniqueTerms);
} else {
startTermsBPV = 1;
}
} else {
startTermsBPV = 1;
}
PackedLongValues.Builder termOrdToBytesOffset = PackedLongValues.monotonicBuilder(PackedInts.COMPACT);
final GrowableWriter docToTermOrd = new GrowableWriter(startTermsBPV, maxDoc, acceptableOverheadRatio);
int termOrd = 0;
// TODO: use Uninvert?
if (terms != null) {
final TermsEnum termsEnum = terms.iterator();
PostingsEnum docs = null;
while(true) {
final BytesRef term = termsEnum.next();
if (term == null) {
break;
}
if (termOrd >= maxDoc) {
throw new IllegalStateException("Type mismatch: " + key.field + " was indexed with multiple values per document, use SORTED_SET instead");
}
termOrdToBytesOffset.add(bytes.copyUsingLengthPrefix(term));
docs = termsEnum.postings(docs, PostingsEnum.NONE);
while (true) {
final int docID = docs.nextDoc();
if (docID == DocIdSetIterator.NO_MORE_DOCS) {
break;
}
// Store 1+ ord into packed bits
docToTermOrd.set(docID, 1+termOrd);
}
termOrd++;
}
}
// maybe an int-only impl?
return new SortedDocValuesImpl(bytes.freeze(true), termOrdToBytesOffset.build(), docToTermOrd.getMutable(), termOrd);
}
@Override
protected Accountable createValue(LeafReader reader, CacheKey key)
throws IOException {
// TODO: would be nice to first check if DocTermsIndex
// was already cached for this field and then return
// that instead, to avoid insanity
final int maxDoc = reader.maxDoc();
Terms terms = reader.terms(key.field);
final float acceptableOverheadRatio = ((Float) key.custom).floatValue();
final int termCountHardLimit = maxDoc;
// Holds the actual term data, expanded.
final PagedBytes bytes = new PagedBytes(15);
int startBPV;
if (terms != null) {
// Try for coarse estimate for number of bits; this
// should be an underestimate most of the time, which
// is fine -- GrowableWriter will reallocate as needed
long numUniqueTerms = terms.size();
if (numUniqueTerms != -1L) {
if (numUniqueTerms > termCountHardLimit) {
numUniqueTerms = termCountHardLimit;
}
startBPV = PackedInts.bitsRequired(numUniqueTerms*4);
} else {
startBPV = 1;
}
} else {
startBPV = 1;
}
final GrowableWriter docToOffset = new GrowableWriter(startBPV, maxDoc, acceptableOverheadRatio);
// pointer==0 means not set
bytes.copyUsingLengthPrefix(new BytesRef());
if (terms != null) {
int termCount = 0;
final TermsEnum termsEnum = terms.iterator();
PostingsEnum docs = null;
while(true) {
if (termCount++ == termCountHardLimit) {
// app is misusing the API (there is more than
// one term per doc); in this case we make best
// effort to load what we can (see LUCENE-2142)
break;
}
final BytesRef term = termsEnum.next();
if (term == null) {
break;
}
final long pointer = bytes.copyUsingLengthPrefix(term);
docs = termsEnum.postings(docs, PostingsEnum.NONE);
while (true) {
final int docID = docs.nextDoc();
if (docID == DocIdSetIterator.NO_MORE_DOCS) {
break;
}
docToOffset.set(docID, pointer);
}
}
}
final PackedInts.Reader offsetReader = docToOffset.getMutable();
Bits docsWithField = new Bits() {
@Override
public boolean get(int index) {
return offsetReader.get(index) != 0;
}
@Override
public int length() {
return maxDoc;
}
};
wrapper.setDocsWithField(reader, key.field, docsWithField, null);
// maybe an int-only impl?
return new BinaryDocValuesImpl(bytes.freeze(true), offsetReader, docsWithField);
}
private static SimpleOrderedMap<Object> getIndexedFieldsInfo(SolrQueryRequest req)
throws Exception {
SolrIndexSearcher searcher = req.getSearcher();
SolrParams params = req.getParams();
Set<String> fields = null;
String fl = params.get(CommonParams.FL);
if (fl != null) {
fields = new TreeSet<>(Arrays.asList(fl.split( "[,\\s]+" )));
}
LeafReader reader = searcher.getSlowAtomicReader();
IndexSchema schema = searcher.getSchema();
// Don't be tempted to put this in the loop below, the whole point here is to alphabetize the fields!
Set<String> fieldNames = new TreeSet<>();
for(FieldInfo fieldInfo : reader.getFieldInfos()) {
fieldNames.add(fieldInfo.name);
}
// Walk the term enum and keep a priority queue for each map in our set
SimpleOrderedMap<Object> finfo = new SimpleOrderedMap<>();
for (String fieldName : fieldNames) {
if (fields != null && ! fields.contains(fieldName) && ! fields.contains("*")) {
continue; //we're not interested in this field Still an issue here
}
SimpleOrderedMap<Object> fieldMap = new SimpleOrderedMap<>();
SchemaField sfield = schema.getFieldOrNull( fieldName );
FieldType ftype = (sfield==null)?null:sfield.getType();
fieldMap.add( "type", (ftype==null)?null:ftype.getTypeName() );
fieldMap.add("schema", getFieldFlags(sfield));
if (sfield != null && schema.isDynamicField(sfield.getName()) && schema.getDynamicPattern(sfield.getName()) != null) {
fieldMap.add("dynamicBase", schema.getDynamicPattern(sfield.getName()));
}
Terms terms = reader.terms(fieldName);
if (terms == null) { // Not indexed, so we need to report what we can (it made it through the fl param if specified)
finfo.add( fieldName, fieldMap );
continue;
}
if(sfield != null && sfield.indexed() ) {
if (params.getBool(INCLUDE_INDEX_FIELD_FLAGS,true)) {
Document doc = getFirstLiveDoc(terms, reader);
if (doc != null) {
// Found a document with this field
try {
IndexableField fld = doc.getField(fieldName);
if (fld != null) {
fieldMap.add("index", getFieldFlags(fld));
} else {
// it is a non-stored field...
fieldMap.add("index", "(unstored field)");
}
} catch (Exception ex) {
log.warn("error reading field: {}", fieldName);
}
}
}
fieldMap.add("docs", terms.getDocCount());
}
if (fields != null && (fields.contains(fieldName) || fields.contains("*"))) {
getDetailedFieldInfo(req, fieldName, fieldMap);
}
// Add the field
finfo.add( fieldName, fieldMap );
}
return finfo;
}
@Override
public void execute(Namespace args, PrintStream out) throws Exception {
String field = args.getString("field");
String termVal = null;
int bucketSize = args.getInt("size");
if (field != null){
String[] parts = field.split(":");
if (parts.length > 1){
field = parts[0];
termVal = parts[1];
}
}
IndexReader reader = ctx.getIndexReader();
List<LeafReaderContext> leaves = reader.leaves();
PostingsEnum postingsEnum = null;
for (LeafReaderContext leaf : leaves) {
LeafReader atomicReader = leaf.reader();
Terms terms = atomicReader.terms(field);
if (terms == null){
continue;
}
if (terms != null && termVal != null){
TermsEnum te = terms.iterator();
if (te.seekExact(new BytesRef(termVal))){
postingsEnum = te.postings(postingsEnum, PostingsEnum.FREQS);
int docFreq = te.docFreq();
int minDocId = -1, maxDocId = -1;
int doc, count = 0;
int[] percentDocs = new int[PERCENTILES.length];
int percentileIdx = 0;
while ((doc = postingsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
maxDocId = doc;
if (minDocId == -1) {
minDocId = doc;
}
count ++;
double perDocs = (double) count / (double) docFreq * 100.0;
while (percentileIdx < percentDocs.length) {
if (perDocs > PERCENTILES[percentileIdx]) {
percentDocs[percentileIdx] = doc;
percentileIdx++;
} else {
break;
}
}
}
// calculate histogram
int[] buckets = null;
if (maxDocId > 0) {
buckets = new int[maxDocId / bucketSize + 1];
postingsEnum = te.postings(postingsEnum, PostingsEnum.FREQS);
while ((doc = postingsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
int bucketIdx = doc / bucketSize;
buckets[bucketIdx]++;
}
}
double density = (double) docFreq / (double) (maxDocId - minDocId) ;
out.println(String.format("min: %d, max: %d, count: %d, density: %.2f", minDocId, maxDocId, docFreq, density));
out.println("percentiles: " + Arrays.toString(PERCENTILES) + " => " + Arrays.toString(percentDocs));
out.println("histogram: (bucketsize=" + bucketSize+")");
out.println(Arrays.toString(buckets));
}
}
}
}