下面列出了怎么用org.apache.lucene.index.PostingsEnum的API类实例代码及写法,或者点击链接到github查看源代码。
/** Returns docID if found, else -1. */
public int lookup(BytesRef id, long version) throws IOException {
for(int seg=0;seg<numSegs;seg++) {
if (((IDVersionSegmentTermsEnum) termsEnums[seg]).seekExact(id, version)) {
if (VERBOSE) {
System.out.println(" found in seg=" + termsEnums[seg]);
}
postingsEnums[seg] = termsEnums[seg].postings(postingsEnums[seg], 0);
int docID = postingsEnums[seg].nextDoc();
if (docID != PostingsEnum.NO_MORE_DOCS && (liveDocs[seg] == null || liveDocs[seg].get(docID))) {
lastVersion = ((IDVersionSegmentTermsEnum) termsEnums[seg]).getVersion();
return docBases[seg] + docID;
}
assert hasDeletions;
}
}
return -1;
}
/**
* Collect collection.
*
* @param reader
* the reader
* @param docSet
* the doc set
* @param collectionInfo
* the collection info
* @throws IOException
* Signals that an I/O exception has occurred.
*/
public static void collectCollection(IndexReader reader, List<Integer> docSet,
ComponentCollection collectionInfo) throws IOException {
if (collectionInfo.action().equals(ComponentCollection.ACTION_CHECK)) {
// can't do anything in lucene for check
} else if (collectionInfo.action()
.equals(ComponentCollection.ACTION_LIST)) {
// can't do anything in lucene for list
} else if (collectionInfo.action()
.equals(ComponentCollection.ACTION_CREATE)) {
BytesRef term = null;
PostingsEnum postingsEnum = null;
Integer docId;
Integer termDocId = -1;
Terms terms;
LeafReaderContext lrc;
LeafReader r;
ListIterator<LeafReaderContext> iterator = reader.leaves().listIterator();
while (iterator.hasNext()) {
lrc = iterator.next();
r = lrc.reader();
for (String field : collectionInfo.fields()) {
if ((terms = r.terms(field)) != null) {
TermsEnum termsEnum = terms.iterator();
while ((term = termsEnum.next()) != null) {
Iterator<Integer> docIterator = docSet.iterator();
postingsEnum = termsEnum.postings(postingsEnum,
PostingsEnum.NONE);
termDocId = -1;
while (docIterator.hasNext()) {
docId = docIterator.next() - lrc.docBase;
if ((docId >= termDocId) && ((docId.equals(termDocId))
|| ((termDocId = postingsEnum.advance(docId))
.equals(docId)))) {
collectionInfo.addValue(term.utf8ToString());
break;
}
if (termDocId.equals(PostingsEnum.NO_MORE_DOCS)) {
break;
}
}
}
}
}
}
}
}
/**
* Creates the TermsEnum (if not already created) and must be called before any calls to getBackgroundFrequency
* @param context The aggregation context
* @return The number of documents in the index (after an optional filter might have been applied)
*/
public long prepareBackground(AggregationContext context) {
if (termsEnum != null) {
// already prepared - return
return termsEnum.getNumDocs();
}
SearchContext searchContext = context.searchContext();
IndexReader reader = searchContext.searcher().getIndexReader();
try {
if (numberOfAggregatorsCreated == 1) {
// Setup a termsEnum for sole use by one aggregator
termsEnum = new FilterableTermsEnum(reader, indexedFieldName, PostingsEnum.NONE, filter);
} else {
// When we have > 1 agg we have possibility of duplicate term frequency lookups
// and so use a TermsEnum that caches results of all term lookups
termsEnum = new FreqTermsEnum(reader, indexedFieldName, true, false, filter, searchContext.bigArrays());
}
} catch (IOException e) {
throw new ElasticsearchException("failed to build terms enumeration", e);
}
return termsEnum.getNumDocs();
}
private void buildTerm(XContentBuilder builder, final CharsRefBuilder spare, Terms curTerms, TermsEnum termIter, BoostAttribute boostAtt) throws IOException {
// start term, optimized writing
BytesRef term = termIter.next();
spare.copyUTF8Bytes(term);
builder.startObject(spare.toString());
buildTermStatistics(builder, termIter);
// finally write the term vectors
PostingsEnum posEnum = termIter.postings(null, PostingsEnum.ALL);
int termFreq = posEnum.freq();
builder.field(FieldStrings.TERM_FREQ, termFreq);
initMemory(curTerms, termFreq);
initValues(curTerms, posEnum, termFreq);
buildValues(builder, curTerms, termFreq);
buildScore(builder, boostAtt);
builder.endObject();
}
private void initValues(Terms curTerms, PostingsEnum posEnum, int termFreq) throws IOException {
for (int j = 0; j < termFreq; j++) {
int nextPos = posEnum.nextPosition();
if (curTerms.hasPositions()) {
currentPositions[j] = nextPos;
}
if (curTerms.hasOffsets()) {
currentStartOffset[j] = posEnum.startOffset();
currentEndOffset[j] = posEnum.endOffset();
}
if (curTerms.hasPayloads()) {
BytesRef curPayload = posEnum.getPayload();
if (curPayload != null) {
currentPayloads[j] = new BytesArray(curPayload.bytes, 0, curPayload.length);
} else {
currentPayloads[j] = null;
}
}
}
}
@Override
public void collectLeaf(PostingsEnum postings, int position, Term term) throws IOException {
if (!fieldMatcher.test(term.field())) {
return;
}
SpanCollectedOffsetsEnum offsetsEnum = termToOffsetsEnums.get(term.bytes());
if (offsetsEnum == null) {
// If it's pos insensitive we handle it outside of PhraseHelper. term.field() is from the Query.
if (positionInsensitiveTerms.contains(term.bytes())) {
return;
}
offsetsEnum = new SpanCollectedOffsetsEnum(term.bytes(), postings.freq());
termToOffsetsEnums.put(term.bytes(), offsetsEnum);
}
offsetsEnum.add(postings.startOffset(), postings.endOffset());
}
private static Document getFirstLiveDoc(Terms terms, LeafReader reader) throws IOException {
PostingsEnum postingsEnum = null;
TermsEnum termsEnum = terms.iterator();
BytesRef text;
// Deal with the chance that the first bunch of terms are in deleted documents. Is there a better way?
for (int idx = 0; idx < 1000 && postingsEnum == null; ++idx) {
text = termsEnum.next();
if (text == null) { // Ran off the end of the terms enum without finding any live docs with that field in them.
return null;
}
postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.NONE);
final Bits liveDocs = reader.getLiveDocs();
if (postingsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
if (liveDocs != null && liveDocs.get(postingsEnum.docID())) {
continue;
}
return reader.document(postingsEnum.docID());
}
}
return null;
}
/**
* Returns a new term vector entry representing the specified term, and optionally, positions.
*
* @param te - positioned terms iterator
* @return term vector entry
* @throws IOException - if there is a low level IO error.
*/
static TermVectorEntry of(TermsEnum te) throws IOException {
Objects.requireNonNull(te);
String termText = BytesRefUtils.decode(te.term());
List<TermVectorEntry.TermVectorPosition> tvPositions = new ArrayList<>();
PostingsEnum pe = te.postings(null, PostingsEnum.OFFSETS);
pe.nextDoc();
int freq = pe.freq();
for (int i = 0; i < freq; i++) {
int pos = pe.nextPosition();
if (pos < 0) {
// no position information available
continue;
}
TermVectorPosition tvPos = TermVectorPosition.of(pos, pe);
tvPositions.add(tvPos);
}
return new TermVectorEntry(termText, te.totalTermFreq(), tvPositions);
}
static TermPosting of(int position, PostingsEnum penum) throws IOException {
TermPosting posting = new TermPosting();
// set position
posting.position = position;
// set offset (if available)
int sOffset = penum.startOffset();
int eOffset = penum.endOffset();
if (sOffset >= 0 && eOffset >= 0) {
posting.startOffset = sOffset;
posting.endOffset = eOffset;
}
// set payload (if available)
if (penum.getPayload() != null) {
posting.payload = BytesRef.deepCopyOf(penum.getPayload());
}
return posting;
}
@Override
public Optional<Integer> firstTermDoc() {
if (tenum == null) {
// terms enum is not set
log.warn("Terms enum un-positioned.");
return Optional.empty();
}
try {
setPostingsIterator(tenum.postings(penum, PostingsEnum.ALL));
if (penum.nextDoc() == PostingsEnum.NO_MORE_DOCS) {
// no docs available for this term
resetPostingsIterator();
log.warn("No docs available for term: {} in field: {}.", BytesRefUtils.decode(tenum.term()), curField);
return Optional.empty();
} else {
return Optional.of(penum.docID());
}
} catch (IOException e) {
resetPostingsIterator();
throw new LukeException(String.format(Locale.ENGLISH, "Term docs not available for field: %s.", curField), e);
}
}
@Override
public Optional<Integer> nextTermDoc() {
if (penum == null) {
// postings enum is not initialized
log.warn("Postings enum un-positioned for field: {}.", curField);
return Optional.empty();
}
try {
if (penum.nextDoc() == PostingsEnum.NO_MORE_DOCS) {
// end of the iterator
resetPostingsIterator();
if (log.isInfoEnabled()) {
log.info("Reached the end of the postings iterator for term: {} in field: {}", BytesRefUtils.decode(tenum.term()), curField);
}
return Optional.empty();
} else {
return Optional.of(penum.docID());
}
} catch (IOException e) {
resetPostingsIterator();
throw new LukeException(String.format(Locale.ENGLISH, "Term docs not available for field: %s.", curField), e);
}
}
@Override
public void collectLeaf(PostingsEnum postings, int position, Term term) throws IOException {
if (!matches)
return;
if (upto >= payloadToMatch.size()) {
matches = false;
return;
}
BytesRef payload = postings.getPayload();
if (payloadToMatch.get(upto) == null) {
matches = payload == null;
upto++;
return;
}
if (payload == null) {
matches = false;
upto++;
return;
}
matches = payloadToMatch.get(upto).bytesEquals(payload);
upto++;
}
private static void findSplitDocs(String idField, Predicate<BytesRef> includeInShard, LeafReader leafReader,
IntConsumer consumer) throws IOException {
Terms terms = leafReader.terms(idField);
TermsEnum iterator = terms.iterator();
BytesRef idTerm;
PostingsEnum postingsEnum = null;
while ((idTerm = iterator.next()) != null) {
if (includeInShard.test(idTerm) == false) {
postingsEnum = iterator.postings(postingsEnum);
int doc;
while ((doc = postingsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
consumer.accept(doc);
}
}
}
}
public PostingsAndFreq(PostingsEnum postings, ImpactsEnum impacts, int position, Term... terms) {
this.postings = postings;
this.impacts = impacts;
this.position = position;
nTerms = terms==null ? 0 : terms.length;
if (nTerms>0) {
if (terms.length==1) {
this.terms = terms;
} else {
Term[] terms2 = new Term[terms.length];
System.arraycopy(terms, 0, terms2, 0, terms.length);
Arrays.sort(terms2);
this.terms = terms2;
}
} else {
this.terms = null;
}
}
@Override
public void collectLeaf(PostingsEnum postings, int position, Term term) throws IOException {
if (!matches)
return;
if (upto >= payloadToMatch.size()) {
matches = false;
return;
}
BytesRef payload = postings.getPayload();
if (payloadToMatch.get(upto) == null) {
matches = payload == null;
upto++;
return;
}
if (payload == null) {
matches = false;
upto++;
return;
}
matches = payloadToMatch.get(upto).bytesEquals(payload);
upto++;
}
public static PostingsEnum docs(Random random, TermsEnum termsEnum, PostingsEnum reuse, int flags) throws IOException {
// TODO: simplify this method? it would be easier to randomly either use the flags passed, or do the random selection,
// FREQS should be part fo the random selection instead of outside on its own?
if (random.nextBoolean()) {
if (random.nextBoolean()) {
final int posFlags;
switch (random.nextInt(4)) {
case 0: posFlags = PostingsEnum.POSITIONS; break;
case 1: posFlags = PostingsEnum.OFFSETS; break;
case 2: posFlags = PostingsEnum.PAYLOADS; break;
default: posFlags = PostingsEnum.ALL; break;
}
return termsEnum.postings(null, posFlags);
}
flags |= PostingsEnum.FREQS;
}
return termsEnum.postings(reuse, flags);
}
/**
* Create a {@link DisjunctionMatchesIterator} over a list of terms extracted from a {@link BytesRefIterator}
*
* Only terms that have at least one match in the given document will be included
*/
static MatchesIterator fromTermsEnum(LeafReaderContext context, int doc, Query query, String field, BytesRefIterator terms) throws IOException {
Objects.requireNonNull(field);
Terms t = context.reader().terms(field);
if (t == null)
return null;
TermsEnum te = t.iterator();
PostingsEnum reuse = null;
for (BytesRef term = terms.next(); term != null; term = terms.next()) {
if (te.seekExact(term)) {
PostingsEnum pe = te.postings(reuse, PostingsEnum.OFFSETS);
if (pe.advance(doc) == doc) {
return new TermsEnumDisjunctionMatchesIterator(new TermMatchesIterator(query, pe), terms, te, doc, query);
}
else {
reuse = pe;
}
}
}
return null;
}
private void init() throws IOException {
List<MatchesIterator> mis = new ArrayList<>();
mis.add(first);
PostingsEnum reuse = null;
for (BytesRef term = terms.next(); term != null; term = terms.next()) {
if (te.seekExact(term)) {
PostingsEnum pe = te.postings(reuse, PostingsEnum.OFFSETS);
if (pe.advance(doc) == doc) {
mis.add(new TermMatchesIterator(query, pe));
reuse = null;
} else {
reuse = pe;
}
}
}
it = fromSubIterators(mis);
}
protected void fillDocsAndScores(FixedBitSet matchingDocs, TermsEnum termsEnum) throws IOException {
BytesRef spare = new BytesRef();
PostingsEnum postingsEnum = null;
for (int i = 0; i < terms.size(); i++) {
if (termsEnum.seekExact(terms.get(ords[i], spare))) {
postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.NONE);
float score = TermsIncludingScoreQuery.this.scores[ords[i]];
for (int doc = postingsEnum.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = postingsEnum.nextDoc()) {
matchingDocs.set(doc);
// In the case the same doc is also related to a another doc, a score might be overwritten. I think this
// can only happen in a many-to-many relation
scores[doc] = score;
}
}
}
}
@Override
protected void fillDocsAndScores(FixedBitSet matchingDocs, TermsEnum termsEnum) throws IOException {
BytesRef spare = new BytesRef();
PostingsEnum postingsEnum = null;
for (int i = 0; i < terms.size(); i++) {
if (termsEnum.seekExact(terms.get(ords[i], spare))) {
postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.NONE);
float score = TermsIncludingScoreQuery.this.scores[ords[i]];
for (int doc = postingsEnum.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = postingsEnum.nextDoc()) {
// I prefer this:
/*if (scores[doc] < score) {
scores[doc] = score;
matchingDocs.set(doc);
}*/
// But this behaves the same as MVInnerScorer and only then the tests will pass:
if (!matchingDocs.get(doc)) {
scores[doc] = score;
matchingDocs.set(doc);
}
}
}
}
}
@Override
public PostingsEnum postings(FieldInfo fieldInfo, BlockTermState termState, PostingsEnum reuse, int flags) throws IOException {
SingleDocsEnum docsEnum;
if (PostingsEnum.featureRequested(flags, PostingsEnum.POSITIONS)) {
SinglePostingsEnum posEnum;
if (reuse instanceof SinglePostingsEnum) {
posEnum = (SinglePostingsEnum) reuse;
} else {
posEnum = new SinglePostingsEnum();
}
IDVersionTermState _termState = (IDVersionTermState) termState;
posEnum.reset(_termState.docID, _termState.idVersion);
return posEnum;
}
if (reuse instanceof SingleDocsEnum) {
docsEnum = (SingleDocsEnum) reuse;
} else {
docsEnum = new SingleDocsEnum();
}
docsEnum.reset(((IDVersionTermState) termState).docID);
return docsEnum;
}
public PostingsAndFreq(PostingsEnum postings, ImpactsEnum impacts, int position, List<Term> terms) {
this.postings = postings;
this.impacts = impacts;
this.position = position;
nTerms = terms == null ? 0 : terms.size();
if (nTerms > 0) {
Term[] terms2 = terms.toArray(new Term[0]);
if (nTerms > 1) {
Arrays.sort(terms2);
}
this.terms = terms2;
} else {
this.terms = null;
}
}
private int convertToLuceneFlags(int flags) {
int lucenePositionsFlags = PostingsEnum.NONE;
lucenePositionsFlags |= (flags & IndexLookup.FLAG_FREQUENCIES) > 0 ? PostingsEnum.FREQS : 0x0;
lucenePositionsFlags |= (flags & IndexLookup.FLAG_POSITIONS) > 0 ? PostingsEnum.POSITIONS : 0x0;
lucenePositionsFlags |= (flags & IndexLookup.FLAG_PAYLOADS) > 0 ? PostingsEnum.PAYLOADS : 0x0;
lucenePositionsFlags |= (flags & IndexLookup.FLAG_OFFSETS) > 0 ? PostingsEnum.OFFSETS : 0x0;
return lucenePositionsFlags;
}
public void testEndOffsetPositionWithTeeSinkTokenFilter() throws Exception {
Directory dir = newDirectory();
Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false);
IndexWriter w = new IndexWriter(dir, newIndexWriterConfig(analyzer));
Document doc = new Document();
TokenStream tokenStream = analyzer.tokenStream("field", "abcd ");
TeeSinkTokenFilter tee = new TeeSinkTokenFilter(tokenStream);
TokenStream sink = tee.newSinkTokenStream();
FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
ft.setStoreTermVectors(true);
ft.setStoreTermVectorOffsets(true);
ft.setStoreTermVectorPositions(true);
Field f1 = new Field("field", tee, ft);
Field f2 = new Field("field", sink, ft);
doc.add(f1);
doc.add(f2);
w.addDocument(doc);
w.close();
IndexReader r = DirectoryReader.open(dir);
Terms vector = r.getTermVectors(0).terms("field");
assertEquals(1, vector.size());
TermsEnum termsEnum = vector.iterator();
termsEnum.next();
assertEquals(2, termsEnum.totalTermFreq());
PostingsEnum positions = termsEnum.postings(null, PostingsEnum.ALL);
assertTrue(positions.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
assertEquals(2, positions.freq());
positions.nextPosition();
assertEquals(0, positions.startOffset());
assertEquals(4, positions.endOffset());
positions.nextPosition();
assertEquals(8, positions.startOffset());
assertEquals(12, positions.endOffset());
assertEquals(DocIdSetIterator.NO_MORE_DOCS, positions.nextDoc());
r.close();
dir.close();
analyzer.close();
}
@Override
public void collectLeaf(PostingsEnum postings, int position, Term term) throws IOException {
BytesRef payload = postings.getPayload();
if (payload == null)
return;
final byte[] bytes = new byte[payload.length];
System.arraycopy(payload.bytes, payload.offset, bytes, 0, payload.length);
payloads.add(bytes);
}
public TermSpans(LeafSimScorer scorer,
PostingsEnum postings, Term term, float positionsCost) {
this.postings = Objects.requireNonNull(postings);
this.term = Objects.requireNonNull(term);
this.doc = -1;
this.position = -1;
assert positionsCost > 0; // otherwise the TermSpans should not be created.
this.positionsCost = positionsCost;
}
private Map<Integer,String> getTermVectorWithException(String field, String id) throws IOException {
TermVectorsResponse response = client.prepareTermVector(indexName, documentType, id)
.setOffsets(false).setPositions(true).setFieldStatistics(false)
.setTermStatistics(false)
.setSelectedFields(field).
execute().actionGet();
Map<Integer,String> map = new HashMap<>();
Terms terms = response.getFields().terms(field);
if (terms==null){
return map;
}
TermsEnum iterator = terms.iterator();
PostingsEnum postings = null;
for (BytesRef termBytes = null; (termBytes = iterator.next()) != null; ) {
String term = termBytes.utf8ToString();
postings = iterator.postings(postings, PostingsEnum.ALL);
//there can only be one doc since we are getting with id. get the doc and the position
postings.nextDoc();
int tf = postings.freq();
for (int i = 0; i < tf; i++) {
int pos = postings.nextPosition();
map.put(pos,term);
}
}
return map;
}
private static DocSet createBigSet(List<LeafReaderContext> leaves, PostingsEnum[] postList, int maxDoc, int firstReader) throws IOException {
long[] bits = new long[FixedBitSet.bits2words(maxDoc)];
int sz = 0;
for (int i = firstReader; i < postList.length; i++) {
PostingsEnum postings = postList[i];
if (postings == null) continue;
LeafReaderContext ctx = leaves.get(i);
Bits liveDocs = ctx.reader().getLiveDocs();
int base = ctx.docBase;
for (; ; ) {
int subId = postings.nextDoc();
if (subId == DocIdSetIterator.NO_MORE_DOCS) break;
if (liveDocs != null && !liveDocs.get(subId)) continue;
int globalId = subId + base;
bits[globalId >> 6] |= (1L << globalId);
sz++;
}
}
BitDocSet docSet = new BitDocSet( new FixedBitSet(bits, maxDoc), sz );
int smallSetSize = smallSetSize(maxDoc);
if (sz < smallSetSize) {
// make this optional?
DocSet smallSet = toSmallSet( docSet );
// assert equals(docSet, smallSet);
return smallSet;
}
return docSet;
}
@Before
public void setUp() throws IOException
{
initMocks(this);
// Link up the mocks.
when(mockTerms.iterator()).thenReturn(mockTermsEnum);
when(mockTermsEnum.postings(null, PostingsEnum.NONE)).thenReturn(mockPostingsEnum);
}
@Override
public Terms terms(String field) throws IOException {
// ensure the underlying PostingsEnum returns offsets. It's sad we have to do this to use the SpanCollector.
return new FilterTerms(super.terms(fieldName)) {
@Override
public TermsEnum iterator() throws IOException {
return new FilterTermsEnum(in.iterator()) {
@Override
public PostingsEnum postings(PostingsEnum reuse, int flags) throws IOException {
return super.postings(reuse, flags | PostingsEnum.OFFSETS);
}
};
}
};
}