下面列出了org.apache.lucene.index.PostingsEnum#NO_MORE_DOCS 实例代码,或者点击链接到github查看源代码,也可以在右侧发表评论。
@Override
public FieldFeatureExtractor[] create(LeafReaderContext context, Set<Integer> allDocs) throws IOException {
FieldFeatureExtractor[] extractors = new FieldFeatureExtractor[terms.length];
int i = 0;
for(Term term: terms){
final TermsEnum termsEnum = getTermsEnum(context, term);
if (termsEnum == null) {
extractors[i] = new FieldFeatureNullExtractor();
}
else{
extractors[i] = new FieldFeatureTFExtractor(termsEnum.postings(null, PostingsEnum.FREQS));
// get it twice without reuse to clone it...
PostingsEnum docs = termsEnum.postings(null, PostingsEnum.FREQS);
for(int docId = docs.nextDoc(); docId != PostingsEnum.NO_MORE_DOCS; docId = docs.nextDoc()){
allDocs.add(docId);
}
}
i++;
}
return extractors;
}
private static Document getDocument(LeafReader reader, Term term) throws IOException {
PostingsEnum docs = reader.postings(term);
if (docs != null) {
int docId = docs.nextDoc();
// PostingsEnum may contain deleted documents, we have to cope for it
while (docId != PostingsEnum.NO_MORE_DOCS) {
// if document is deleted, skip and continue
Bits liveDocs = reader.getLiveDocs();
if (liveDocs != null && !liveDocs.get(docId)) {
docId = docs.nextDoc();
continue;
}
if (docs.nextDoc() != PostingsEnum.NO_MORE_DOCS) {
throw new IllegalStateException("Multiple Documents for term " + term.text());
}
return readDocument(reader, docId, null);
}
}
return null;
}
@Override
public Optional<Integer> firstTermDoc() {
if (tenum == null) {
// terms enum is not set
log.warn("Terms enum un-positioned.");
return Optional.empty();
}
try {
setPostingsIterator(tenum.postings(penum, PostingsEnum.ALL));
if (penum.nextDoc() == PostingsEnum.NO_MORE_DOCS) {
// no docs available for this term
resetPostingsIterator();
log.warn("No docs available for term: {} in field: {}.", BytesRefUtils.decode(tenum.term()), curField);
return Optional.empty();
} else {
return Optional.of(penum.docID());
}
} catch (IOException e) {
resetPostingsIterator();
throw new LukeException(String.format(Locale.ENGLISH, "Term docs not available for field: %s.", curField), e);
}
}
@Override
public Optional<Integer> nextTermDoc() {
if (penum == null) {
// postings enum is not initialized
log.warn("Postings enum un-positioned for field: {}.", curField);
return Optional.empty();
}
try {
if (penum.nextDoc() == PostingsEnum.NO_MORE_DOCS) {
// end of the iterator
resetPostingsIterator();
if (log.isInfoEnabled()) {
log.info("Reached the end of the postings iterator for term: {} in field: {}", BytesRefUtils.decode(tenum.term()), curField);
}
return Optional.empty();
} else {
return Optional.of(penum.docID());
}
} catch (IOException e) {
resetPostingsIterator();
throw new LukeException(String.format(Locale.ENGLISH, "Term docs not available for field: %s.", curField), e);
}
}
/** Returns docID if found, else -1. */
public int lookup(BytesRef id, long version) throws IOException {
for(int seg=0;seg<numSegs;seg++) {
if (((IDVersionSegmentTermsEnum) termsEnums[seg]).seekExact(id, version)) {
if (VERBOSE) {
System.out.println(" found in seg=" + termsEnums[seg]);
}
postingsEnums[seg] = termsEnums[seg].postings(postingsEnums[seg], 0);
int docID = postingsEnums[seg].nextDoc();
if (docID != PostingsEnum.NO_MORE_DOCS && (liveDocs[seg] == null || liveDocs[seg].get(docID))) {
lastVersion = ((IDVersionSegmentTermsEnum) termsEnums[seg]).getVersion();
return docBases[seg] + docID;
}
assert hasDeletions;
}
}
return -1;
}
/**
* Gets the 1 - entropy (i.e. 1+ plogp) of a term,
* a function that favors terms that are focally distributed
* We use the definition of log-entropy weighting provided in
* Martin and Berry (2007):
* Entropy = 1 + sum ((Pij log2(Pij)) / log2(n))
* where Pij = frequency of term i in doc j / global frequency of term i
* n = number of documents in collection
* @param term whose entropy you want
* Thanks to Vidya Vasuki for adding the hash table to
* eliminate redundant calculation
*/
private float getEntropy(Term term) {
if (termEntropy.containsKey(term.field()+"_"+term.text()))
return termEntropy.get(term.field()+"_"+term.text());
int gf = getGlobalTermFreq(term);
double entropy = 0;
try {
PostingsEnum docsEnum = this.getDocsForTerm(term);
while ((docsEnum.nextDoc()) != PostingsEnum.NO_MORE_DOCS) {
double p = docsEnum.freq(); //frequency in this document
p = p / gf; //frequency across all documents
entropy += p * (Math.log(p) / Math.log(2)); //sum of Plog(P)
}
int n = this.getNumDocs();
double log2n = Math.log(n) / Math.log(2);
entropy = entropy / log2n;
} catch (IOException e) {
logger.info("Couldn't get term entropy for term " + term.text());
}
termEntropy.put(term.field()+"_"+term.text(), 1 + (float) entropy);
return (float) (1 + entropy);
}
private static void addDocuments(LeafReader reader, Term term, Collection<Document> documents) throws IOException {
PostingsEnum docs = reader.postings(term);
if (docs != null) {
int docId;
while ((docId = docs.nextDoc()) != PostingsEnum.NO_MORE_DOCS) {
Bits liveDocs = reader.getLiveDocs();
// Maybe some of the docs have been deleted! Check that too..
if (liveDocs != null && !liveDocs.get(docId)) {
continue;
}
Document document = readDocument(reader, docId, null);
documents.add(document);
}
}
}
private List<MWESentenceContext> collectTermOffsets(Terms termVectorLookup) throws IOException {
List<MWESentenceContext> result = new ArrayList<>();
TermsEnum tiRef= termVectorLookup.iterator();
BytesRef luceneTerm = tiRef.next();
while (luceneTerm != null) {
if (luceneTerm.length == 0) {
luceneTerm = tiRef.next();
continue;
}
String tString = luceneTerm.utf8ToString();
if(!allCandidates.contains(tString)) {
luceneTerm=tiRef.next();
continue;
}
PostingsEnum postingsEnum = tiRef.postings(null, PostingsEnum.ALL);
//PostingsEnum postingsEnum = ti.postings(null, PostingsEnum.OFFSETS);
int doc = postingsEnum.nextDoc(); //this should be just 1 doc, i.e., the constraint for getting this TV
if (doc != PostingsEnum.NO_MORE_DOCS) {
int totalOccurrence = postingsEnum.freq();
for (int i = 0; i < totalOccurrence; i++) {
postingsEnum.nextPosition();
int start = postingsEnum.startOffset();
int end = postingsEnum.endOffset();
BytesRef payload=postingsEnum.getPayload();
int sentenceId=-1;
if(payload!=null){
sentenceId=new SentenceContext(MWEMetadata.deserialize(payload.utf8ToString())).getSentenceId();
}
result.add(new MWESentenceContext(tString,sentenceId, start, end));
}
}
luceneTerm = tiRef.next();
}
Collections.sort(result);
return result;
}
/** Returns an IntsRef either cached or reading postingsEnum. Not null.
* @param postingsEnum*/
private IntsRef postingsEnumToIntsRef(PostingsEnum postingsEnum, Bits liveDocs) throws IOException {
// (The cache can have empty IntsRefs)
//lookup prefixBuf in a cache
if (docIdsCache != null) {
docIds = docIdsCache.get(prefixBuf);
if (docIds != null) {
return docIds;
}
}
//read postingsEnum
docIds = new IntsRef(termsEnum.docFreq());
int docId;
while ((docId = postingsEnum.nextDoc()) != PostingsEnum.NO_MORE_DOCS) {
if (liveDocs != null && !liveDocs.get(postingsEnum.docID())) {
continue;
}
docIds.ints[docIds.length++] = docId;
}
if (docIds.length == 0)
docIds = EMPTY_INTSREF;
//cache
if (docIdsCache != null) {
ensureBufIsACopy();
//clone is shallow; that's okay as the prefix isn't overwritten; it's just appended to
docIdsCache.put(prefixBuf.clone(), docIds);
}
return docIds;
}
private static boolean next(PostingsEnum docs) throws IOException {
return (docs.nextDoc() != PostingsEnum.NO_MORE_DOCS);
}
/** Used when base query is highly constraining vs the
* drilldowns, or when the docs must be scored at once
* (i.e., like BooleanScorer2, not BooleanScorer). In
* this case we just .next() on base and .advance() on
* the dim filters. */
private void doQueryFirstScoring(Bits acceptDocs, LeafCollector collector, DocsAndCost[] dims) throws IOException {
//if (DEBUG) {
// System.out.println(" doQueryFirstScoring");
//}
int docID = baseScorer.docID();
nextDoc: while (docID != PostingsEnum.NO_MORE_DOCS) {
if (acceptDocs != null && acceptDocs.get(docID) == false) {
docID = baseIterator.nextDoc();
continue;
}
LeafCollector failedCollector = null;
for (DocsAndCost dim : dims) {
// TODO: should we sort this 2nd dimension of
// docsEnums from most frequent to least?
if (dim.approximation.docID() < docID) {
dim.approximation.advance(docID);
}
boolean matches = false;
if (dim.approximation.docID() == docID) {
if (dim.twoPhase == null) {
matches = true;
} else {
matches = dim.twoPhase.matches();
}
}
if (matches == false) {
if (failedCollector != null) {
// More than one dim fails on this document, so
// it's neither a hit nor a near-miss; move to
// next doc:
docID = baseIterator.nextDoc();
continue nextDoc;
} else {
failedCollector = dim.sidewaysLeafCollector;
}
}
}
collectDocID = docID;
// TODO: we could score on demand instead since we are
// daat here:
collectScore = baseScorer.score();
if (failedCollector == null) {
// Hit passed all filters, so it's "real":
collectHit(collector, dims);
} else {
// Hit missed exactly one filter:
collectNearMiss(failedCollector);
}
docID = baseIterator.nextDoc();
}
}
/**
* Creates doc vectors, iterating over terms.
*/
private void trainDocVectors() throws IOException {
VerbatimLogger.info("Building document vectors ... ");
Enumeration<ObjectVector> termEnum = termVectors.getAllVectors();
try {
int tc = 0;
while (termEnum.hasMoreElements()) {
// Output progress counter.
if ((tc % 10000 == 0) || (tc < 10000 && tc % 1000 == 0)) {
VerbatimLogger.info("Processed " + tc + " terms ... ");
}
tc++;
ObjectVector termVectorObject = termEnum.nextElement();
Vector termVector = termVectorObject.getVector();
String word = (String) termVectorObject.getObject();
// Go through checking terms for each fieldName.
for (String fieldName : flagConfig.contentsfields()) {
Term term = new Term(fieldName, word);
float globalweight = luceneUtils.getGlobalTermWeight(term);
float fieldweight = 1;
// Get any docs for this term.
PostingsEnum docsEnum = this.luceneUtils.getDocsForTerm(term);
// This may occur frequently if one term vector store is derived from multiple fields
if (docsEnum == null) { continue; }
while (docsEnum.nextDoc() != PostingsEnum.NO_MORE_DOCS) {
String externalDocID = luceneUtils.getExternalDocId(docsEnum.docID());
// Add vector from this term, taking freq into account.
Vector docVector = this.docVectors.getVector(externalDocID);
float localweight = docsEnum.freq();
if (flagConfig.fieldweight()) {
//field weight: 1/sqrt(number of terms in field)
TermsEnum terms = luceneUtils.getTermVector(docsEnum.docID(), fieldName).iterator();
int numTerms = 0;
while (terms.next() != null) {
numTerms++;
}
fieldweight = (float) (1/Math.sqrt(numTerms));
}
docVector.superpose(
termVector, localweight * globalweight * fieldweight, null);
}
}
}
}
catch (IOException e) { // catches from indexReader.
e.printStackTrace();
}
VerbatimLogger.info("\nNormalizing doc vectors ...\n");
Enumeration<ObjectVector> docEnum = docVectors.getAllVectors();
while (docEnum.hasMoreElements())
docEnum.nextElement().getVector().normalize();
}
private List<MWEInSentence> collectTermSentenceContext(Terms termVectorLookup,
Map<Integer, Integer> sentenceBoundaries) throws IOException {
List<MWEInSentence> result = new ArrayList<>();
TermsEnum tiRef = termVectorLookup.iterator();
BytesRef luceneTerm = tiRef.next();
while (luceneTerm != null) {
if (luceneTerm.length == 0) {
luceneTerm = tiRef.next();
continue;
}
String tString = luceneTerm.utf8ToString();
if (!allCandidates.contains(tString)) {
luceneTerm = tiRef.next();
continue;
}
PostingsEnum postingsEnum = tiRef.postings(null, PostingsEnum.ALL);
//PostingsEnum postingsEnum = ti.postings(null, PostingsEnum.OFFSETS);
int doc = postingsEnum.nextDoc(); //this should be just 1 doc, i.e., the constraint for getting this TV
if (doc != PostingsEnum.NO_MORE_DOCS) {
int totalOccurrence = postingsEnum.freq();
for (int i = 0; i < totalOccurrence; i++) {
postingsEnum.nextPosition();
int start = postingsEnum.startOffset();
int end = postingsEnum.endOffset();
BytesRef payload = postingsEnum.getPayload();
SentenceContext sentenceContextInfo = null;
if (payload != null) {
sentenceContextInfo = new SentenceContext(MWEMetadata.deserialize(payload.utf8ToString()));
}
if (sentenceContextInfo == null)
result.add(new MWEInSentence(tString, start, end, 0, 0, 0));
else {
result.add(new MWEInSentence(tString, start, end,
sentenceContextInfo.getFirstTokenIdx(),
sentenceContextInfo.getLastTokenIdx(),
sentenceContextInfo.getSentenceId()));
Integer endBound = sentenceBoundaries.get(sentenceContextInfo.getSentenceId());
if (endBound == null || endBound < sentenceContextInfo.getLastTokenIdx())
sentenceBoundaries.put(sentenceContextInfo.getSentenceId(),
sentenceContextInfo.getLastTokenIdx());
}
}
}
luceneTerm = tiRef.next();
}
Collections.sort(result);
return result;
}