下面列出了org.apache.lucene.index.PostingsEnum#freq ( ) 实例代码,或者点击链接到github查看源代码,也可以在右侧发表评论。
private void buildTerm(XContentBuilder builder, final CharsRefBuilder spare, Terms curTerms, TermsEnum termIter, BoostAttribute boostAtt) throws IOException {
// start term, optimized writing
BytesRef term = termIter.next();
spare.copyUTF8Bytes(term);
builder.startObject(spare.toString());
buildTermStatistics(builder, termIter);
// finally write the term vectors
PostingsEnum posEnum = termIter.postings(null, PostingsEnum.ALL);
int termFreq = posEnum.freq();
builder.field(FieldStrings.TERM_FREQ, termFreq);
initMemory(curTerms, termFreq);
initValues(curTerms, posEnum, termFreq);
buildValues(builder, curTerms, termFreq);
buildScore(builder, boostAtt);
builder.endObject();
}
@Override
public void collectLeaf(PostingsEnum postings, int position, Term term) throws IOException {
if (!fieldMatcher.test(term.field())) {
return;
}
SpanCollectedOffsetsEnum offsetsEnum = termToOffsetsEnums.get(term.bytes());
if (offsetsEnum == null) {
// If it's pos insensitive we handle it outside of PhraseHelper. term.field() is from the Query.
if (positionInsensitiveTerms.contains(term.bytes())) {
return;
}
offsetsEnum = new SpanCollectedOffsetsEnum(term.bytes(), postings.freq());
termToOffsetsEnums.put(term.bytes(), offsetsEnum);
}
offsetsEnum.add(postings.startOffset(), postings.endOffset());
}
/**
* Returns a new term vector entry representing the specified term, and optionally, positions.
*
* @param te - positioned terms iterator
* @return term vector entry
* @throws IOException - if there is a low level IO error.
*/
static TermVectorEntry of(TermsEnum te) throws IOException {
Objects.requireNonNull(te);
String termText = BytesRefUtils.decode(te.term());
List<TermVectorEntry.TermVectorPosition> tvPositions = new ArrayList<>();
PostingsEnum pe = te.postings(null, PostingsEnum.OFFSETS);
pe.nextDoc();
int freq = pe.freq();
for (int i = 0; i < freq; i++) {
int pos = pe.nextPosition();
if (pos < 0) {
// no position information available
continue;
}
TermVectorPosition tvPos = TermVectorPosition.of(pos, pe);
tvPositions.add(tvPos);
}
return new TermVectorEntry(termText, te.totalTermFreq(), tvPositions);
}
/**
* checks docs + freqs + positions + payloads, sequentially
*/
public void assertDocsAndPositionsEnum(PostingsEnum leftDocs, PostingsEnum rightDocs) throws Exception {
assertNotNull(leftDocs);
assertNotNull(rightDocs);
assertEquals(-1, leftDocs.docID());
assertEquals(-1, rightDocs.docID());
int docid;
while ((docid = leftDocs.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
assertEquals(docid, rightDocs.nextDoc());
int freq = leftDocs.freq();
assertEquals(freq, rightDocs.freq());
for (int i = 0; i < freq; i++) {
assertEquals(leftDocs.nextPosition(), rightDocs.nextPosition());
// we don't assert offsets/payloads, they are allowed to be different
}
}
assertEquals(DocIdSetIterator.NO_MORE_DOCS, rightDocs.nextDoc());
}
/**
* Compute termvector number full.
*
* @param docSet
* the doc set
* @param termDocId
* the term doc id
* @param termsEnum
* the terms enum
* @param lrc
* the lrc
* @param postingsEnum
* the postings enum
* @param positionsData
* the positions data
* @return the termvector number full
* @throws IOException
* Signals that an I/O exception has occurred.
*/
private static TermvectorNumberFull computeTermvectorNumberFull(
List<Integer> docSet, int termDocId, TermsEnum termsEnum,
LeafReaderContext lrc, PostingsEnum postingsEnum,
Map<Integer, Integer> positionsData) throws IOException {
TermvectorNumberFull result = new TermvectorNumberFull(docSet.size());
Iterator<Integer> docIterator = docSet.iterator();
int localTermDocId = termDocId;
postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.FREQS);
while (docIterator.hasNext()) {
int docId = docIterator.next() - lrc.docBase;
if (docId >= localTermDocId && ((docId == localTermDocId)
|| ((localTermDocId = postingsEnum.advance(docId)) == docId))) {
result.args[result.docNumber] = postingsEnum.freq();
result.positions[result.docNumber] = (positionsData == null) ? 0
: positionsData.get(docId + lrc.docBase);
result.docNumber++;
}
}
return result;
}
/**
* Gets the 1 - entropy (i.e. 1+ plogp) of a term,
* a function that favors terms that are focally distributed
* We use the definition of log-entropy weighting provided in
* Martin and Berry (2007):
* Entropy = 1 + sum ((Pij log2(Pij)) / log2(n))
* where Pij = frequency of term i in doc j / global frequency of term i
* n = number of documents in collection
* @param term whose entropy you want
* Thanks to Vidya Vasuki for adding the hash table to
* eliminate redundant calculation
*/
private float getEntropy(Term term) {
if (termEntropy.containsKey(term.field()+"_"+term.text()))
return termEntropy.get(term.field()+"_"+term.text());
int gf = getGlobalTermFreq(term);
double entropy = 0;
try {
PostingsEnum docsEnum = this.getDocsForTerm(term);
while ((docsEnum.nextDoc()) != PostingsEnum.NO_MORE_DOCS) {
double p = docsEnum.freq(); //frequency in this document
p = p / gf; //frequency across all documents
entropy += p * (Math.log(p) / Math.log(2)); //sum of Plog(P)
}
int n = this.getNumDocs();
double log2n = Math.log(n) / Math.log(2);
entropy = entropy / log2n;
} catch (IOException e) {
logger.info("Couldn't get term entropy for term " + term.text());
}
termEntropy.put(term.field()+"_"+term.text(), 1 + (float) entropy);
return (float) (1 + entropy);
}
private void initParents(IndexReader reader, int first) throws IOException {
if (reader.maxDoc() == first) {
return;
}
// it's ok to use MultiTerms because we only iterate on one posting list.
// breaking it to loop over the leaves() only complicates code for no
// apparent gain.
PostingsEnum positions = MultiTerms.getTermPostingsEnum(reader,
Consts.FIELD_PAYLOADS, Consts.PAYLOAD_PARENT_BYTES_REF,
PostingsEnum.PAYLOADS);
// shouldn't really happen, if it does, something's wrong
if (positions == null || positions.advance(first) == DocIdSetIterator.NO_MORE_DOCS) {
throw new CorruptIndexException("Missing parent data for category " + first, reader.toString());
}
int num = reader.maxDoc();
for (int i = first; i < num; i++) {
if (positions.docID() == i) {
if (positions.freq() == 0) { // shouldn't happen
throw new CorruptIndexException("Missing parent data for category " + i, reader.toString());
}
parents[i] = positions.nextPosition();
if (positions.nextDoc() == DocIdSetIterator.NO_MORE_DOCS) {
if (i + 1 < num) {
throw new CorruptIndexException("Missing parent data for category "+ (i + 1), reader.toString());
}
break;
}
} else { // this shouldn't happen
throw new CorruptIndexException("Missing parent data for category " + i, reader.toString());
}
}
}
/**
* checks advancing docs + positions
*/
public void assertPositionsSkipping(int docFreq, PostingsEnum leftDocs, PostingsEnum rightDocs) throws Exception {
if (leftDocs == null || rightDocs == null) {
assertNull(leftDocs);
assertNull(rightDocs);
return;
}
int docid = -1;
int averageGap = MAXDOC / (1+docFreq);
int skipInterval = 16;
while (true) {
if (random().nextBoolean()) {
// nextDoc()
docid = leftDocs.nextDoc();
assertEquals(docid, rightDocs.nextDoc());
} else {
// advance()
int skip = docid + (int) Math.ceil(Math.abs(skipInterval + random().nextGaussian() * averageGap));
docid = leftDocs.advance(skip);
assertEquals(docid, rightDocs.advance(skip));
}
if (docid == DocIdSetIterator.NO_MORE_DOCS) {
return;
}
int freq = leftDocs.freq();
assertEquals(freq, rightDocs.freq());
for (int i = 0; i < freq; i++) {
assertEquals(leftDocs.nextPosition(), rightDocs.nextPosition());
// we don't compare the payloads, it's allowed that one is empty etc
}
}
}
/**
* Compute termvector number basic.
*
* @param docSet
* the doc set
* @param termDocId
* the term doc id
* @param termsEnum
* the terms enum
* @param r
* the r
* @param lrc
* the lrc
* @param postingsEnum
* the postings enum
* @return the termvector number basic
* @throws IOException
* Signals that an I/O exception has occurred.
*/
private static TermvectorNumberBasic computeTermvectorNumberBasic(
List<Integer> docSet, int termDocId, TermsEnum termsEnum, LeafReader r,
LeafReaderContext lrc, PostingsEnum postingsEnum) throws IOException {
TermvectorNumberBasic result = new TermvectorNumberBasic();
boolean hasDeletedDocuments = (r.getLiveDocs() != null);
if ((docSet.size() == r.numDocs()) && !hasDeletedDocuments) {
try {
return computeTermvectorNumberBasic(termsEnum, r);
} catch (IOException e) {
log.debug("problem", e);
// problem
}
}
result.docNumber = 0;
result.valueSum[0] = 0;
int localTermDocId = termDocId;
Iterator<Integer> docIterator = docSet.iterator();
postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.FREQS);
int docId;
while (docIterator.hasNext()) {
docId = docIterator.next() - lrc.docBase;
if (docId >= localTermDocId && ((docId == localTermDocId)
|| ((localTermDocId = postingsEnum.advance(docId)) == docId))) {
result.docNumber++;
result.valueSum[0] += postingsEnum.freq();
}
if (localTermDocId == DocIdSetIterator.NO_MORE_DOCS) {
break;
}
}
return result;
}
private Map<Integer,String> getTermVectorWithException(String field, String id) throws IOException {
TermVectorsResponse response = client.prepareTermVector(indexName, documentType, id)
.setOffsets(false).setPositions(true).setFieldStatistics(false)
.setTermStatistics(false)
.setSelectedFields(field).
execute().actionGet();
Map<Integer,String> map = new HashMap<>();
Terms terms = response.getFields().terms(field);
if (terms==null){
return map;
}
TermsEnum iterator = terms.iterator();
PostingsEnum postings = null;
for (BytesRef termBytes = null; (termBytes = iterator.next()) != null; ) {
String term = termBytes.utf8ToString();
postings = iterator.postings(postings, PostingsEnum.ALL);
//there can only be one doc since we are getting with id. get the doc and the position
postings.nextDoc();
int tf = postings.freq();
for (int i = 0; i < tf; i++) {
int pos = postings.nextPosition();
map.put(pos,term);
}
}
return map;
}
private List<MWESentenceContext> collectTermOffsets(Terms termVectorLookup) throws IOException {
List<MWESentenceContext> result = new ArrayList<>();
TermsEnum tiRef= termVectorLookup.iterator();
BytesRef luceneTerm = tiRef.next();
while (luceneTerm != null) {
if (luceneTerm.length == 0) {
luceneTerm = tiRef.next();
continue;
}
String tString = luceneTerm.utf8ToString();
if(!allCandidates.contains(tString)) {
luceneTerm=tiRef.next();
continue;
}
PostingsEnum postingsEnum = tiRef.postings(null, PostingsEnum.ALL);
//PostingsEnum postingsEnum = ti.postings(null, PostingsEnum.OFFSETS);
int doc = postingsEnum.nextDoc(); //this should be just 1 doc, i.e., the constraint for getting this TV
if (doc != PostingsEnum.NO_MORE_DOCS) {
int totalOccurrence = postingsEnum.freq();
for (int i = 0; i < totalOccurrence; i++) {
postingsEnum.nextPosition();
int start = postingsEnum.startOffset();
int end = postingsEnum.endOffset();
BytesRef payload=postingsEnum.getPayload();
int sentenceId=-1;
if(payload!=null){
sentenceId=new SentenceContext(MWEMetadata.deserialize(payload.utf8ToString())).getSentenceId();
}
result.add(new MWESentenceContext(tString,sentenceId, start, end));
}
}
luceneTerm = tiRef.next();
}
Collections.sort(result);
return result;
}
@Override
public boolean seekExact(BytesRef text) throws IOException {
int docFreq = 0;
long totalTermFreq = 0;
for (Holder anEnum : enums) {
if (anEnum.termsEnum.seekExact(text)) {
if (anEnum.bits == null) {
docFreq += anEnum.termsEnum.docFreq();
if (docsEnumFlag == PostingsEnum.FREQS) {
long leafTotalTermFreq = anEnum.termsEnum.totalTermFreq();
if (totalTermFreq == -1 || leafTotalTermFreq == -1) {
totalTermFreq = -1;
continue;
}
totalTermFreq += leafTotalTermFreq;
}
} else {
final PostingsEnum docsEnum = anEnum.docsEnum = anEnum.termsEnum.postings(anEnum.docsEnum, docsEnumFlag);
// 2 choices for performing same heavy loop - one attempts to calculate totalTermFreq and other does not
if (docsEnumFlag == PostingsEnum.FREQS) {
for (int docId = docsEnum.nextDoc(); docId != DocIdSetIterator.NO_MORE_DOCS; docId = docsEnum.nextDoc()) {
if (anEnum.bits != null && anEnum.bits.get(docId) == false) {
continue;
}
docFreq++;
// docsEnum.freq() returns 1 if doc indexed with IndexOptions.DOCS_ONLY so no way of knowing if value
// is really 1 or unrecorded when filtering like this
totalTermFreq += docsEnum.freq();
}
} else {
for (int docId = docsEnum.nextDoc(); docId != DocIdSetIterator.NO_MORE_DOCS; docId = docsEnum.nextDoc()) {
if (anEnum.bits != null && anEnum.bits.get(docId) == false) {
continue;
}
// docsEnum.freq() behaviour is undefined if docsEnumFlag==PostingsEnum.FLAG_NONE so don't bother with call
docFreq++;
}
}
}
}
}
if (docFreq > 0) {
currentDocFreq = docFreq;
currentTotalTermFreq = totalTermFreq;
current = text;
return true;
} else {
currentDocFreq = NOT_FOUND;
currentTotalTermFreq = NOT_FOUND;
current = null;
return false;
}
}
@Override
protected CustomScoreProvider getCustomScoreProvider(LeafReaderContext context) throws IOException {
return new CustomScoreProvider(context){
@Override
public float customScore(int docID, float subQueryScore, float valSrcScore) throws IOException {
float score = 0;
double docVectorNorm = 0;
LeafReader reader = context.reader();
Terms terms = reader.getTermVector(docID, field);
if(vector.size() != terms.size()){
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "indexed and input vector array must have same length");
}
TermsEnum iter = terms.iterator();
BytesRef text;
while ((text = iter.next()) != null) {
String term = text.utf8ToString();
float payloadValue = 0f;
PostingsEnum postings = iter.postings(null, PostingsEnum.ALL);
while (postings.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
int freq = postings.freq();
while (freq-- > 0) postings.nextPosition();
BytesRef payload = postings.getPayload();
payloadValue = PayloadHelper.decodeFloat(payload.bytes, payload.offset);
if (cosine)
docVectorNorm += Math.pow(payloadValue, 2.0);
}
score = (float)(score + payloadValue * (vector.get(Integer.parseInt(term))));
}
if (cosine) {
if ((docVectorNorm == 0) || (queryVectorNorm == 0)) return 0f;
return (float)(score / (Math.sqrt(docVectorNorm) * Math.sqrt(queryVectorNorm)));
}
return score;
}
};
}
public OfPostings(BytesRef term, PostingsEnum postingsEnum) throws IOException {
this(term, postingsEnum.freq(), postingsEnum);
}
/**
* Test ReadTokensTask
*/
public void testReadTokens() throws Exception {
// We will call ReadTokens on this many docs
final int NUM_DOCS = 20;
// Read tokens from first NUM_DOCS docs from Reuters and
// then build index from the same docs
String algLines1[] = {
"# ----- properties ",
"analyzer=org.apache.lucene.analysis.core.WhitespaceAnalyzer",
"content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
"docs.file=" + getReuters20LinesFile(),
"# ----- alg ",
"{ReadTokens}: " + NUM_DOCS,
"ResetSystemErase",
"CreateIndex",
"{AddDoc}: " + NUM_DOCS,
"CloseIndex",
};
// Run algo
Benchmark benchmark = execBenchmark(algLines1);
List<TaskStats> stats = benchmark.getRunData().getPoints().taskStats();
// Count how many tokens all ReadTokens saw
int totalTokenCount1 = 0;
for (final TaskStats stat : stats) {
if (stat.getTask().getName().equals("ReadTokens")) {
totalTokenCount1 += stat.getCount();
}
}
// Separately count how many tokens are actually in the index:
IndexReader reader = DirectoryReader.open(benchmark.getRunData().getDirectory());
assertEquals(NUM_DOCS, reader.numDocs());
int totalTokenCount2 = 0;
Collection<String> fields = FieldInfos.getIndexedFields(reader);
for (String fieldName : fields) {
if (fieldName.equals(DocMaker.ID_FIELD) || fieldName.equals(DocMaker.DATE_MSEC_FIELD) || fieldName.equals(DocMaker.TIME_SEC_FIELD)) {
continue;
}
Terms terms = MultiTerms.getTerms(reader, fieldName);
if (terms == null) {
continue;
}
TermsEnum termsEnum = terms.iterator();
PostingsEnum docs = null;
while(termsEnum.next() != null) {
docs = TestUtil.docs(random(), termsEnum, docs, PostingsEnum.FREQS);
while(docs.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
totalTokenCount2 += docs.freq();
}
}
}
reader.close();
// Make sure they are the same
assertEquals(totalTokenCount1, totalTokenCount2);
}
private IntervalMatchesIterator matches(TermsEnum te, int doc) throws IOException {
PostingsEnum pe = te.postings(null, PostingsEnum.ALL);
if (pe.advance(doc) != doc) {
return null;
}
return new IntervalMatchesIterator() {
@Override
public int gaps() {
return 0;
}
@Override
public int width() {
return 1;
}
int upto = pe.freq();
int pos = -1;
@Override
public boolean next() throws IOException {
do {
if (upto <= 0) {
pos = IntervalIterator.NO_MORE_INTERVALS;
return false;
}
upto--;
pos = pe.nextPosition();
}
while (filter.test(pe.getPayload()) == false);
return true;
}
@Override
public int startPosition() {
return pos;
}
@Override
public int endPosition() {
return pos;
}
@Override
public int startOffset() throws IOException {
return pe.startOffset();
}
@Override
public int endOffset() throws IOException {
return pe.endOffset();
}
@Override
public MatchesIterator getSubMatches() {
return null;
}
@Override
public Query getQuery() {
throw new UnsupportedOperationException();
}
};
}
static IntervalMatchesIterator matches(TermsEnum te, int doc, String field) throws IOException {
TermQuery query = new TermQuery(new Term(field, te.term()));
PostingsEnum pe = te.postings(null, PostingsEnum.OFFSETS);
if (pe.advance(doc) != doc) {
return null;
}
return new IntervalMatchesIterator() {
@Override
public int gaps() {
return 0;
}
@Override
public int width() {
return 1;
}
int upto = pe.freq();
int pos = -1;
@Override
public boolean next() throws IOException {
if (upto <= 0) {
pos = IntervalIterator.NO_MORE_INTERVALS;
return false;
}
upto--;
pos = pe.nextPosition();
return true;
}
@Override
public int startPosition() {
return pos;
}
@Override
public int endPosition() {
return pos;
}
@Override
public int startOffset() throws IOException {
return pe.startOffset();
}
@Override
public int endOffset() throws IOException {
return pe.endOffset();
}
@Override
public MatchesIterator getSubMatches() {
return null;
}
@Override
public Query getQuery() {
return query;
}
};
}
/**
* Create a new {@link TermMatchesIterator} for the given term and postings list
*/
TermMatchesIterator(Query query, PostingsEnum pe) throws IOException {
this.pe = pe;
this.query = query;
this.upto = pe.freq();
}
/**
* Creates doc vectors, iterating over terms.
*/
private void trainDocVectors() throws IOException {
VerbatimLogger.info("Building document vectors ... ");
Enumeration<ObjectVector> termEnum = termVectors.getAllVectors();
try {
int tc = 0;
while (termEnum.hasMoreElements()) {
// Output progress counter.
if ((tc % 10000 == 0) || (tc < 10000 && tc % 1000 == 0)) {
VerbatimLogger.info("Processed " + tc + " terms ... ");
}
tc++;
ObjectVector termVectorObject = termEnum.nextElement();
Vector termVector = termVectorObject.getVector();
String word = (String) termVectorObject.getObject();
// Go through checking terms for each fieldName.
for (String fieldName : flagConfig.contentsfields()) {
Term term = new Term(fieldName, word);
float globalweight = luceneUtils.getGlobalTermWeight(term);
float fieldweight = 1;
// Get any docs for this term.
PostingsEnum docsEnum = this.luceneUtils.getDocsForTerm(term);
// This may occur frequently if one term vector store is derived from multiple fields
if (docsEnum == null) { continue; }
while (docsEnum.nextDoc() != PostingsEnum.NO_MORE_DOCS) {
String externalDocID = luceneUtils.getExternalDocId(docsEnum.docID());
// Add vector from this term, taking freq into account.
Vector docVector = this.docVectors.getVector(externalDocID);
float localweight = docsEnum.freq();
if (flagConfig.fieldweight()) {
//field weight: 1/sqrt(number of terms in field)
TermsEnum terms = luceneUtils.getTermVector(docsEnum.docID(), fieldName).iterator();
int numTerms = 0;
while (terms.next() != null) {
numTerms++;
}
fieldweight = (float) (1/Math.sqrt(numTerms));
}
docVector.superpose(
termVector, localweight * globalweight * fieldweight, null);
}
}
}
}
catch (IOException e) { // catches from indexReader.
e.printStackTrace();
}
VerbatimLogger.info("\nNormalizing doc vectors ...\n");
Enumeration<ObjectVector> docEnum = docVectors.getAllVectors();
while (docEnum.hasMoreElements())
docEnum.nextElement().getVector().normalize();
}
private List<MWEInSentence> collectTermSentenceContext(Terms termVectorLookup,
Map<Integer, Integer> sentenceBoundaries) throws IOException {
List<MWEInSentence> result = new ArrayList<>();
TermsEnum tiRef = termVectorLookup.iterator();
BytesRef luceneTerm = tiRef.next();
while (luceneTerm != null) {
if (luceneTerm.length == 0) {
luceneTerm = tiRef.next();
continue;
}
String tString = luceneTerm.utf8ToString();
if (!allCandidates.contains(tString)) {
luceneTerm = tiRef.next();
continue;
}
PostingsEnum postingsEnum = tiRef.postings(null, PostingsEnum.ALL);
//PostingsEnum postingsEnum = ti.postings(null, PostingsEnum.OFFSETS);
int doc = postingsEnum.nextDoc(); //this should be just 1 doc, i.e., the constraint for getting this TV
if (doc != PostingsEnum.NO_MORE_DOCS) {
int totalOccurrence = postingsEnum.freq();
for (int i = 0; i < totalOccurrence; i++) {
postingsEnum.nextPosition();
int start = postingsEnum.startOffset();
int end = postingsEnum.endOffset();
BytesRef payload = postingsEnum.getPayload();
SentenceContext sentenceContextInfo = null;
if (payload != null) {
sentenceContextInfo = new SentenceContext(MWEMetadata.deserialize(payload.utf8ToString()));
}
if (sentenceContextInfo == null)
result.add(new MWEInSentence(tString, start, end, 0, 0, 0));
else {
result.add(new MWEInSentence(tString, start, end,
sentenceContextInfo.getFirstTokenIdx(),
sentenceContextInfo.getLastTokenIdx(),
sentenceContextInfo.getSentenceId()));
Integer endBound = sentenceBoundaries.get(sentenceContextInfo.getSentenceId());
if (endBound == null || endBound < sentenceContextInfo.getLastTokenIdx())
sentenceBoundaries.put(sentenceContextInfo.getSentenceId(),
sentenceContextInfo.getLastTokenIdx());
}
}
}
luceneTerm = tiRef.next();
}
Collections.sort(result);
return result;
}