下面列出了org.apache.lucene.search.CollectionStatistics#docCount ( ) 实例代码,或者点击链接到github查看源代码,也可以在右侧发表评论。
private void writeFieldStatistics(CollectionStatistics fieldStats) throws IOException {
long sttf = fieldStats.sumTotalTermFreq();
assert (sttf >= -1);
writePotentiallyNegativeVLong(sttf);
long sdf = fieldStats.sumDocFreq();
assert (sdf >= -1);
writePotentiallyNegativeVLong(sdf);
int dc = (int) fieldStats.docCount();
assert (dc >= -1);
writePotentiallyNegativeVInt(dc);
}
@Override
public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats) {
final long df = termStats.docFreq();
final long docCount = collectionStats.docCount();
final float idf = idf(df, docCount);
return Explanation.match(idf, "idf, computed as log((docCount+1)/(docFreq+1)) + 1 from:",
Explanation.match(df, "docFreq, number of documents containing term"),
Explanation.match(docCount, "docCount, total number of documents with field"));
}
public CollectionStats(CollectionStatistics stats) {
this.field = stats.field();
this.maxDoc = stats.maxDoc();
this.docCount = stats.docCount();
this.sumTotalTermFreq = stats.sumTotalTermFreq();
this.sumDocFreq = stats.sumDocFreq();
}
/** The default implementation computes the average as <code>sumTotalTermFreq / docCount</code>,
* or returns <code>1</code> if the index does not store sumTotalTermFreq:
* any field that omits frequency information). */
protected float avgFieldLength(CollectionStatistics collectionStats) {
final long sumTotalTermFreq = collectionStats.sumTotalTermFreq();
if (sumTotalTermFreq <= 0) {
return 1f; // field does not exist, or stat is unsupported
} else {
final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount();
return (float) (sumTotalTermFreq / (double) docCount);
}
}
@Override
public final SimWeight computeWeight(CollectionStatistics collectionStats,
TermStatistics... termStats)
{
long N, n;
float idf_, avdl;
idf_ = 1.0f;
N = collectionStats.docCount();
if (N == -1)
N = collectionStats.maxDoc();
avdl = collectionStats.sumTotalTermFreq() / N;
if (termStats.length == 1) {
n = termStats[0].docFreq();
idf_ = idf(n, N);
}
else { /* computation for a phrase */
for (final TermStatistics stat : termStats) {
n = stat.docFreq();
idf_ += idf(n, N);
}
}
return new TFIDFWeight(collectionStats.field(), idf_, avdl);
}
public void reportCollectionStatistics()throws IOException {
IndexSearcher searcher = new IndexSearcher(reader);
CollectionStatistics collectionStats = searcher.collectionStatistics(lucene4ir.Lucene4IRConstants.FIELD_ALL);
long token_count = collectionStats.sumTotalTermFreq();
long doc_count = collectionStats.docCount();
long sum_doc_count = collectionStats.sumDocFreq();
long avg_doc_length = token_count / doc_count;
System.out.println("ALL: Token count: " + token_count+ " Doc Count: " + doc_count + " sum doc: " + sum_doc_count + " avg doc len: " + avg_doc_length);
collectionStats = searcher.collectionStatistics(lucene4ir.Lucene4IRConstants.FIELD_TITLE);
token_count = collectionStats.sumTotalTermFreq();
doc_count = collectionStats.docCount();
sum_doc_count = collectionStats.sumDocFreq();
avg_doc_length = token_count / doc_count;
System.out.println("TITLE: Token count: " + token_count+ " Doc Count: " + doc_count + " sum doc: " + sum_doc_count + " avg doc len: " + avg_doc_length);
collectionStats = searcher.collectionStatistics(lucene4ir.Lucene4IRConstants.FIELD_CONTENT);
token_count = collectionStats.sumTotalTermFreq();
doc_count = collectionStats.docCount();
sum_doc_count = collectionStats.sumDocFreq();
avg_doc_length = token_count / doc_count;
System.out.println("CONTENT: Token count: " + token_count+ " Doc Count: " + doc_count + " sum doc: " + sum_doc_count + " avg doc len: " + avg_doc_length);
}
public void reportCollectionStatistics()throws IOException {
IndexSearcher searcher = new IndexSearcher(reader);
CollectionStatistics collectionStats = searcher.collectionStatistics(Lucene4IRConstants.FIELD_ALL);
long token_count = collectionStats.sumTotalTermFreq();
long doc_count = collectionStats.docCount();
long sum_doc_count = collectionStats.sumDocFreq();
long avg_doc_length = token_count / doc_count;
System.out.println("ALL: Token count: " + token_count+ " Doc Count: " + doc_count + " sum doc: " + sum_doc_count + " avg doc len: " + avg_doc_length);
collectionStats = searcher.collectionStatistics(Lucene4IRConstants.FIELD_TITLE);
token_count = collectionStats.sumTotalTermFreq();
doc_count = collectionStats.docCount();
sum_doc_count = collectionStats.sumDocFreq();
avg_doc_length = token_count / doc_count;
System.out.println("TITLE: Token count: " + token_count+ " Doc Count: " + doc_count + " sum doc: " + sum_doc_count + " avg doc len: " + avg_doc_length);
collectionStats = searcher.collectionStatistics(Lucene4IRConstants.FIELD_CONTENT);
token_count = collectionStats.sumTotalTermFreq();
doc_count = collectionStats.docCount();
sum_doc_count = collectionStats.sumDocFreq();
avg_doc_length = token_count / doc_count;
System.out.println("CONTENT: Token count: " + token_count+ " Doc Count: " + doc_count + " sum doc: " + sum_doc_count + " avg doc len: " + avg_doc_length);
}
/**
* returns new random term, that fits within the bounds of the corpus
*/
static TermStatistics newTerm(Random random, CollectionStatistics corpus) {
final long docFreq;
switch (random.nextInt(3)) {
case 0:
// rare term
docFreq = 1;
break;
case 1:
// common term
docFreq = corpus.docCount();
break;
default:
// random specificity
docFreq = TestUtil.nextLong(random, 1, corpus.docCount());
break;
}
final long totalTermFreq;
// can't require docs to have > 2B tokens
long upperBound;
try {
upperBound = Math.min(corpus.sumTotalTermFreq(), Math.multiplyExact(docFreq, Integer.MAX_VALUE));
} catch (ArithmeticException overflow) {
upperBound = corpus.sumTotalTermFreq();
}
if (corpus.sumTotalTermFreq() == corpus.sumDocFreq()) {
// omitTF
totalTermFreq = docFreq;
} else {
switch (random.nextInt(3)) {
case 0:
// no repetition
totalTermFreq = docFreq;
break;
case 1:
// maximum repetition
totalTermFreq = upperBound;
break;
default:
// random repetition
totalTermFreq = TestUtil.nextLong(random, docFreq, upperBound);
break;
}
}
return new TermStatistics(TERM, docFreq, totalTermFreq);
}
/** The default implementation computes the average as <code>sumTotalTermFreq / docCount</code> */
protected float avgFieldLength(CollectionStatistics collectionStats) {
return (float) (collectionStats.sumTotalTermFreq() / (double) collectionStats.docCount());
}
/**
* Computes a score factor for a simple term and returns an explanation
* for that score factor.
*
* <p>
* The default implementation uses:
*
* <pre class="prettyprint">
* idf(docFreq, docCount);
* </pre>
*
* Note that {@link CollectionStatistics#docCount()} is used instead of
* {@link org.apache.lucene.index.IndexReader#numDocs() IndexReader#numDocs()} because also
* {@link TermStatistics#docFreq()} is used, and when the latter
* is inaccurate, so is {@link CollectionStatistics#docCount()}, and in the same direction.
* In addition, {@link CollectionStatistics#docCount()} does not skew when fields are sparse.
*
* @param collectionStats collection-level statistics
* @param termStats term-level statistics for the term
* @return an Explain object that includes both an idf score factor
and an explanation for the term.
*/
public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats) {
final long df = termStats.docFreq();
final long docCount = collectionStats.docCount();
final float idf = idf(df, docCount);
return Explanation.match(idf, "idf, computed as log(1 + (N - n + 0.5) / (n + 0.5)) from:",
Explanation.match(df, "n, number of documents containing term"),
Explanation.match(docCount, "N, total number of documents with field"));
}
/**
* Computes a score factor for a simple term and returns an explanation
* for that score factor.
*
* <p>
* The default implementation uses:
*
* <pre class="prettyprint">
* idf(docFreq, docCount);
* </pre>
*
* Note that {@link CollectionStatistics#docCount()} is used instead of
* {@link org.apache.lucene.index.IndexReader#numDocs() IndexReader#numDocs()} because also
* {@link TermStatistics#docFreq()} is used, and when the latter
* is inaccurate, so is {@link CollectionStatistics#docCount()}, and in the same direction.
* In addition, {@link CollectionStatistics#docCount()} does not skew when fields are sparse.
*
* @param collectionStats collection-level statistics
* @param termStats term-level statistics for the term
* @return an Explain object that includes both an idf score factor
and an explanation for the term.
*/
public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats) {
final long df = termStats.docFreq();
final long docCount = collectionStats.docCount();
final float idf = idf(df, docCount);
return Explanation.match(idf, "idf(docFreq, docCount)",
Explanation.match(df, "docFreq, number of documents containing term"),
Explanation.match(docCount, "docCount, total number of documents with field"));
}
/**
* Computes a score factor for a simple term and returns an explanation
* for that score factor.
*
* <p>
* The default implementation uses:
*
* <pre class="prettyprint">
* idf(docFreq, docCount);
* </pre>
*
* Note that {@link CollectionStatistics#docCount()} is used instead of
* {@link org.apache.lucene.index.IndexReader#numDocs() IndexReader#numDocs()} because also
* {@link TermStatistics#docFreq()} is used, and when the latter
* is inaccurate, so is {@link CollectionStatistics#docCount()}, and in the same direction.
* In addition, {@link CollectionStatistics#docCount()} does not skew when fields are sparse.
*
* @param collectionStats collection-level statistics
* @param termStats term-level statistics for the term
* @return an Explain object that includes both an idf score factor
and an explanation for the term.
*/
public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats) {
final long df = termStats.docFreq();
final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount();
final float idf = idf(df, docCount);
return Explanation.match(idf, "idf(docFreq=" + df + ", docCount=" + docCount + ")");
}
/**
* Computes a score factor for a phrase.
*
* <p>
* The default implementation sums the idf factor for
* each term in the phrase.
*
* @param collectionStats collection-level statistics
* @param termStats term-level statistics for the terms in the phrase
* @return an Explain object that includes both an idf
* score factor for the phrase and an explanation
* for each term.
*/
public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats[]) {
final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount();
float idf = 0.0f;
List<Explanation> details = new ArrayList<>();
for (final TermStatistics stat : termStats ) {
final long df = stat.docFreq();
final float termIdf = idf(df, docCount);
details.add(Explanation.match(termIdf, "idf(docFreq=" + df + ", docCount=" + docCount + ")"));
idf += termIdf;
}
return Explanation.match(idf, "idf(), sum of:", details);
}