下面列出了org.apache.lucene.search.CollectionStatistics#sumTotalTermFreq ( ) 实例代码,或者点击链接到github查看源代码,也可以在右侧发表评论。
@Override
public final SimWeight computeWeight(CollectionStatistics collectionStats,
TermStatistics... termStats)
{
float N, n, idf, adl;
idf = 1.0f;
N = collectionStats.maxDoc();
adl = collectionStats.sumTotalTermFreq() / N;
if (termStats.length == 1) {
n = termStats[0].docFreq();
idf = log(N/n);
}
else {
for (final TermStatistics stat : termStats) {
n = stat.docFreq();
idf += log(N/n);
}
}
return new TFIDFWeight(collectionStats.field(), idf, adl);
}
private void writeFieldStatistics(CollectionStatistics fieldStats) throws IOException {
long sttf = fieldStats.sumTotalTermFreq();
assert (sttf >= -1);
writePotentiallyNegativeVLong(sttf);
long sdf = fieldStats.sumDocFreq();
assert (sdf >= -1);
writePotentiallyNegativeVLong(sdf);
int dc = (int) fieldStats.docCount();
assert (dc >= -1);
writePotentiallyNegativeVInt(dc);
}
public CollectionStats(CollectionStatistics stats) {
this.field = stats.field();
this.maxDoc = stats.maxDoc();
this.docCount = stats.docCount();
this.sumTotalTermFreq = stats.sumTotalTermFreq();
this.sumDocFreq = stats.sumDocFreq();
}
/** The default implementation computes the average as <code>sumTotalTermFreq / docCount</code>,
* or returns <code>1</code> if the index does not store sumTotalTermFreq:
* any field that omits frequency information). */
protected float avgFieldLength(CollectionStatistics collectionStats) {
final long sumTotalTermFreq = collectionStats.sumTotalTermFreq();
if (sumTotalTermFreq <= 0) {
return 1f; // field does not exist, or stat is unsupported
} else {
final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount();
return (float) (sumTotalTermFreq / (double) docCount);
}
}
@Override
public final SimWeight computeWeight(CollectionStatistics collectionStats,
TermStatistics... termStats)
{
long N, n;
float idf_, avdl;
idf_ = 1.0f;
N = collectionStats.docCount();
if (N == -1)
N = collectionStats.maxDoc();
avdl = collectionStats.sumTotalTermFreq() / N;
if (termStats.length == 1) {
n = termStats[0].docFreq();
idf_ = idf(n, N);
}
else { /* computation for a phrase */
for (final TermStatistics stat : termStats) {
n = stat.docFreq();
idf_ += idf(n, N);
}
}
return new TFIDFWeight(collectionStats.field(), idf_, avdl);
}
public void reportCollectionStatistics()throws IOException {
IndexSearcher searcher = new IndexSearcher(reader);
CollectionStatistics collectionStats = searcher.collectionStatistics(lucene4ir.Lucene4IRConstants.FIELD_ALL);
long token_count = collectionStats.sumTotalTermFreq();
long doc_count = collectionStats.docCount();
long sum_doc_count = collectionStats.sumDocFreq();
long avg_doc_length = token_count / doc_count;
System.out.println("ALL: Token count: " + token_count+ " Doc Count: " + doc_count + " sum doc: " + sum_doc_count + " avg doc len: " + avg_doc_length);
collectionStats = searcher.collectionStatistics(lucene4ir.Lucene4IRConstants.FIELD_TITLE);
token_count = collectionStats.sumTotalTermFreq();
doc_count = collectionStats.docCount();
sum_doc_count = collectionStats.sumDocFreq();
avg_doc_length = token_count / doc_count;
System.out.println("TITLE: Token count: " + token_count+ " Doc Count: " + doc_count + " sum doc: " + sum_doc_count + " avg doc len: " + avg_doc_length);
collectionStats = searcher.collectionStatistics(lucene4ir.Lucene4IRConstants.FIELD_CONTENT);
token_count = collectionStats.sumTotalTermFreq();
doc_count = collectionStats.docCount();
sum_doc_count = collectionStats.sumDocFreq();
avg_doc_length = token_count / doc_count;
System.out.println("CONTENT: Token count: " + token_count+ " Doc Count: " + doc_count + " sum doc: " + sum_doc_count + " avg doc len: " + avg_doc_length);
}
public void reportCollectionStatistics()throws IOException {
IndexSearcher searcher = new IndexSearcher(reader);
CollectionStatistics collectionStats = searcher.collectionStatistics(Lucene4IRConstants.FIELD_ALL);
long token_count = collectionStats.sumTotalTermFreq();
long doc_count = collectionStats.docCount();
long sum_doc_count = collectionStats.sumDocFreq();
long avg_doc_length = token_count / doc_count;
System.out.println("ALL: Token count: " + token_count+ " Doc Count: " + doc_count + " sum doc: " + sum_doc_count + " avg doc len: " + avg_doc_length);
collectionStats = searcher.collectionStatistics(Lucene4IRConstants.FIELD_TITLE);
token_count = collectionStats.sumTotalTermFreq();
doc_count = collectionStats.docCount();
sum_doc_count = collectionStats.sumDocFreq();
avg_doc_length = token_count / doc_count;
System.out.println("TITLE: Token count: " + token_count+ " Doc Count: " + doc_count + " sum doc: " + sum_doc_count + " avg doc len: " + avg_doc_length);
collectionStats = searcher.collectionStatistics(Lucene4IRConstants.FIELD_CONTENT);
token_count = collectionStats.sumTotalTermFreq();
doc_count = collectionStats.docCount();
sum_doc_count = collectionStats.sumDocFreq();
avg_doc_length = token_count / doc_count;
System.out.println("CONTENT: Token count: " + token_count+ " Doc Count: " + doc_count + " sum doc: " + sum_doc_count + " avg doc len: " + avg_doc_length);
}
/**
* returns new random term, that fits within the bounds of the corpus
*/
static TermStatistics newTerm(Random random, CollectionStatistics corpus) {
final long docFreq;
switch (random.nextInt(3)) {
case 0:
// rare term
docFreq = 1;
break;
case 1:
// common term
docFreq = corpus.docCount();
break;
default:
// random specificity
docFreq = TestUtil.nextLong(random, 1, corpus.docCount());
break;
}
final long totalTermFreq;
// can't require docs to have > 2B tokens
long upperBound;
try {
upperBound = Math.min(corpus.sumTotalTermFreq(), Math.multiplyExact(docFreq, Integer.MAX_VALUE));
} catch (ArithmeticException overflow) {
upperBound = corpus.sumTotalTermFreq();
}
if (corpus.sumTotalTermFreq() == corpus.sumDocFreq()) {
// omitTF
totalTermFreq = docFreq;
} else {
switch (random.nextInt(3)) {
case 0:
// no repetition
totalTermFreq = docFreq;
break;
case 1:
// maximum repetition
totalTermFreq = upperBound;
break;
default:
// random repetition
totalTermFreq = TestUtil.nextLong(random, docFreq, upperBound);
break;
}
}
return new TermStatistics(TERM, docFreq, totalTermFreq);
}
/** The default implementation computes the average as <code>sumTotalTermFreq / docCount</code> */
protected float avgFieldLength(CollectionStatistics collectionStats) {
return (float) (collectionStats.sumTotalTermFreq() / (double) collectionStats.docCount());
}