下面列出了org.apache.lucene.search.TermStatistics#docFreq ( ) 实例代码,或者点击链接到github查看源代码,也可以在右侧发表评论。
@Override
public final SimWeight computeWeight(CollectionStatistics collectionStats,
TermStatistics... termStats)
{
float N, n, idf, adl;
idf = 1.0f;
N = collectionStats.maxDoc();
adl = collectionStats.sumTotalTermFreq() / N;
if (termStats.length == 1) {
n = termStats[0].docFreq();
idf = log(N/n);
}
else {
for (final TermStatistics stat : termStats) {
n = stat.docFreq();
idf += log(N/n);
}
}
return new TFIDFWeight(collectionStats.field(), idf, adl);
}
private void writeTermStatistics(TermStatistics termStatistics) throws IOException {
int docFreq = (int) termStatistics.docFreq();
assert (docFreq >= -1);
writePotentiallyNegativeVInt(docFreq);
long ttf = termStatistics.totalTermFreq();
assert (ttf >= -1);
writePotentiallyNegativeVLong(ttf);
}
@Override
public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats) {
final long df = termStats.docFreq();
final long max = collectionStats.maxDoc();
final float idf = idfManager.getIDF(termStats.term().utf8ToString());
return new Explanation(idf, "idf(docFreq=" + df + ", maxDocs=" + max + ")");
}
@Override
public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats) {
final long df = termStats.docFreq();
final long docCount = collectionStats.docCount();
final float idf = idf(df, docCount);
return Explanation.match(idf, "idf, computed as log((docCount+1)/(docFreq+1)) + 1 from:",
Explanation.match(df, "docFreq, number of documents containing term"),
Explanation.match(docCount, "docCount, total number of documents with field"));
}
@Override
public final SimWeight computeWeight(CollectionStatistics collectionStats,
TermStatistics... termStats)
{
long N, n;
float idf_, avdl;
idf_ = 1.0f;
N = collectionStats.docCount();
if (N == -1)
N = collectionStats.maxDoc();
avdl = collectionStats.sumTotalTermFreq() / N;
if (termStats.length == 1) {
n = termStats[0].docFreq();
idf_ = idf(n, N);
}
else { /* computation for a phrase */
for (final TermStatistics stat : termStats) {
n = stat.docFreq();
idf_ += idf(n, N);
}
}
return new TFIDFWeight(collectionStats.field(), idf_, avdl);
}
public TermStats(String field, TermStatistics stats) {
this.term = field + ":" + stats.term().utf8ToString();
this.t = new Term(field, stats.term());
this.docFreq = stats.docFreq();
this.totalTermFreq = stats.totalTermFreq();
}
/**
* Computes a score factor for a simple term and returns an explanation
* for that score factor.
*
* <p>
* The default implementation uses:
*
* <pre class="prettyprint">
* idf(docFreq, docCount);
* </pre>
*
* Note that {@link CollectionStatistics#docCount()} is used instead of
* {@link org.apache.lucene.index.IndexReader#numDocs() IndexReader#numDocs()} because also
* {@link TermStatistics#docFreq()} is used, and when the latter
* is inaccurate, so is {@link CollectionStatistics#docCount()}, and in the same direction.
* In addition, {@link CollectionStatistics#docCount()} does not skew when fields are sparse.
*
* @param collectionStats collection-level statistics
* @param termStats term-level statistics for the term
* @return an Explain object that includes both an idf score factor
and an explanation for the term.
*/
public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats) {
final long df = termStats.docFreq();
final long docCount = collectionStats.docCount();
final float idf = idf(df, docCount);
return Explanation.match(idf, "idf, computed as log(1 + (N - n + 0.5) / (n + 0.5)) from:",
Explanation.match(df, "n, number of documents containing term"),
Explanation.match(docCount, "N, total number of documents with field"));
}
/**
* Computes a score factor for a simple term and returns an explanation
* for that score factor.
*
* <p>
* The default implementation uses:
*
* <pre class="prettyprint">
* idf(docFreq, docCount);
* </pre>
*
* Note that {@link CollectionStatistics#docCount()} is used instead of
* {@link org.apache.lucene.index.IndexReader#numDocs() IndexReader#numDocs()} because also
* {@link TermStatistics#docFreq()} is used, and when the latter
* is inaccurate, so is {@link CollectionStatistics#docCount()}, and in the same direction.
* In addition, {@link CollectionStatistics#docCount()} does not skew when fields are sparse.
*
* @param collectionStats collection-level statistics
* @param termStats term-level statistics for the term
* @return an Explain object that includes both an idf score factor
and an explanation for the term.
*/
public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats) {
final long df = termStats.docFreq();
final long docCount = collectionStats.docCount();
final float idf = idf(df, docCount);
return Explanation.match(idf, "idf(docFreq, docCount)",
Explanation.match(df, "docFreq, number of documents containing term"),
Explanation.match(docCount, "docCount, total number of documents with field"));
}
/**
* Computes a score factor for a simple term and returns an explanation
* for that score factor.
*
* <p>
* The default implementation uses:
*
* <pre class="prettyprint">
* idf(docFreq, docCount);
* </pre>
*
* Note that {@link CollectionStatistics#docCount()} is used instead of
* {@link org.apache.lucene.index.IndexReader#numDocs() IndexReader#numDocs()} because also
* {@link TermStatistics#docFreq()} is used, and when the latter
* is inaccurate, so is {@link CollectionStatistics#docCount()}, and in the same direction.
* In addition, {@link CollectionStatistics#docCount()} does not skew when fields are sparse.
*
* @param collectionStats collection-level statistics
* @param termStats term-level statistics for the term
* @return an Explain object that includes both an idf score factor
and an explanation for the term.
*/
public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats) {
final long df = termStats.docFreq();
final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount();
final float idf = idf(df, docCount);
return Explanation.match(idf, "idf(docFreq=" + df + ", docCount=" + docCount + ")");
}
/**
* Computes a score factor for a phrase.
*
* <p>
* The default implementation sums the idf factor for
* each term in the phrase.
*
* @param collectionStats collection-level statistics
* @param termStats term-level statistics for the terms in the phrase
* @return an Explain object that includes both an idf
* score factor for the phrase and an explanation
* for each term.
*/
public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats[]) {
final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount();
float idf = 0.0f;
List<Explanation> details = new ArrayList<>();
for (final TermStatistics stat : termStats ) {
final long df = stat.docFreq();
final float termIdf = idf(df, docCount);
details.add(Explanation.match(termIdf, "idf(docFreq=" + df + ", docCount=" + docCount + ")"));
idf += termIdf;
}
return Explanation.match(idf, "idf(), sum of:", details);
}