org.apache.lucene.search.CollectionStatistics#sumTotalTermFreq ( )源码实例Demo

下面列出了org.apache.lucene.search.CollectionStatistics#sumTotalTermFreq ( ) 实例代码,或者点击链接到github查看源代码,也可以在右侧发表评论。

源代码1 项目: lucene4ir   文件: SMARTBNNBNNSimilarity.java
@Override
   public final SimWeight computeWeight(CollectionStatistics collectionStats,
				 TermStatistics... termStats)
   {
float N, n, idf, adl;
idf = 1.0f;
N   = collectionStats.maxDoc();
adl = collectionStats.sumTotalTermFreq() / N;

if (termStats.length == 1) {
    n = termStats[0].docFreq();
    idf = log(N/n);
}
else {
    for (final TermStatistics stat : termStats) {
	n = stat.docFreq();
	idf += log(N/n);
    }
}

return new TFIDFWeight(collectionStats.field(), idf, adl);
   }
 
源代码2 项目: Elasticsearch   文件: TermVectorsWriter.java
private void writeFieldStatistics(CollectionStatistics fieldStats) throws IOException {
    long sttf = fieldStats.sumTotalTermFreq();
    assert (sttf >= -1);
    writePotentiallyNegativeVLong(sttf);
    long sdf = fieldStats.sumDocFreq();
    assert (sdf >= -1);
    writePotentiallyNegativeVLong(sdf);
    int dc = (int) fieldStats.docCount();
    assert (dc >= -1);
    writePotentiallyNegativeVInt(dc);
}
 
源代码3 项目: lucene-solr   文件: CollectionStats.java
public CollectionStats(CollectionStatistics stats) {
  this.field = stats.field();
  this.maxDoc = stats.maxDoc();
  this.docCount = stats.docCount();
  this.sumTotalTermFreq = stats.sumTotalTermFreq();
  this.sumDocFreq = stats.sumDocFreq();
}
 
源代码4 项目: lucene4ir   文件: BM25Similarity.java
/** The default implementation computes the average as <code>sumTotalTermFreq / docCount</code>,
 * or returns <code>1</code> if the index does not store sumTotalTermFreq:
 * any field that omits frequency information). */
protected float avgFieldLength(CollectionStatistics collectionStats) {
  final long sumTotalTermFreq = collectionStats.sumTotalTermFreq();
  if (sumTotalTermFreq <= 0) {
    return 1f;       // field does not exist, or stat is unsupported
  } else {
    final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount();
    return (float) (sumTotalTermFreq / (double) docCount);
  }
}
 
源代码5 项目: lucene4ir   文件: OKAPIBM25Similarity.java
@Override
   public final SimWeight computeWeight(CollectionStatistics collectionStats,
				 TermStatistics... termStats)
   {
long  N, n;
float idf_, avdl;

idf_ = 1.0f;

N    = collectionStats.docCount();
if (N == -1)
    N = collectionStats.maxDoc();

avdl = collectionStats.sumTotalTermFreq() / N;

if (termStats.length == 1) {
    n    = termStats[0].docFreq();
    idf_ = idf(n, N);
}
else { /* computation for a phrase */
    for (final TermStatistics stat : termStats) {
	n     = stat.docFreq();
	idf_ += idf(n, N);
    }
}

return new TFIDFWeight(collectionStats.field(), idf_, avdl);
   }
 
源代码6 项目: lucene4ir   文件: DumpTermsApp.java
public void reportCollectionStatistics()throws IOException {

        IndexSearcher searcher = new IndexSearcher(reader);

        CollectionStatistics collectionStats = searcher.collectionStatistics(lucene4ir.Lucene4IRConstants.FIELD_ALL);
        long token_count = collectionStats.sumTotalTermFreq();
        long doc_count = collectionStats.docCount();
        long sum_doc_count = collectionStats.sumDocFreq();
        long avg_doc_length = token_count / doc_count;

        System.out.println("ALL: Token count: " + token_count+ " Doc Count: " + doc_count + " sum doc: " + sum_doc_count + " avg doc len: " + avg_doc_length);

        collectionStats = searcher.collectionStatistics(lucene4ir.Lucene4IRConstants.FIELD_TITLE);
        token_count = collectionStats.sumTotalTermFreq();
        doc_count = collectionStats.docCount();
        sum_doc_count = collectionStats.sumDocFreq();
        avg_doc_length = token_count / doc_count;

        System.out.println("TITLE: Token count: " + token_count+ " Doc Count: " + doc_count + " sum doc: " + sum_doc_count + " avg doc len: " + avg_doc_length);


        collectionStats = searcher.collectionStatistics(lucene4ir.Lucene4IRConstants.FIELD_CONTENT);
        token_count = collectionStats.sumTotalTermFreq();
        doc_count = collectionStats.docCount();
        sum_doc_count = collectionStats.sumDocFreq();
        avg_doc_length = token_count / doc_count;

        System.out.println("CONTENT: Token count: " + token_count+ " Doc Count: " + doc_count + " sum doc: " + sum_doc_count + " avg doc len: " + avg_doc_length);

    }
 
源代码7 项目: lucene4ir   文件: ExampleStatsApp.java
public void reportCollectionStatistics()throws IOException {

        IndexSearcher searcher = new IndexSearcher(reader);

        CollectionStatistics collectionStats = searcher.collectionStatistics(Lucene4IRConstants.FIELD_ALL);
        long token_count = collectionStats.sumTotalTermFreq();
        long doc_count = collectionStats.docCount();
        long sum_doc_count = collectionStats.sumDocFreq();
        long avg_doc_length = token_count / doc_count;

        System.out.println("ALL: Token count: " + token_count+ " Doc Count: " + doc_count + " sum doc: " + sum_doc_count + " avg doc len: " + avg_doc_length);

        collectionStats = searcher.collectionStatistics(Lucene4IRConstants.FIELD_TITLE);
        token_count = collectionStats.sumTotalTermFreq();
        doc_count = collectionStats.docCount();
        sum_doc_count = collectionStats.sumDocFreq();
        avg_doc_length = token_count / doc_count;

        System.out.println("TITLE: Token count: " + token_count+ " Doc Count: " + doc_count + " sum doc: " + sum_doc_count + " avg doc len: " + avg_doc_length);


        collectionStats = searcher.collectionStatistics(Lucene4IRConstants.FIELD_CONTENT);
        token_count = collectionStats.sumTotalTermFreq();
        doc_count = collectionStats.docCount();
        sum_doc_count = collectionStats.sumDocFreq();
        avg_doc_length = token_count / doc_count;

        System.out.println("CONTENT: Token count: " + token_count+ " Doc Count: " + doc_count + " sum doc: " + sum_doc_count + " avg doc len: " + avg_doc_length);

    }
 
源代码8 项目: lucene-solr   文件: BaseSimilarityTestCase.java
/**
 * returns new random term, that fits within the bounds of the corpus
 */
static TermStatistics newTerm(Random random, CollectionStatistics corpus) {
  final long docFreq;
  switch (random.nextInt(3)) {
    case 0:
      // rare term
      docFreq = 1;
      break;
    case 1:
      // common term
      docFreq = corpus.docCount();
      break;
    default:
      // random specificity
      docFreq = TestUtil.nextLong(random, 1, corpus.docCount());
      break;
  }
  final long totalTermFreq;
  // can't require docs to have > 2B tokens
  long upperBound;
  try {
    upperBound = Math.min(corpus.sumTotalTermFreq(), Math.multiplyExact(docFreq, Integer.MAX_VALUE));
  } catch (ArithmeticException overflow) {
    upperBound = corpus.sumTotalTermFreq();
  }
  if (corpus.sumTotalTermFreq() == corpus.sumDocFreq()) {
    // omitTF
    totalTermFreq = docFreq;
  } else {
    switch (random.nextInt(3)) {
      case 0:
        // no repetition
        totalTermFreq = docFreq;
        break;
      case 1:
        // maximum repetition
        totalTermFreq = upperBound;
        break;
      default:
        // random repetition
        totalTermFreq = TestUtil.nextLong(random, docFreq, upperBound);
        break;
    }
  }
  return new TermStatistics(TERM, docFreq, totalTermFreq);
}
 
源代码9 项目: lucene-solr   文件: BM25Similarity.java
/** The default implementation computes the average as <code>sumTotalTermFreq / docCount</code> */
protected float avgFieldLength(CollectionStatistics collectionStats) {
  return (float) (collectionStats.sumTotalTermFreq() / (double) collectionStats.docCount());
}