下面列出了怎么用org.apache.lucene.search.CollectionStatistics的API类实例代码及写法,或者点击链接到github查看源代码。
public static ObjectObjectHashMap<String, CollectionStatistics> readFieldStats(StreamInput in, ObjectObjectHashMap<String, CollectionStatistics> fieldStatistics) throws IOException {
final int numFieldStatistics = in.readVInt();
if (fieldStatistics == null) {
fieldStatistics = HppcMaps.newNoNullKeysMap(numFieldStatistics);
}
for (int i = 0; i < numFieldStatistics; i++) {
final String field = in.readString();
assert field != null;
final long maxDoc = in.readVLong();
final long docCount = subOne(in.readVLong());
final long sumTotalTermFreq = subOne(in.readVLong());
final long sumDocFreq = subOne(in.readVLong());
CollectionStatistics stats = new CollectionStatistics(field, maxDoc, docCount, sumTotalTermFreq, sumDocFreq);
fieldStatistics.put(field, stats);
}
return fieldStatistics;
}
@Override
public FunctionValues getValues(Map<Object, Object> context, LeafReaderContext readerContext) throws IOException {
IndexSearcher searcher = (IndexSearcher)context.get("searcher");
final TFIDFSimilarity similarity = IDFValueSource.asTFIDF(searcher.getSimilarity(), field);
if (similarity == null) {
throw new UnsupportedOperationException("requires a TFIDFSimilarity (such as ClassicSimilarity)");
}
// Only works if the contribution of the tf is 1 when the freq is 1 and contribution of the idf
// is 1 when docCount == docFreq == 1
final SimScorer simScorer = similarity.scorer(1f,
new CollectionStatistics(field, 1, 1, 1, 1),
new TermStatistics(new BytesRef("bogus"), 1, 1));
final LeafSimScorer leafSimScorer = new LeafSimScorer(simScorer, readerContext.reader(), field, true);
return new FloatDocValues(this) {
int lastDocID = -1;
@Override
public float floatVal(int docID) throws IOException {
if (docID < lastDocID) {
throw new AssertionError("docs out of order: lastDocID=" + lastDocID + " docID=" + docID);
}
lastDocID = docID;
return leafSimScorer.score(docID, 1f);
}
};
}
private Similarity.SimScorer buildSimWeight(SpanQuery query, IndexSearcher searcher, Map<Term, TermStates> termStates, float boost) throws IOException {
if (termStates == null || termStates.size() == 0 || query.getField() == null)
return null;
TermStatistics[] termStats = new TermStatistics[termStates.size()];
int termUpTo = 0;
for (Map.Entry<Term, TermStates> entry : termStates.entrySet()) {
TermStates ts = entry.getValue();
if (ts.docFreq() > 0) {
termStats[termUpTo++] = searcher.termStatistics(entry.getKey(), ts.docFreq(), ts.totalTermFreq());
}
}
CollectionStatistics collectionStats = searcher.collectionStatistics(query.getField());
if (termUpTo > 0) {
return similarity.scorer(boost, collectionStats, ArrayUtil.copyOfSubArray(termStats, 0, termUpTo));
} else {
return null; // no terms at all exist, we won't use similarity
}
}
@Override
public final SimWeight computeWeight(CollectionStatistics collectionStats,
TermStatistics... termStats)
{
float N, n, idf, adl;
idf = 1.0f;
N = collectionStats.maxDoc();
adl = collectionStats.sumTotalTermFreq() / N;
if (termStats.length == 1) {
n = termStats[0].docFreq();
idf = log(N/n);
}
else {
for (final TermStatistics stat : termStats) {
n = stat.docFreq();
idf += log(N/n);
}
}
return new TFIDFWeight(collectionStats.field(), idf, adl);
}
public static void writeFieldStats(StreamOutput out, ObjectObjectHashMap<String, CollectionStatistics> fieldStatistics) throws IOException {
out.writeVInt(fieldStatistics.size());
for (ObjectObjectCursor<String, CollectionStatistics> c : fieldStatistics) {
out.writeString(c.key);
CollectionStatistics statistics = c.value;
assert statistics.maxDoc() >= 0;
out.writeVLong(statistics.maxDoc());
out.writeVLong(addOne(statistics.docCount()));
out.writeVLong(addOne(statistics.sumTotalTermFreq()));
out.writeVLong(addOne(statistics.sumDocFreq()));
}
}
private void writeFieldStatistics(CollectionStatistics fieldStats) throws IOException {
long sttf = fieldStats.sumTotalTermFreq();
assert (sttf >= -1);
writePotentiallyNegativeVLong(sttf);
long sdf = fieldStats.sumDocFreq();
assert (sdf >= -1);
writePotentiallyNegativeVLong(sdf);
int dc = (int) fieldStats.docCount();
assert (dc >= -1);
writePotentiallyNegativeVInt(dc);
}
@Override
public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats) {
final long df = termStats.docFreq();
final long max = collectionStats.maxDoc();
final float idf = idfManager.getIDF(termStats.term().utf8ToString());
return new Explanation(idf, "idf(docFreq=" + df + ", maxDocs=" + max + ")");
}
@Test
public void testSimilarities() throws IOException {
MemoryIndex mi = new MemoryIndex();
mi.addField("f1", "a long text field that contains many many terms", analyzer);
IndexSearcher searcher = mi.createSearcher();
LeafReader reader = (LeafReader) searcher.getIndexReader();
NumericDocValues norms = reader.getNormValues("f1");
assertEquals(0, norms.nextDoc());
float n1 = norms.longValue();
// Norms are re-computed when we change the Similarity
mi.setSimilarity(new Similarity() {
@Override
public long computeNorm(FieldInvertState state) {
return 74;
}
@Override
public SimScorer scorer(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
throw new UnsupportedOperationException();
}
});
norms = reader.getNormValues("f1");
assertEquals(0, norms.nextDoc());
float n2 = norms.longValue();
assertTrue(n1 != n2);
TestUtil.checkReader(reader);
}
@Override
public SimScorer scorer(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
assert boost >= 0;
assert collectionStats != null;
assert termStats.length > 0;
for (TermStatistics term : termStats) {
assert term != null;
}
// TODO: check that TermStats is in bounds with respect to collection? e.g. docFreq <= maxDoc
SimScorer scorer = delegate.scorer(boost, collectionStats, termStats);
assert scorer != null;
return new AssertingSimScorer(scorer, boost);
}
@Override
public final SimScorer scorer(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
SimScorer weights[] = new SimScorer[termStats.length];
for (int i = 0; i < termStats.length; i++) {
BasicStats stats = newStats(collectionStats.field(), boost);
fillBasicStats(stats, collectionStats, termStats[i]);
weights[i] = new BasicSimScorer(stats);
}
if (weights.length == 1) {
return weights[0];
} else {
return new MultiSimilarity.MultiSimScorer(weights);
}
}
/** Fills all member fields defined in {@code BasicStats} in {@code stats}.
* Subclasses can override this method to fill additional stats. */
protected void fillBasicStats(BasicStats stats, CollectionStatistics collectionStats, TermStatistics termStats) {
// TODO: validate this for real, somewhere else
assert termStats.totalTermFreq() <= collectionStats.sumTotalTermFreq();
assert termStats.docFreq() <= collectionStats.sumDocFreq();
// TODO: add sumDocFreq for field (numberOfFieldPostings)
stats.setNumberOfDocuments(collectionStats.docCount());
stats.setNumberOfFieldTokens(collectionStats.sumTotalTermFreq());
stats.setAvgFieldLength(collectionStats.sumTotalTermFreq() / (double) collectionStats.docCount());
stats.setDocFreq(termStats.docFreq());
stats.setTotalTermFreq(termStats.totalTermFreq());
}
@Override
public final SimScorer scorer(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
Explanation idf = termStats.length == 1 ? idfExplain(collectionStats, termStats[0]) : idfExplain(collectionStats, termStats);
float avgdl = avgFieldLength(collectionStats);
float[] cache = new float[256];
for (int i = 0; i < cache.length; i++) {
cache[i] = 1f / (k1 * ((1 - b) + b * LENGTH_TABLE[i] / avgdl));
}
return new BM25Scorer(boost, k1, b, idf, avgdl, cache);
}
@Override
public SimScorer scorer(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
SimScorer subScorers[] = new SimScorer[sims.length];
for (int i = 0; i < subScorers.length; i++) {
subScorers[i] = sims[i].scorer(boost, collectionStats, termStats);
}
return new MultiSimScorer(subScorers);
}
/**
* Computes the collection probability of the current term in addition to the
* usual statistics.
*/
@Override
protected void fillBasicStats(BasicStats stats, CollectionStatistics collectionStats, TermStatistics termStats) {
super.fillBasicStats(stats, collectionStats, termStats);
LMStats lmStats = (LMStats) stats;
lmStats.setCollectionProbability(collectionModel.computeProbability(stats));
}
@Override
public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats) {
final long df = termStats.docFreq();
final long docCount = collectionStats.docCount();
final float idf = idf(df, docCount);
return Explanation.match(idf, "idf, computed as log((docCount+1)/(docFreq+1)) + 1 from:",
Explanation.match(df, "docFreq, number of documents containing term"),
Explanation.match(docCount, "docCount, total number of documents with field"));
}
private CollectionStatistics toCollectionStats(BasicStats stats) {
long sumTtf = stats.getNumberOfFieldTokens();
long sumDf;
if (sumTtf == -1) {
sumDf = TestUtil.nextLong(random(), stats.getNumberOfDocuments(), 2L * stats.getNumberOfDocuments());
} else {
sumDf = TestUtil.nextLong(random(), Math.min(stats.getNumberOfDocuments(), sumTtf), sumTtf);
}
int docCount = Math.toIntExact(Math.min(sumDf, stats.getNumberOfDocuments()));
int maxDoc = TestUtil.nextInt(random(), docCount, docCount + 10);
return new CollectionStatistics(stats.field, maxDoc, docCount, sumTtf, sumDf);
}
@Override
public SimScorer scorer(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
return new SimScorer() {
@Override
public float score(float freq, long norm) {
return 0;
}
};
}
@Override
public CollectionStatistics collectionStatistics(SolrIndexSearcher localSearcher, String field)
throws IOException {
CollectionStats colStats = currentGlobalColStats.get(field);
if (colStats == null) {
log.debug("## Missing global colStats info: {}, using local", field);
missingColStats.add(field);
metrics.missingGlobalFieldStats.increment();
return localSearcher != null ? localSearcher.localCollectionStatistics(field) : null;
} else {
return colStats.toCollectionStatistics();
}
}
@Override
public CollectionStatistics collectionStatistics(SolrIndexSearcher localSearcher, String field)
throws IOException {
CollectionStats colStats = colStatsCache.get(field);
if (colStats == null) {
log.debug("Missing global colStats info for field={}, using local", field);
metrics.missingGlobalFieldStats.increment();
return localSearcher != null ? localSearcher.localCollectionStatistics(field) : null;
} else {
return colStats.toCollectionStatistics();
}
}
@Override
public CollectionStatistics collectionStatistics(String field) throws IOException {
if (statsSource.collectionStatistics(null, field) == null) {
missingFieldStats.accept(field);
missingFieldsCount++;
}
return super.collectionStatistics(field);
}
public CollectionStats(CollectionStatistics stats) {
this.field = stats.field();
this.maxDoc = stats.maxDoc();
this.docCount = stats.docCount();
this.sumTotalTermFreq = stats.sumTotalTermFreq();
this.sumDocFreq = stats.sumDocFreq();
}
/** The default implementation computes the average as <code>sumTotalTermFreq / docCount</code>,
* or returns <code>1</code> if the index does not store sumTotalTermFreq:
* any field that omits frequency information). */
protected float avgFieldLength(CollectionStatistics collectionStats) {
final long sumTotalTermFreq = collectionStats.sumTotalTermFreq();
if (sumTotalTermFreq <= 0) {
return 1f; // field does not exist, or stat is unsupported
} else {
final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount();
return (float) (sumTotalTermFreq / (double) docCount);
}
}
@Override
public final SimWeight computeWeight(CollectionStatistics collectionStats, TermStatistics... termStats) {
Explanation idf = termStats.length == 1 ? idfExplain(collectionStats, termStats[0]) : idfExplain(collectionStats, termStats);
float avgdl = avgFieldLength(collectionStats);
// compute freq-independent part of bm25 equation across all norm values
float cache[] = new float[256];
for (int i = 0; i < cache.length; i++) {
cache[i] = k1 * ((1 - b) + b * decodeNormValue((byte)i) / avgdl);
}
return new BM25Stats(collectionStats.field(), idf, avgdl, cache);
}
@Override
public final SimWeight computeWeight(CollectionStatistics collectionStats,
TermStatistics... termStats)
{
long N, n;
float idf_, avdl;
idf_ = 1.0f;
N = collectionStats.docCount();
if (N == -1)
N = collectionStats.maxDoc();
avdl = collectionStats.sumTotalTermFreq() / N;
if (termStats.length == 1) {
n = termStats[0].docFreq();
idf_ = idf(n, N);
}
else { /* computation for a phrase */
for (final TermStatistics stat : termStats) {
n = stat.docFreq();
idf_ += idf(n, N);
}
}
return new TFIDFWeight(collectionStats.field(), idf_, avdl);
}
public void reportCollectionStatistics()throws IOException {
IndexSearcher searcher = new IndexSearcher(reader);
CollectionStatistics collectionStats = searcher.collectionStatistics(lucene4ir.Lucene4IRConstants.FIELD_ALL);
long token_count = collectionStats.sumTotalTermFreq();
long doc_count = collectionStats.docCount();
long sum_doc_count = collectionStats.sumDocFreq();
long avg_doc_length = token_count / doc_count;
System.out.println("ALL: Token count: " + token_count+ " Doc Count: " + doc_count + " sum doc: " + sum_doc_count + " avg doc len: " + avg_doc_length);
collectionStats = searcher.collectionStatistics(lucene4ir.Lucene4IRConstants.FIELD_TITLE);
token_count = collectionStats.sumTotalTermFreq();
doc_count = collectionStats.docCount();
sum_doc_count = collectionStats.sumDocFreq();
avg_doc_length = token_count / doc_count;
System.out.println("TITLE: Token count: " + token_count+ " Doc Count: " + doc_count + " sum doc: " + sum_doc_count + " avg doc len: " + avg_doc_length);
collectionStats = searcher.collectionStatistics(lucene4ir.Lucene4IRConstants.FIELD_CONTENT);
token_count = collectionStats.sumTotalTermFreq();
doc_count = collectionStats.docCount();
sum_doc_count = collectionStats.sumDocFreq();
avg_doc_length = token_count / doc_count;
System.out.println("CONTENT: Token count: " + token_count+ " Doc Count: " + doc_count + " sum doc: " + sum_doc_count + " avg doc len: " + avg_doc_length);
}
public void reportCollectionStatistics()throws IOException {
IndexSearcher searcher = new IndexSearcher(reader);
CollectionStatistics collectionStats = searcher.collectionStatistics(Lucene4IRConstants.FIELD_ALL);
long token_count = collectionStats.sumTotalTermFreq();
long doc_count = collectionStats.docCount();
long sum_doc_count = collectionStats.sumDocFreq();
long avg_doc_length = token_count / doc_count;
System.out.println("ALL: Token count: " + token_count+ " Doc Count: " + doc_count + " sum doc: " + sum_doc_count + " avg doc len: " + avg_doc_length);
collectionStats = searcher.collectionStatistics(Lucene4IRConstants.FIELD_TITLE);
token_count = collectionStats.sumTotalTermFreq();
doc_count = collectionStats.docCount();
sum_doc_count = collectionStats.sumDocFreq();
avg_doc_length = token_count / doc_count;
System.out.println("TITLE: Token count: " + token_count+ " Doc Count: " + doc_count + " sum doc: " + sum_doc_count + " avg doc len: " + avg_doc_length);
collectionStats = searcher.collectionStatistics(Lucene4IRConstants.FIELD_CONTENT);
token_count = collectionStats.sumTotalTermFreq();
doc_count = collectionStats.docCount();
sum_doc_count = collectionStats.sumDocFreq();
avg_doc_length = token_count / doc_count;
System.out.println("CONTENT: Token count: " + token_count+ " Doc Count: " + doc_count + " sum doc: " + sum_doc_count + " avg doc len: " + avg_doc_length);
}
public AggregatedDfs(ObjectObjectHashMap<Term, TermStatistics> termStatistics, ObjectObjectHashMap<String, CollectionStatistics> fieldStatistics, long maxDoc) {
this.termStatistics = termStatistics;
this.fieldStatistics = fieldStatistics;
this.maxDoc = maxDoc;
}
public ObjectObjectHashMap<String, CollectionStatistics> fieldStatistics() {
return fieldStatistics;
}
public ObjectObjectHashMap<String, CollectionStatistics> fieldStatistics() {
return fieldStatistics;
}
public static ObjectObjectHashMap<String, CollectionStatistics> readFieldStats(StreamInput in) throws IOException {
return readFieldStats(in, null);
}