下面列出了怎么用org.apache.lucene.search.TermStatistics的API类实例代码及写法,或者点击链接到github查看源代码。
@Override
public void writeTo(final StreamOutput out) throws IOException {
out.writeVInt(termStatistics.size());
for (ObjectObjectCursor<Term, TermStatistics> c : termStatistics()) {
Term term = (Term) c.key;
out.writeString(term.field());
out.writeBytesRef(term.bytes());
TermStatistics stats = (TermStatistics) c.value;
out.writeBytesRef(stats.term());
out.writeVLong(stats.docFreq());
out.writeVLong(DfsSearchResult.addOne(stats.totalTermFreq()));
}
DfsSearchResult.writeFieldStats(out, fieldStatistics);
out.writeVLong(maxDoc);
}
public static TermStatistics[] readTermStats(StreamInput in, Term[] terms) throws IOException {
int termsStatsSize = in.readVInt();
final TermStatistics[] termStatistics;
if (termsStatsSize == 0) {
termStatistics = EMPTY_TERM_STATS;
} else {
termStatistics = new TermStatistics[termsStatsSize];
assert terms.length == termsStatsSize;
for (int i = 0; i < termStatistics.length; i++) {
BytesRef term = terms[i].bytes();
final long docFreq = in.readVLong();
assert docFreq >= 0;
final long totalTermFreq = subOne(in.readVLong());
termStatistics[i] = new TermStatistics(term, docFreq, totalTermFreq);
}
}
return termStatistics;
}
@Override
public FunctionValues getValues(Map<Object, Object> context, LeafReaderContext readerContext) throws IOException {
IndexSearcher searcher = (IndexSearcher)context.get("searcher");
final TFIDFSimilarity similarity = IDFValueSource.asTFIDF(searcher.getSimilarity(), field);
if (similarity == null) {
throw new UnsupportedOperationException("requires a TFIDFSimilarity (such as ClassicSimilarity)");
}
// Only works if the contribution of the tf is 1 when the freq is 1 and contribution of the idf
// is 1 when docCount == docFreq == 1
final SimScorer simScorer = similarity.scorer(1f,
new CollectionStatistics(field, 1, 1, 1, 1),
new TermStatistics(new BytesRef("bogus"), 1, 1));
final LeafSimScorer leafSimScorer = new LeafSimScorer(simScorer, readerContext.reader(), field, true);
return new FloatDocValues(this) {
int lastDocID = -1;
@Override
public float floatVal(int docID) throws IOException {
if (docID < lastDocID) {
throw new AssertionError("docs out of order: lastDocID=" + lastDocID + " docID=" + docID);
}
lastDocID = docID;
return leafSimScorer.score(docID, 1f);
}
};
}
private Similarity.SimScorer buildSimWeight(SpanQuery query, IndexSearcher searcher, Map<Term, TermStates> termStates, float boost) throws IOException {
if (termStates == null || termStates.size() == 0 || query.getField() == null)
return null;
TermStatistics[] termStats = new TermStatistics[termStates.size()];
int termUpTo = 0;
for (Map.Entry<Term, TermStates> entry : termStates.entrySet()) {
TermStates ts = entry.getValue();
if (ts.docFreq() > 0) {
termStats[termUpTo++] = searcher.termStatistics(entry.getKey(), ts.docFreq(), ts.totalTermFreq());
}
}
CollectionStatistics collectionStats = searcher.collectionStatistics(query.getField());
if (termUpTo > 0) {
return similarity.scorer(boost, collectionStats, ArrayUtil.copyOfSubArray(termStats, 0, termUpTo));
} else {
return null; // no terms at all exist, we won't use similarity
}
}
@Override
public final SimWeight computeWeight(CollectionStatistics collectionStats,
TermStatistics... termStats)
{
float N, n, idf, adl;
idf = 1.0f;
N = collectionStats.maxDoc();
adl = collectionStats.sumTotalTermFreq() / N;
if (termStats.length == 1) {
n = termStats[0].docFreq();
idf = log(N/n);
}
else {
for (final TermStatistics stat : termStats) {
n = stat.docFreq();
idf += log(N/n);
}
}
return new TFIDFWeight(collectionStats.field(), idf, adl);
}
@Override
public void readFrom(StreamInput in) throws IOException {
int size = in.readVInt();
termStatistics = HppcMaps.newMap(size);
for (int i = 0; i < size; i++) {
Term term = new Term(in.readString(), in.readBytesRef());
TermStatistics stats = new TermStatistics(in.readBytesRef(),
in.readVLong(),
DfsSearchResult.subOne(in.readVLong()));
termStatistics.put(term, stats);
}
fieldStatistics = DfsSearchResult.readFieldStats(in);
maxDoc = in.readVLong();
}
private void writeTermStatistics(TermStatistics termStatistics) throws IOException {
int docFreq = (int) termStatistics.docFreq();
assert (docFreq >= -1);
writePotentiallyNegativeVInt(docFreq);
long ttf = termStatistics.totalTermFreq();
assert (ttf >= -1);
writePotentiallyNegativeVLong(ttf);
}
@Override
public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats) {
final long df = termStats.docFreq();
final long max = collectionStats.maxDoc();
final float idf = idfManager.getIDF(termStats.term().utf8ToString());
return new Explanation(idf, "idf(docFreq=" + df + ", maxDocs=" + max + ")");
}
@Test
public void testSimilarities() throws IOException {
MemoryIndex mi = new MemoryIndex();
mi.addField("f1", "a long text field that contains many many terms", analyzer);
IndexSearcher searcher = mi.createSearcher();
LeafReader reader = (LeafReader) searcher.getIndexReader();
NumericDocValues norms = reader.getNormValues("f1");
assertEquals(0, norms.nextDoc());
float n1 = norms.longValue();
// Norms are re-computed when we change the Similarity
mi.setSimilarity(new Similarity() {
@Override
public long computeNorm(FieldInvertState state) {
return 74;
}
@Override
public SimScorer scorer(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
throw new UnsupportedOperationException();
}
});
norms = reader.getNormValues("f1");
assertEquals(0, norms.nextDoc());
float n2 = norms.longValue();
assertTrue(n1 != n2);
TestUtil.checkReader(reader);
}
@Override
public SimScorer scorer(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
assert boost >= 0;
assert collectionStats != null;
assert termStats.length > 0;
for (TermStatistics term : termStats) {
assert term != null;
}
// TODO: check that TermStats is in bounds with respect to collection? e.g. docFreq <= maxDoc
SimScorer scorer = delegate.scorer(boost, collectionStats, termStats);
assert scorer != null;
return new AssertingSimScorer(scorer, boost);
}
@Override
public final SimScorer scorer(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
SimScorer weights[] = new SimScorer[termStats.length];
for (int i = 0; i < termStats.length; i++) {
BasicStats stats = newStats(collectionStats.field(), boost);
fillBasicStats(stats, collectionStats, termStats[i]);
weights[i] = new BasicSimScorer(stats);
}
if (weights.length == 1) {
return weights[0];
} else {
return new MultiSimilarity.MultiSimScorer(weights);
}
}
/** Fills all member fields defined in {@code BasicStats} in {@code stats}.
* Subclasses can override this method to fill additional stats. */
protected void fillBasicStats(BasicStats stats, CollectionStatistics collectionStats, TermStatistics termStats) {
// TODO: validate this for real, somewhere else
assert termStats.totalTermFreq() <= collectionStats.sumTotalTermFreq();
assert termStats.docFreq() <= collectionStats.sumDocFreq();
// TODO: add sumDocFreq for field (numberOfFieldPostings)
stats.setNumberOfDocuments(collectionStats.docCount());
stats.setNumberOfFieldTokens(collectionStats.sumTotalTermFreq());
stats.setAvgFieldLength(collectionStats.sumTotalTermFreq() / (double) collectionStats.docCount());
stats.setDocFreq(termStats.docFreq());
stats.setTotalTermFreq(termStats.totalTermFreq());
}
@Override
public final SimScorer scorer(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
Explanation idf = termStats.length == 1 ? idfExplain(collectionStats, termStats[0]) : idfExplain(collectionStats, termStats);
float avgdl = avgFieldLength(collectionStats);
float[] cache = new float[256];
for (int i = 0; i < cache.length; i++) {
cache[i] = 1f / (k1 * ((1 - b) + b * LENGTH_TABLE[i] / avgdl));
}
return new BM25Scorer(boost, k1, b, idf, avgdl, cache);
}
@Override
public SimScorer scorer(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
SimScorer subScorers[] = new SimScorer[sims.length];
for (int i = 0; i < subScorers.length; i++) {
subScorers[i] = sims[i].scorer(boost, collectionStats, termStats);
}
return new MultiSimScorer(subScorers);
}
/**
* Computes the collection probability of the current term in addition to the
* usual statistics.
*/
@Override
protected void fillBasicStats(BasicStats stats, CollectionStatistics collectionStats, TermStatistics termStats) {
super.fillBasicStats(stats, collectionStats, termStats);
LMStats lmStats = (LMStats) stats;
lmStats.setCollectionProbability(collectionModel.computeProbability(stats));
}
@Override
public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats) {
final long df = termStats.docFreq();
final long docCount = collectionStats.docCount();
final float idf = idf(df, docCount);
return Explanation.match(idf, "idf, computed as log((docCount+1)/(docFreq+1)) + 1 from:",
Explanation.match(df, "docFreq, number of documents containing term"),
Explanation.match(docCount, "docCount, total number of documents with field"));
}
@Override
public SimScorer scorer(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
return new SimScorer() {
@Override
public float score(float freq, long norm) {
return 0;
}
};
}
@Override
public TermStatistics termStatistics(SolrIndexSearcher localSearcher, Term term, int docFreq, long totalTermFreq)
throws IOException {
TermStats termStats = currentGlobalTermStats.get(term.toString());
if (termStats == null) {
log.debug("## Missing global termStats info: {}, using local", term);
missingTermStats.add(term);
metrics.missingGlobalTermStats.increment();
return localSearcher != null ? localSearcher.localTermStatistics(term, docFreq, totalTermFreq) : null;
} else {
return termStats.toTermStatistics();
}
}
public TermStatistics termStatistics(SolrIndexSearcher localSearcher, Term term, int docFreq, long totalTermFreq)
throws IOException {
TermStats termStats = termStatsCache.get(term.toString());
// TermStats == null is also true if term has no docFreq anyway,
// see returnLocalStats, if docFreq == 0, they are not added anyway
// Not sure we need a warning here
if (termStats == null) {
log.debug("Missing global termStats info for term={}, using local stats", term);
metrics.missingGlobalTermStats.increment();
return localSearcher != null ? localSearcher.localTermStatistics(term, docFreq, totalTermFreq) : null;
} else {
return termStats.toTermStatistics();
}
}
@Override
public TermStatistics termStatistics(Term term, int docFreq, long totalTermFreq) throws IOException {
if (statsSource.termStatistics(null, term, docFreq, totalTermFreq) == null) {
missingTermStats.accept(term);
missingTermsCount++;
}
return super.termStatistics(term, docFreq, totalTermFreq);
}
@Override
public final SimWeight computeWeight(CollectionStatistics collectionStats, TermStatistics... termStats) {
Explanation idf = termStats.length == 1 ? idfExplain(collectionStats, termStats[0]) : idfExplain(collectionStats, termStats);
float avgdl = avgFieldLength(collectionStats);
// compute freq-independent part of bm25 equation across all norm values
float cache[] = new float[256];
for (int i = 0; i < cache.length; i++) {
cache[i] = k1 * ((1 - b) + b * decodeNormValue((byte)i) / avgdl);
}
return new BM25Stats(collectionStats.field(), idf, avgdl, cache);
}
@Override
public final SimWeight computeWeight(CollectionStatistics collectionStats,
TermStatistics... termStats)
{
long N, n;
float idf_, avdl;
idf_ = 1.0f;
N = collectionStats.docCount();
if (N == -1)
N = collectionStats.maxDoc();
avdl = collectionStats.sumTotalTermFreq() / N;
if (termStats.length == 1) {
n = termStats[0].docFreq();
idf_ = idf(n, N);
}
else { /* computation for a phrase */
for (final TermStatistics stat : termStats) {
n = stat.docFreq();
idf_ += idf(n, N);
}
}
return new TFIDFWeight(collectionStats.field(), idf_, avdl);
}
public AggregatedDfs(ObjectObjectHashMap<Term, TermStatistics> termStatistics, ObjectObjectHashMap<String, CollectionStatistics> fieldStatistics, long maxDoc) {
this.termStatistics = termStatistics;
this.fieldStatistics = fieldStatistics;
this.maxDoc = maxDoc;
}
public ObjectObjectHashMap<Term, TermStatistics> termStatistics() {
return termStatistics;
}
public DfsSearchResult termsStatistics(Term[] terms, TermStatistics[] termStatistics) {
this.terms = terms;
this.termStatistics = termStatistics;
return this;
}
public TermStatistics[] termStatistics() {
return termStatistics;
}
public static void writeTermStats(StreamOutput out, TermStatistics[] termStatistics) throws IOException {
out.writeVInt(termStatistics.length);
for (TermStatistics termStatistic : termStatistics) {
writeSingleTermStats(out, termStatistic);
}
}
public static void writeSingleTermStats(StreamOutput out, TermStatistics termStatistic) throws IOException {
assert termStatistic.docFreq() >= 0;
out.writeVLong(termStatistic.docFreq());
out.writeVLong(addOne(termStatistic.totalTermFreq()));
}
private TermStatistics getTermStatistics(TermsEnum termsEnum, Term term) throws IOException {
if (dfs != null) {
return dfs.termStatistics().get(term);
}
return new TermStatistics(termsEnum.term(), termsEnum.docFreq(), termsEnum.totalTermFreq());
}
@Override
public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics[] termStats) {
return Explanation.match(1.0f, "Inexplicable");
}