下面列出了怎么用org.apache.lucene.index.TermStates的API类实例代码及写法,或者点击链接到github查看源代码。
public TermAutomatonWeight(Automaton automaton, IndexSearcher searcher, Map<Integer,TermStates> termStates, float boost) throws IOException {
super(TermAutomatonQuery.this);
this.automaton = automaton;
this.termStates = termStates;
this.similarity = searcher.getSimilarity();
List<TermStatistics> allTermStats = new ArrayList<>();
for(Map.Entry<Integer,BytesRef> ent : idToTerm.entrySet()) {
Integer termID = ent.getKey();
if (ent.getValue() != null) {
TermStates ts = termStates.get(termID);
if (ts.docFreq() > 0) {
allTermStats.add(searcher.termStatistics(new Term(field, ent.getValue()), ts.docFreq(), ts.totalTermFreq()));
}
}
}
if (allTermStats.isEmpty()) {
stats = null; // no terms matched at all, will not use sim
} else {
stats = similarity.scorer(boost, searcher.collectionStatistics(field),
allTermStats.toArray(new TermStatistics[allTermStats.size()]));
}
}
private Query newTermQuery(IndexReader reader, Term term) throws IOException {
if (ignoreTF) {
return new ConstantScoreQuery(new TermQuery(term));
} else {
// we build an artificial TermStates that will give an overall df and ttf
// equal to 1
TermStates context = new TermStates(reader.getContext());
for (LeafReaderContext leafContext : reader.leaves()) {
Terms terms = leafContext.reader().terms(term.field());
if (terms != null) {
TermsEnum termsEnum = terms.iterator();
if (termsEnum.seekExact(term.bytes())) {
int freq = 1 - context.docFreq(); // we want the total df and ttf to be 1
context.register(termsEnum.termState(), leafContext.ord, freq, freq);
}
}
}
return new TermQuery(term, context);
}
}
private Query newTermQuery(IndexReader reader, Term term) throws IOException {
// we build an artificial TermStates that will give an overall df and ttf
// equal to 1
TermStates termStates = new TermStates(reader.getContext());
for (LeafReaderContext leafContext : reader.leaves()) {
Terms terms = leafContext.reader().terms(term.field());
if (terms != null) {
TermsEnum termsEnum = terms.iterator();
if (termsEnum.seekExact(term.bytes())) {
int freq = 1 - termStates.docFreq(); // we want the total df and ttf to be 1
termStates.register(termsEnum.termState(), leafContext.ord, freq, freq);
}
}
}
return new TermQuery(term, termStates);
}
Map<Term,TermStatistics> getNodeTermStats(Set<Term> terms, int nodeID, long version) throws IOException {
final NodeState node = nodes[nodeID];
final Map<Term,TermStatistics> stats = new HashMap<>();
final IndexSearcher s = node.searchers.acquire(version);
if (s == null) {
throw new SearcherExpiredException("node=" + nodeID + " version=" + version);
}
try {
for(Term term : terms) {
final TermStates ts = TermStates.build(s.getIndexReader().getContext(), term, true);
if (ts.docFreq() > 0) {
stats.put(term, s.termStatistics(term, ts.docFreq(), ts.totalTermFreq()));
}
}
} finally {
node.searchers.release(s);
}
return stats;
}
/**
* Create a TopTermsSpanBooleanQueryRewrite for
* at most <code>size</code> terms.
*/
public TopTermsSpanBooleanQueryRewrite(int size) {
delegate = new TopTermsRewrite<List<SpanQuery>>(size) {
@Override
protected int getMaxSize() {
return Integer.MAX_VALUE;
}
@Override
protected List<SpanQuery> getTopLevelBuilder() {
return new ArrayList<SpanQuery>();
}
@Override
protected Query build(List<SpanQuery> builder) {
return new SpanOrQuery(builder.toArray(new SpanQuery[builder.size()]));
}
@Override
protected void addClause(List<SpanQuery> topLevel, Term term, int docFreq, float boost, TermStates states) {
final SpanTermQuery q = new SpanTermQuery(term, states);
topLevel.add(q);
}
};
}
private Similarity.SimScorer buildSimWeight(SpanQuery query, IndexSearcher searcher, Map<Term, TermStates> termStates, float boost) throws IOException {
if (termStates == null || termStates.size() == 0 || query.getField() == null)
return null;
TermStatistics[] termStats = new TermStatistics[termStates.size()];
int termUpTo = 0;
for (Map.Entry<Term, TermStates> entry : termStates.entrySet()) {
TermStates ts = entry.getValue();
if (ts.docFreq() > 0) {
termStats[termUpTo++] = searcher.termStatistics(entry.getKey(), ts.docFreq(), ts.totalTermFreq());
}
}
CollectionStatistics collectionStats = searcher.collectionStatistics(query.getField());
if (termUpTo > 0) {
return similarity.scorer(boost, collectionStats, ArrayUtil.copyOfSubArray(termStats, 0, termUpTo));
} else {
return null; // no terms at all exist, we won't use similarity
}
}
private static TermStates adjustFrequencies(IndexReaderContext readerContext,
TermStates ctx, int artificialDf, long artificialTtf) throws IOException {
List<LeafReaderContext> leaves = readerContext.leaves();
final int len;
if (leaves == null) {
len = 1;
} else {
len = leaves.size();
}
TermStates newCtx = new TermStates(readerContext);
for (int i = 0; i < len; ++i) {
TermState termState = ctx.get(leaves.get(i));
if (termState == null) {
continue;
}
newCtx.register(termState, i);
}
newCtx.accumulateStatistics(artificialDf, artificialTtf);
return newCtx;
}
@Override
public Query rewrite(IndexReader reader) throws IOException {
Query rewritten = super.rewrite(reader);
if (rewritten != this) {
return rewritten;
}
IndexReaderContext context = reader.getContext();
TermStates[] ctx = new TermStates[terms.length];
int[] docFreqs = new int[ctx.length];
for (int i = 0; i < terms.length; i++) {
ctx[i] = TermStates.build(context, terms[i], true);
docFreqs[i] = ctx[i].docFreq();
}
final int maxDoc = reader.maxDoc();
blend(ctx, maxDoc, reader);
return topLevelQuery(terms, ctx, docFreqs, maxDoc);
}
@Override
public Query rewrite(IndexReader reader) throws IOException {
if (this.terms.isEmpty()) {
return new MatchNoDocsQuery("CommonTermsQuery with no terms");
} else if (this.terms.size() == 1) {
return newTermQuery(this.terms.get(0), null);
}
final List<LeafReaderContext> leaves = reader.leaves();
final int maxDoc = reader.maxDoc();
final TermStates[] contextArray = new TermStates[terms.size()];
final Term[] queryTerms = this.terms.toArray(new Term[0]);
collectTermStates(reader, leaves, contextArray, queryTerms);
return buildQuery(maxDoc, contextArray, queryTerms);
}
public void collectTermStates(IndexReader reader,
List<LeafReaderContext> leaves, TermStates[] contextArray,
Term[] queryTerms) throws IOException {
TermsEnum termsEnum = null;
for (LeafReaderContext context : leaves) {
for (int i = 0; i < queryTerms.length; i++) {
Term term = queryTerms[i];
TermStates termStates = contextArray[i];
final Terms terms = context.reader().terms(term.field());
if (terms == null) {
// field does not exist
continue;
}
termsEnum = terms.iterator();
assert termsEnum != null;
if (termsEnum == TermsEnum.EMPTY) continue;
if (termsEnum.seekExact(term.bytes())) {
if (termStates == null) {
contextArray[i] = new TermStates(reader.getContext(),
termsEnum.termState(), context.ord, termsEnum.docFreq(),
termsEnum.totalTermFreq());
} else {
termStates.register(termsEnum.termState(), context.ord,
termsEnum.docFreq(), termsEnum.totalTermFreq());
}
}
}
}
}
@Override
public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException {
IndexReaderContext context = searcher.getTopReaderContext();
Map<Integer,TermStates> termStates = new HashMap<>();
for (Map.Entry<BytesRef,Integer> ent : termToID.entrySet()) {
if (ent.getKey() != null) {
termStates.put(ent.getValue(), TermStates.build(context, new Term(field, ent.getKey()), scoreMode.needsScores()));
}
}
return new TermAutomatonWeight(det, searcher, termStates, boost);
}
public TermWeight(IndexSearcher searcher, ScoreMode scoreMode,
float boost, TermStates termStates) throws IOException {
super(TermQuery.this);
if (scoreMode.needsScores() && termStates == null) {
throw new IllegalStateException("termStates are required when scores are needed");
}
this.scoreMode = scoreMode;
this.termStates = termStates;
this.similarity = searcher.getSimilarity();
final CollectionStatistics collectionStats;
final TermStatistics termStats;
if (scoreMode.needsScores()) {
collectionStats = searcher.collectionStatistics(term.field());
termStats = termStates.docFreq() > 0 ? searcher.termStatistics(term, termStates.docFreq(), termStates.totalTermFreq()) : null;
} else {
// we do not need the actual stats, use fake stats with docFreq=maxDoc=ttf=1
collectionStats = new CollectionStatistics(term.field(), 1, 1, 1, 1);
termStats = new TermStatistics(term.bytes(), 1, 1);
}
if (termStats == null) {
this.simScorer = null; // term doesn't exist in any segment, we won't use similarity at all
} else {
this.simScorer = similarity.scorer(boost, collectionStats, termStats);
}
}
@Override
public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException {
final IndexReaderContext context = searcher.getTopReaderContext();
final TermStates termState;
if (perReaderTermState == null
|| perReaderTermState.wasBuiltFor(context) == false) {
termState = TermStates.build(context, term, scoreMode.needsScores());
} else {
// PRTS was pre-build for this IS
termState = this.perReaderTermState;
}
return new TermWeight(searcher, scoreMode, boost, termState);
}
@Override
public SpanWeight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException {
final TermStates context;
final IndexReaderContext topContext = searcher.getTopReaderContext();
if (termStates == null || termStates.wasBuiltFor(topContext) == false) {
context = TermStates.build(topContext, term, scoreMode.needsScores());
}
else {
context = termStates;
}
return new SpanTermWeight(context, searcher, scoreMode.needsScores() ? Collections.singletonMap(term, context) : null, boost);
}
/**
* Build a map of terms to {@link TermStates}, for use in constructing SpanWeights
* @lucene.internal
*/
public static Map<Term, TermStates> getTermStates(SpanWeight... weights) {
Map<Term, TermStates> terms = new TreeMap<>();
for (SpanWeight w : weights) {
w.extractTermStates(terms);
}
return terms;
}
/**
* Build a map of terms to {@link TermStates}, for use in constructing SpanWeights
* @lucene.internal
*/
public static Map<Term, TermStates> getTermStates(Collection<SpanWeight> weights) {
Map<Term, TermStates> terms = new TreeMap<>();
for (SpanWeight w : weights) {
w.extractTermStates(terms);
}
return terms;
}
/**
* Expert: Add a {@link Term} with the provided boost and context.
* This method is useful if you already have a {@link TermStates}
* object constructed for the given term.
*/
public Builder add(Term term, float boost, TermStates context) {
if (numTerms >= IndexSearcher.getMaxClauseCount()) {
throw new IndexSearcher.TooManyClauses();
}
terms = ArrayUtil.grow(terms, numTerms + 1);
boosts = ArrayUtil.grow(boosts, numTerms + 1);
contexts = ArrayUtil.grow(contexts, numTerms + 1);
terms[numTerms] = term;
boosts[numTerms] = boost;
contexts[numTerms] = context;
numTerms += 1;
return this;
}
private BlendedTermQuery(Term[] terms, float[] boosts, TermStates[] contexts,
RewriteMethod rewriteMethod) {
assert terms.length == boosts.length;
assert terms.length == contexts.length;
this.terms = terms;
this.boosts = boosts;
this.contexts = contexts;
this.rewriteMethod = rewriteMethod;
// we sort terms so that equals/hashcode does not rely on the order
new InPlaceMergeSorter() {
@Override
protected void swap(int i, int j) {
Term tmpTerm = terms[i];
terms[i] = terms[j];
terms[j] = tmpTerm;
TermStates tmpContext = contexts[i];
contexts[i] = contexts[j];
contexts[j] = tmpContext;
float tmpBoost = boosts[i];
boosts[i] = boosts[j];
boosts[j] = tmpBoost;
}
@Override
protected int compare(int i, int j) {
return terms[i].compareTo(terms[j]);
}
}.sort(0, terms.length);
}
@Override
public final Query rewrite(IndexReader reader) throws IOException {
final TermStates[] contexts = ArrayUtil.copyOfSubArray(this.contexts, 0, this.contexts.length);
for (int i = 0; i < contexts.length; ++i) {
if (contexts[i] == null || contexts[i].wasBuiltFor(reader.getContext()) == false) {
contexts[i] = TermStates.build(reader.getContext(), terms[i], true);
}
}
// Compute aggregated doc freq and total term freq
// df will be the max of all doc freqs
// ttf will be the sum of all total term freqs
int df = 0;
long ttf = 0;
for (TermStates ctx : contexts) {
df = Math.max(df, ctx.docFreq());
ttf += ctx.totalTermFreq();
}
for (int i = 0; i < contexts.length; ++i) {
contexts[i] = adjustFrequencies(reader.getContext(), contexts[i], df, ttf);
}
Query[] termQueries = new Query[terms.length];
for (int i = 0; i < terms.length; ++i) {
termQueries[i] = new TermQuery(terms[i], contexts[i]);
if (boosts[i] != 1f) {
termQueries[i] = new BoostQuery(termQueries[i], boosts[i]);
}
}
return rewriteMethod.rewrite(termQueries);
}
/**
* Compute a feature value that may be used as the {@code pivot} parameter of
* the {@link #newSaturationQuery(String, String, float, float)} and
* {@link #newSigmoidQuery(String, String, float, float, float)} factory
* methods. The implementation takes the average of the int bits of the float
* representation in practice before converting it back to a float. Given that
* floats store the exponent in the higher bits, it means that the result will
* be an approximation of the geometric mean of all feature values.
* @param reader the {@link IndexReader} to search against
* @param featureField the field that stores features
* @param featureName the name of the feature
*/
static float computePivotFeatureValue(IndexReader reader, String featureField, String featureName) throws IOException {
Term term = new Term(featureField, featureName);
TermStates states = TermStates.build(reader.getContext(), term, true);
if (states.docFreq() == 0) {
// avoid division by 0
// The return value doesn't matter much here, the term doesn't exist,
// it will never be used for scoring. Just Make sure to return a legal
// value.
return 1;
}
float avgFreq = (float) ((double) states.totalTermFreq() / states.docFreq());
return decodeFeatureValue(avgFreq);
}
public void testEquals() throws IOException {
QueryUtils.checkEqual(
new TermQuery(new Term("foo", "bar")),
new TermQuery(new Term("foo", "bar")));
QueryUtils.checkUnequal(
new TermQuery(new Term("foo", "bar")),
new TermQuery(new Term("foo", "baz")));
final CompositeReaderContext context;
try (MultiReader multiReader = new MultiReader()) {
context = multiReader.getContext();
}
QueryUtils.checkEqual(
new TermQuery(new Term("foo", "bar")),
new TermQuery(new Term("foo", "bar"), TermStates.build(context, new Term("foo", "bar"), true)));
}
private void collectTermStates(IndexReader reader,
List<LeafReaderContext> leaves,
TermStates[] contextArray,
Term[] queryTerms) throws IOException {
TermsEnum termsEnum = null;
for (LeafReaderContext context : leaves) {
Terms terms = context.reader().terms(this.field);
if (terms == null) {
// field does not exist
continue;
}
termsEnum = terms.iterator();
if (termsEnum == TermsEnum.EMPTY) continue;
for (int i = 0; i < queryTerms.length; i++) {
Term term = queryTerms[i];
TermStates termStates = contextArray[i];
if (termsEnum.seekExact(term.bytes())) {
if (termStates == null) {
contextArray[i] = new TermStates(reader.getContext(),
termsEnum.termState(), context.ord, termsEnum.docFreq(),
termsEnum.totalTermFreq());
} else {
termStates.register(termsEnum.termState(), context.ord,
termsEnum.docFreq(), termsEnum.totalTermFreq());
}
}
}
}
}
private static void collectTermStates(IndexReaderContext topReaderContext, TermStates[] contextArray,
Term[] queryTerms) throws IOException {
TermsEnum termsEnum = null;
for (LeafReaderContext context : topReaderContext.leaves()) {
for (int i = 0; i < queryTerms.length; i++) {
Term term = queryTerms[i];
final Terms terms = context.reader().terms(term.field());
if (terms == null) {
// field does not exist
continue;
}
termsEnum = terms.iterator();
assert termsEnum != null;
if (termsEnum == TermsEnum.EMPTY) continue;
TermStates termStates = contextArray[i];
if (termsEnum.seekExact(term.bytes())) {
if (termStates == null) {
termStates = new TermStates(topReaderContext);
contextArray[i] = termStates;
}
termStates.accumulateStatistics(termsEnum.docFreq(), termsEnum.totalTermFreq());
}
}
}
}
@Override
public Weight createWeight(final IndexSearcher searcher, final ScoreMode scoreMode, final float boost)
throws IOException {
final IndexReaderContext context = searcher.getTopReaderContext();
final TermStates termState = TermStates.build(context, term, scoreMode.needsScores());
// TODO: set boosts to 1f if needsScores is false?
return new FieldBoostWeight(termState, boost, fieldBoost.getBoost(term.field(), searcher.getIndexReader()));
}
public FieldBoostWeight(final TermStates termStates, final float queryBoost, final float fieldBoost) {
super(FieldBoostTermQuery.this);
assert termStates != null : "TermContext must not be null";
this.termStates = termStates;
this.queryBoost = queryBoost;
this.fieldBoost = fieldBoost;
this.score = queryBoost * fieldBoost;
}
private TermStates adjustTTF(IndexReaderContext readerContext,
TermStates termContext,
long sumTTF) throws IOException {
assert termContext.wasBuiltFor(readerContext);
if (sumTTF == -1 && termContext.totalTermFreq() == -1) {
return termContext;
}
TermStates newTermContext = new TermStates(readerContext);
List<LeafReaderContext> leaves = readerContext.leaves();
final int len;
if (leaves == null) {
len = 1;
} else {
len = leaves.size();
}
int df = termContext.docFreq();
long ttf = sumTTF;
if (leaves != null) {
for (int i = 0; i < len; i++) {
TermState termState = termContext.get(leaves.get(i));
if (termState == null) {
continue;
}
newTermContext.register(termState, i, df, ttf);
df = 0;
ttf = 0;
}
}
return newTermContext;
}
private static TermStates adjustDF(IndexReaderContext readerContext,
TermStates ctx,
int newDocFreq) throws IOException {
assert ctx.wasBuiltFor(readerContext);
// Use a value of ttf that is consistent with the doc freq (ie. gte)
long newTTF;
if (ctx.totalTermFreq() < 0) {
newTTF = -1;
} else {
newTTF = Math.max(ctx.totalTermFreq(), newDocFreq);
}
List<LeafReaderContext> leaves = readerContext.leaves();
final int len;
if (leaves == null) {
len = 1;
} else {
len = leaves.size();
}
TermStates newCtx = new TermStates(readerContext);
if (leaves != null) {
for (int i = 0; i < len; ++i) {
TermState termState = ctx.get(leaves.get(i));
if (termState == null) {
continue;
}
newCtx.register(termState, i, newDocFreq, newTTF);
newDocFreq = 0;
newTTF = 0;
}
}
return newCtx;
}
public SpanPayloadCheckWeight(IndexSearcher searcher, Map<Term, TermStates> termStates, SpanWeight matchWeight, float boost) throws IOException {
super(SpanPayloadCheckQuery.this, searcher, termStates, boost);
this.matchWeight = matchWeight;
}
@Override
public void extractTermStates(Map<Term, TermStates> contexts) {
matchWeight.extractTermStates(contexts);
}
@Override
public void extractTermStates(Map<Term, TermStates> contexts) {
innerWeight.extractTermStates(contexts);
}