下面列出了org.apache.lucene.index.ImpactsEnum#org.apache.lucene.util.PriorityQueue 实例代码,或者点击链接到github查看源代码,也可以在右侧发表评论。
private void updateTop(CandidateSet[] candidates, Candidate[] path, PriorityQueue<Correction> corrections, double cutoffScore, double score)
throws IOException {
score = Math.exp(score);
assert Math.abs(score - score(path, candidates)) < 0.00001;
if (score > cutoffScore) {
if (corrections.size() < maxNumCorrections) {
Candidate[] c = new Candidate[candidates.length];
System.arraycopy(path, 0, c, 0, path.length);
corrections.add(new Correction(score, c));
} else if (corrections.top().compareTo(score, path) < 0) {
Correction top = corrections.top();
System.arraycopy(path, 0, top.candidates, 0, path.length);
top.score = score;
corrections.updateTop();
}
}
}
/**
* Add to an existing boolean query the More Like This query from this PriorityQueue
*/
private void addToQuery(PriorityQueue<ScoreTerm> q, BooleanQuery query) {
ScoreTerm scoreTerm;
float bestScore = -1;
while ((scoreTerm = q.pop()) != null) {
TermQuery tq = new TermQuery(new Term(scoreTerm.topField, scoreTerm.word));
if (boost) {
if (bestScore == -1) {
bestScore = (scoreTerm.score);
}
float myScore = (scoreTerm.score);
tq.setBoost(boostFactor * myScore / bestScore);
}
try {
query.add(tq, BooleanClause.Occur.SHOULD);
}
catch (BooleanQuery.TooManyClauses ignore) {
break;
}
}
}
MinimumShouldMatchIntervalIterator(Collection<IntervalIterator> subs, int minShouldMatch) {
this.disiQueue = new DisiPriorityQueue(subs.size());
float mc = 0;
for (IntervalIterator it : subs) {
this.disiQueue.add(new DisiWrapper(it));
mc += it.matchCost();
}
this.approximation = new DisjunctionDISIApproximation(disiQueue);
this.matchCost = mc;
this.minShouldMatch = minShouldMatch;
this.proximityQueue = new PriorityQueue<IntervalIterator>(minShouldMatch) {
@Override
protected boolean lessThan(IntervalIterator a, IntervalIterator b) {
return a.start() < b.start() || (a.start() == b.start() && a.end() >= b.end());
}
};
this.backgroundQueue = new PriorityQueue<IntervalIterator>(subs.size()) {
@Override
protected boolean lessThan(IntervalIterator a, IntervalIterator b) {
return a.end() < b.end() || (a.end() == b.end() && a.start() >= b.start());
}
};
}
private PriorityQueue<ScoreTerm> retrieveTerms(Map<String, Collection<Object>> field2fieldValues) throws
IOException {
Map<String, Map<String, Int>> field2termFreqMap = new HashMap<>();
for (String fieldName : fieldNames) {
Collection<Object> fieldValues = field2fieldValues.get(fieldName);
if (fieldValues == null)
continue;
for (Object fieldValue : fieldValues) {
if (fieldValue != null) {
addTermFrequencies(new StringReader(String.valueOf(fieldValue)), field2termFreqMap,
fieldName);
}
}
}
return createQueue(field2termFreqMap);
}
static long cost(LongStream costs, int numScorers, int minShouldMatch) {
// the idea here is the following: a boolean query c1,c2,...cn with minShouldMatch=m
// could be rewritten to:
// (c1 AND (c2..cn|msm=m-1)) OR (!c1 AND (c2..cn|msm=m))
// if we assume that clauses come in ascending cost, then
// the cost of the first part is the cost of c1 (because the cost of a conjunction is
// the cost of the least costly clause)
// the cost of the second part is the cost of finding m matches among the c2...cn
// remaining clauses
// since it is a disjunction overall, the total cost is the sum of the costs of these
// two parts
// If we recurse infinitely, we find out that the cost of a msm query is the sum of the
// costs of the num_scorers - minShouldMatch + 1 least costly scorers
final PriorityQueue<Long> pq = new PriorityQueue<Long>(numScorers - minShouldMatch + 1) {
@Override
protected boolean lessThan(Long a, Long b) {
return a > b;
}
};
costs.forEach(pq::insertWithOverflow);
return StreamSupport.stream(pq.spliterator(), false).mapToLong(Number::longValue).sum();
}
public void findCandidates(CandidateSet[] candidates, Candidate[] path, int ord, int numMissspellingsLeft,
PriorityQueue<Correction> corrections, double cutoffScore, final double pathScore) throws IOException {
CandidateSet current = candidates[ord];
if (ord == candidates.length - 1) {
path[ord] = current.originalTerm;
updateTop(candidates, path, corrections, cutoffScore, pathScore + scorer.score(path, candidates, ord, gramSize));
if (numMissspellingsLeft > 0) {
for (int i = 0; i < current.candidates.length; i++) {
path[ord] = current.candidates[i];
updateTop(candidates, path, corrections, cutoffScore, pathScore + scorer.score(path, candidates, ord, gramSize));
}
}
} else {
if (numMissspellingsLeft > 0) {
path[ord] = current.originalTerm;
findCandidates(candidates, path, ord + 1, numMissspellingsLeft, corrections, cutoffScore, pathScore + scorer.score(path, candidates, ord, gramSize));
for (int i = 0; i < current.candidates.length; i++) {
path[ord] = current.candidates[i];
findCandidates(candidates, path, ord + 1, numMissspellingsLeft - 1, corrections, cutoffScore, pathScore + scorer.score(path, candidates, ord, gramSize));
}
} else {
path[ord] = current.originalTerm;
findCandidates(candidates, path, ord + 1, 0, corrections, cutoffScore, pathScore + scorer.score(path, candidates, ord, gramSize));
}
}
}
/**
* Find words for a more-like-this query former.
*
* @param docNum the id of the lucene document from which to find terms
*/
private PriorityQueue<ScoreTerm> retrieveTerms(int docNum) throws IOException {
Map<String, Int> termFreqMap = new HashMap<>();
for (String fieldName : fieldNames) {
final Fields vectors = ir.getTermVectors(docNum);
final Terms vector;
if (vectors != null) {
vector = vectors.terms(fieldName);
} else {
vector = null;
}
// field does not store term vector info
if (vector == null) {
Document d = ir.document(docNum);
IndexableField fields[] = d.getFields(fieldName);
for (IndexableField field : fields) {
final String stringValue = field.stringValue();
if (stringValue != null) {
addTermFrequencies(new FastStringReader(stringValue), termFreqMap, fieldName);
}
}
} else {
addTermFrequencies(termFreqMap, vector, fieldName);
}
}
return createQueue(termFreqMap);
}
/**
* @see #retrieveInterestingTerms(java.io.Reader, String)
*/
public String[] retrieveInterestingTerms(int docNum) throws IOException {
ArrayList<Object> al = new ArrayList<>(maxQueryTerms);
PriorityQueue<ScoreTerm> pq = retrieveTerms(docNum);
ScoreTerm scoreTerm;
int lim = maxQueryTerms; // have to be careful, retrieveTerms returns all words but that's probably not useful to our caller...
// we just want to return the top words
while (((scoreTerm = pq.pop()) != null) && lim-- > 0) {
al.add(scoreTerm.word); // the 1st entry is the interesting word
}
String[] res = new String[al.size()];
return al.toArray(res);
}
/**
* Convenience routine to make it easy to return the most interesting words in a document.
* More advanced users will call {@link #retrieveTerms(Reader, String) retrieveTerms()} directly.
*
* @param r the source document
* @param fieldName field passed to analyzer to use when analyzing the content
* @return the most interesting words in the document
* @see #retrieveTerms(java.io.Reader, String)
* @see #setMaxQueryTerms
*/
public String[] retrieveInterestingTerms(Reader r, String fieldName) throws IOException {
ArrayList<Object> al = new ArrayList<>(maxQueryTerms);
PriorityQueue<ScoreTerm> pq = retrieveTerms(r, fieldName);
ScoreTerm scoreTerm;
int lim = maxQueryTerms; // have to be careful, retrieveTerms returns all words but that's probably not useful to our caller...
// we just want to return the top words
while (((scoreTerm = pq.pop()) != null) && lim-- > 0) {
al.add(scoreTerm.word); // the 1st entry is the interesting word
}
String[] res = new String[al.size()];
return al.toArray(res);
}
private String [] bestTerms(String field,int numTerms) throws IOException {
PriorityQueue<TermDf> pq = new TermsDfQueue(numTerms);
IndexReader ir = DirectoryReader.open(dir);
try {
int threshold = ir.maxDoc() / 10; // ignore words too common.
Terms terms = MultiTerms.getTerms(ir, field);
if (terms != null) {
TermsEnum termsEnum = terms.iterator();
while (termsEnum.next() != null) {
int df = termsEnum.docFreq();
if (df<threshold) {
String ttxt = termsEnum.term().utf8ToString();
pq.insertWithOverflow(new TermDf(ttxt,df));
}
}
}
} finally {
ir.close();
}
String res[] = new String[pq.size()];
int i = 0;
while (pq.size()>0) {
TermDf tdf = pq.pop();
res[i++] = tdf.word;
System.out.println(i+". word: "+tdf.df+" "+tdf.word);
}
return res;
}
UnorderedIntervalIterator(List<IntervalIterator> subIterators) {
super(subIterators);
this.queue = new PriorityQueue<IntervalIterator>(subIterators.size()) {
@Override
protected boolean lessThan(IntervalIterator a, IntervalIterator b) {
return a.start() < b.start() || (a.start() == b.start() && a.end() >= b.end());
}
};
this.subIterators = new IntervalIterator[subIterators.size()];
for (int i = 0; i < subIterators.size(); i++) {
this.subIterators[i] = subIterators.get(i);
}
}
/**
* Find words for a more-like-this query former.
*
* @param docNum the id of the lucene document from which to find terms
*/
private PriorityQueue<ScoreTerm> retrieveTerms(int docNum) throws IOException {
Map<String, Map<String, Int>> field2termFreqMap = new HashMap<>();
for (String fieldName : fieldNames) {
final Fields vectors = ir.getTermVectors(docNum);
final Terms vector;
if (vectors != null) {
vector = vectors.terms(fieldName);
} else {
vector = null;
}
// field does not store term vector info
if (vector == null) {
Document d = ir.document(docNum);
IndexableField[] fields = d.getFields(fieldName);
for (IndexableField field : fields) {
final String stringValue = field.stringValue();
if (stringValue != null) {
addTermFrequencies(new StringReader(stringValue), field2termFreqMap, fieldName);
}
}
} else {
addTermFrequencies(field2termFreqMap, vector, fieldName);
}
}
return createQueue(field2termFreqMap);
}
/**
* @see #retrieveInterestingTerms(java.io.Reader, String)
*/
public String[] retrieveInterestingTerms(int docNum) throws IOException {
ArrayList<String> al = new ArrayList<>(maxQueryTerms);
PriorityQueue<ScoreTerm> pq = retrieveTerms(docNum);
ScoreTerm scoreTerm;
int lim = maxQueryTerms; // have to be careful, retrieveTerms returns all words but that's probably not useful to our caller...
// we just want to return the top words
while (((scoreTerm = pq.pop()) != null) && lim-- > 0) {
al.add(scoreTerm.word); // the 1st entry is the interesting word
}
String[] res = new String[al.size()];
return al.toArray(res);
}
/**
* Convenience routine to make it easy to return the most interesting words in a document.
* More advanced users will call {@link #retrieveTerms(Reader, String) retrieveTerms()} directly.
*
* @param r the source document
* @param fieldName field passed to analyzer to use when analyzing the content
* @return the most interesting words in the document
* @see #retrieveTerms(java.io.Reader, String)
* @see #setMaxQueryTerms
*/
public String[] retrieveInterestingTerms(Reader r, String fieldName) throws IOException {
ArrayList<String> al = new ArrayList<>(maxQueryTerms);
PriorityQueue<ScoreTerm> pq = retrieveTerms(r, fieldName);
ScoreTerm scoreTerm;
int lim = maxQueryTerms; // have to be careful, retrieveTerms returns all words but that's probably not useful to our caller...
// we just want to return the top words
while (((scoreTerm = pq.pop()) != null) && lim-- > 0) {
al.add(scoreTerm.word); // the 1st entry is the interesting word
}
String[] res = new String[al.size()];
return al.toArray(res);
}
private static List<TermAndFreq> queueToList(PriorityQueue<TermAndFreq> queue) {
List<TermAndFreq> terms = new ArrayList<>();
while (queue.size() > 0) {
terms.add(queue.pop());
}
return terms;
}
private TwoPhase(DocIdSetIterator approximation, float matchCost) {
super(approximation);
this.matchCost = matchCost;
unverifiedMatches = new PriorityQueue<DisiWrapper>(DisjunctionScorer.this.subScorers.size()) {
@Override
protected boolean lessThan(DisiWrapper a, DisiWrapper b) {
return a.matchCost < b.matchCost;
}
};
}
UnionFullPostingsEnum(List<PostingsEnum> subs) {
super(subs);
this.posQueue = new PriorityQueue<PostingsAndPosition>(subs.size()) {
@Override
protected boolean lessThan(PostingsAndPosition a, PostingsAndPosition b) {
return a.pos < b.pos;
}
};
this.subs = new ArrayList<>();
for (PostingsEnum pe : subs) {
this.subs.add(new PostingsAndPosition(pe));
}
}
private DisjunctionMatchesIterator(List<MatchesIterator> matches) throws IOException {
queue = new PriorityQueue<MatchesIterator>(matches.size()){
@Override
protected boolean lessThan(MatchesIterator a, MatchesIterator b) {
return a.startPosition() < b.startPosition() ||
(a.startPosition() == b.startPosition() && a.endPosition() < b.endPosition()) ||
(a.startPosition() == b.startPosition() && a.endPosition() == b.endPosition());
}
};
for (MatchesIterator mi : matches) {
if (mi.next()) {
queue.add(mi);
}
}
}
/**
* Create the More like query from a PriorityQueue
*/
private Query createQuery(PriorityQueue<ScoreTerm> q) {
BooleanQuery query = new BooleanQuery();
addToQuery(q, query);
return query;
}
/**
* Create a PriorityQueue from a word->tf map.
*
* @param words a map of words keyed on the word(String) with Int objects as the values.
* @param fieldNames an array of field names to override defaults.
*/
private PriorityQueue<ScoreTerm> createQueue(Map<String, Int> words, String... fieldNames) throws IOException {
// have collected all words in doc and their freqs
int numDocs = ir.numDocs();
final int limit = Math.min(maxQueryTerms, words.size());
FreqQ queue = new FreqQ(limit); // will order words by score
for (String word : words.keySet()) { // for every word
int tf = words.get(word).x; // term freq in the source doc
if (minTermFreq > 0 && tf < minTermFreq) {
continue; // filter out words that don't occur enough times in the source
}
// go through all the fields and find the largest document frequency
String topField = fieldNames[0];
int docFreq = 0;
for (String fieldName : fieldNames) {
int freq = ir.docFreq(new Term(fieldName, word));
topField = (freq > docFreq) ? fieldName : topField;
docFreq = (freq > docFreq) ? freq : docFreq;
}
if (minDocFreq > 0 && docFreq < minDocFreq) {
continue; // filter out words that don't occur in enough docs
}
if (docFreq > maxDocFreq) {
continue; // filter out words that occur in too many docs
}
if (docFreq == 0) {
continue; // index update problem?
}
float idf = similarity.idf(docFreq, numDocs);
float score = tf * idf;
if (queue.size() < limit) {
// there is still space in the queue
queue.add(new ScoreTerm(word, topField, score, idf, docFreq, tf));
} else {
ScoreTerm term = queue.top();
if (term.score < score) { // update the smallest in the queue in place and update the queue.
term.update(word, topField, score, idf, docFreq, tf);
queue.updateTop();
}
}
}
return queue;
}
/**
* Create a PriorityQueue from a word->tf map.
*
* @param perFieldTermFrequencies a per field map of words keyed on the word(String) with Int objects as the values.
*/
private PriorityQueue<ScoreTerm> createQueue(Map<String, Map<String, Int>> perFieldTermFrequencies) throws IOException {
// have collected all words in doc and their freqs
final int limit = Math.min(maxQueryTerms, this.getTermsCount(perFieldTermFrequencies));
FreqQ queue = new FreqQ(limit); // will order words by score
for (Map.Entry<String, Map<String, Int>> entry : perFieldTermFrequencies.entrySet()) {
Map<String, Int> perWordTermFrequencies = entry.getValue();
String fieldName = entry.getKey();
long numDocs = ir.getDocCount(fieldName);
if(numDocs == -1) {
numDocs = ir.numDocs();
}
for (Map.Entry<String, Int> tfEntry : perWordTermFrequencies.entrySet()) { // for every word
String word = tfEntry.getKey();
int tf = tfEntry.getValue().x; // term freq in the source doc
if (minTermFreq > 0 && tf < minTermFreq) {
continue; // filter out words that don't occur enough times in the source
}
int docFreq = ir.docFreq(new Term(fieldName, word));
if (minDocFreq > 0 && docFreq < minDocFreq) {
continue; // filter out words that don't occur in enough docs
}
if (docFreq > maxDocFreq) {
continue; // filter out words that occur in too many docs
}
if (docFreq == 0) {
continue; // index update problem?
}
float idf = similarity.idf(docFreq, numDocs);
float score = tf * idf;
if (queue.size() < limit) {
// there is still space in the queue
queue.add(new ScoreTerm(word, fieldName, score, idf, docFreq, tf));
} else {
ScoreTerm term = queue.top();
if (term.score < score) { // update the smallest in the queue in place and update the queue.
term.update(word, fieldName, score, idf, docFreq, tf);
queue.updateTop();
}
}
}
}
return queue;
}
protected TopDocsCollector(PriorityQueue<T> pq) {
this.pq = pq;
}
protected JustCompileTopDocsCollector(PriorityQueue<ScoreDoc> pq) {
super(pq);
}
@SuppressWarnings({"unchecked"})
public TestCollector(@SuppressWarnings({"rawtypes"})PriorityQueue pq) {
super(pq);
}
@SuppressWarnings({"unchecked"})
public TestCollector1(@SuppressWarnings({"rawtypes"})PriorityQueue pq) {
super(pq);
}
/**
* Create a PriorityQueue from a word->tf map.
*
* @param words a map of words keyed on the word(String) with Int objects as the values.
*/
private PriorityQueue<ScoreTerm> createQueue(Map<String, Int> words) throws IOException {
return createQueue(words, this.fieldNames);
}
/**
* Find words for a more-like-this query former.
* The result is a priority queue of arrays with one entry for <b>every word</b> in the document.
* Each array has 6 elements.
* The elements are:
* <ol>
* <li> The word (String)
* <li> The top field that this word comes from (String)
* <li> The score for this word (Float)
* <li> The IDF value (Float)
* <li> The frequency of this word in the index (Integer)
* <li> The frequency of this word in the source document (Integer)
* </ol>
* This is a somewhat "advanced" routine, and in general only the 1st entry in the array is of interest.
* This method is exposed so that you can identify the "interesting words" in a document.
* For an easier method to call see {@link #retrieveInterestingTerms retrieveInterestingTerms()}.
*
* @param r the reader that has the content of the document
* @param fieldName field passed to the analyzer to use when analyzing the content
* @return the most interesting words in the document ordered by score, with the highest scoring, or best entry, first
* @see #retrieveInterestingTerms
*/
private PriorityQueue<ScoreTerm> retrieveTerms(Reader r, String fieldName) throws IOException {
Map<String, Int> words = new HashMap<>();
addTermFrequencies(r, words, fieldName);
return createQueue(words);
}
/**
* Find words for a more-like-this query former.
* The result is a priority queue of arrays with one entry for <b>every word</b> in the document.
* Each array has 6 elements.
* The elements are:
* <ol>
* <li> The word (String)
* <li> The top field that this word comes from (String)
* <li> The score for this word (Float)
* <li> The IDF value (Float)
* <li> The frequency of this word in the index (Integer)
* <li> The frequency of this word in the source document (Integer)
* </ol>
* This is a somewhat "advanced" routine, and in general only the 1st entry in the array is of interest.
* This method is exposed so that you can identify the "interesting words" in a document.
* For an easier method to call see {@link #retrieveInterestingTerms retrieveInterestingTerms()}.
*
* @param r the reader that has the content of the document
* @param fieldName field passed to the analyzer to use when analyzing the content
* @return the most interesting words in the document ordered by score, with the highest scoring, or best entry, first
* @see #retrieveInterestingTerms
*/
private PriorityQueue<ScoreTerm> retrieveTerms(Reader r, String fieldName) throws IOException {
Map<String, Map<String, Int>> field2termFreqMap = new HashMap<>();
addTermFrequencies(r, field2termFreqMap, fieldName);
return createQueue(field2termFreqMap);
}