org.apache.lucene.index.ImpactsEnum#org.apache.lucene.util.PriorityQueue源码实例Demo

下面列出了org.apache.lucene.index.ImpactsEnum#org.apache.lucene.util.PriorityQueue 实例代码,或者点击链接到github查看源代码,也可以在右侧发表评论。

源代码1 项目: Elasticsearch   文件: CandidateScorer.java
private void updateTop(CandidateSet[] candidates, Candidate[] path, PriorityQueue<Correction> corrections, double cutoffScore, double score)
        throws IOException {
    score = Math.exp(score);
    assert Math.abs(score - score(path, candidates)) < 0.00001;
    if (score > cutoffScore) {
        if (corrections.size() < maxNumCorrections) {
            Candidate[] c = new Candidate[candidates.length];
            System.arraycopy(path, 0, c, 0, path.length);
            corrections.add(new Correction(score, c));
        } else if (corrections.top().compareTo(score, path) < 0) {
            Correction top = corrections.top();
            System.arraycopy(path, 0, top.candidates, 0, path.length);
            top.score = score;
            corrections.updateTop();
        }
    }
}
 
源代码2 项目: Elasticsearch   文件: XMoreLikeThis.java
/**
 * Add to an existing boolean query the More Like This query from this PriorityQueue
 */
private void addToQuery(PriorityQueue<ScoreTerm> q, BooleanQuery query) {
    ScoreTerm scoreTerm;
    float bestScore = -1;

    while ((scoreTerm = q.pop()) != null) {
        TermQuery tq = new TermQuery(new Term(scoreTerm.topField, scoreTerm.word));

        if (boost) {
            if (bestScore == -1) {
                bestScore = (scoreTerm.score);
            }
            float myScore = (scoreTerm.score);
            tq.setBoost(boostFactor * myScore / bestScore);
        }

        try {
            query.add(tq, BooleanClause.Occur.SHOULD);
        }
        catch (BooleanQuery.TooManyClauses ignore) {
            break;
        }
    }
}
 
MinimumShouldMatchIntervalIterator(Collection<IntervalIterator> subs, int minShouldMatch) {
  this.disiQueue = new DisiPriorityQueue(subs.size());
  float mc = 0;
  for (IntervalIterator it : subs) {
    this.disiQueue.add(new DisiWrapper(it));
    mc += it.matchCost();
  }
  this.approximation = new DisjunctionDISIApproximation(disiQueue);
  this.matchCost = mc;
  this.minShouldMatch = minShouldMatch;

  this.proximityQueue = new PriorityQueue<IntervalIterator>(minShouldMatch) {
    @Override
    protected boolean lessThan(IntervalIterator a, IntervalIterator b) {
      return a.start() < b.start() || (a.start() == b.start() && a.end() >= b.end());
    }
  };
  this.backgroundQueue = new PriorityQueue<IntervalIterator>(subs.size()) {
    @Override
    protected boolean lessThan(IntervalIterator a, IntervalIterator b) {
      return a.end() < b.end() || (a.end() == b.end() && a.start() >= b.start());
    }
  };
}
 
源代码4 项目: lucene-solr   文件: MoreLikeThis.java
private PriorityQueue<ScoreTerm> retrieveTerms(Map<String, Collection<Object>> field2fieldValues) throws
    IOException {
  Map<String, Map<String, Int>> field2termFreqMap = new HashMap<>();
  for (String fieldName : fieldNames) {
    Collection<Object> fieldValues = field2fieldValues.get(fieldName);
    if (fieldValues == null)
      continue;
    for (Object fieldValue : fieldValues) {
      if (fieldValue != null) {
        addTermFrequencies(new StringReader(String.valueOf(fieldValue)), field2termFreqMap,
            fieldName);
      }
    }
  }
  return createQueue(field2termFreqMap);
}
 
源代码5 项目: lucene-solr   文件: MinShouldMatchSumScorer.java
static long cost(LongStream costs, int numScorers, int minShouldMatch) {
  // the idea here is the following: a boolean query c1,c2,...cn with minShouldMatch=m
  // could be rewritten to:
  // (c1 AND (c2..cn|msm=m-1)) OR (!c1 AND (c2..cn|msm=m))
  // if we assume that clauses come in ascending cost, then
  // the cost of the first part is the cost of c1 (because the cost of a conjunction is
  // the cost of the least costly clause)
  // the cost of the second part is the cost of finding m matches among the c2...cn
  // remaining clauses
  // since it is a disjunction overall, the total cost is the sum of the costs of these
  // two parts

  // If we recurse infinitely, we find out that the cost of a msm query is the sum of the
  // costs of the num_scorers - minShouldMatch + 1 least costly scorers
  final PriorityQueue<Long> pq = new PriorityQueue<Long>(numScorers - minShouldMatch + 1) {
    @Override
    protected boolean lessThan(Long a, Long b) {
      return a > b;
    }
  };
  costs.forEach(pq::insertWithOverflow);
  return StreamSupport.stream(pq.spliterator(), false).mapToLong(Number::longValue).sum();
}
 
源代码6 项目: Elasticsearch   文件: CandidateScorer.java
public void findCandidates(CandidateSet[] candidates, Candidate[] path, int ord, int numMissspellingsLeft,
        PriorityQueue<Correction> corrections, double cutoffScore, final double pathScore) throws IOException {
    CandidateSet current = candidates[ord];
    if (ord == candidates.length - 1) {
        path[ord] = current.originalTerm;
        updateTop(candidates, path, corrections, cutoffScore, pathScore + scorer.score(path, candidates, ord, gramSize));
        if (numMissspellingsLeft > 0) {
            for (int i = 0; i < current.candidates.length; i++) {
                path[ord] = current.candidates[i];
                updateTop(candidates, path, corrections, cutoffScore, pathScore + scorer.score(path, candidates, ord, gramSize));
            }
        }
    } else {
        if (numMissspellingsLeft > 0) {
            path[ord] = current.originalTerm;
            findCandidates(candidates, path, ord + 1, numMissspellingsLeft, corrections, cutoffScore, pathScore + scorer.score(path, candidates, ord, gramSize));
            for (int i = 0; i < current.candidates.length; i++) {
                path[ord] = current.candidates[i];
                findCandidates(candidates, path, ord + 1, numMissspellingsLeft - 1, corrections, cutoffScore, pathScore + scorer.score(path, candidates, ord, gramSize));
            }
        } else {
            path[ord] = current.originalTerm;
            findCandidates(candidates, path, ord + 1, 0, corrections, cutoffScore, pathScore + scorer.score(path, candidates, ord, gramSize));
        }
    }

}
 
源代码7 项目: Elasticsearch   文件: XMoreLikeThis.java
/**
 * Find words for a more-like-this query former.
 *
 * @param docNum the id of the lucene document from which to find terms
 */
private PriorityQueue<ScoreTerm> retrieveTerms(int docNum) throws IOException {
    Map<String, Int> termFreqMap = new HashMap<>();
    for (String fieldName : fieldNames) {
        final Fields vectors = ir.getTermVectors(docNum);
        final Terms vector;
        if (vectors != null) {
            vector = vectors.terms(fieldName);
        } else {
            vector = null;
        }

        // field does not store term vector info
        if (vector == null) {
            Document d = ir.document(docNum);
            IndexableField fields[] = d.getFields(fieldName);
            for (IndexableField field : fields) {
                final String stringValue = field.stringValue();
                if (stringValue != null) {
                    addTermFrequencies(new FastStringReader(stringValue), termFreqMap, fieldName);
                }
            }
        } else {
            addTermFrequencies(termFreqMap, vector, fieldName);
        }
    }

    return createQueue(termFreqMap);
}
 
源代码8 项目: Elasticsearch   文件: XMoreLikeThis.java
/**
 * @see #retrieveInterestingTerms(java.io.Reader, String)
 */
public String[] retrieveInterestingTerms(int docNum) throws IOException {
    ArrayList<Object> al = new ArrayList<>(maxQueryTerms);
    PriorityQueue<ScoreTerm> pq = retrieveTerms(docNum);
    ScoreTerm scoreTerm;
    int lim = maxQueryTerms; // have to be careful, retrieveTerms returns all words but that's probably not useful to our caller...
    // we just want to return the top words
    while (((scoreTerm = pq.pop()) != null) && lim-- > 0) {
        al.add(scoreTerm.word); // the 1st entry is the interesting word
    }
    String[] res = new String[al.size()];
    return al.toArray(res);
}
 
源代码9 项目: Elasticsearch   文件: XMoreLikeThis.java
/**
 * Convenience routine to make it easy to return the most interesting words in a document.
 * More advanced users will call {@link #retrieveTerms(Reader, String) retrieveTerms()} directly.
 *
 * @param r the source document
 * @param fieldName field passed to analyzer to use when analyzing the content
 * @return the most interesting words in the document
 * @see #retrieveTerms(java.io.Reader, String)
 * @see #setMaxQueryTerms
 */
public String[] retrieveInterestingTerms(Reader r, String fieldName) throws IOException {
    ArrayList<Object> al = new ArrayList<>(maxQueryTerms);
    PriorityQueue<ScoreTerm> pq = retrieveTerms(r, fieldName);
    ScoreTerm scoreTerm;
    int lim = maxQueryTerms; // have to be careful, retrieveTerms returns all words but that's probably not useful to our caller...
    // we just want to return the top words
    while (((scoreTerm = pq.pop()) != null) && lim-- > 0) {
        al.add(scoreTerm.word); // the 1st entry is the interesting word
    }
    String[] res = new String[al.size()];
    return al.toArray(res);
}
 
源代码10 项目: lucene-solr   文件: QualityQueriesFinder.java
private String [] bestTerms(String field,int numTerms) throws IOException {
  PriorityQueue<TermDf> pq = new TermsDfQueue(numTerms);
  IndexReader ir = DirectoryReader.open(dir);
  try {
    int threshold = ir.maxDoc() / 10; // ignore words too common.
    Terms terms = MultiTerms.getTerms(ir, field);
    if (terms != null) {
      TermsEnum termsEnum = terms.iterator();
      while (termsEnum.next() != null) {
        int df = termsEnum.docFreq();
        if (df<threshold) {
          String ttxt = termsEnum.term().utf8ToString();
          pq.insertWithOverflow(new TermDf(ttxt,df));
        }
      }
    }
  } finally {
    ir.close();
  }
  String res[] = new String[pq.size()];
  int i = 0;
  while (pq.size()>0) {
    TermDf tdf = pq.pop(); 
    res[i++] = tdf.word;
    System.out.println(i+".   word:  "+tdf.df+"   "+tdf.word);
  }
  return res;
}
 
源代码11 项目: lucene-solr   文件: UnorderedIntervalsSource.java
UnorderedIntervalIterator(List<IntervalIterator> subIterators) {
  super(subIterators);
  this.queue = new PriorityQueue<IntervalIterator>(subIterators.size()) {
    @Override
    protected boolean lessThan(IntervalIterator a, IntervalIterator b) {
      return a.start() < b.start() || (a.start() == b.start() && a.end() >= b.end());
    }
  };
  this.subIterators = new IntervalIterator[subIterators.size()];

  for (int i = 0; i < subIterators.size(); i++) {
    this.subIterators[i] = subIterators.get(i);
  }
}
 
源代码12 项目: lucene-solr   文件: MoreLikeThis.java
/**
 * Find words for a more-like-this query former.
 *
 * @param docNum the id of the lucene document from which to find terms
 */
private PriorityQueue<ScoreTerm> retrieveTerms(int docNum) throws IOException {
  Map<String, Map<String, Int>> field2termFreqMap = new HashMap<>();
  for (String fieldName : fieldNames) {
    final Fields vectors = ir.getTermVectors(docNum);
    final Terms vector;
    if (vectors != null) {
      vector = vectors.terms(fieldName);
    } else {
      vector = null;
    }

    // field does not store term vector info
    if (vector == null) {
      Document d = ir.document(docNum);
      IndexableField[] fields = d.getFields(fieldName);
      for (IndexableField field : fields) {
        final String stringValue = field.stringValue();
        if (stringValue != null) {
          addTermFrequencies(new StringReader(stringValue), field2termFreqMap, fieldName);
        }
      }
    } else {
      addTermFrequencies(field2termFreqMap, vector, fieldName);
    }
  }

  return createQueue(field2termFreqMap);
}
 
源代码13 项目: lucene-solr   文件: MoreLikeThis.java
/**
 * @see #retrieveInterestingTerms(java.io.Reader, String)
 */
public String[] retrieveInterestingTerms(int docNum) throws IOException {
  ArrayList<String> al = new ArrayList<>(maxQueryTerms);
  PriorityQueue<ScoreTerm> pq = retrieveTerms(docNum);
  ScoreTerm scoreTerm;
  int lim = maxQueryTerms; // have to be careful, retrieveTerms returns all words but that's probably not useful to our caller...
  // we just want to return the top words
  while (((scoreTerm = pq.pop()) != null) && lim-- > 0) {
    al.add(scoreTerm.word); // the 1st entry is the interesting word
  }
  String[] res = new String[al.size()];
  return al.toArray(res);
}
 
源代码14 项目: lucene-solr   文件: MoreLikeThis.java
/**
 * Convenience routine to make it easy to return the most interesting words in a document.
 * More advanced users will call {@link #retrieveTerms(Reader, String) retrieveTerms()} directly.
 *
 * @param r the source document
 * @param fieldName field passed to analyzer to use when analyzing the content
 * @return the most interesting words in the document
 * @see #retrieveTerms(java.io.Reader, String)
 * @see #setMaxQueryTerms
 */
public String[] retrieveInterestingTerms(Reader r, String fieldName) throws IOException {
  ArrayList<String> al = new ArrayList<>(maxQueryTerms);
  PriorityQueue<ScoreTerm> pq = retrieveTerms(r, fieldName);
  ScoreTerm scoreTerm;
  int lim = maxQueryTerms; // have to be careful, retrieveTerms returns all words but that's probably not useful to our caller...
  // we just want to return the top words
  while (((scoreTerm = pq.pop()) != null) && lim-- > 0) {
    al.add(scoreTerm.word); // the 1st entry is the interesting word
  }
  String[] res = new String[al.size()];
  return al.toArray(res);
}
 
源代码15 项目: lucene-solr   文件: CommonTermsQueryTest.java
private static List<TermAndFreq> queueToList(PriorityQueue<TermAndFreq> queue) {
  List<TermAndFreq> terms = new ArrayList<>();
  while (queue.size() > 0) {
    terms.add(queue.pop());
  }
  return terms;
}
 
源代码16 项目: lucene-solr   文件: DisjunctionScorer.java
private TwoPhase(DocIdSetIterator approximation, float matchCost) {
  super(approximation);
  this.matchCost = matchCost;
  unverifiedMatches = new PriorityQueue<DisiWrapper>(DisjunctionScorer.this.subScorers.size()) {
    @Override
    protected boolean lessThan(DisiWrapper a, DisiWrapper b) {
      return a.matchCost < b.matchCost;
    }
  };
}
 
源代码17 项目: lucene-solr   文件: MultiPhraseQuery.java
UnionFullPostingsEnum(List<PostingsEnum> subs) {
  super(subs);
  this.posQueue = new PriorityQueue<PostingsAndPosition>(subs.size()) {
    @Override
    protected boolean lessThan(PostingsAndPosition a, PostingsAndPosition b) {
      return a.pos < b.pos;
    }
  };
  this.subs = new ArrayList<>();
  for (PostingsEnum pe : subs) {
    this.subs.add(new PostingsAndPosition(pe));
  }
}
 
源代码18 项目: lucene-solr   文件: DisjunctionMatchesIterator.java
private DisjunctionMatchesIterator(List<MatchesIterator> matches) throws IOException {
  queue = new PriorityQueue<MatchesIterator>(matches.size()){
    @Override
    protected boolean lessThan(MatchesIterator a, MatchesIterator b) {
      return a.startPosition() < b.startPosition() ||
          (a.startPosition() == b.startPosition() && a.endPosition() < b.endPosition()) ||
          (a.startPosition() == b.startPosition() && a.endPosition() == b.endPosition());
    }
  };
  for (MatchesIterator mi : matches) {
    if (mi.next()) {
      queue.add(mi);
    }
  }
}
 
源代码19 项目: Elasticsearch   文件: XMoreLikeThis.java
/**
 * Create the More like query from a PriorityQueue
 */
private Query createQuery(PriorityQueue<ScoreTerm> q) {
    BooleanQuery query = new BooleanQuery();
    addToQuery(q, query);
    return query;
}
 
源代码20 项目: Elasticsearch   文件: XMoreLikeThis.java
/**
 * Create a PriorityQueue from a word-&gt;tf map.
 *
 * @param words a map of words keyed on the word(String) with Int objects as the values.
 * @param fieldNames an array of field names to override defaults.
 */
private PriorityQueue<ScoreTerm> createQueue(Map<String, Int> words, String... fieldNames) throws IOException {
    // have collected all words in doc and their freqs
    int numDocs = ir.numDocs();
    final int limit = Math.min(maxQueryTerms, words.size());
    FreqQ queue = new FreqQ(limit); // will order words by score

    for (String word : words.keySet()) { // for every word
        int tf = words.get(word).x; // term freq in the source doc
        if (minTermFreq > 0 && tf < minTermFreq) {
            continue; // filter out words that don't occur enough times in the source
        }

        // go through all the fields and find the largest document frequency
        String topField = fieldNames[0];
        int docFreq = 0;
        for (String fieldName : fieldNames) {
            int freq = ir.docFreq(new Term(fieldName, word));
            topField = (freq > docFreq) ? fieldName : topField;
            docFreq = (freq > docFreq) ? freq : docFreq;
        }

        if (minDocFreq > 0 && docFreq < minDocFreq) {
            continue; // filter out words that don't occur in enough docs
        }

        if (docFreq > maxDocFreq) {
            continue; // filter out words that occur in too many docs
        }

        if (docFreq == 0) {
            continue; // index update problem?
        }

        float idf = similarity.idf(docFreq, numDocs);
        float score = tf * idf;

        if (queue.size() < limit) {
            // there is still space in the queue
            queue.add(new ScoreTerm(word, topField, score, idf, docFreq, tf));
        } else {
            ScoreTerm term = queue.top();
            if (term.score < score) { // update the smallest in the queue in place and update the queue.
                term.update(word, topField, score, idf, docFreq, tf);
                queue.updateTop();
            }
        }
    }
    return queue;
}
 
源代码21 项目: lucene-solr   文件: MoreLikeThis.java
/**
 * Create a PriorityQueue from a word-&gt;tf map.
 *
 * @param perFieldTermFrequencies a per field map of words keyed on the word(String) with Int objects as the values.
 */
private PriorityQueue<ScoreTerm> createQueue(Map<String, Map<String, Int>> perFieldTermFrequencies) throws IOException {
  // have collected all words in doc and their freqs
  final int limit = Math.min(maxQueryTerms, this.getTermsCount(perFieldTermFrequencies));
  FreqQ queue = new FreqQ(limit); // will order words by score
  for (Map.Entry<String, Map<String, Int>> entry : perFieldTermFrequencies.entrySet()) {
    Map<String, Int> perWordTermFrequencies = entry.getValue();
    String fieldName = entry.getKey();

    long numDocs = ir.getDocCount(fieldName);
    if(numDocs == -1) {
      numDocs = ir.numDocs();
    }

    for (Map.Entry<String, Int> tfEntry : perWordTermFrequencies.entrySet()) { // for every word
      String word = tfEntry.getKey();
      int tf = tfEntry.getValue().x; // term freq in the source doc
      if (minTermFreq > 0 && tf < minTermFreq) {
        continue; // filter out words that don't occur enough times in the source
      }

      int docFreq = ir.docFreq(new Term(fieldName, word));

      if (minDocFreq > 0 && docFreq < minDocFreq) {
        continue; // filter out words that don't occur in enough docs
      }

      if (docFreq > maxDocFreq) {
        continue; // filter out words that occur in too many docs
      }

      if (docFreq == 0) {
        continue; // index update problem?
      }

      float idf = similarity.idf(docFreq, numDocs);
      float score = tf * idf;

      if (queue.size() < limit) {
        // there is still space in the queue
        queue.add(new ScoreTerm(word, fieldName, score, idf, docFreq, tf));
      } else {
        ScoreTerm term = queue.top();
        if (term.score < score) { // update the smallest in the queue in place and update the queue.
          term.update(word, fieldName, score, idf, docFreq, tf);
          queue.updateTop();
        }
      }
    }
  }
  return queue;
}
 
源代码22 项目: lucene-solr   文件: TopDocsCollector.java
protected TopDocsCollector(PriorityQueue<T> pq) {
  this.pq = pq;
}
 
源代码23 项目: lucene-solr   文件: JustCompileSearch.java
protected JustCompileTopDocsCollector(PriorityQueue<ScoreDoc> pq) {
  super(pq);
}
 
源代码24 项目: lucene-solr   文件: RankQueryTestPlugin.java
@SuppressWarnings({"unchecked"})
public TestCollector(@SuppressWarnings({"rawtypes"})PriorityQueue pq) {
  super(pq);
}
 
源代码25 项目: lucene-solr   文件: RankQueryTestPlugin.java
@SuppressWarnings({"unchecked"})
public TestCollector1(@SuppressWarnings({"rawtypes"})PriorityQueue pq) {
  super(pq);
}
 
源代码26 项目: Elasticsearch   文件: XMoreLikeThis.java
/**
 * Create a PriorityQueue from a word-&gt;tf map.
 *
 * @param words a map of words keyed on the word(String) with Int objects as the values.
 */
private PriorityQueue<ScoreTerm> createQueue(Map<String, Int> words) throws IOException {
    return createQueue(words, this.fieldNames);
}
 
源代码27 项目: Elasticsearch   文件: XMoreLikeThis.java
/**
 * Find words for a more-like-this query former.
 * The result is a priority queue of arrays with one entry for <b>every word</b> in the document.
 * Each array has 6 elements.
 * The elements are:
 * <ol>
 * <li> The word (String)
 * <li> The top field that this word comes from (String)
 * <li> The score for this word (Float)
 * <li> The IDF value (Float)
 * <li> The frequency of this word in the index (Integer)
 * <li> The frequency of this word in the source document (Integer)
 * </ol>
 * This is a somewhat "advanced" routine, and in general only the 1st entry in the array is of interest.
 * This method is exposed so that you can identify the "interesting words" in a document.
 * For an easier method to call see {@link #retrieveInterestingTerms retrieveInterestingTerms()}.
 *
 * @param r the reader that has the content of the document
 * @param fieldName field passed to the analyzer to use when analyzing the content
 * @return the most interesting words in the document ordered by score, with the highest scoring, or best entry, first
 * @see #retrieveInterestingTerms
 */
private PriorityQueue<ScoreTerm> retrieveTerms(Reader r, String fieldName) throws IOException {
    Map<String, Int> words = new HashMap<>();
    addTermFrequencies(r, words, fieldName);
    return createQueue(words);
}
 
源代码28 项目: lucene-solr   文件: MoreLikeThis.java
/**
 * Find words for a more-like-this query former.
 * The result is a priority queue of arrays with one entry for <b>every word</b> in the document.
 * Each array has 6 elements.
 * The elements are:
 * <ol>
 * <li> The word (String)
 * <li> The top field that this word comes from (String)
 * <li> The score for this word (Float)
 * <li> The IDF value (Float)
 * <li> The frequency of this word in the index (Integer)
 * <li> The frequency of this word in the source document (Integer)
 * </ol>
 * This is a somewhat "advanced" routine, and in general only the 1st entry in the array is of interest.
 * This method is exposed so that you can identify the "interesting words" in a document.
 * For an easier method to call see {@link #retrieveInterestingTerms retrieveInterestingTerms()}.
 *
 * @param r the reader that has the content of the document
 * @param fieldName field passed to the analyzer to use when analyzing the content
 * @return the most interesting words in the document ordered by score, with the highest scoring, or best entry, first
 * @see #retrieveInterestingTerms
 */
private PriorityQueue<ScoreTerm> retrieveTerms(Reader r, String fieldName) throws IOException {
  Map<String, Map<String, Int>> field2termFreqMap = new HashMap<>();
  addTermFrequencies(r, field2termFreqMap, fieldName);
  return createQueue(field2termFreqMap);
}