下面列出了org.apache.lucene.index.IndexReader#docFreq ( ) 实例代码,或者点击链接到github查看源代码,也可以在右侧发表评论。
/**
* Extracts all terms texts of a given Query into an array of WeightedTerms
*
* @param query Query to extract term texts from
* @param reader used to compute IDF which can be used to a) score selected fragments better
* b) use graded highlights eg changing intensity of font color
* @param fieldName the field on which Inverse Document Frequency (IDF) calculations are based
* @return an array of the terms used in a query, plus their weights.
*/
public static final WeightedTerm[] getIdfWeightedTerms(Query query, IndexReader reader, String fieldName)
{
WeightedTerm[] terms=getTerms(query,false, fieldName);
int totalNumDocs=reader.maxDoc();
for (int i = 0; i < terms.length; i++)
{
try
{
int docFreq=reader.docFreq(new Term(fieldName,terms[i].term));
//IDF algorithm taken from ClassicSimilarity class
float idf=(float)(Math.log(totalNumDocs/(double)(docFreq+1)) + 1.0);
terms[i].weight*=idf;
}
catch (IOException e)
{
//ignore
}
}
return terms;
}
/**
* Builds the TFIDF vector and its norm2
*
* @param tfidf
* - the vector containing for each term its TFIDF score, it will
* be populated by this method
* @param freq
* - the vector containing for each term its frequency
* @param field
* - the field on which to compute the inverse document frequency
*
* @return the norm of the TFIDF vector
*
*/
private double tfidfVector(Map<String, Double> tfidf,
Map<String, Integer> freq, String field) {
IndexReader reader = getReader();
double norm = 0;
for (Map.Entry<String, Integer> entry : freq.entrySet()) {
Term t = new Term(field, entry.getKey());
int df = 0;
try {
df = reader.docFreq(t);
} catch (IOException e) {
logger.error("computing tfidfVector ({}) ", e.toString());
System.exit(-1);
}
double idf = Math.log(collectionSize / (double) df + 1)
/ Math.log(2) + 1;
double tfidfValue = entry.getValue() * idf;
norm += tfidfValue * tfidfValue;
tfidf.put(entry.getKey(), tfidfValue);
}
return Math.sqrt(norm);
}
protected float calculateWeight(Term term, IndexReader reader) throws IOException {
//if a term is not in the index, then it's weight is 0
int docFrequency = reader.docFreq(term);
if (docFrequency !=0) {
log.warn("Term {} doc freq.{}",term.toString(),docFrequency);
return 1.0f / docFrequency;
} else {
log.warn("Couldn't find doc freq for term {}", term);
return 0f;
}
}
/**
* Creates a Map of <code>WeightedSpanTerms</code> from the given <code>Query</code> and <code>TokenStream</code>. Uses a supplied
* <code>IndexReader</code> to properly weight terms (for gradient highlighting).
*
* <p>
*
* @param query
* that caused hit
* @param tokenStream
* of text to be highlighted
* @param fieldName
* restricts Term's used based on field name
* @param reader
* to use for scoring
* @return Map of WeightedSpanTerms with quasi tf/idf scores
* @throws IOException If there is a low-level I/O error
*/
public Map<String,WeightedSpanTerm> getWeightedSpanTermsWithScores(Query query, float boost, TokenStream tokenStream, String fieldName,
IndexReader reader) throws IOException {
if (fieldName != null) {
this.fieldName = fieldName;
} else {
this.fieldName = null;
}
this.tokenStream = tokenStream;
Map<String,WeightedSpanTerm> terms = new PositionCheckingMap<>();
extract(query, boost, terms);
int totalNumDocs = reader.maxDoc();
Set<String> weightedTerms = terms.keySet();
Iterator<String> it = weightedTerms.iterator();
try {
while (it.hasNext()) {
WeightedSpanTerm weightedSpanTerm = terms.get(it.next());
int docFreq = reader.docFreq(new Term(fieldName, weightedSpanTerm.term));
// IDF algorithm taken from ClassicSimilarity class
float idf = (float) (Math.log(totalNumDocs / (double) (docFreq + 1)) + 1.0);
weightedSpanTerm.weight *= idf;
}
} finally {
IOUtils.close(internalReader);
}
return terms;
}
private SuggestWord generateSuggestWord(IndexReader ir, String fieldname, String text) throws IOException {
Term term = new Term(fieldname, text);
int freq = ir.docFreq(term);
SuggestWord word = new SuggestWord();
word.freq = freq;
word.score = 1;
word.string = text;
return word;
}
/**
* Note: if you use a counting {@link Facets} implementation, you can amortize the
* sampled counts by calling this method. Uses the {@link FacetsConfig} and
* the {@link IndexSearcher} to determine the upper bound for each facet value.
*/
public FacetResult amortizeFacetCounts(FacetResult res, FacetsConfig config, IndexSearcher searcher) throws IOException {
if (res == null || totalHits <= sampleSize) {
return res;
}
LabelAndValue[] fixedLabelValues = new LabelAndValue[res.labelValues.length];
IndexReader reader = searcher.getIndexReader();
DimConfig dimConfig = config.getDimConfig(res.dim);
// +2 to prepend dimension, append child label
String[] childPath = new String[res.path.length + 2];
childPath[0] = res.dim;
System.arraycopy(res.path, 0, childPath, 1, res.path.length); // reuse
for (int i = 0; i < res.labelValues.length; i++) {
childPath[res.path.length + 1] = res.labelValues[i].label;
String fullPath = FacetsConfig.pathToString(childPath, childPath.length);
int max = reader.docFreq(new Term(dimConfig.indexFieldName, fullPath));
int correctedCount = (int) (res.labelValues[i].value.doubleValue() / samplingRate);
correctedCount = Math.min(max, correctedCount);
fixedLabelValues[i] = new LabelAndValue(res.labelValues[i].label, correctedCount);
}
// cap the total count on the total number of non-deleted documents in the reader
int correctedTotalCount = res.value.intValue();
if (correctedTotalCount > 0) {
correctedTotalCount = Math.min(reader.numDocs(), (int) (res.value.doubleValue() / samplingRate));
}
return new FacetResult(res.dim, res.path, correctedTotalCount, fixedLabelValues, res.childCount);
}
public Explanation explain(IndexReader reader, int doc)
throws IOException {
ComplexExplanation result = new ComplexExplanation();
result.setDescription("weight("+getQuery()+" in "+doc+"), product of:");
Explanation idfExpl =
new Explanation(idf, "idf(docFreq=" + reader.docFreq(term) +
", numDocs=" + reader.numDocs() + ")");
// explain query weight
Explanation queryExpl = new Explanation();
queryExpl.setDescription("queryWeight(" + getQuery() + "), product of:");
Explanation boostExpl = new Explanation(getBoost(), "boost");
if (getBoost() != 1.0f)
queryExpl.addDetail(boostExpl);
queryExpl.addDetail(idfExpl);
Explanation queryNormExpl = new Explanation(queryNorm,"queryNorm");
queryExpl.addDetail(queryNormExpl);
queryExpl.setValue(boostExpl.getValue() *
idfExpl.getValue() *
queryNormExpl.getValue());
result.addDetail(queryExpl);
// explain field weight
String field = term.field();
ComplexExplanation fieldExpl = new ComplexExplanation();
fieldExpl.setDescription("fieldWeight("+term+" in "+doc+
"), product of:");
Explanation tfExpl = scorer(reader).explain(doc);
fieldExpl.addDetail(tfExpl);
fieldExpl.addDetail(idfExpl);
Explanation fieldNormExpl = new Explanation();
byte[] fieldNorms = reader.norms(field);
float fieldNorm =
fieldNorms!=null ? Similarity.decodeNorm(fieldNorms[doc]) : 0.0f;
fieldNormExpl.setValue(fieldNorm);
fieldNormExpl.setDescription("fieldNorm(field="+field+", doc="+doc+")");
fieldExpl.addDetail(fieldNormExpl);
fieldExpl.setMatch(Boolean.valueOf(tfExpl.isMatch()));
fieldExpl.setValue(tfExpl.getValue() *
idfExpl.getValue() *
fieldNormExpl.getValue());
result.addDetail(fieldExpl);
result.setMatch(fieldExpl.getMatch());
// combine them
result.setValue(queryExpl.getValue() * fieldExpl.getValue());
if (queryExpl.getValue() == 1.0f)
return fieldExpl;
return result;
}
/**
* <p>
* Generate suggestions by breaking the passed-in term into multiple words.
* The scores returned are equal to the number of word breaks needed so a
* lower score is generally preferred over a higher score.
* </p>
*
* @param suggestMode
* - default = {@link SuggestMode#SUGGEST_WHEN_NOT_IN_INDEX}
* @param sortMethod
* - default =
* {@link BreakSuggestionSortMethod#NUM_CHANGES_THEN_MAX_FREQUENCY}
* @return one or more arrays of words formed by breaking up the original term
* @throws IOException If there is a low-level I/O error.
*/
public SuggestWord[][] suggestWordBreaks(Term term, int maxSuggestions,
IndexReader ir, SuggestMode suggestMode,
BreakSuggestionSortMethod sortMethod) throws IOException {
if (maxSuggestions < 1) {
return new SuggestWord[0][0];
}
if (suggestMode == null) {
suggestMode = SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX;
}
if (sortMethod == null) {
sortMethod = BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY;
}
int queueInitialCapacity = maxSuggestions > 10 ? 10 : maxSuggestions;
Comparator<SuggestWordArrayWrapper> queueComparator = sortMethod == BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY ? new LengthThenMaxFreqComparator()
: new LengthThenSumFreqComparator();
Queue<SuggestWordArrayWrapper> suggestions = new PriorityQueue<>(
queueInitialCapacity, queueComparator);
int origFreq = ir.docFreq(term);
if (origFreq > 0 && suggestMode == SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX) {
return new SuggestWord[0][];
}
int useMinSuggestionFrequency = minSuggestionFrequency;
if (suggestMode == SuggestMode.SUGGEST_MORE_POPULAR) {
useMinSuggestionFrequency = (origFreq == 0 ? 1 : origFreq);
}
generateBreakUpSuggestions(term, ir, 1, maxSuggestions,
useMinSuggestionFrequency, new SuggestWord[0], suggestions, 0,
sortMethod);
SuggestWord[][] suggestionArray = new SuggestWord[suggestions.size()][];
for (int i = suggestions.size() - 1; i >= 0; i--) {
suggestionArray[i] = suggestions.remove().suggestWords;
}
return suggestionArray;
}
private void mapOneVector(NamedList<Object> docNL, FieldOptions fieldOptions, IndexReader reader, int docID, TermsEnum termsEnum, String field) throws IOException {
NamedList<Object> fieldNL = new NamedList<>();
docNL.add(field, fieldNL);
BytesRef text;
PostingsEnum dpEnum = null;
while((text = termsEnum.next()) != null) {
String term = text.utf8ToString();
NamedList<Object> termInfo = new NamedList<>();
fieldNL.add(term, termInfo);
final int freq = (int) termsEnum.totalTermFreq();
if (fieldOptions.termFreq == true) {
termInfo.add("tf", freq);
}
int dpEnumFlags = 0;
dpEnumFlags |= fieldOptions.positions ? PostingsEnum.POSITIONS : 0;
//payloads require offsets
dpEnumFlags |= (fieldOptions.offsets || fieldOptions.payloads) ? PostingsEnum.OFFSETS : 0;
dpEnumFlags |= fieldOptions.payloads ? PostingsEnum.PAYLOADS : 0;
dpEnum = termsEnum.postings(dpEnum, dpEnumFlags);
boolean atNextDoc = false;
if (dpEnum != null) {
dpEnum.nextDoc();
atNextDoc = true;
}
if (atNextDoc && dpEnumFlags != 0) {
NamedList<Integer> positionsNL = null;
NamedList<Number> theOffsets = null;
NamedList<String> thePayloads = null;
for (int i = 0; i < freq; i++) {
final int pos = dpEnum.nextPosition();
if (fieldOptions.positions && pos >= 0) {
if (positionsNL == null) {
positionsNL = new NamedList<>();
termInfo.add("positions", positionsNL);
}
positionsNL.add("position", pos);
}
int startOffset = fieldOptions.offsets ? dpEnum.startOffset() : -1;
if (startOffset >= 0) {
if (theOffsets == null) {
theOffsets = new NamedList<>();
termInfo.add("offsets", theOffsets);
}
theOffsets.add("start", dpEnum.startOffset());
theOffsets.add("end", dpEnum.endOffset());
}
BytesRef payload = fieldOptions.payloads ? dpEnum.getPayload() : null;
if (payload != null) {
if (thePayloads == null) {
thePayloads = new NamedList<>();
termInfo.add("payloads", thePayloads);
}
thePayloads.add("payload", Base64.byteArrayToBase64(payload.bytes, payload.offset, payload.length));
}
}
}
int df = 0;
if (fieldOptions.docFreq || fieldOptions.tfIdf) {
df = reader.docFreq(new Term(field, text));
}
if (fieldOptions.docFreq) {
termInfo.add("df", df);
}
// TODO: this is not TF/IDF by anyone's definition!
if (fieldOptions.tfIdf) {
double tfIdfVal = ((double) freq) / df;
termInfo.add("tf-idf", tfIdfVal);
}
}
}
public static long recordFrequency(IndexReader reader, String columnFamily, String columnName, String value)
throws IOException {
return reader.docFreq(getTerm(columnFamily, columnName, value));
}