下面列出了org.apache.lucene.index.Terms#size ( ) 实例代码,或者点击链接到github查看源代码,也可以在右侧发表评论。
/**
* create a sparse <code>Double</code> vector given doc and field term vectors using local frequency of the terms in the doc
*
* @param docTerms term vectors for a given document
* @param fieldTerms field term vectors
* @return a sparse vector of <code>Double</code>s as an array
* @throws IOException in case accessing the underlying index fails
*/
public static Double[] toSparseLocalFreqDoubleArray(Terms docTerms, Terms fieldTerms) throws IOException {
TermsEnum fieldTermsEnum = fieldTerms.iterator();
Double[] freqVector = null;
if (docTerms != null && fieldTerms.size() > -1) {
freqVector = new Double[(int) fieldTerms.size()];
int i = 0;
TermsEnum docTermsEnum = docTerms.iterator();
BytesRef term;
while ((term = fieldTermsEnum.next()) != null) {
TermsEnum.SeekStatus seekStatus = docTermsEnum.seekCeil(term);
if (seekStatus.equals(TermsEnum.SeekStatus.END)) {
docTermsEnum = docTerms.iterator();
}
if (seekStatus.equals(TermsEnum.SeekStatus.FOUND)) {
long termFreqLocal = docTermsEnum.totalTermFreq(); // the total number of occurrences of this term in the given document
freqVector[i] = Long.valueOf(termFreqLocal).doubleValue();
} else {
freqVector[i] = 0d;
}
i++;
}
}
return freqVector;
}
/**
* create a dense <code>Double</code> vector given doc and field term vectors using local frequency of the terms in the doc
*
* @param docTerms term vectors for a given document
* @return a dense vector of <code>Double</code>s as an array
* @throws IOException in case accessing the underlying index fails
*/
public static Double[] toDenseLocalFreqDoubleArray(Terms docTerms) throws IOException {
Double[] freqVector = null;
if (docTerms != null) {
freqVector = new Double[(int) docTerms.size()];
int i = 0;
TermsEnum docTermsEnum = docTerms.iterator();
while (docTermsEnum.next() != null) {
long termFreqLocal = docTermsEnum.totalTermFreq(); // the total number of occurrences of this term in the given document
freqVector[i] = Long.valueOf(termFreqLocal).doubleValue();
i++;
}
}
return freqVector;
}
static boolean hasHighCardinalityRatio(Supplier<Engine.Searcher> acquireSearcher, String fieldName) {
// acquire separate searcher:
// Can't use sharedShardContexts() yet, if we bail out the "getOrCreateContext" causes issues later on in the fallback logic
try (Engine.Searcher searcher = acquireSearcher.get()) {
for (LeafReaderContext leaf : searcher.reader().leaves()) {
Terms terms = leaf.reader().terms(fieldName);
if (terms == null) {
return true;
}
double cardinalityRatio = terms.size() / (double) leaf.reader().numDocs();
if (cardinalityRatio > CARDINALITY_RATIO_THRESHOLD) {
return true;
}
}
} catch (IOException e) {
return true;
}
return false;
}
public WordScorer(IndexReader reader, Terms terms, String field, double realWordLikelyHood, BytesRef separator) throws IOException {
this.field = field;
if (terms == null) {
throw new IllegalArgumentException("Field: [" + field + "] does not exist");
}
this.terms = terms;
final long vocSize = terms.getSumTotalTermFreq();
this.vocabluarySize = vocSize == -1 ? reader.maxDoc() : vocSize;
this.useTotalTermFreq = vocSize != -1;
this.numTerms = terms.size();
this.termsEnum = new FreqTermsEnum(reader, field, !useTotalTermFreq, useTotalTermFreq, null, BigArrays.NON_RECYCLING_INSTANCE); // non recycling for now
this.reader = reader;
this.realWordLikelyhood = realWordLikelyHood;
this.separator = separator;
}
private void buildField(XContentBuilder builder, final CharsRefBuilder spare, Fields theFields, Iterator<String> fieldIter) throws IOException {
String fieldName = fieldIter.next();
builder.startObject(fieldName);
Terms curTerms = theFields.terms(fieldName);
// write field statistics
buildFieldStatistics(builder, curTerms);
builder.startObject(FieldStrings.TERMS);
TermsEnum termIter = curTerms.iterator();
BoostAttribute boostAtt = termIter.attributes().addAttribute(BoostAttribute.class);
for (int i = 0; i < curTerms.size(); i++) {
buildTerm(builder, spare, curTerms, termIter, boostAtt);
}
builder.endObject();
builder.endObject();
}
@Test
public void testSparseFreqDoubleArrayConversion() throws Exception {
Terms fieldTerms = MultiTerms.getTerms(index, "text");
if (fieldTerms != null && fieldTerms.size() != -1) {
IndexSearcher indexSearcher = new IndexSearcher(index);
for (ScoreDoc scoreDoc : indexSearcher.search(new MatchAllDocsQuery(), Integer.MAX_VALUE).scoreDocs) {
Terms docTerms = index.getTermVector(scoreDoc.doc, "text");
Double[] vector = DocToDoubleVectorUtils.toSparseLocalFreqDoubleArray(docTerms, fieldTerms);
assertNotNull(vector);
assertTrue(vector.length > 0);
}
}
}
/**
* checks collection-level statistics on Terms
*/
public void assertTermsStatistics(Terms leftTerms, Terms rightTerms) throws Exception {
assertEquals(leftTerms.getDocCount(), rightTerms.getDocCount());
assertEquals(leftTerms.getSumDocFreq(), rightTerms.getSumDocFreq());
if (leftTerms.hasFreqs() && rightTerms.hasFreqs()) {
assertEquals(leftTerms.getSumTotalTermFreq(), rightTerms.getSumTotalTermFreq());
}
if (leftTerms.size() != -1 && rightTerms.size() != -1) {
assertEquals(leftTerms.size(), rightTerms.size());
}
}
private Set<String> getTerms(IndexReader ir) {
Set<String> t = new HashSet<>();
for (int i = 0; i < ir.leaves().size(); i++) {
Terms termsList;
try {
// Get all the terms at this level of the tree.
termsList = ir.leaves().get(i).reader().terms(Lucene4IRConstants.FIELD_ALL);
if (termsList != null && termsList.size() > 0) {
TermsEnum te = termsList.iterator();
BytesRef termBytes;
while ((termBytes = te.next()) != null) {
t.add(termBytes.utf8ToString());
}
}
// Get all the terms at the next level of the tree.
if (ir.leaves().get(i).children() != null && ir.leaves().get(i).children().size() > 0) {
for (IndexReaderContext c : ir.leaves().get(i).children()) {
t.addAll(getTerms(c.reader()));
}
}
} catch (IOException e) {
e.printStackTrace();
}
}
return t;
}
private long getTotalNumberOfRowIds(DirectoryReader reader) throws IOException {
long total = 0;
List<AtomicReaderContext> leaves = reader.leaves();
for (AtomicReaderContext context : leaves) {
AtomicReader atomicReader = context.reader();
Terms terms = atomicReader.terms(BlurConstants.ROW_ID);
long expectedInsertions = terms.size();
if (expectedInsertions < 0) {
return -1;
}
total += expectedInsertions;
}
return total;
}
@Override
protected CustomScoreProvider getCustomScoreProvider(LeafReaderContext context) throws IOException {
return new CustomScoreProvider(context){
@Override
public float customScore(int docID, float subQueryScore, float valSrcScore) throws IOException {
float score = 0;
double docVectorNorm = 0;
LeafReader reader = context.reader();
Terms terms = reader.getTermVector(docID, field);
if(vector.size() != terms.size()){
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "indexed and input vector array must have same length");
}
TermsEnum iter = terms.iterator();
BytesRef text;
while ((text = iter.next()) != null) {
String term = text.utf8ToString();
float payloadValue = 0f;
PostingsEnum postings = iter.postings(null, PostingsEnum.ALL);
while (postings.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
int freq = postings.freq();
while (freq-- > 0) postings.nextPosition();
BytesRef payload = postings.getPayload();
payloadValue = PayloadHelper.decodeFloat(payload.bytes, payload.offset);
if (cosine)
docVectorNorm += Math.pow(payloadValue, 2.0);
}
score = (float)(score + payloadValue * (vector.get(Integer.parseInt(term))));
}
if (cosine) {
if ((docVectorNorm == 0) || (queryVectorNorm == 0)) return 0f;
return (float)(score / (Math.sqrt(docVectorNorm) * Math.sqrt(queryVectorNorm)));
}
return score;
}
};
}
private long getTermsSize(LeafReaderContext leafReaderContext) throws IOException {
Terms terms = leafReaderContext.reader().terms(field);
return terms == null ? 0 : terms.size();
}
@Override
protected Accountable createValue(LeafReader reader, CacheKey key)
throws IOException {
final int maxDoc = reader.maxDoc();
Terms terms = reader.terms(key.field);
final float acceptableOverheadRatio = ((Float) key.custom).floatValue();
final PagedBytes bytes = new PagedBytes(15);
int startTermsBPV;
// TODO: use Uninvert?
if (terms != null) {
// Try for coarse estimate for number of bits; this
// should be an underestimate most of the time, which
// is fine -- GrowableWriter will reallocate as needed
long numUniqueTerms = terms.size();
if (numUniqueTerms != -1L) {
if (numUniqueTerms > maxDoc) {
throw new IllegalStateException("Type mismatch: " + key.field + " was indexed with multiple values per document, use SORTED_SET instead");
}
startTermsBPV = PackedInts.bitsRequired(numUniqueTerms);
} else {
startTermsBPV = 1;
}
} else {
startTermsBPV = 1;
}
PackedLongValues.Builder termOrdToBytesOffset = PackedLongValues.monotonicBuilder(PackedInts.COMPACT);
final GrowableWriter docToTermOrd = new GrowableWriter(startTermsBPV, maxDoc, acceptableOverheadRatio);
int termOrd = 0;
// TODO: use Uninvert?
if (terms != null) {
final TermsEnum termsEnum = terms.iterator();
PostingsEnum docs = null;
while(true) {
final BytesRef term = termsEnum.next();
if (term == null) {
break;
}
if (termOrd >= maxDoc) {
throw new IllegalStateException("Type mismatch: " + key.field + " was indexed with multiple values per document, use SORTED_SET instead");
}
termOrdToBytesOffset.add(bytes.copyUsingLengthPrefix(term));
docs = termsEnum.postings(docs, PostingsEnum.NONE);
while (true) {
final int docID = docs.nextDoc();
if (docID == DocIdSetIterator.NO_MORE_DOCS) {
break;
}
// Store 1+ ord into packed bits
docToTermOrd.set(docID, 1+termOrd);
}
termOrd++;
}
}
// maybe an int-only impl?
return new SortedDocValuesImpl(bytes.freeze(true), termOrdToBytesOffset.build(), docToTermOrd.getMutable(), termOrd);
}
@Override
protected Accountable createValue(LeafReader reader, CacheKey key)
throws IOException {
// TODO: would be nice to first check if DocTermsIndex
// was already cached for this field and then return
// that instead, to avoid insanity
final int maxDoc = reader.maxDoc();
Terms terms = reader.terms(key.field);
final float acceptableOverheadRatio = ((Float) key.custom).floatValue();
final int termCountHardLimit = maxDoc;
// Holds the actual term data, expanded.
final PagedBytes bytes = new PagedBytes(15);
int startBPV;
if (terms != null) {
// Try for coarse estimate for number of bits; this
// should be an underestimate most of the time, which
// is fine -- GrowableWriter will reallocate as needed
long numUniqueTerms = terms.size();
if (numUniqueTerms != -1L) {
if (numUniqueTerms > termCountHardLimit) {
numUniqueTerms = termCountHardLimit;
}
startBPV = PackedInts.bitsRequired(numUniqueTerms*4);
} else {
startBPV = 1;
}
} else {
startBPV = 1;
}
final GrowableWriter docToOffset = new GrowableWriter(startBPV, maxDoc, acceptableOverheadRatio);
// pointer==0 means not set
bytes.copyUsingLengthPrefix(new BytesRef());
if (terms != null) {
int termCount = 0;
final TermsEnum termsEnum = terms.iterator();
PostingsEnum docs = null;
while(true) {
if (termCount++ == termCountHardLimit) {
// app is misusing the API (there is more than
// one term per doc); in this case we make best
// effort to load what we can (see LUCENE-2142)
break;
}
final BytesRef term = termsEnum.next();
if (term == null) {
break;
}
final long pointer = bytes.copyUsingLengthPrefix(term);
docs = termsEnum.postings(docs, PostingsEnum.NONE);
while (true) {
final int docID = docs.nextDoc();
if (docID == DocIdSetIterator.NO_MORE_DOCS) {
break;
}
docToOffset.set(docID, pointer);
}
}
}
final PackedInts.Reader offsetReader = docToOffset.getMutable();
Bits docsWithField = new Bits() {
@Override
public boolean get(int index) {
return offsetReader.get(index) != 0;
}
@Override
public int length() {
return maxDoc;
}
};
wrapper.setDocsWithField(reader, key.field, docsWithField, null);
// maybe an int-only impl?
return new BinaryDocValuesImpl(bytes.freeze(true), offsetReader, docsWithField);
}