下面列出了怎么用org.apache.lucene.index.Terms的API类实例代码及写法,或者点击链接到github查看源代码。
@Override
protected FieldStatsShardResponse shardOperation(FieldStatsShardRequest request) {
ShardId shardId = request.shardId();
Map<String, FieldStats> fieldStats = new HashMap<>();
IndexService indexServices = indicesService.indexServiceSafe(shardId.getIndex());
MapperService mapperService = indexServices.mapperService();
IndexShard shard = indexServices.shardSafe(shardId.id());
try (Engine.Searcher searcher = shard.acquireSearcher("fieldstats")) {
for (String field : request.getFields()) {
MappedFieldType fieldType = mapperService.fullName(field);
if (fieldType != null) {
IndexReader reader = searcher.reader();
Terms terms = MultiFields.getTerms(reader, field);
if (terms != null) {
fieldStats.put(field, fieldType.stats(terms, reader.maxDoc()));
}
} else {
throw new IllegalArgumentException("field [" + field + "] doesn't exist");
}
}
} catch (IOException e) {
throw ExceptionsHelper.convertToElastic(e);
}
return new FieldStatsShardResponse(shardId, fieldStats);
}
@Override
public void visitMatchingTerms(
IndexReader reader,
String fieldName,
MatchingTermVisitor mtv) throws IOException
{
/* check term presence in index here for symmetry with other SimpleTerm's */
Terms terms = MultiTerms.getTerms(reader, fieldName);
if (terms != null) {
TermsEnum termsEnum = terms.iterator();
TermsEnum.SeekStatus status = termsEnum.seekCeil(new BytesRef(getTermText()));
if (status == TermsEnum.SeekStatus.FOUND) {
mtv.visitMatchingTerm(getLuceneTerm(fieldName));
}
}
}
/**
* count the number of documents in the index having at least a value for the 'class' field
*
* @return the no. of documents having a value for the 'class' field
* @throws IOException if accessing to term vectors or search fails
*/
protected int countDocsWithClass() throws IOException {
Terms terms = MultiTerms.getTerms(this.indexReader, this.classFieldName);
int docCount;
if (terms == null || terms.getDocCount() == -1) { // in case codec doesn't support getDocCount
TotalHitCountCollector classQueryCountCollector = new TotalHitCountCollector();
BooleanQuery.Builder q = new BooleanQuery.Builder();
q.add(new BooleanClause(new WildcardQuery(new Term(classFieldName, String.valueOf(WildcardQuery.WILDCARD_STRING))), BooleanClause.Occur.MUST));
if (query != null) {
q.add(query, BooleanClause.Occur.MUST);
}
indexSearcher.search(q.build(),
classQueryCountCollector);
docCount = classQueryCountCollector.getTotalHits();
} else {
docCount = terms.getDocCount();
}
return docCount;
}
private Query newTermQuery(IndexReader reader, Term term) throws IOException {
if (ignoreTF) {
return new ConstantScoreQuery(new TermQuery(term));
} else {
// we build an artificial TermStates that will give an overall df and ttf
// equal to 1
TermStates context = new TermStates(reader.getContext());
for (LeafReaderContext leafContext : reader.leaves()) {
Terms terms = leafContext.reader().terms(term.field());
if (terms != null) {
TermsEnum termsEnum = terms.iterator();
if (termsEnum.seekExact(term.bytes())) {
int freq = 1 - context.docFreq(); // we want the total df and ttf to be 1
context.register(termsEnum.termState(), leafContext.ord, freq, freq);
}
}
}
return new TermQuery(term, context);
}
}
@Override
public IntervalIterator intervals(String field, LeafReaderContext ctx) throws IOException {
Terms terms = ctx.reader().terms(field);
if (terms == null)
return null;
if (terms.hasPositions() == false) {
throw new IllegalArgumentException("Cannot create an IntervalIterator over field " + field + " because it has no indexed positions");
}
if (terms.hasPayloads() == false) {
throw new IllegalArgumentException("Cannot create a payload-filtered iterator over field " + field + " because it has no indexed payloads");
}
TermsEnum te = terms.iterator();
if (te.seekExact(term) == false) {
return null;
}
return intervals(te);
}
static CodecReader wrap(CodecReader reader) throws IOException {
final FieldInfos fieldInfos = reader.getFieldInfos();
final FieldInfo versionInfo = fieldInfos.fieldInfo(VersionFieldMapper.NAME);
if (versionInfo != null && versionInfo.getDocValuesType() != DocValuesType.NONE) {
// the reader is a recent one, it has versions and they are stored
// in a numeric doc values field
return reader;
}
// The segment is an old one, look at the _uid field
final Terms terms = reader.terms(UidFieldMapper.NAME);
if (terms == null || !terms.hasPayloads()) {
// The segment doesn't have an _uid field or doesn't have payloads
// don't try to do anything clever. If any other segment has versions
// all versions of this segment will be initialized to 0
return reader;
}
// convert _uid payloads -> _version docvalues
return new VersionFieldUpgrader(reader);
}
public DfsOnlyRequest(Fields termVectorsFields, String[] indices, String[] types, Set<String> selectedFields) throws IOException {
super(indices);
// build a search request with a query of all the terms
final BoolQueryBuilder boolBuilder = boolQuery();
for (String fieldName : termVectorsFields) {
if ((selectedFields != null) && (!selectedFields.contains(fieldName))) {
continue;
}
Terms terms = termVectorsFields.terms(fieldName);
TermsEnum iterator = terms.iterator();
while (iterator.next() != null) {
String text = iterator.term().utf8ToString();
boolBuilder.should(QueryBuilders.termQuery(fieldName, text));
}
}
// wrap a search request object
this.searchRequest = new SearchRequest(indices).types(types).source(new SearchSourceBuilder().query(boolBuilder));
}
private void buildTerm(XContentBuilder builder, final CharsRefBuilder spare, Terms curTerms, TermsEnum termIter, BoostAttribute boostAtt) throws IOException {
// start term, optimized writing
BytesRef term = termIter.next();
spare.copyUTF8Bytes(term);
builder.startObject(spare.toString());
buildTermStatistics(builder, termIter);
// finally write the term vectors
PostingsEnum posEnum = termIter.postings(null, PostingsEnum.ALL);
int termFreq = posEnum.freq();
builder.field(FieldStrings.TERM_FREQ, termFreq);
initMemory(curTerms, termFreq);
initValues(curTerms, posEnum, termFreq);
buildValues(builder, curTerms, termFreq);
buildScore(builder, boostAtt);
builder.endObject();
}
private void buildFieldStatistics(XContentBuilder builder, Terms curTerms) throws IOException {
long sumDocFreq = curTerms.getSumDocFreq();
int docCount = curTerms.getDocCount();
long sumTotalTermFrequencies = curTerms.getSumTotalTermFreq();
if (docCount > 0) {
assert ((sumDocFreq > 0)) : "docCount >= 0 but sumDocFreq ain't!";
assert ((sumTotalTermFrequencies > 0)) : "docCount >= 0 but sumTotalTermFrequencies ain't!";
builder.startObject(FieldStrings.FIELD_STATISTICS);
builder.field(FieldStrings.SUM_DOC_FREQ, sumDocFreq);
builder.field(FieldStrings.DOC_COUNT, docCount);
builder.field(FieldStrings.SUM_TTF, sumTotalTermFrequencies);
builder.endObject();
} else if (docCount == -1) { // this should only be -1 if the field
// statistics were not requested at all. In
// this case all 3 values should be -1
assert ((sumDocFreq == -1)) : "docCount was -1 but sumDocFreq ain't!";
assert ((sumTotalTermFrequencies == -1)) : "docCount was -1 but sumTotalTermFrequencies ain't!";
} else {
throw new IllegalStateException(
"Something is wrong with the field statistics of the term vector request: Values are " + "\n"
+ FieldStrings.SUM_DOC_FREQ + " " + sumDocFreq + "\n" + FieldStrings.DOC_COUNT + " " + docCount + "\n"
+ FieldStrings.SUM_TTF + " " + sumTotalTermFrequencies);
}
}
public static void addToStatistics(SolrIndexSearcher searcher, String field) throws IOException {
// check if this field is already in the stats.
// synchronized (instance) {
if (termstats.get(field)!=null) return;
// }
// else add it to the stats.
Terms terms = searcher.getSlowAtomicReader().terms(field);
HashMap<String, Integer> term2docFreq = new HashMap<String, Integer>(1000);
termstats.put(field, term2docFreq);
if (terms!=null) {
TermsEnum termsEnum = terms.iterator();
BytesRef term;
while ((term = termsEnum.next()) != null) {
term2docFreq.put(term.utf8ToString(), termsEnum.docFreq());
}
}
}
private HashMap<String, Integer> getAllTerms() throws IOException {
HashMap<String, Integer> allTerms = new HashMap<>();
int pos = 0;
for (int docId = 0; docId < getTotalDocumentInIndex(); docId++) {
Terms vector = getIndexReader().getTermVector(docId, FIELD_CONTENT);
TermsEnum termsEnum = null;
termsEnum = vector.iterator();
BytesRef text = null;
while ((text = termsEnum.next()) != null) {
String term = text.utf8ToString();
allTerms.put(term, pos++);
}
}
// Update postition
pos = 0;
for (Entry<String, Integer> s : allTerms.entrySet()) {
s.setValue(pos++);
}
return allTerms;
}
private DocVector[] getDocumentVectors() throws IOException {
DocVector[] docVector = new DocVector[getTotalDocumentInIndex()];
for (int docId = 0; docId < getTotalDocumentInIndex(); docId++) {
Terms vector = getIndexReader().getTermVector(docId, FIELD_CONTENT);
TermsEnum termsEnum = null;
termsEnum = vector.iterator();
BytesRef text = null;
docVector[docId] = new DocVector(getAllTerms());
while ((text = termsEnum.next()) != null) {
String term = text.utf8ToString();
int freq = (int) termsEnum.totalTermFreq();
docVector[docId].setEntry(term, freq);
}
docVector[docId].normalize();
}
getIndexReader().close();
return docVector;
}
/**
* Combines the individual term vectors of each document into a single list.
* @param terms
* @return
*/
public HashMap<String, QETerm> combineTerms(Vector<Terms> terms){
HashMap<String, QETerm> combinedTerms = new HashMap<String, QETerm>();
int numDocs = terms.size();
for(Terms ts : terms){
try {
TermsEnum te = ts.iterator();
BytesRef term;
while ((term = te.next()) != null) {
String tString = term.utf8ToString();
QETerm qet = new QETerm(tString, te.totalTermFreq(),te.docFreq(),numDocs);
if (combinedTerms.containsKey(tString)){
QETerm mergedTerm = qet.combine(combinedTerms.get(tString));
combinedTerms.replace(tString,mergedTerm);
}
else
combinedTerms.put(tString,qet);
}
} catch (IOException e) {
e.printStackTrace();
}
}
return combinedTerms;
}
public void testIntFieldMinMax() throws Exception {
Directory dir = newDirectory();
RandomIndexWriter w = new RandomIndexWriter(random(), dir);
int numDocs = atLeast(100);
int minValue = Integer.MAX_VALUE;
int maxValue = Integer.MIN_VALUE;
for(int i=0;i<numDocs;i++ ){
Document doc = new Document();
int num = random().nextInt();
minValue = Math.min(num, minValue);
maxValue = Math.max(num, maxValue);
doc.add(new LegacyIntField("field", num, Field.Store.NO));
w.addDocument(doc);
}
IndexReader r = w.getReader();
Terms terms = MultiTerms.getTerms(r, "field");
assertEquals(Integer.valueOf(minValue), LegacyNumericUtils.getMinInt(terms));
assertEquals(Integer.valueOf(maxValue), LegacyNumericUtils.getMaxInt(terms));
r.close();
w.close();
dir.close();
}
/**
* A convenience method that tries a number of approaches to getting a token
* stream. The cost of finding there are no termVectors in the index is
* minimal (1000 invocations still registers 0 ms). So this "lazy" (flexible?)
* approach to coding is probably acceptable
*
* @return null if field not stored correctly
* @throws IOException If there is a low-level I/O error
*/
@Deprecated // maintenance reasons LUCENE-6445
public static TokenStream getAnyTokenStream(IndexReader reader, int docId,
String field, Analyzer analyzer) throws IOException {
TokenStream ts = null;
Fields vectors = reader.getTermVectors(docId);
if (vectors != null) {
Terms vector = vectors.terms(field);
if (vector != null) {
ts = getTokenStream(vector);
}
}
// No token info stored so fall back to analyzing raw content
if (ts == null) {
ts = getTokenStream(reader, docId, field, analyzer);
}
return ts;
}
public void testNoOrds() throws Exception {
Directory dir = newDirectory();
RandomIndexWriter iw = new RandomIndexWriter(random(), dir);
Document doc = new Document();
FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
ft.setStoreTermVectors(true);
doc.add(new Field("foo", "this is a test", ft));
iw.addDocument(doc);
LeafReader ir = getOnlyLeafReader(iw.getReader());
Terms terms = ir.getTermVector(0, "foo");
assertNotNull(terms);
TermsEnum termsEnum = terms.iterator();
assertEquals(SeekStatus.FOUND, termsEnum.seekCeil(new BytesRef("this")));
expectThrows(UnsupportedOperationException.class, termsEnum::ord);
expectThrows(UnsupportedOperationException.class, () -> termsEnum.seekExact(0));
ir.close();
iw.close();
dir.close();
}
@Override
public OffsetsEnum getOffsetsEnum(LeafReader reader, int docId, String content) throws IOException {
Terms tvTerms = reader.getTermVector(docId, getField());
if (tvTerms == null) {
return OffsetsEnum.EMPTY;
}
LeafReader singleDocReader = new TermVectorLeafReader(getField(), tvTerms);
return createOffsetsEnumFromReader(
new OverlaySingleDocTermsLeafReader(
reader,
singleDocReader,
getField(),
docId),
docId);
}
public void testFloatFieldMinMax() throws Exception {
Directory dir = newDirectory();
RandomIndexWriter w = new RandomIndexWriter(random(), dir);
int numDocs = atLeast(100);
float minValue = Float.POSITIVE_INFINITY;
float maxValue = Float.NEGATIVE_INFINITY;
for(int i=0;i<numDocs;i++ ){
Document doc = new Document();
float num = random().nextFloat();
minValue = Math.min(num, minValue);
maxValue = Math.max(num, maxValue);
doc.add(new LegacyFloatField("field", num, Field.Store.NO));
w.addDocument(doc);
}
IndexReader r = w.getReader();
Terms terms = MultiTerms.getTerms(r, "field");
assertEquals(minValue, NumericUtils.sortableIntToFloat(LegacyNumericUtils.getMinInt(terms)), 0.0f);
assertEquals(maxValue, NumericUtils.sortableIntToFloat(LegacyNumericUtils.getMaxInt(terms)), 0.0f);
r.close();
w.close();
dir.close();
}
public String exportTerms(String destDir, String field, String delimiter) {
String filename = "terms_" + field + "_" + System.currentTimeMillis() + ".out";
Path path = Paths.get(destDir, filename);
try {
Terms terms = MultiTerms.getTerms(reader, field);
if (terms == null) {
throw new LukeException(String.format(Locale.US, "Field %s does not contain any terms to be exported", field));
}
try (BufferedWriter writer = Files.newBufferedWriter(path, Charset.forName("UTF-8"))) {
TermsEnum termsEnum = terms.iterator();
BytesRef term;
while (!Thread.currentThread().isInterrupted() && (term = termsEnum.next()) != null) {
writer.write(String.format(Locale.US, "%s%s%d\n", term.utf8ToString(), delimiter, +termsEnum.docFreq()));
}
return path.toString();
}
} catch (IOException e) {
throw new LukeException("Terms file export for field [" + field + "] to file [" + filename + "] has failed.", e);
}
}
@Override
public void write(Fields fields, NormsProducer norms) throws IOException {
for(String field : fields) {
Terms terms = fields.terms(field);
if (terms == null) {
continue;
}
TermsEnum termsEnum = terms.iterator();
TermsWriter termsWriter = addField(fieldInfos.fieldInfo(field));
while (true) {
BytesRef term = termsEnum.next();
if (term == null) {
break;
}
termsWriter.write(term, termsEnum, norms);
}
termsWriter.finish();
}
}
public void testLongFieldMinMax() throws Exception {
Directory dir = newDirectory();
RandomIndexWriter w = new RandomIndexWriter(random(), dir);
int numDocs = atLeast(100);
long minValue = Long.MAX_VALUE;
long maxValue = Long.MIN_VALUE;
for(int i=0;i<numDocs;i++ ){
Document doc = new Document();
long num = random().nextLong();
minValue = Math.min(num, minValue);
maxValue = Math.max(num, maxValue);
doc.add(new LegacyLongField("field", num, Field.Store.NO));
w.addDocument(doc);
}
IndexReader r = w.getReader();
Terms terms = MultiTerms.getTerms(r, "field");
assertEquals(Long.valueOf(minValue), LegacyNumericUtils.getMinLong(terms));
assertEquals(Long.valueOf(maxValue), LegacyNumericUtils.getMaxLong(terms));
r.close();
w.close();
dir.close();
}
private void collectTermStates(IndexReader reader,
List<LeafReaderContext> leaves,
TermStates[] contextArray,
Term[] queryTerms) throws IOException {
TermsEnum termsEnum = null;
for (LeafReaderContext context : leaves) {
Terms terms = context.reader().terms(this.field);
if (terms == null) {
// field does not exist
continue;
}
termsEnum = terms.iterator();
if (termsEnum == TermsEnum.EMPTY) continue;
for (int i = 0; i < queryTerms.length; i++) {
Term term = queryTerms[i];
TermStates termStates = contextArray[i];
if (termsEnum.seekExact(term.bytes())) {
if (termStates == null) {
contextArray[i] = new TermStates(reader.getContext(),
termsEnum.termState(), context.ord, termsEnum.docFreq(),
termsEnum.totalTermFreq());
} else {
termStates.register(termsEnum.termState(), context.ord,
termsEnum.docFreq(), termsEnum.totalTermFreq());
}
}
}
}
}
QueryTermFilter(IndexReader reader) throws IOException {
for (LeafReaderContext ctx : reader.leaves()) {
for (FieldInfo fi : ctx.reader().getFieldInfos()) {
BytesRefHash terms = termsHash.computeIfAbsent(fi.name, f -> new BytesRefHash());
Terms t = ctx.reader().terms(fi.name);
if (t != null) {
TermsEnum te = t.iterator();
BytesRef term;
while ((term = te.next()) != null) {
terms.add(term);
}
}
}
}
}
@Test
public void testSparseFreqDoubleArrayConversion() throws Exception {
Terms fieldTerms = MultiTerms.getTerms(index, "text");
if (fieldTerms != null && fieldTerms.size() != -1) {
IndexSearcher indexSearcher = new IndexSearcher(index);
for (ScoreDoc scoreDoc : indexSearcher.search(new MatchAllDocsQuery(), Integer.MAX_VALUE).scoreDocs) {
Terms docTerms = index.getTermVector(scoreDoc.doc, "text");
Double[] vector = DocToDoubleVectorUtils.toSparseLocalFreqDoubleArray(docTerms, fieldTerms);
assertNotNull(vector);
assertTrue(vector.length > 0);
}
}
}
/**
* Find words for a more-like-this query former.
*
* @param docNum the id of the lucene document from which to find terms
*/
private PriorityQueue<ScoreTerm> retrieveTerms(int docNum) throws IOException {
Map<String, Map<String, Int>> field2termFreqMap = new HashMap<>();
for (String fieldName : fieldNames) {
final Fields vectors = ir.getTermVectors(docNum);
final Terms vector;
if (vectors != null) {
vector = vectors.terms(fieldName);
} else {
vector = null;
}
// field does not store term vector info
if (vector == null) {
Document d = ir.document(docNum);
IndexableField[] fields = d.getFields(fieldName);
for (IndexableField field : fields) {
final String stringValue = field.stringValue();
if (stringValue != null) {
addTermFrequencies(new StringReader(stringValue), field2termFreqMap, fieldName);
}
}
} else {
addTermFrequencies(field2termFreqMap, vector, fieldName);
}
}
return createQueue(field2termFreqMap);
}
public WordScorer(IndexReader reader, Terms terms, String field, double realWordLikelyHood, BytesRef separator) throws IOException {
this.field = field;
if (terms == null) {
throw new IllegalArgumentException("Field: [" + field + "] does not exist");
}
this.terms = terms;
final long vocSize = terms.getSumTotalTermFreq();
this.vocabluarySize = vocSize == -1 ? reader.maxDoc() : vocSize;
this.useTotalTermFreq = vocSize != -1;
this.numTerms = terms.size();
this.termsEnum = new FreqTermsEnum(reader, field, !useTotalTermFreq, useTotalTermFreq, null, BigArrays.NON_RECYCLING_INSTANCE); // non recycling for now
this.reader = reader;
this.realWordLikelyhood = realWordLikelyHood;
this.separator = separator;
}
@Override
public MtasSpans getSpans(LeafReaderContext context,
Postings requiredPostings) throws IOException {
if (field == null) {
return null;
} else {
Terms terms = context.reader().terms(field);
if (terms == null) {
return null; // field does not exist
}
List<MtasSpanSequenceQuerySpans> setSequenceSpans = new ArrayList<>(
items.size());
Spans ignoreSpans = null;
boolean allSpansEmpty = true;
for (MtasSpanSequenceQueryWeight w : subWeights) {
Spans sequenceSpans = w.spanWeight.getSpans(context,
requiredPostings);
if (sequenceSpans != null) {
setSequenceSpans.add(new MtasSpanSequenceQuerySpans(
MtasSpanSequenceQuery.this, sequenceSpans, w.optional));
allSpansEmpty = false;
} else {
if (w.optional) {
setSequenceSpans.add(new MtasSpanSequenceQuerySpans(
MtasSpanSequenceQuery.this, null, w.optional));
} else {
return null;
}
}
}
if (allSpansEmpty) {
return null; // at least one required
} else if (ignoreWeight != null) {
ignoreSpans = ignoreWeight.getSpans(context, requiredPostings);
}
return new MtasSpanSequenceSpans(MtasSpanSequenceQuery.this,
setSequenceSpans, ignoreSpans, maximumIgnoreLength);
}
}
@Override
public Terms terms(String field) throws IOException {
if (!filtered.contains(field)) {
throw new IllegalArgumentException("The field named '" + field + "' is not accessible in the current " +
"merge context, available ones are: " + filtered);
}
return in.terms(field);
}
@Override
public FieldStats stats(Terms terms, int maxDoc) throws IOException {
long minValue = NumericUtils.getMinInt(terms);
long maxValue = NumericUtils.getMaxInt(terms);
return new FieldStats.Long(
maxDoc, terms.getDocCount(), terms.getSumDocFreq(), terms.getSumTotalTermFreq(), minValue, maxValue
);
}
private static void applyFamily(OpenBitSet bits, String family, AtomicReader atomicReader, int primeDocRowId,
int numberOfDocsInRow, Bits liveDocs) throws IOException {
Fields fields = atomicReader.fields();
Terms terms = fields.terms(BlurConstants.FAMILY);
TermsEnum iterator = terms.iterator(null);
BytesRef text = new BytesRef(family);
int lastDocId = primeDocRowId + numberOfDocsInRow;
if (iterator.seekExact(text, true)) {
DocsEnum docs = iterator.docs(liveDocs, null, DocsEnum.FLAG_NONE);
int doc = primeDocRowId;
while ((doc = docs.advance(doc)) < lastDocId) {
bits.set(doc - primeDocRowId);
}
}
}