下面列出了org.apache.lucene.index.Terms#iterator ( ) 实例代码,或者点击链接到github查看源代码,也可以在右侧发表评论。
@Override
protected Accountable createValue(final AtomicReader reader, CacheKey key, boolean setDocsWithField)
throws IOException {
final Map<String, Integer> uidMap = new HashMap<>();
Uninvert u = new Uninvert() {
private String currentValue;
@Override
public void visitTerm(BytesRef term) {
currentValue = term.utf8ToString();
}
@Override
public void visitDoc(int docID) {
uidMap.put(currentValue, docID);
}
@Override
protected TermsEnum termsEnum(Terms terms) throws IOException {
return terms.iterator(null);
}
};
u.uninvert(reader, key.field, setDocsWithField);
return new PerReaderUIDMaps(reader.getContext().ord, uidMap);
}
public RangeTermsEnum(Terms terms) throws IOException {
if (terms == null) {
positioned = true;
} else {
te = terms.iterator();
if (lower != null) {
TermsEnum.SeekStatus status = te.seekCeil(lower);
if (status == TermsEnum.SeekStatus.END) {
positioned = true;
curr = null;
} else if (status == SeekStatus.FOUND) {
positioned = includeLower();
curr = te.term();
} else {
// lower bound not found, so includeLower is irrelevant
positioned = true;
curr = te.term();
}
}
}
}
/**
* Calculate probabilities for all classes for a given input text
*
* @param inputDocument the input text as a {@code String}
* @return a {@code List} of {@code ClassificationResult}, one for each existing class
* @throws IOException if assigning probabilities fails
*/
private List<ClassificationResult<BytesRef>> assignClassNormalizedList(String inputDocument) throws IOException {
List<ClassificationResult<BytesRef>> assignedClasses = new ArrayList<>();
Terms classes = MultiTerms.getTerms(indexReader, classFieldName);
TermsEnum classesEnum = classes.iterator();
BytesRef next;
String[] tokenizedText = tokenize(inputDocument);
while ((next = classesEnum.next()) != null) {
if (next.length > 0) {
Term term = new Term(this.classFieldName, next);
assignedClasses.add(new ClassificationResult<>(term.bytes(), calculateLogPrior(term) + calculateLogLikelihood(tokenizedText, term)));
}
}
return normClassificationResults(assignedClasses);
}
@Override
public void write(Fields fields, NormsProducer norms) throws IOException {
for(String field : fields) {
Terms terms = fields.terms(field);
if (terms == null) {
continue;
}
TermsEnum termsEnum = terms.iterator();
TermsWriter termsWriter = addField(fieldInfos.fieldInfo(field));
while (true) {
BytesRef term = termsEnum.next();
if (term == null) {
break;
}
termsWriter.write(term, termsEnum, norms);
}
termsWriter.finish();
}
}
/**
* Create a {@link DisjunctionMatchesIterator} over a list of terms extracted from a {@link BytesRefIterator}
*
* Only terms that have at least one match in the given document will be included
*/
static MatchesIterator fromTermsEnum(LeafReaderContext context, int doc, Query query, String field, BytesRefIterator terms) throws IOException {
Objects.requireNonNull(field);
Terms t = context.reader().terms(field);
if (t == null)
return null;
TermsEnum te = t.iterator();
PostingsEnum reuse = null;
for (BytesRef term = terms.next(); term != null; term = terms.next()) {
if (te.seekExact(term)) {
PostingsEnum pe = te.postings(reuse, PostingsEnum.OFFSETS);
if (pe.advance(doc) == doc) {
return new TermsEnumDisjunctionMatchesIterator(new TermMatchesIterator(query, pe), terms, te, doc, query);
}
else {
reuse = pe;
}
}
}
return null;
}
/**
* Create the coefficient to transform the weight.
*
* @param doc id of the document
* @param matchedTokens tokens found in the query
* @param prefixToken unfinished token in the query
* @return the coefficient
* @throws IOException If there are problems reading term vectors from the underlying Lucene index.
*/
private double createCoefficient(IndexSearcher searcher, int doc, Set<String> matchedTokens, String prefixToken) throws IOException {
Terms tv = searcher.getIndexReader().getTermVector(doc, TEXT_FIELD_NAME);
TermsEnum it = tv.iterator();
Integer position = Integer.MAX_VALUE;
BytesRef term;
// find the closest token position
while ((term = it.next()) != null) {
String docTerm = term.utf8ToString();
if (matchedTokens.contains(docTerm) || (prefixToken != null && docTerm.startsWith(prefixToken))) {
PostingsEnum docPosEnum = it.postings(null, PostingsEnum.OFFSETS);
docPosEnum.nextDoc();
// use the first occurrence of the term
int p = docPosEnum.nextPosition();
if (p < position) {
position = p;
}
}
}
// create corresponding coefficient based on position
return calculateCoefficient(position);
}
/**
* Retrieve term candidates from solr field
* see @code {uk.ac.shef.dcs.jate.JATEProperties.PROPERTY_SOLR_FIELD_CONTENT_TERMS}
*
* The method assumes that the term candidates are extracted at index-time and stored in pre-configured field
*
* @return Set, a set of term candidate surface form
* @throws JATEException
* @throws IOException
*/
protected Set<String> getUniqueTerms() throws JATEException, IOException {
Terms terms =SolrUtil.getTermVector(properties.getSolrFieldNameJATECTerms(),solrIndexSearcher);
//>>>>>>>>>
/*TermsEnum source = terms.iterator();
String term = //"thrownawayorusedjustforelementarystatistical profile";
"l hierar hy";
//"ordertoavoidadependencyofthebaselineresultontherandom";
if (source.seekExact(new BytesRef(term.getBytes("UTF-8")))) {
PostingsEnum docEnum = source.postings(null);
int doc = 0;
while ((doc = docEnum.nextDoc()) != PostingsEnum.NO_MORE_DOCS) {
int tfid = docEnum.freq(); //tf in document
}
} else {
}*/
//>>>>>>>>>
TermsEnum termsEnum = terms.iterator();
Set<String> allTermCandidates = new HashSet<>();
while (termsEnum.next() != null) {
BytesRef t = termsEnum.term();
if (t.length == 0)
continue;
allTermCandidates.add(t.utf8ToString());
}
return allTermCandidates;
}
private void buildField(XContentBuilder builder, final CharsRefBuilder spare, Fields theFields, Iterator<String> fieldIter) throws IOException {
String fieldName = fieldIter.next();
builder.startObject(fieldName);
Terms curTerms = theFields.terms(fieldName);
// write field statistics
buildFieldStatistics(builder, curTerms);
builder.startObject(FieldStrings.TERMS);
TermsEnum termIter = curTerms.iterator();
BoostAttribute boostAtt = termIter.attributes().addAttribute(BoostAttribute.class);
for (int i = 0; i < curTerms.size(); i++) {
buildTerm(builder, spare, curTerms, termIter, boostAtt);
}
builder.endObject();
builder.endObject();
}
public void testEndOffsetPositionWithTeeSinkTokenFilter() throws Exception {
Directory dir = newDirectory();
Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false);
IndexWriter w = new IndexWriter(dir, newIndexWriterConfig(analyzer));
Document doc = new Document();
TokenStream tokenStream = analyzer.tokenStream("field", "abcd ");
TeeSinkTokenFilter tee = new TeeSinkTokenFilter(tokenStream);
TokenStream sink = tee.newSinkTokenStream();
FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
ft.setStoreTermVectors(true);
ft.setStoreTermVectorOffsets(true);
ft.setStoreTermVectorPositions(true);
Field f1 = new Field("field", tee, ft);
Field f2 = new Field("field", sink, ft);
doc.add(f1);
doc.add(f2);
w.addDocument(doc);
w.close();
IndexReader r = DirectoryReader.open(dir);
Terms vector = r.getTermVectors(0).terms("field");
assertEquals(1, vector.size());
TermsEnum termsEnum = vector.iterator();
termsEnum.next();
assertEquals(2, termsEnum.totalTermFreq());
PostingsEnum positions = termsEnum.postings(null, PostingsEnum.ALL);
assertTrue(positions.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
assertEquals(2, positions.freq());
positions.nextPosition();
assertEquals(0, positions.startOffset());
assertEquals(4, positions.endOffset());
positions.nextPosition();
assertEquals(8, positions.startOffset());
assertEquals(12, positions.endOffset());
assertEquals(DocIdSetIterator.NO_MORE_DOCS, positions.nextDoc());
r.close();
dir.close();
analyzer.close();
}
@Override
public Boolean execute(IndexContext context) throws IOException, InterruptedException {
try {
IndexReader indexReader = context.getIndexReader();
while (true) {
long hash = 0;
for (AtomicReaderContext atomicReaderContext : indexReader.leaves()) {
AtomicReader reader = atomicReaderContext.reader();
for (String field : reader.fields()) {
Terms terms = reader.terms(field);
BytesRef bytesRef;
TermsEnum iterator = terms.iterator(null);
while ((bytesRef = iterator.next()) != null) {
hash += bytesRef.hashCode();
}
}
}
System.out.println("hashcode = " + hash);
}
} catch (IOException e) {
e.printStackTrace();
throw e;
} catch (Throwable t) {
t.printStackTrace();
if (t instanceof InterruptedException) {
throw t;
} else if (t instanceof RuntimeException) {
throw (RuntimeException) t;
}
throw new RuntimeException(t);
}
}
private int getTermWithSeekCount(Fields fields, String field) throws IOException {
Terms terms = fields.terms(field);
TermsEnum termsEnum = terms.iterator(null);
SeekStatus seekStatus = termsEnum.seekCeil(new BytesRef(""));
if (seekStatus == SeekStatus.END) {
return 0;
}
System.out.println(termsEnum.term().utf8ToString());
int count = 1;
while (termsEnum.next() != null) {
count++;
}
return count;
}
@Override
protected void doSetNextReader(LeafReaderContext context) throws IOException {
Terms terms = context.reader().terms(field);
if (terms == null) {
currentReaderPostingsValues = null;
} else {
TermsEnum termsEnum = terms.iterator();
if (termsEnum.seekExact(featureName) == false) {
currentReaderPostingsValues = null;
} else {
currentReaderPostingsValues = termsEnum.postings(currentReaderPostingsValues, PostingsEnum.FREQS);
}
}
}
private List<MWESentenceContext> collectTermOffsets(Terms termVectorLookup) throws IOException {
List<MWESentenceContext> result = new ArrayList<>();
TermsEnum tiRef= termVectorLookup.iterator();
BytesRef luceneTerm = tiRef.next();
while (luceneTerm != null) {
if (luceneTerm.length == 0) {
luceneTerm = tiRef.next();
continue;
}
String tString = luceneTerm.utf8ToString();
if(!allCandidates.contains(tString)) {
luceneTerm=tiRef.next();
continue;
}
PostingsEnum postingsEnum = tiRef.postings(null, PostingsEnum.ALL);
//PostingsEnum postingsEnum = ti.postings(null, PostingsEnum.OFFSETS);
int doc = postingsEnum.nextDoc(); //this should be just 1 doc, i.e., the constraint for getting this TV
if (doc != PostingsEnum.NO_MORE_DOCS) {
int totalOccurrence = postingsEnum.freq();
for (int i = 0; i < totalOccurrence; i++) {
postingsEnum.nextPosition();
int start = postingsEnum.startOffset();
int end = postingsEnum.endOffset();
BytesRef payload=postingsEnum.getPayload();
int sentenceId=-1;
if(payload!=null){
sentenceId=new SentenceContext(MWEMetadata.deserialize(payload.utf8ToString())).getSentenceId();
}
result.add(new MWESentenceContext(tString,sentenceId, start, end));
}
}
luceneTerm = tiRef.next();
}
Collections.sort(result);
return result;
}
@Test
public void testPerformance() throws Exception {
MockAnalyzer analyzer = new MockAnalyzer(random());
int numDocs = atLeast(10);
LeafReader leafReader = getRandomIndex(analyzer, numDocs);
try {
CachingNaiveBayesClassifier simpleNaiveBayesClassifier = new CachingNaiveBayesClassifier(leafReader,
analyzer, null, categoryFieldName, textFieldName);
ConfusionMatrixGenerator.ConfusionMatrix confusionMatrix = ConfusionMatrixGenerator.getConfusionMatrix(leafReader,
simpleNaiveBayesClassifier, categoryFieldName, textFieldName, -1);
assertNotNull(confusionMatrix);
double avgClassificationTime = confusionMatrix.getAvgClassificationTime();
assertTrue(avgClassificationTime >= 0);
double accuracy = confusionMatrix.getAccuracy();
assertTrue(accuracy >= 0d);
assertTrue(accuracy <= 1d);
double recall = confusionMatrix.getRecall();
assertTrue(recall >= 0d);
assertTrue(recall <= 1d);
double precision = confusionMatrix.getPrecision();
assertTrue(precision >= 0d);
assertTrue(precision <= 1d);
Terms terms = MultiTerms.getTerms(leafReader, categoryFieldName);
TermsEnum iterator = terms.iterator();
BytesRef term;
while ((term = iterator.next()) != null) {
String s = term.utf8ToString();
recall = confusionMatrix.getRecall(s);
assertTrue(recall >= 0d);
assertTrue(recall <= 1d);
precision = confusionMatrix.getPrecision(s);
assertTrue(precision >= 0d);
assertTrue(precision <= 1d);
double f1Measure = confusionMatrix.getF1Measure(s);
assertTrue(f1Measure >= 0d);
assertTrue(f1Measure <= 1d);
}
} finally {
leafReader.close();
}
}
@Override
protected TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException {
return new FullKeyDataRangeFilteredTermsEnum(terms.iterator(null));
}
@SuppressWarnings("unchecked")
private static void getDetailedFieldInfo(SolrQueryRequest req,
String field, SimpleOrderedMap<Object> fieldMap) throws IOException {
SolrParams params = req.getParams();
final int numTerms = params.getInt(NUMTERMS, DEFAULT_COUNT);
TopTermQueue tiq = new TopTermQueue(numTerms + 1); // Something to
// collect the top N
// terms in.
final CharsRefBuilder spare = new CharsRefBuilder();
Terms terms = MultiFields.getTerms(req.getSearcher().getIndexReader(),
field);
if (terms == null) { // field does not exist
return;
}
TermsEnum termsEnum = terms.iterator();
BytesRef text;
int[] buckets = new int[HIST_ARRAY_SIZE];
while ((text = termsEnum.next()) != null) {
++tiq.distinctTerms;
int freq = termsEnum.docFreq(); // This calculation seems odd, but
// it gives the same results as it
// used to.
int slot = 32 - Integer.numberOfLeadingZeros(Math.max(0, freq - 1));
buckets[slot] = buckets[slot] + 1;
if (numTerms > 0 && freq > tiq.minFreq) {
spare.copyUTF8Bytes(text);
String t = spare.toString();
tiq.add(new TopTermQueue.TermInfo(new Term(field, t), termsEnum
.docFreq()));
if (tiq.size() > numTerms) { // if tiq full
tiq.pop(); // remove lowest in tiq
tiq.minFreq = tiq.getTopTermInfo().docFreq;
}
}
}
tiq.histogram.add(buckets);
fieldMap.add("distinct", tiq.distinctTerms);
// Include top terms
fieldMap.add("topTerms", tiq.toNamedList(req.getSearcher().getSchema()));
// Add a histogram
fieldMap.add("histogram", tiq.histogram.toNamedList());
}
@Override
public void finish() throws IOException {
NamedList<Double> analytics = new NamedList<Double>();
@SuppressWarnings({"unchecked", "rawtypes"})
NamedList<Integer> topFreq = new NamedList();
@SuppressWarnings({"unchecked", "rawtypes"})
NamedList<Integer> allFreq = new NamedList();
rb.rsp.add("featuredTerms", analytics);
rb.rsp.add("docFreq", topFreq);
rb.rsp.add("numDocs", count);
TreeSet<TermWithScore> topTerms = new TreeSet<>();
double numDocs = count;
double pc = numPositiveDocs / numDocs;
double entropyC = binaryEntropy(pc);
Terms terms = ((SolrIndexSearcher)searcher).getSlowAtomicReader().terms(field);
TermsEnum termsEnum = terms == null ? TermsEnum.EMPTY : terms.iterator();
BytesRef term;
PostingsEnum postingsEnum = null;
while ((term = termsEnum.next()) != null) {
postingsEnum = termsEnum.postings(postingsEnum);
int xc = 0;
int nc = 0;
while (postingsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
if (positiveSet.get(postingsEnum.docID())) {
xc++;
} else if (negativeSet.get(postingsEnum.docID())) {
nc++;
}
}
int docFreq = xc+nc;
double entropyContainsTerm = binaryEntropy( (double) xc / docFreq );
double entropyNotContainsTerm = binaryEntropy( (double) (numPositiveDocs - xc) / (numDocs - docFreq + 1) );
double score = entropyC - ( (docFreq / numDocs) * entropyContainsTerm + (1.0 - docFreq / numDocs) * entropyNotContainsTerm);
topFreq.add(term.utf8ToString(), docFreq);
if (topTerms.size() < numTerms) {
topTerms.add(new TermWithScore(term.utf8ToString(), score));
} else {
if (topTerms.first().score < score) {
topTerms.pollFirst();
topTerms.add(new TermWithScore(term.utf8ToString(), score));
}
}
}
for (TermWithScore topTerm : topTerms) {
analytics.add(topTerm.term, topTerm.score);
topFreq.add(topTerm.term, allFreq.get(topTerm.term));
}
if (this.delegate instanceof DelegatingCollector) {
((DelegatingCollector) this.delegate).finish();
}
}
@Override
protected TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException {
return new SimplePrefixTermsEnum(terms.iterator(), prefix);
}
@Test
public void testPerformance() throws Exception {
MockAnalyzer analyzer = new MockAnalyzer(random());
int numDocs = atLeast(10);
LeafReader leafReader = getRandomIndex(analyzer, numDocs);
try {
Classifier<BytesRef> classifier = new KNearestFuzzyClassifier(leafReader, null, analyzer, null, 3, categoryFieldName, textFieldName);
ConfusionMatrixGenerator.ConfusionMatrix confusionMatrix = ConfusionMatrixGenerator.getConfusionMatrix(leafReader,
classifier, categoryFieldName, textFieldName, -1);
assertNotNull(confusionMatrix);
double avgClassificationTime = confusionMatrix.getAvgClassificationTime();
assertTrue(avgClassificationTime >= 0);
double accuracy = confusionMatrix.getAccuracy();
assertTrue(accuracy >= 0d);
assertTrue(accuracy <= 1d);
double recall = confusionMatrix.getRecall();
assertTrue(recall >= 0d);
assertTrue(recall <= 1d);
double precision = confusionMatrix.getPrecision();
assertTrue(precision >= 0d);
assertTrue(precision <= 1d);
Terms terms = MultiTerms.getTerms(leafReader, categoryFieldName);
TermsEnum iterator = terms.iterator();
BytesRef term;
while ((term = iterator.next()) != null) {
String s = term.utf8ToString();
recall = confusionMatrix.getRecall(s);
assertTrue(recall >= 0d);
assertTrue(recall <= 1d);
precision = confusionMatrix.getPrecision(s);
assertTrue(precision >= 0d);
assertTrue(precision <= 1d);
double f1Measure = confusionMatrix.getF1Measure(s);
assertTrue(f1Measure >= 0d);
assertTrue(f1Measure <= 1d);
}
} finally {
leafReader.close();
}
}
private static void analyseTermFreq(String indexName, String field, String freqType, String outputFileName) {
String indexFile = elasticsearchLoc + "/data/stackoverflow/nodes/0/indices/"
+ indexName + "/0/index";
DecimalFormat df = new DecimalFormat("#.00");
int printEvery = 100000;
File outputFile = new File(outputFileName);
if (outputFile.exists()) {
if (!outputFile.delete()) {
System.out.println("ERROR: cannot delete the output file.");
System.exit(0);
}
}
/* adapted from
https://stackoverflow.com/questions/28244961/lucene-4-10-2-calculate-tf-idf-for-all-terms-in-index
*/
int count = 0;
try {
IndexReader reader = DirectoryReader.open(FSDirectory.open(Paths.get(indexFile)));
Fields fields = MultiFields.getFields(reader);
Terms terms = fields.terms(field);
TermsEnum termsEnum = terms.iterator();
int size = 0;
// TODO: is there a better solution?
// iterate to get the size
while (termsEnum.next() != null) {
size++;
}
// String[] termArr = new String[size];
long[] freqArr = new long[size];
// do the real work
termsEnum = terms.iterator();
while (termsEnum.next() != null) {
// String term = termsEnum.term().utf8ToString();
long tfreq = 0;
if (freqType.equals("tf"))
tfreq = termsEnum.totalTermFreq();
else if (freqType.equals("df"))
tfreq = termsEnum.docFreq();
else {
System.out.println("Wrong frequency. Quit!");
System.exit(0);
}
// termArr[count] = term;
freqArr[count] = tfreq;
if (count % printEvery == 0) {
System.out.println("processed: " + count + " terms "
+ " [" + df.format(((long)count * 100)/size) + "%]");
}
count++;
}
System.out.println(field + ": total = " + count);
double[] data = new double[size];
String output = "freq\n";
for (int i = 0; i < freqArr.length; i++) {
data[i] = freqArr[i];
output += freqArr[i] + "\n";
if (i > 0 && i % printEvery == 0) {
MyUtils.writeToFile("./", outputFileName, output, true);
System.out.println("written: " + i + " terms "
+ " [" + df.format(((long)i * 100)/size) + "%]");
output = "";
}
}
// write the rest to the file
MyUtils.writeToFile("./",outputFileName, output, true);
} catch (IOException e) {
e.printStackTrace();
}
}