org.apache.lucene.index.Terms#iterator ( )源码实例Demo

下面列出了org.apache.lucene.index.Terms#iterator ( ) 实例代码,或者点击链接到github查看源代码,也可以在右侧发表评论。

源代码1 项目: linden   文件: LindenFieldCacheImpl.java
@Override
protected Accountable createValue(final AtomicReader reader, CacheKey key, boolean setDocsWithField)
    throws IOException {
  final Map<String, Integer> uidMap = new HashMap<>();

  Uninvert u = new Uninvert() {
    private String currentValue;

    @Override
    public void visitTerm(BytesRef term) {
      currentValue = term.utf8ToString();
    }

    @Override
    public void visitDoc(int docID) {
      uidMap.put(currentValue, docID);
    }

    @Override
    protected TermsEnum termsEnum(Terms terms) throws IOException {
      return terms.iterator(null);
    }
  };
  u.uninvert(reader, key.field, setDocsWithField);
  return new PerReaderUIDMaps(reader.getContext().ord, uidMap);
}
 
源代码2 项目: lucene-solr   文件: SolrRangeQuery.java
public RangeTermsEnum(Terms terms) throws IOException {
  if (terms == null) {
    positioned = true;
  } else {
    te = terms.iterator();
    if (lower != null) {
      TermsEnum.SeekStatus status = te.seekCeil(lower);
      if (status == TermsEnum.SeekStatus.END) {
        positioned = true;
        curr = null;
      } else if (status == SeekStatus.FOUND) {
        positioned = includeLower();
        curr = te.term();
      } else {
        // lower bound not found, so includeLower is irrelevant
        positioned = true;
        curr = te.term();
      }
    }
  }
}
 
源代码3 项目: lucene-solr   文件: BM25NBClassifier.java
/**
 * Calculate probabilities for all classes for a given input text
 *
 * @param inputDocument the input text as a {@code String}
 * @return a {@code List} of {@code ClassificationResult}, one for each existing class
 * @throws IOException if assigning probabilities fails
 */
private List<ClassificationResult<BytesRef>> assignClassNormalizedList(String inputDocument) throws IOException {
  List<ClassificationResult<BytesRef>> assignedClasses = new ArrayList<>();

  Terms classes = MultiTerms.getTerms(indexReader, classFieldName);
  TermsEnum classesEnum = classes.iterator();
  BytesRef next;
  String[] tokenizedText = tokenize(inputDocument);
  while ((next = classesEnum.next()) != null) {
    if (next.length > 0) {
      Term term = new Term(this.classFieldName, next);
      assignedClasses.add(new ClassificationResult<>(term.bytes(), calculateLogPrior(term) + calculateLogLikelihood(tokenizedText, term)));
    }
  }

  return normClassificationResults(assignedClasses);
}
 
源代码4 项目: lucene-solr   文件: BlockTermsWriter.java
@Override
public void write(Fields fields, NormsProducer norms) throws IOException {

  for(String field : fields) {

    Terms terms = fields.terms(field);
    if (terms == null) {
      continue;
    }

    TermsEnum termsEnum = terms.iterator();

    TermsWriter termsWriter = addField(fieldInfos.fieldInfo(field));

    while (true) {
      BytesRef term = termsEnum.next();
      if (term == null) {
        break;
      }

      termsWriter.write(term, termsEnum, norms);
    }

    termsWriter.finish();
  }
}
 
源代码5 项目: lucene-solr   文件: DisjunctionMatchesIterator.java
/**
 * Create a {@link DisjunctionMatchesIterator} over a list of terms extracted from a {@link BytesRefIterator}
 *
 * Only terms that have at least one match in the given document will be included
 */
static MatchesIterator fromTermsEnum(LeafReaderContext context, int doc, Query query, String field, BytesRefIterator terms) throws IOException {
  Objects.requireNonNull(field);
  Terms t = context.reader().terms(field);
  if (t == null)
    return null;
  TermsEnum te = t.iterator();
  PostingsEnum reuse = null;
  for (BytesRef term = terms.next(); term != null; term = terms.next()) {
    if (te.seekExact(term)) {
      PostingsEnum pe = te.postings(reuse, PostingsEnum.OFFSETS);
      if (pe.advance(doc) == doc) {
        return new TermsEnumDisjunctionMatchesIterator(new TermMatchesIterator(query, pe), terms, te, doc, query);
      }
      else {
        reuse = pe;
      }
    }
  }
  return null;
}
 
源代码6 项目: lucene-solr   文件: BlendedInfixSuggester.java
/**
 * Create the coefficient to transform the weight.
 *
 * @param doc id of the document
 * @param matchedTokens tokens found in the query
 * @param prefixToken unfinished token in the query
 * @return the coefficient
 * @throws IOException If there are problems reading term vectors from the underlying Lucene index.
 */
private double createCoefficient(IndexSearcher searcher, int doc, Set<String> matchedTokens, String prefixToken) throws IOException {

  Terms tv = searcher.getIndexReader().getTermVector(doc, TEXT_FIELD_NAME);
  TermsEnum it = tv.iterator();

  Integer position = Integer.MAX_VALUE;
  BytesRef term;
  // find the closest token position
  while ((term = it.next()) != null) {

    String docTerm = term.utf8ToString();

    if (matchedTokens.contains(docTerm) || (prefixToken != null && docTerm.startsWith(prefixToken))) {
 
      PostingsEnum docPosEnum = it.postings(null, PostingsEnum.OFFSETS);
      docPosEnum.nextDoc();

      // use the first occurrence of the term
      int p = docPosEnum.nextPosition();
      if (p < position) {
        position = p;
      }
    }
  }

  // create corresponding coefficient based on position
  return calculateCoefficient(position);
}
 
源代码7 项目: jate   文件: AbstractFeatureBuilder.java
/**
 * Retrieve term candidates from solr field
 *      see @code {uk.ac.shef.dcs.jate.JATEProperties.PROPERTY_SOLR_FIELD_CONTENT_TERMS}
 *
 * The method assumes that the term candidates are extracted at index-time and stored in pre-configured field
 *
 * @return Set, a set of term candidate surface form
 * @throws JATEException
 * @throws IOException
 */
protected Set<String> getUniqueTerms() throws JATEException, IOException {
    Terms terms =SolrUtil.getTermVector(properties.getSolrFieldNameJATECTerms(),solrIndexSearcher);

    //>>>>>>>>>
    /*TermsEnum source = terms.iterator();
    String term = //"thrownawayorusedjustforelementarystatistical profile";
    "l hierar hy";
    //"ordertoavoidadependencyofthebaselineresultontherandom";

            if (source.seekExact(new BytesRef(term.getBytes("UTF-8")))) {
                PostingsEnum docEnum = source.postings(null);
                int doc = 0;
                while ((doc = docEnum.nextDoc()) != PostingsEnum.NO_MORE_DOCS) {
                    int tfid = docEnum.freq();  //tf in document

                }

            } else {

            }*/
    //>>>>>>>>>

    TermsEnum termsEnum = terms.iterator();
    Set<String> allTermCandidates = new HashSet<>();

    while (termsEnum.next() != null) {
        BytesRef t = termsEnum.term();
        if (t.length == 0)
            continue;
        allTermCandidates.add(t.utf8ToString());
    }
    return allTermCandidates;
}
 
源代码8 项目: Elasticsearch   文件: TermVectorsResponse.java
private void buildField(XContentBuilder builder, final CharsRefBuilder spare, Fields theFields, Iterator<String> fieldIter) throws IOException {
    String fieldName = fieldIter.next();
    builder.startObject(fieldName);
    Terms curTerms = theFields.terms(fieldName);
    // write field statistics
    buildFieldStatistics(builder, curTerms);
    builder.startObject(FieldStrings.TERMS);
    TermsEnum termIter = curTerms.iterator();
    BoostAttribute boostAtt = termIter.attributes().addAttribute(BoostAttribute.class);
    for (int i = 0; i < curTerms.size(); i++) {
        buildTerm(builder, spare, curTerms, termIter, boostAtt);
    }
    builder.endObject();
    builder.endObject();
}
 
源代码9 项目: lucene-solr   文件: TestTeeSinkTokenFilter.java
public void testEndOffsetPositionWithTeeSinkTokenFilter() throws Exception {
  Directory dir = newDirectory();
  Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false);
  IndexWriter w = new IndexWriter(dir, newIndexWriterConfig(analyzer));
  Document doc = new Document();
  TokenStream tokenStream = analyzer.tokenStream("field", "abcd   ");
  TeeSinkTokenFilter tee = new TeeSinkTokenFilter(tokenStream);
  TokenStream sink = tee.newSinkTokenStream();
  FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
  ft.setStoreTermVectors(true);
  ft.setStoreTermVectorOffsets(true);
  ft.setStoreTermVectorPositions(true);
  Field f1 = new Field("field", tee, ft);
  Field f2 = new Field("field", sink, ft);
  doc.add(f1);
  doc.add(f2);
  w.addDocument(doc);
  w.close();

  IndexReader r = DirectoryReader.open(dir);
  Terms vector = r.getTermVectors(0).terms("field");
  assertEquals(1, vector.size());
  TermsEnum termsEnum = vector.iterator();
  termsEnum.next();
  assertEquals(2, termsEnum.totalTermFreq());
  PostingsEnum positions = termsEnum.postings(null, PostingsEnum.ALL);
  assertTrue(positions.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
  assertEquals(2, positions.freq());
  positions.nextPosition();
  assertEquals(0, positions.startOffset());
  assertEquals(4, positions.endOffset());
  positions.nextPosition();
  assertEquals(8, positions.startOffset());
  assertEquals(12, positions.endOffset());
  assertEquals(DocIdSetIterator.NO_MORE_DOCS, positions.nextDoc());
  r.close();
  dir.close();
  analyzer.close();
}
 
@Override
public Boolean execute(IndexContext context) throws IOException, InterruptedException {
  try {
    IndexReader indexReader = context.getIndexReader();
    while (true) {
      long hash = 0;
      for (AtomicReaderContext atomicReaderContext : indexReader.leaves()) {
        AtomicReader reader = atomicReaderContext.reader();
        for (String field : reader.fields()) {
          Terms terms = reader.terms(field);
          BytesRef bytesRef;
          TermsEnum iterator = terms.iterator(null);
          while ((bytesRef = iterator.next()) != null) {
            hash += bytesRef.hashCode();
          }
        }
      }
      System.out.println("hashcode = " + hash);
    }
  } catch (IOException e) {
    e.printStackTrace();
    throw e;
  } catch (Throwable t) {
    t.printStackTrace();
    if (t instanceof InterruptedException) {
      throw t;
    } else if (t instanceof RuntimeException) {
      throw (RuntimeException) t;
    }
    throw new RuntimeException(t);
  }
}
 
private int getTermWithSeekCount(Fields fields, String field) throws IOException {
  Terms terms = fields.terms(field);
  TermsEnum termsEnum = terms.iterator(null);
  SeekStatus seekStatus = termsEnum.seekCeil(new BytesRef(""));
  if (seekStatus == SeekStatus.END) {
    return 0;
  }
  System.out.println(termsEnum.term().utf8ToString());
  int count = 1;
  while (termsEnum.next() != null) {
    count++;
  }
  return count;
}
 
源代码12 项目: lucene-solr   文件: FeatureSortField.java
@Override
protected void doSetNextReader(LeafReaderContext context) throws IOException {
  Terms terms = context.reader().terms(field);
  if (terms == null) {
    currentReaderPostingsValues = null;
  } else {
    TermsEnum termsEnum = terms.iterator();
    if (termsEnum.seekExact(featureName) == false) {
      currentReaderPostingsValues = null;
    } else {
      currentReaderPostingsValues = termsEnum.postings(currentReaderPostingsValues, PostingsEnum.FREQS);
    }
  }
}
 
源代码13 项目: jate   文件: FrequencyCtxSentenceBasedFBWorker.java
private List<MWESentenceContext> collectTermOffsets(Terms termVectorLookup) throws IOException {
    List<MWESentenceContext> result = new ArrayList<>();

    TermsEnum tiRef= termVectorLookup.iterator();
    BytesRef luceneTerm = tiRef.next();
    while (luceneTerm != null) {
        if (luceneTerm.length == 0) {
            luceneTerm = tiRef.next();
            continue;
        }
        String tString = luceneTerm.utf8ToString();
        if(!allCandidates.contains(tString)) {
            luceneTerm=tiRef.next();
            continue;
        }


        PostingsEnum postingsEnum = tiRef.postings(null, PostingsEnum.ALL);
        //PostingsEnum postingsEnum = ti.postings(null, PostingsEnum.OFFSETS);

        int doc = postingsEnum.nextDoc(); //this should be just 1 doc, i.e., the constraint for getting this TV
        if (doc != PostingsEnum.NO_MORE_DOCS) {
            int totalOccurrence = postingsEnum.freq();
            for (int i = 0; i < totalOccurrence; i++) {
                postingsEnum.nextPosition();
                int start = postingsEnum.startOffset();
                int end = postingsEnum.endOffset();
                BytesRef payload=postingsEnum.getPayload();
                int sentenceId=-1;
                if(payload!=null){
                    sentenceId=new SentenceContext(MWEMetadata.deserialize(payload.utf8ToString())).getSentenceId();
                }
                result.add(new MWESentenceContext(tString,sentenceId, start, end));
            }
        }
        luceneTerm = tiRef.next();
    }
    Collections.sort(result);
    return result;
}
 
@Test
public void testPerformance() throws Exception {
  MockAnalyzer analyzer = new MockAnalyzer(random());
  int numDocs = atLeast(10);
  LeafReader leafReader = getRandomIndex(analyzer,  numDocs);
  try {
    CachingNaiveBayesClassifier simpleNaiveBayesClassifier = new CachingNaiveBayesClassifier(leafReader,
        analyzer, null, categoryFieldName, textFieldName);

    ConfusionMatrixGenerator.ConfusionMatrix confusionMatrix = ConfusionMatrixGenerator.getConfusionMatrix(leafReader,
        simpleNaiveBayesClassifier, categoryFieldName, textFieldName, -1);
    assertNotNull(confusionMatrix);

    double avgClassificationTime = confusionMatrix.getAvgClassificationTime();
    assertTrue(avgClassificationTime >= 0);
    double accuracy = confusionMatrix.getAccuracy();
    assertTrue(accuracy >= 0d);
    assertTrue(accuracy <= 1d);

    double recall = confusionMatrix.getRecall();
    assertTrue(recall >= 0d);
    assertTrue(recall <= 1d);

    double precision = confusionMatrix.getPrecision();
    assertTrue(precision >= 0d);
    assertTrue(precision <= 1d);

    Terms terms = MultiTerms.getTerms(leafReader, categoryFieldName);
    TermsEnum iterator = terms.iterator();
    BytesRef term;
    while ((term = iterator.next()) != null) {
      String s = term.utf8ToString();
      recall = confusionMatrix.getRecall(s);
      assertTrue(recall >= 0d);
      assertTrue(recall <= 1d);
      precision = confusionMatrix.getPrecision(s);
      assertTrue(precision >= 0d);
      assertTrue(precision <= 1d);
      double f1Measure = confusionMatrix.getF1Measure(s);
      assertTrue(f1Measure >= 0d);
      assertTrue(f1Measure <= 1d);
    }
  } finally {
    leafReader.close();
  }

}
 
源代码15 项目: stratio-cassandra   文件: ClusteringKeyQuery.java
@Override
protected TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException {
    return new FullKeyDataRangeFilteredTermsEnum(terms.iterator(null));
}
 
@SuppressWarnings("unchecked")
private static void getDetailedFieldInfo(SolrQueryRequest req,
		String field, SimpleOrderedMap<Object> fieldMap) throws IOException {

	SolrParams params = req.getParams();
	final int numTerms = params.getInt(NUMTERMS, DEFAULT_COUNT);

	TopTermQueue tiq = new TopTermQueue(numTerms + 1); // Something to
														// collect the top N
														// terms in.

	final CharsRefBuilder spare = new CharsRefBuilder();

	Terms terms = MultiFields.getTerms(req.getSearcher().getIndexReader(),
			field);
	if (terms == null) { // field does not exist
		return;
	}
	TermsEnum termsEnum = terms.iterator();
	BytesRef text;
	int[] buckets = new int[HIST_ARRAY_SIZE];
	while ((text = termsEnum.next()) != null) {
		++tiq.distinctTerms;
		int freq = termsEnum.docFreq(); // This calculation seems odd, but
										// it gives the same results as it
										// used to.
		int slot = 32 - Integer.numberOfLeadingZeros(Math.max(0, freq - 1));
		buckets[slot] = buckets[slot] + 1;
		if (numTerms > 0 && freq > tiq.minFreq) {
			spare.copyUTF8Bytes(text);
			String t = spare.toString();

			tiq.add(new TopTermQueue.TermInfo(new Term(field, t), termsEnum
					.docFreq()));
			if (tiq.size() > numTerms) { // if tiq full
				tiq.pop(); // remove lowest in tiq
				tiq.minFreq = tiq.getTopTermInfo().docFreq;
			}
		}
	}
	tiq.histogram.add(buckets);
	fieldMap.add("distinct", tiq.distinctTerms);

	// Include top terms
	fieldMap.add("topTerms", tiq.toNamedList(req.getSearcher().getSchema()));

	// Add a histogram
	fieldMap.add("histogram", tiq.histogram.toNamedList());
}
 
源代码17 项目: lucene-solr   文件: IGainTermsQParserPlugin.java
@Override
public void finish() throws IOException {
  NamedList<Double> analytics = new NamedList<Double>();
  @SuppressWarnings({"unchecked", "rawtypes"})
  NamedList<Integer> topFreq = new NamedList();

  @SuppressWarnings({"unchecked", "rawtypes"})
  NamedList<Integer> allFreq = new NamedList();

  rb.rsp.add("featuredTerms", analytics);
  rb.rsp.add("docFreq", topFreq);
  rb.rsp.add("numDocs", count);

  TreeSet<TermWithScore> topTerms = new TreeSet<>();

  double numDocs = count;
  double pc = numPositiveDocs / numDocs;
  double entropyC = binaryEntropy(pc);

  Terms terms = ((SolrIndexSearcher)searcher).getSlowAtomicReader().terms(field);
  TermsEnum termsEnum = terms == null ? TermsEnum.EMPTY : terms.iterator();
  BytesRef term;
  PostingsEnum postingsEnum = null;
  while ((term = termsEnum.next()) != null) {
    postingsEnum = termsEnum.postings(postingsEnum);
    int xc = 0;
    int nc = 0;
    while (postingsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
      if (positiveSet.get(postingsEnum.docID())) {
        xc++;
      } else if (negativeSet.get(postingsEnum.docID())) {
        nc++;
      }
    }

    int docFreq = xc+nc;

    double entropyContainsTerm = binaryEntropy( (double) xc / docFreq );
    double entropyNotContainsTerm = binaryEntropy( (double) (numPositiveDocs - xc) / (numDocs - docFreq + 1) );
    double score = entropyC - ( (docFreq / numDocs) * entropyContainsTerm + (1.0 - docFreq / numDocs) * entropyNotContainsTerm);

    topFreq.add(term.utf8ToString(), docFreq);
    if (topTerms.size() < numTerms) {
      topTerms.add(new TermWithScore(term.utf8ToString(), score));
    } else  {
      if (topTerms.first().score < score) {
        topTerms.pollFirst();
        topTerms.add(new TermWithScore(term.utf8ToString(), score));
      }
    }
  }

  for (TermWithScore topTerm : topTerms) {
    analytics.add(topTerm.term, topTerm.score);
    topFreq.add(topTerm.term, allFreq.get(topTerm.term));
  }

  if (this.delegate instanceof DelegatingCollector) {
    ((DelegatingCollector) this.delegate).finish();
  }
}
 
源代码18 项目: lucene-solr   文件: TestPrefixRandom.java
@Override
protected TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException {
  return new SimplePrefixTermsEnum(terms.iterator(), prefix);
}
 
源代码19 项目: lucene-solr   文件: KNearestFuzzyClassifierTest.java
@Test
public void testPerformance() throws Exception {
  MockAnalyzer analyzer = new MockAnalyzer(random());
  int numDocs = atLeast(10);
  LeafReader leafReader = getRandomIndex(analyzer, numDocs);
  try {
    Classifier<BytesRef> classifier = new KNearestFuzzyClassifier(leafReader, null, analyzer, null, 3, categoryFieldName, textFieldName);

    ConfusionMatrixGenerator.ConfusionMatrix confusionMatrix = ConfusionMatrixGenerator.getConfusionMatrix(leafReader,
        classifier, categoryFieldName, textFieldName, -1);
    assertNotNull(confusionMatrix);

    double avgClassificationTime = confusionMatrix.getAvgClassificationTime();
    assertTrue(avgClassificationTime >= 0);

    double accuracy = confusionMatrix.getAccuracy();
    assertTrue(accuracy >= 0d);
    assertTrue(accuracy <= 1d);

    double recall = confusionMatrix.getRecall();
    assertTrue(recall >= 0d);
    assertTrue(recall <= 1d);

    double precision = confusionMatrix.getPrecision();
    assertTrue(precision >= 0d);
    assertTrue(precision <= 1d);

    Terms terms = MultiTerms.getTerms(leafReader, categoryFieldName);
    TermsEnum iterator = terms.iterator();
    BytesRef term;
    while ((term = iterator.next()) != null) {
      String s = term.utf8ToString();
      recall = confusionMatrix.getRecall(s);
      assertTrue(recall >= 0d);
      assertTrue(recall <= 1d);
      precision = confusionMatrix.getPrecision(s);
      assertTrue(precision >= 0d);
      assertTrue(precision <= 1d);
      double f1Measure = confusionMatrix.getF1Measure(s);
      assertTrue(f1Measure >= 0d);
      assertTrue(f1Measure <= 1d);
    }
  } finally {
    leafReader.close();
  }
}
 
源代码20 项目: Siamese   文件: TermFreqAnalyser.java
private static void analyseTermFreq(String indexName, String field, String freqType, String outputFileName) {
        String indexFile = elasticsearchLoc + "/data/stackoverflow/nodes/0/indices/"
                + indexName + "/0/index";
        DecimalFormat df = new DecimalFormat("#.00");
        int printEvery = 100000;
        File outputFile = new File(outputFileName);
        if (outputFile.exists()) {
            if (!outputFile.delete()) {
                System.out.println("ERROR: cannot delete the output file.");
                System.exit(0);
            }
        }
        /* adapted from
        https://stackoverflow.com/questions/28244961/lucene-4-10-2-calculate-tf-idf-for-all-terms-in-index
         */
        int count = 0;
        try {
            IndexReader reader = DirectoryReader.open(FSDirectory.open(Paths.get(indexFile)));
            Fields fields = MultiFields.getFields(reader);
            Terms terms = fields.terms(field);
            TermsEnum termsEnum = terms.iterator();
            int size = 0;
            // TODO: is there a better solution?
            // iterate to get the size
            while (termsEnum.next() != null) {
                size++;
            }
//            String[] termArr = new String[size];
            long[] freqArr = new long[size];
            // do the real work
            termsEnum = terms.iterator();
            while (termsEnum.next() != null) {
//                String term = termsEnum.term().utf8ToString();
                long tfreq = 0;
                if (freqType.equals("tf"))
                    tfreq = termsEnum.totalTermFreq();
                else if (freqType.equals("df"))
                    tfreq = termsEnum.docFreq();
                else {
                    System.out.println("Wrong frequency. Quit!");
                    System.exit(0);
                }
//                termArr[count] = term;
                freqArr[count] = tfreq;
                if (count % printEvery == 0) {
                    System.out.println("processed: " + count + " terms "
                            + " [" + df.format(((long)count * 100)/size) + "%]");
                }
                count++;
            }
            System.out.println(field + ": total = " + count);
            double[] data = new double[size];
            String output = "freq\n";
            for (int i = 0; i < freqArr.length; i++) {
                data[i] = freqArr[i];
                output += freqArr[i] + "\n";
                if (i > 0 && i % printEvery == 0) {
                    MyUtils.writeToFile("./", outputFileName, output, true);
                    System.out.println("written: " + i + " terms "
                            + " [" + df.format(((long)i * 100)/size) + "%]");
                    output = "";
                }
            }
            // write the rest to the file
            MyUtils.writeToFile("./",outputFileName, output, true);
        } catch (IOException e) {
            e.printStackTrace();
        }
    }