org.apache.lucene.index.IndexableField#tokenStream ( )源码实例Demo

下面列出了org.apache.lucene.index.IndexableField#tokenStream ( ) 实例代码,或者点击链接到github查看源代码,也可以在右侧发表评论。

@Override
public void prepare(PercolateContext context, ParsedDocument parsedDocument) {
    MemoryIndex memoryIndex = cache.get();
    for (IndexableField field : parsedDocument.rootDoc().getFields()) {
        if (field.fieldType().indexOptions() == IndexOptions.NONE && field.name().equals(UidFieldMapper.NAME)) {
            continue;
        }
        try {
            Analyzer analyzer = context.mapperService().documentMapper(parsedDocument.type()).mappers().indexAnalyzer();
            // TODO: instead of passing null here, we can have a CTL<Map<String,TokenStream>> and pass previous,
            // like the indexer does
            try (TokenStream tokenStream = field.tokenStream(analyzer, null)) {
                if (tokenStream != null) {
                    memoryIndex.addField(field.name(), tokenStream, field.boost());
                }
             }
        } catch (Exception e) {
            throw new ElasticsearchException("Failed to create token stream for [" + field.name() + "]", e);
        }
    }
    context.initialize(new DocEngineSearcher(memoryIndex), parsedDocument);
}
 
源代码2 项目: lucene-solr   文件: ReadTokensTask.java
@Override
public int doLogic() throws Exception {
  List<IndexableField> fields = doc.getFields();
  Analyzer analyzer = getRunData().getAnalyzer();
  int tokenCount = 0;
  for(final IndexableField field : fields) {
    if (field.fieldType().indexOptions() == IndexOptions.NONE ||
        field.fieldType().tokenized() == false) {
      continue;
    }
    
    final TokenStream stream = field.tokenStream(analyzer, null);
    // reset the TokenStream to the first token
    stream.reset();

    TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class);
    while(stream.incrementToken()) {
      termAtt.getBytesRef();
      tokenCount++;
    }
    stream.end();
    stream.close();
  }
  totalTokenCount += tokenCount;
  return tokenCount;
}
 
/**
 * This methods performs the analysis for the seed document and extract the boosts if present.
 * This is done only one time for the Seed Document.
 *
 * @param inputDocument         the seed unseen document
 * @param fieldName2tokensArray a map that associated to a field name the list of token arrays for all its values
 * @param fieldName2boost       a map that associates the boost to the field
 * @throws IOException If there is a low-level I/O error
 */
private void analyzeSeedDocument(Document inputDocument, Map<String, List<String[]>> fieldName2tokensArray, Map<String, Float> fieldName2boost) throws IOException {
  for (int i = 0; i < textFieldNames.length; i++) {
    String fieldName = textFieldNames[i];
    float boost = 1;
    List<String[]> tokenizedValues = new LinkedList<>();
    if (fieldName.contains("^")) {
      String[] field2boost = fieldName.split("\\^");
      fieldName = field2boost[0];
      boost = Float.parseFloat(field2boost[1]);
    }
    IndexableField[] fieldValues = inputDocument.getFields(fieldName);
    for (IndexableField fieldValue : fieldValues) {
      TokenStream fieldTokens = fieldValue.tokenStream(field2analyzer.get(fieldName), null);
      String[] fieldTokensArray = getTokenArray(fieldTokens);
      tokenizedValues.add(fieldTokensArray);
    }
    fieldName2tokensArray.put(fieldName, tokenizedValues);
    fieldName2boost.put(fieldName, boost);
    textFieldNames[i] = fieldName;
  }
}