下面列出了org.apache.lucene.search.suggest.Lookup.LookupResult#org.apache.lucene.analysis.StopFilter 实例代码,或者点击链接到github查看源代码,也可以在右侧发表评论。
public void testEndingHole() throws Exception {
// Just deletes "of"
Analyzer a = new Analyzer() {
@Override
public TokenStreamComponents createComponents(String field) {
Tokenizer tokenizer = new MockTokenizer();
CharArraySet stopSet = StopFilter.makeStopSet("of");
return new TokenStreamComponents(tokenizer, new StopFilter(tokenizer, stopSet));
}
};
Iterable<Input> keys = AnalyzingSuggesterTest.shuffle(
new Input("wizard of oz", 50)
);
FreeTextSuggester sug = new FreeTextSuggester(a, a, 3, (byte) 0x20);
sug.build(new InputArrayIterator(keys));
assertEquals("wizard _ oz/1.00",
toString(sug.lookup("wizard of", 10)));
// Falls back to unigram model, with backoff 0.4 times
// prop 0.5:
assertEquals("oz/0.20",
toString(sug.lookup("wizard o", 10)));
a.close();
}
public void testTwoEndingHoles() throws Exception {
// Just deletes "of"
Analyzer a = new Analyzer() {
@Override
public TokenStreamComponents createComponents(String field) {
Tokenizer tokenizer = new MockTokenizer();
CharArraySet stopSet = StopFilter.makeStopSet("of");
return new TokenStreamComponents(tokenizer, new StopFilter(tokenizer, stopSet));
}
};
Iterable<Input> keys = AnalyzingSuggesterTest.shuffle(
new Input("wizard of of oz", 50)
);
FreeTextSuggester sug = new FreeTextSuggester(a, a, 3, (byte) 0x20);
sug.build(new InputArrayIterator(keys));
assertEquals("",
toString(sug.lookup("wizard of of", 10)));
a.close();
}
public void testEndNotStopWord() throws Exception {
CharArraySet stopWords = StopFilter.makeStopSet("to");
Tokenizer stream = new MockTokenizer();
stream.setReader(new StringReader("go to"));
TokenStream filter = new SuggestStopFilter(stream, stopWords);
assertTokenStreamContents(filter,
new String[] {"go", "to"},
new int[] {0, 3},
new int[] {2, 5},
null,
new int[] {1, 1},
null,
5,
new boolean[] {false, true},
true);
}
public void testEndIsStopWord() throws Exception {
CharArraySet stopWords = StopFilter.makeStopSet("to");
Tokenizer stream = new MockTokenizer();
stream.setReader(new StringReader("go to "));
TokenStream filter = new SuggestStopFilter(stream, stopWords);
assertTokenStreamContents(filter,
new String[] {"go"},
new int[] {0},
new int[] {2},
null,
new int[] {1},
null,
6,
new boolean[] {false},
true);
}
public void testMidStopWord() throws Exception {
CharArraySet stopWords = StopFilter.makeStopSet("to");
Tokenizer stream = new MockTokenizer();
stream.setReader(new StringReader("go to school"));
TokenStream filter = new SuggestStopFilter(stream, stopWords);
assertTokenStreamContents(filter,
new String[] {"go", "school"},
new int[] {0, 6},
new int[] {2, 12},
null,
new int[] {1, 2},
null,
12,
new boolean[] {false, false},
true);
}
public void testMultipleStopWords() throws Exception {
CharArraySet stopWords = StopFilter.makeStopSet("to", "the", "a");
Tokenizer stream = new MockTokenizer();
stream.setReader(new StringReader("go to a the school"));
TokenStream filter = new SuggestStopFilter(stream, stopWords);
assertTokenStreamContents(filter,
new String[] { "go", "school" },
new int[] {0, 12},
new int[] {2, 18},
null,
new int[] {1, 4},
null,
18,
new boolean[] {false, false},
true);
}
public void testMultipleStopWordsEnd() throws Exception {
CharArraySet stopWords = StopFilter.makeStopSet("to", "the", "a");
Tokenizer stream = new MockTokenizer();
stream.setReader(new StringReader("go to a the"));
TokenStream filter = new SuggestStopFilter(stream, stopWords);
assertTokenStreamContents(filter,
new String[] { "go", "the"},
new int[] {0, 8},
new int[] {2, 11},
null,
new int[] {1, 3},
null,
11,
new boolean[] {false, true},
true);
}
public void testMultipleStopWordsEnd2() throws Exception {
CharArraySet stopWords = StopFilter.makeStopSet("to", "the", "a");
Tokenizer stream = new MockTokenizer();
stream.setReader(new StringReader("go to a the "));
TokenStream filter = new SuggestStopFilter(stream, stopWords);
assertTokenStreamContents(filter,
new String[] { "go"},
new int[] {0},
new int[] {2},
null,
new int[] {1},
null,
12,
new boolean[] {false},
true);
}
@Test
public void testWithStopword() throws Exception {
for (boolean preservePosInc : new boolean[]{true, false}) {
Tokenizer tokenStream = new MockTokenizer(MockTokenizer.WHITESPACE, true);
String input = "a mykeyword a keyword"; //LUCENE-8344 add "a"
tokenStream.setReader(new StringReader(input));
TokenFilter tokenFilter = new StopFilter(tokenStream, StopFilter.makeStopSet("a"));
ConcatenateGraphFilter concatStream = new ConcatenateGraphFilter(tokenFilter, SEP_LABEL, preservePosInc, 10);
CharsRefBuilder builder = new CharsRefBuilder();
if (preservePosInc) {
builder.append(SEP_LABEL);
}
builder.append("mykeyword");
builder.append(SEP_LABEL);
if (preservePosInc) {
builder.append(SEP_LABEL);
}
builder.append("keyword");
// if (preservePosInc) { LUCENE-8344 uncomment
// builder.append(SEP_LABEL);
// }
assertTokenStreamContents(concatStream, new String[]{builder.toCharsRef().toString()});
}
}
@Override
public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
class SavedStreams {
StandardTokenizer tokenStream;
TokenStream filteredTokenStream;
}
SavedStreams streams = (SavedStreams) getPreviousTokenStream();
if (streams == null) {
streams = new SavedStreams();
setPreviousTokenStream(streams);
streams.tokenStream = new StandardTokenizer(LUCENE_VERSION, reader);
streams.filteredTokenStream = new StandardFilter(streams.tokenStream);
streams.filteredTokenStream = new LowerCaseFilter(streams.filteredTokenStream);
streams.filteredTokenStream = new StopFilter(true, streams.filteredTokenStream, STOP_WORDS_SET);
streams.filteredTokenStream = new ASCIIFoldingFilter(streams.filteredTokenStream);
} else {
streams.tokenStream.reset(reader);
}
streams.tokenStream.setMaxTokenLength(DEFAULT_MAX_TOKEN_LENGTH);
return streams.filteredTokenStream;
}
@Override
protected TokenStreamComponents wrapComponents(String fieldName, TokenStreamComponents components) {
Set<String> stopWords = stopWordsPerField.get(fieldName);
if (stopWords == null) {
return components;
}
StopFilter stopFilter = new StopFilter(components.getTokenStream(),
new CharArraySet(stopWords, false));
return new TokenStreamComponents(components.getSource(), stopFilter);
}
@Test
public void testSeparatorWithStopWords() throws IOException {
Tokenizer tokenStream = new MockTokenizer(MockTokenizer.WHITESPACE, false);
String input = "A B C D E F J H";
tokenStream.setReader(new StringReader(input));
TokenStream tokenFilter = new StopFilter(tokenStream, StopFilter.makeStopSet("A", "D", "E", "J"));
ConcatenateGraphFilter stream = new ConcatenateGraphFilter(tokenFilter, '-', false, 100);
assertTokenStreamContents(stream, new String[] {"B-C-F-H"}, null, null, new int[] { 1 });
}
@Test
public void testSeparatorWithStopWordsAndPreservePositionIncrements() throws IOException {
Tokenizer tokenStream = new MockTokenizer(MockTokenizer.WHITESPACE, false);
String input = "A B C D E F J H";
tokenStream.setReader(new StringReader(input));
TokenStream tokenFilter = new StopFilter(tokenStream, StopFilter.makeStopSet("A", "D", "E", "J"));
ConcatenateGraphFilter stream = new ConcatenateGraphFilter(tokenFilter, '-', true, 100);
assertTokenStreamContents(stream, new String[] {"-B-C---F--H"}, null, null, new int[] { 1 });
}
@Before
public void up() {
analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer source = new IcuTokenizer(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY,
new DefaultIcuTokenizerConfig(false, true));
TokenStream result = new CJKBigramFilter(source);
return new TokenStreamComponents(source, new StopFilter(result, CharArraySet.EMPTY_SET));
}
};
}
@Override
public TokenStream create(TokenStream tokenStream) {
if (removeTrailing) {
return new StopFilter(tokenStream, stopWords);
} else {
return new SuggestStopFilter(tokenStream, stopWords);
}
}
@Override
public List<PreConfiguredTokenFilter> getPreConfiguredTokenFilters() {
List<PreConfiguredTokenFilter> filters = new ArrayList<>();
filters.add(PreConfiguredTokenFilter.singleton("apostrophe", false, ApostropheFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("arabic_normalization", true, ArabicNormalizationFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("arabic_stem", false, ArabicStemFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("asciifolding", true, ASCIIFoldingFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("bengali_normalization", true, BengaliNormalizationFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("brazilian_stem", false, BrazilianStemFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("cjk_bigram", false, CJKBigramFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("cjk_width", true, CJKWidthFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("classic", false, ClassicFilter::new));
filters.add(PreConfiguredTokenFilter.singleton(
"common_grams",
false,
input -> new CommonGramsFilter(input, CharArraySet.EMPTY_SET)));
filters.add(PreConfiguredTokenFilter.singleton("czech_stem", false, CzechStemFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("decimal_digit", true, DecimalDigitFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("delimited_payload_filter", false, input ->
new DelimitedPayloadTokenFilter(input,
DelimitedPayloadTokenFilterFactory.DEFAULT_DELIMITER,
DelimitedPayloadTokenFilterFactory.DEFAULT_ENCODER)));
filters.add(PreConfiguredTokenFilter.singleton("delimited_payload", false, input ->
new DelimitedPayloadTokenFilter(input,
DelimitedPayloadTokenFilterFactory.DEFAULT_DELIMITER,
DelimitedPayloadTokenFilterFactory.DEFAULT_ENCODER)));
filters.add(PreConfiguredTokenFilter.singleton("dutch_stem", false, input -> new SnowballFilter(input, new DutchStemmer())));
filters.add(PreConfiguredTokenFilter.singleton(
"edge_ngram",
false,
input -> new EdgeNGramTokenFilter(input, EdgeNGramTokenFilterFactory.SIDE_FRONT, EdgeNGramTokenFilterFactory.SIDE_BACK, EdgeNGramTokenFilter.DEFAULT_PRESERVE_ORIGINAL)));
filters.add(PreConfiguredTokenFilter.singleton(
"elision",
true,
input -> new ElisionFilter(input, FrenchAnalyzer.DEFAULT_ARTICLES))
);
filters.add(PreConfiguredTokenFilter.singleton("french_stem", false, input -> new SnowballFilter(input, new FrenchStemmer())));
filters.add(PreConfiguredTokenFilter.singleton("german_normalization", true, GermanNormalizationFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("german_stem", false, GermanStemFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("hindi_normalization", true, HindiNormalizationFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("indic_normalization", true, IndicNormalizationFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("keyword_repeat", false, KeywordRepeatFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("kstem", false, KStemFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("length", false, input ->
new LengthFilter(input, 0, Integer.MAX_VALUE))); // TODO this one seems useless
filters.add(PreConfiguredTokenFilter.singleton(
"limit",
false,
input -> new LimitTokenCountFilter(
input,
LimitTokenCountFilterFactory.DEFAULT_MAX_TOKEN_COUNT,
LimitTokenCountFilterFactory.DEFAULT_CONSUME_ALL_TOKENS)
)
);
filters.add(PreConfiguredTokenFilter.singleton("ngram", false, reader -> new NGramTokenFilter(reader, 1, 2, false)));
filters.add(PreConfiguredTokenFilter.singleton("persian_normalization", true, PersianNormalizationFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("porter_stem", false, PorterStemFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("reverse", false, ReverseStringFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("russian_stem", false, input -> new SnowballFilter(input, "Russian")));
filters.add(PreConfiguredTokenFilter.singleton("scandinavian_folding", true, ScandinavianFoldingFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("scandinavian_normalization", true, ScandinavianNormalizationFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("shingle", false, input -> {
TokenStream ts = new ShingleFilter(input);
/**
* We disable the graph analysis on this token stream
* because it produces shingles of different size.
* Graph analysis on such token stream is useless and dangerous as it may create too many paths
* since shingles of different size are not aligned in terms of positions.
*/
ts.addAttribute(DisableGraphAttribute.class);
return ts;
}));
filters.add(PreConfiguredTokenFilter.singleton("snowball", false, input -> new SnowballFilter(input, "English")));
filters.add(PreConfiguredTokenFilter.singleton("sorani_normalization", true, SoraniNormalizationFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("stemmer", false, PorterStemFilter::new));
// The stop filter is in lucene-core but the English stop words set is in lucene-analyzers-common
filters.add(PreConfiguredTokenFilter.singleton("stop", false, input -> new StopFilter(input, EnglishAnalyzer.ENGLISH_STOP_WORDS_SET)));
filters.add(PreConfiguredTokenFilter.singleton("trim", true, TrimFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("truncate", false, input -> new TruncateTokenFilter(input, 10)));
filters.add(PreConfiguredTokenFilter.singleton("type_as_payload", false, TypeAsPayloadTokenFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("unique", false, UniqueTokenFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("uppercase", true, UpperCaseFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("word_delimiter", false, input ->
new WordDelimiterFilter(input,
WordDelimiterFilter.GENERATE_WORD_PARTS
| WordDelimiterFilter.GENERATE_NUMBER_PARTS
| WordDelimiterFilter.SPLIT_ON_CASE_CHANGE
| WordDelimiterFilter.SPLIT_ON_NUMERICS
| WordDelimiterFilter.STEM_ENGLISH_POSSESSIVE, null)));
filters.add(PreConfiguredTokenFilter.singleton("word_delimiter_graph", false, input ->
new WordDelimiterGraphFilter(input,
WordDelimiterGraphFilter.GENERATE_WORD_PARTS
| WordDelimiterGraphFilter.GENERATE_NUMBER_PARTS
| WordDelimiterGraphFilter.SPLIT_ON_CASE_CHANGE
| WordDelimiterGraphFilter.SPLIT_ON_NUMERICS
| WordDelimiterGraphFilter.STEM_ENGLISH_POSSESSIVE, null)));
return filters;
}
@Override public TokenStream create(TokenStream tokenStream) {
return new StopFilter(tokenStream, VietnameseAnalyzer.getDefaultStopSet());
}
/**
* Creates
* {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
* used to tokenize all the text in the provided {@link Reader}.
*
* @return {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
* built from a {@link LetterTokenizer} filtered with
* {@link StopFilter}
*/
@Override
protected TokenStreamComponents createComponents(String fieldName) {
final Tokenizer source = new LetterTokenizer();
return new TokenStreamComponents(source, new StopFilter(new LowerCaseFilter(source), stopwords));
}