下面列出了org.apache.lucene.search.AutomatonQuery#org.apache.lucene.util.automaton.Automata 实例代码,或者点击链接到github查看源代码,也可以在右侧发表评论。
private Automaton toAutomaton() {
Automaton a = null;
if (include != null) {
a = include.toAutomaton();
} else if (includeValues != null) {
a = Automata.makeStringUnion(includeValues);
} else {
a = Automata.makeAnyString();
}
if (exclude != null) {
a = Operations.minus(a, exclude.toAutomaton(), Operations.DEFAULT_MAX_DETERMINIZED_STATES);
} else if (excludeValues != null) {
a = Operations.minus(a, Automata.makeStringUnion(excludeValues), Operations.DEFAULT_MAX_DETERMINIZED_STATES);
}
return a;
}
/**
* Create a automaton for a given context query this automaton will be used
* to find the matching paths with the fst
*
* @param preserveSep set an additional char (<code>XAnalyzingSuggester.SEP_LABEL</code>) between each context query
* @param queries list of {@link ContextQuery} defining the lookup context
*
* @return Automaton matching the given Query
*/
public static Automaton toAutomaton(boolean preserveSep, Iterable<ContextQuery> queries) {
Automaton a = Automata.makeEmptyString();
Automaton gap = Automata.makeChar(ContextMapping.SEPARATOR);
if (preserveSep) {
// if separators are preserved the fst contains a SEP_LABEL
// behind each gap. To have a matching automaton, we need to
// include the SEP_LABEL in the query as well
gap = Operations.concatenate(gap, Automata.makeChar(XAnalyzingSuggester.SEP_LABEL));
}
for (ContextQuery query : queries) {
a = Operations.concatenate(Arrays.asList(query.toAutomaton(), gap, a));
}
// TODO: should we limit this? Do any of our ContextQuery impls really create exponential regexps? GeoQuery looks safe (union
// of strings).
return Operations.determinize(a, Integer.MAX_VALUE);
}
private static Automaton toContextAutomaton(final Map<IntsRef, ContextMetaData> contexts, final boolean matchAllContexts) {
final Automaton matchAllAutomaton = Operations.repeat(Automata.makeAnyString());
final Automaton sep = Automata.makeChar(ContextSuggestField.CONTEXT_SEPARATOR);
if (matchAllContexts || contexts.size() == 0) {
return Operations.concatenate(matchAllAutomaton, sep);
} else {
Automaton contextsAutomaton = null;
for (Map.Entry<IntsRef, ContextMetaData> entry : contexts.entrySet()) {
final ContextMetaData contextMetaData = entry.getValue();
final IntsRef ref = entry.getKey();
Automaton contextAutomaton = Automata.makeString(ref.ints, ref.offset, ref.length);
if (contextMetaData.exact == false) {
contextAutomaton = Operations.concatenate(contextAutomaton, matchAllAutomaton);
}
contextAutomaton = Operations.concatenate(contextAutomaton, sep);
if (contextsAutomaton == null) {
contextsAutomaton = contextAutomaton;
} else {
contextsAutomaton = Operations.union(contextsAutomaton, contextAutomaton);
}
}
return contextsAutomaton;
}
}
public void testBoost() throws Exception {
CharacterRunAutomaton stopSet = new CharacterRunAutomaton(Automata.makeString("on"));
Analyzer oneStopAnalyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopSet);
PrecedenceQueryParser qp = new PrecedenceQueryParser();
qp.setAnalyzer(oneStopAnalyzer);
Query q = qp.parse("on^1.0", "field");
assertNotNull(q);
q = qp.parse("\"hello\"^2.0", "field");
assertNotNull(q);
assertEquals(((BoostQuery) q).getBoost(), (float) 2.0, (float) 0.5);
q = qp.parse("hello^2.0", "field");
assertNotNull(q);
assertEquals(((BoostQuery) q).getBoost(), (float) 2.0, (float) 0.5);
q = qp.parse("\"on\"^1.0", "field");
assertNotNull(q);
q = getParser(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET)).parse("the^3",
"field");
assertNotNull(q);
}
public void testBoost() throws Exception {
CharacterRunAutomaton stopSet = new CharacterRunAutomaton(Automata.makeString("on"));
Analyzer oneStopAnalyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopSet);
StandardQueryParser qp = new StandardQueryParser();
qp.setAnalyzer(oneStopAnalyzer);
Query q = qp.parse("on^1.0", "field");
assertNotNull(q);
q = qp.parse("\"hello\"^2.0", "field");
assertNotNull(q);
assertEquals(((BoostQuery) q).getBoost(), (float) 2.0, (float) 0.5);
q = qp.parse("hello^2.0", "field");
assertNotNull(q);
assertEquals(((BoostQuery) q).getBoost(), (float) 2.0, (float) 0.5);
q = qp.parse("\"on\"^1.0", "field");
assertNotNull(q);
StandardQueryParser qp2 = new StandardQueryParser();
qp2.setAnalyzer(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET));
q = qp2.parse("the^3", "field");
// "the" is a stop word so the result is an empty query:
assertNotNull(q);
assertMatchNoDocsQuery(q);
assertFalse(q instanceof BoostQuery);
}
public void testBoost()
throws Exception {
CharacterRunAutomaton stopWords = new CharacterRunAutomaton(Automata.makeString("on"));
Analyzer oneStopAnalyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopWords);
CommonQueryParserConfiguration qp = getParserConfig(oneStopAnalyzer);
Query q = getQuery("on^1.0",qp);
assertNotNull(q);
q = getQuery("\"hello\"^2.0",qp);
assertNotNull(q);
assertEquals(((BoostQuery) q).getBoost(), (float) 2.0, (float) 0.5);
q = getQuery("hello^2.0",qp);
assertNotNull(q);
assertEquals(((BoostQuery) q).getBoost(), (float) 2.0, (float) 0.5);
q = getQuery("\"on\"^1.0",qp);
assertNotNull(q);
Analyzer a2 = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET);
CommonQueryParserConfiguration qp2 = getParserConfig(a2);
q = getQuery("the^3", qp2);
// "the" is a stop word so the result is an empty query:
assertNotNull(q);
assertMatchNoDocsQuery(q);
assertFalse(q instanceof BoostQuery);
}
public void testCustomProvider() throws IOException {
AutomatonProvider myProvider = new AutomatonProvider() {
// automaton that matches quick or brown
private Automaton quickBrownAutomaton = Operations.union(Arrays
.asList(Automata.makeString("quick"),
Automata.makeString("brown"),
Automata.makeString("bob")));
@Override
public Automaton getAutomaton(String name) {
if (name.equals("quickBrown")) return quickBrownAutomaton;
else return null;
}
};
RegexpQuery query = new RegexpQuery(newTerm("<quickBrown>"), RegExp.ALL,
myProvider, DEFAULT_MAX_DETERMINIZED_STATES);
assertEquals(1, searcher.search(query, 5).totalHits.value);
}
/**
* Test some very simple automata.
*/
public void testAutomata() throws IOException {
assertAutomatonHits(0, Automata.makeEmpty());
assertAutomatonHits(0, Automata.makeEmptyString());
assertAutomatonHits(2, Automata.makeAnyChar());
assertAutomatonHits(3, Automata.makeAnyString());
assertAutomatonHits(2, Automata.makeString("doc"));
assertAutomatonHits(1, Automata.makeChar('a'));
assertAutomatonHits(2, Automata.makeCharRange('a', 'b'));
assertAutomatonHits(2, Automata.makeDecimalInterval(1233, 2346, 0));
assertAutomatonHits(1, Automata.makeDecimalInterval(0, 2000, 0));
assertAutomatonHits(2, Operations.union(Automata.makeChar('a'),
Automata.makeChar('b')));
assertAutomatonHits(0, Operations.intersection(Automata
.makeChar('a'), Automata.makeChar('b')));
assertAutomatonHits(1, Operations.minus(Automata.makeCharRange('a', 'b'),
Automata.makeChar('a'), DEFAULT_MAX_DETERMINIZED_STATES));
}
@Override
public Automaton toAutomaton() {
List<Automaton> automatons = new ArrayList<>();
for (CharSequence value : values) {
automatons.add(Automata.makeString(value.toString()));
}
return Operations.union(automatons);
}
@Override
public Automaton toAutomaton() {
Automaton automaton;
if(precisions == null || precisions.length == 0) {
automaton = Automata.makeString(location);
} else {
automaton = Automata.makeString(location.substring(0, Math.max(1, Math.min(location.length(), precisions[0]))));
for (int i = 1; i < precisions.length; i++) {
final String cell = location.substring(0, Math.max(1, Math.min(location.length(), precisions[i])));
automaton = Operations.union(automaton, Automata.makeString(cell));
}
}
return automaton;
}
private static CharArrayMatcher[] convertTermsToMatchers(BytesRef[] terms, CharArrayMatcher[] matchers) {
CharArrayMatcher[] newAutomata = new CharArrayMatcher[terms.length + matchers.length];
for (int i = 0; i < terms.length; i++) {
String termString = terms[i].utf8ToString();
CharacterRunAutomaton a = new CharacterRunAutomaton(Automata.makeString(termString));
newAutomata[i] = LabelledCharArrayMatcher.wrap(termString, a::run);
}
// Append existing automata (that which is used for MTQs)
System.arraycopy(matchers, 0, newAutomata, terms.length, matchers.length);
return newAutomata;
}
@Override
public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException {
// If an empty regex is provided, we return an automaton that matches nothing. This ensures
// consistency with PrefixCompletionQuery, which returns no results for an empty term.
Automaton automaton = getTerm().text().isEmpty()
? Automata.makeEmpty()
: new RegExp(getTerm().text(), flags).toAutomaton(maxDeterminizedStates);
return new CompletionWeight(this, automaton);
}
@Override
public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException {
final CompletionWeight innerWeight = ((CompletionWeight) innerQuery.createWeight(searcher, scoreMode, boost));
final Automaton innerAutomaton = innerWeight.getAutomaton();
// If the inner automaton matches nothing, then we return an empty weight to avoid
// traversing all contexts during scoring.
if (innerAutomaton.getNumStates() == 0) {
return new CompletionWeight(this, innerAutomaton);
}
// if separators are preserved the fst contains a SEP_LABEL
// behind each gap. To have a matching automaton, we need to
// include the SEP_LABEL in the query as well
Automaton optionalSepLabel = Operations.optional(Automata.makeChar(ConcatenateGraphFilter.SEP_LABEL));
Automaton prefixAutomaton = Operations.concatenate(optionalSepLabel, innerAutomaton);
Automaton contextsAutomaton = Operations.concatenate(toContextAutomaton(contexts, matchAllContexts), prefixAutomaton);
contextsAutomaton = Operations.determinize(contextsAutomaton, Operations.DEFAULT_MAX_DETERMINIZED_STATES);
final Map<IntsRef, Float> contextMap = new HashMap<>(contexts.size());
final TreeSet<Integer> contextLengths = new TreeSet<>();
for (Map.Entry<IntsRef, ContextMetaData> entry : contexts.entrySet()) {
ContextMetaData contextMetaData = entry.getValue();
contextMap.put(entry.getKey(), contextMetaData.boost);
contextLengths.add(entry.getKey().length);
}
int[] contextLengthArray = new int[contextLengths.size()];
final Iterator<Integer> iterator = contextLengths.descendingIterator();
for (int i = 0; iterator.hasNext(); i++) {
contextLengthArray[i] = iterator.next();
}
return new ContextCompletionWeight(this, contextsAutomaton, innerWeight, contextMap, contextLengthArray);
}
@BeforeClass
public static void beforeClass() throws Exception {
Random random = random();
directory = newDirectory();
stopword = "" + randomChar();
CharacterRunAutomaton stopset = new CharacterRunAutomaton(Automata.makeString(stopword));
analyzer = new MockAnalyzer(random, MockTokenizer.WHITESPACE, false, stopset);
RandomIndexWriter iw = new RandomIndexWriter(random, directory, analyzer);
Document doc = new Document();
Field id = new StringField("id", "", Field.Store.NO);
Field field = new TextField("field", "", Field.Store.NO);
doc.add(id);
doc.add(field);
// index some docs
int numDocs = TEST_NIGHTLY ? atLeast(1000) : atLeast(100);
for (int i = 0; i < numDocs; i++) {
id.setStringValue(Integer.toString(i));
field.setStringValue(randomFieldContents());
iw.addDocument(doc);
}
// delete some docs
int numDeletes = numDocs/20;
for (int i = 0; i < numDeletes; i++) {
Term toDelete = new Term("id", Integer.toString(random.nextInt(numDocs)));
if (random.nextBoolean()) {
iw.deleteDocuments(toDelete);
} else {
iw.deleteDocuments(new TermQuery(toDelete));
}
}
reader = iw.getReader();
s1 = newSearcher(reader);
s2 = newSearcher(reader);
iw.close();
}
/** Test a configuration that behaves a lot like KeepWordFilter */
public void testKeep() throws Exception {
CharacterRunAutomaton keepWords =
new CharacterRunAutomaton(
Operations.complement(
Operations.union(
Arrays.asList(Automata.makeString("foo"), Automata.makeString("bar"))),
DEFAULT_MAX_DETERMINIZED_STATES));
Analyzer a = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, keepWords);
assertAnalyzesTo(a, "quick foo brown bar bar fox foo",
new String[] { "foo", "bar", "bar", "foo" },
new int[] { 2, 2, 1, 2 });
}
public static Automaton toAutomaton(BytesRef lowerTerm, BytesRef upperTerm, boolean includeLower, boolean includeUpper) {
if (lowerTerm == null) {
// makeBinaryInterval is more picky than we are:
includeLower = true;
}
if (upperTerm == null) {
// makeBinaryInterval is more picky than we are:
includeUpper = true;
}
return Automata.makeBinaryInterval(lowerTerm, includeLower, upperTerm, includeUpper);
}
private ByteRunAutomaton asByteRunAutomaton() {
TermIterator iterator = termData.iterator();
List<Automaton> automata = new ArrayList<>();
for (BytesRef term = iterator.next(); term != null; term = iterator.next()) {
automata.add(Automata.makeBinary(term));
}
return new CompiledAutomaton(Operations.union(automata)).runAutomaton;
}
/**
* Test that a nondeterministic automaton works correctly. (It should will be
* determinized)
*/
public void testNFA() throws IOException {
// accept this or three, the union is an NFA (two transitions for 't' from
// initial state)
Automaton nfa = Operations.union(Automata.makeString("this"),
Automata.makeString("three"));
assertAutomatonHits(2, nfa);
}
public void testEquals() {
AutomatonQuery a1 = new AutomatonQuery(newTerm("foobar"), Automata
.makeString("foobar"));
// reference to a1
AutomatonQuery a2 = a1;
// same as a1 (accepts the same language, same term)
AutomatonQuery a3 = new AutomatonQuery(newTerm("foobar"),
Operations.concatenate(
Automata.makeString("foo"),
Automata.makeString("bar")));
// different than a1 (same term, but different language)
AutomatonQuery a4 = new AutomatonQuery(newTerm("foobar"),
Automata.makeString("different"));
// different than a1 (different term, same language)
AutomatonQuery a5 = new AutomatonQuery(newTerm("blah"),
Automata.makeString("foobar"));
assertEquals(a1.hashCode(), a2.hashCode());
assertEquals(a1, a2);
assertEquals(a1.hashCode(), a3.hashCode());
assertEquals(a1, a3);
// different class
AutomatonQuery w1 = new WildcardQuery(newTerm("foobar"));
// different class
AutomatonQuery w2 = new RegexpQuery(newTerm("foobar"));
assertFalse(a1.equals(w1));
assertFalse(a1.equals(w2));
assertFalse(w1.equals(w2));
assertFalse(a1.equals(a4));
assertFalse(a1.equals(a5));
assertFalse(a1.equals(null));
}
/**
* Test that rewriting to a single term works as expected, preserves
* MultiTermQuery semantics.
*/
public void testRewriteSingleTerm() throws IOException {
AutomatonQuery aq = new AutomatonQuery(newTerm("bogus"), Automata.makeString("piece"));
Terms terms = MultiTerms.getTerms(searcher.getIndexReader(), FN);
assertTrue(aq.getTermsEnum(terms) instanceof SingleTermsEnum);
assertEquals(1, automatonQueryNrHits(aq));
}
/**
* Test that rewriting to a prefix query works as expected, preserves
* MultiTermQuery semantics.
*/
public void testRewritePrefix() throws IOException {
Automaton pfx = Automata.makeString("do");
Automaton prefixAutomaton = Operations.concatenate(pfx, Automata.makeAnyString());
AutomatonQuery aq = new AutomatonQuery(newTerm("bogus"), prefixAutomaton);
assertEquals(3, automatonQueryNrHits(aq));
}
/**
* Test handling of the empty language
*/
public void testEmptyOptimization() throws IOException {
AutomatonQuery aq = new AutomatonQuery(newTerm("bogus"), Automata.makeEmpty());
// not yet available: assertTrue(aq.getEnum(searcher.getIndexReader())
// instanceof EmptyTermEnum);
Terms terms = MultiTerms.getTerms(searcher.getIndexReader(), FN);
assertSame(TermsEnum.EMPTY, aq.getTermsEnum(terms));
assertEquals(0, automatonQueryNrHits(aq));
}
public void testBiggishAutomaton() {
int numTerms = TEST_NIGHTLY ? 3000 : 500;
List<BytesRef> terms = new ArrayList<>();
while (terms.size() < numTerms) {
terms.add(new BytesRef(TestUtil.randomUnicodeString(random())));
}
Collections.sort(terms);
new AutomatonQuery(new Term("foo", "bar"), Automata.makeStringUnion(terms), Integer.MAX_VALUE);
}
public void testStopwordsPosIncHole2() throws Exception {
// use two stopfilters for testing here
Directory dir = newDirectory();
final Automaton secondSet = Automata.makeString("foobar");
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new MockTokenizer();
TokenStream stream = new MockTokenFilter(tokenizer, MockTokenFilter.ENGLISH_STOPSET);
stream = new MockTokenFilter(stream, new CharacterRunAutomaton(secondSet));
return new TokenStreamComponents(tokenizer, stream);
}
};
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, a);
Document doc = new Document();
doc.add(new TextField("body", "just a foobar", Field.Store.NO));
doc.add(new TextField("body", "test of gaps", Field.Store.NO));
iw.addDocument(doc);
IndexReader ir = iw.getReader();
iw.close();
IndexSearcher is = newSearcher(ir);
PhraseQuery.Builder builder = new PhraseQuery.Builder();
builder.add(new Term("body", "just"), 0);
builder.add(new Term("body", "test"), 3);
PhraseQuery pq = builder.build();
// body:"just ? ? test"
assertEquals(1, is.search(pq, 5).totalHits.value);
ir.close();
dir.close();
}
@Override
@SuppressWarnings("unchecked")
Filter makeFilter(String fname, Iterator<BytesRef> it) {
Automaton union = Automata.makeStringUnion(IteratorUtils.toList(it));
return new MultiTermQueryWrapperFilter<AutomatonQuery>(new AutomatonQuery(new Term(fname), union)) {
};
}
/** Return an {@link Automaton} that matches the given pattern. */
public static Automaton simpleMatchToAutomaton(String pattern) {
List<Automaton> automata = new ArrayList<>();
int previous = 0;
for (int i = pattern.indexOf('*'); i != -1; i = pattern.indexOf('*', i + 1)) {
automata.add(Automata.makeString(pattern.substring(previous, i)));
automata.add(Automata.makeAnyString());
previous = i + 1;
}
automata.add(Automata.makeString(pattern.substring(previous)));
return Operations.concatenate(automata);
}
/**
* Returns a function that filters a document map based on the given include and exclude rules.
* @see #filter(Map, String[], String[]) for details
*/
public static Function<Map<String, ?>, Map<String, Object>> filter(String[] includes, String[] excludes) {
CharacterRunAutomaton matchAllAutomaton = new CharacterRunAutomaton(Automata.makeAnyString());
CharacterRunAutomaton include;
if (includes == null || includes.length == 0) {
include = matchAllAutomaton;
} else {
Automaton includeA = Regex.simpleMatchToAutomaton(includes);
includeA = makeMatchDotsInFieldNames(includeA);
include = new CharacterRunAutomaton(includeA);
}
Automaton excludeA;
if (excludes == null || excludes.length == 0) {
excludeA = Automata.makeEmpty();
} else {
excludeA = Regex.simpleMatchToAutomaton(excludes);
excludeA = makeMatchDotsInFieldNames(excludeA);
}
CharacterRunAutomaton exclude = new CharacterRunAutomaton(excludeA);
// NOTE: We cannot use Operations.minus because of the special case that
// we want all sub properties to match as soon as an object matches
return (map) -> filter(map,
include, 0,
exclude, 0,
matchAllAutomaton);
}
static CharArrayMatcher fromTerms(List<BytesRef> terms) {
CharacterRunAutomaton a = new CharacterRunAutomaton(Automata.makeStringUnion(terms));
return a::run;
}
public void testMaxSizeHighlightTruncates() throws Exception {
TestHighlightRunner helper = new TestHighlightRunner() {
@Override
public void run() throws Exception {
String goodWord = "goodtoken";
CharacterRunAutomaton stopWords = new CharacterRunAutomaton(Automata.makeString("stoppedtoken"));
// we disable MockTokenizer checks because we will forcefully limit the
// tokenstream and call end() before incrementToken() returns false.
final MockAnalyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopWords);
analyzer.setEnableChecks(false);
TermQuery query = new TermQuery(new Term("data", goodWord));
String match;
StringBuilder sb = new StringBuilder();
sb.append(goodWord);
for (int i = 0; i < 10000; i++) {
sb.append(" ");
// only one stopword
sb.append("stoppedtoken");
}
SimpleHTMLFormatter fm = new SimpleHTMLFormatter();
Highlighter hg = getHighlighter(query, "data", fm);// new Highlighter(fm,
// new
// QueryTermScorer(query));
hg.setTextFragmenter(new NullFragmenter());
hg.setMaxDocCharsToAnalyze(100);
match = hg.getBestFragment(analyzer, "data", sb.toString());
assertTrue("Matched text should be no more than 100 chars in length ", match.length() < hg
.getMaxDocCharsToAnalyze());
// add another tokenized word to the overrall length - but set way
// beyond
// the length of text under consideration (after a large slug of stop
// words
// + whitespace)
sb.append(" ");
sb.append(goodWord);
match = hg.getBestFragment(analyzer, "data", sb.toString());
assertTrue("Matched text should be no more than 100 chars in length ", match.length() < hg
.getMaxDocCharsToAnalyze());
}
};
helper.start();
}
public void testInvalidAutomatonTermsEnum() throws Exception {
expectThrows(IllegalArgumentException.class,
() -> {
new AutomatonTermsEnum(TermsEnum.EMPTY, new CompiledAutomaton(Automata.makeString("foo")));
});
}