下面列出了org.apache.lucene.search.AutomatonQuery#org.apache.lucene.util.automaton.Automaton 实例代码,或者点击链接到github查看源代码,也可以在右侧发表评论。
final Automaton toLookupAutomaton(final CharSequence key) throws IOException {
// TODO: is there a Reader from a CharSequence?
// Turn tokenstream into automaton:
Automaton automaton = null;
try (TokenStream ts = queryAnalyzer.tokenStream("", key.toString())) {
automaton = getTokenStreamToAutomaton().toAutomaton(ts);
}
automaton = replaceSep(automaton);
// TODO: we can optimize this somewhat by determinizing
// while we convert
// This automaton should not blow up during determinize:
automaton = Operations.determinize(automaton, Integer.MAX_VALUE);
return automaton;
}
private Automaton toAutomaton() {
Automaton a = null;
if (include != null) {
a = include.toAutomaton();
} else if (includeValues != null) {
a = Automata.makeStringUnion(includeValues);
} else {
a = Automata.makeAnyString();
}
if (exclude != null) {
a = Operations.minus(a, exclude.toAutomaton(), Operations.DEFAULT_MAX_DETERMINIZED_STATES);
} else if (excludeValues != null) {
a = Operations.minus(a, Automata.makeStringUnion(excludeValues), Operations.DEFAULT_MAX_DETERMINIZED_STATES);
}
return a;
}
/**
* Create a automaton for a given context query this automaton will be used
* to find the matching paths with the fst
*
* @param preserveSep set an additional char (<code>XAnalyzingSuggester.SEP_LABEL</code>) between each context query
* @param queries list of {@link ContextQuery} defining the lookup context
*
* @return Automaton matching the given Query
*/
public static Automaton toAutomaton(boolean preserveSep, Iterable<ContextQuery> queries) {
Automaton a = Automata.makeEmptyString();
Automaton gap = Automata.makeChar(ContextMapping.SEPARATOR);
if (preserveSep) {
// if separators are preserved the fst contains a SEP_LABEL
// behind each gap. To have a matching automaton, we need to
// include the SEP_LABEL in the query as well
gap = Operations.concatenate(gap, Automata.makeChar(XAnalyzingSuggester.SEP_LABEL));
}
for (ContextQuery query : queries) {
a = Operations.concatenate(Arrays.asList(query.toAutomaton(), gap, a));
}
// TODO: should we limit this? Do any of our ContextQuery impls really create exponential regexps? GeoQuery looks safe (union
// of strings).
return Operations.determinize(a, Integer.MAX_VALUE);
}
@Override
protected List<FSTUtil.Path<Pair<Long,BytesRef>>> getFullPrefixPaths(List<FSTUtil.Path<Pair<Long,BytesRef>>> prefixPaths,
Automaton lookupAutomaton,
FST<Pair<Long,BytesRef>> fst)
throws IOException {
// TODO: right now there's no penalty for fuzzy/edits,
// ie a completion whose prefix matched exactly what the
// user typed gets no boost over completions that
// required an edit, which get no boost over completions
// requiring two edits. I suspect a multiplicative
// factor is appropriate (eg, say a fuzzy match must be at
// least 2X better weight than the non-fuzzy match to
// "compete") ... in which case I think the wFST needs
// to be log weights or something ...
Automaton levA = convertAutomaton(toLevenshteinAutomata(lookupAutomaton));
/*
Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"), StandardCharsets.UTF_8);
w.write(levA.toDot());
w.close();
System.out.println("Wrote LevA to out.dot");
*/
return FSTUtil.intersectPrefixPaths(levA, fst);
}
final Automaton toAutomaton(final BytesRef surfaceForm, final TokenStreamToAutomaton ts2a) throws IOException {
// Analyze surface form:
Automaton automaton;
try (TokenStream ts = indexAnalyzer.tokenStream("", surfaceForm.utf8ToString())) {
// Create corresponding automaton: labels are bytes
// from each analyzed token, with byte 0 used as
// separator between tokens:
automaton = ts2a.toAutomaton(ts);
}
automaton = replaceSep(automaton);
automaton = convertAutomaton(automaton);
// TODO: LUCENE-5660 re-enable this once we disallow massive suggestion strings
// assert SpecialOperations.isFinite(automaton);
// Get all paths from the automaton (there can be
// more than one path, eg if the analyzer created a
// graph using SynFilter or WDF):
return automaton;
}
@Override
public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException {
final Automaton originalAutomata;
try (CompletionTokenStream stream = (CompletionTokenStream) analyzer.tokenStream(getField(), getTerm().text()) ) {
originalAutomata = stream.toAutomaton(unicodeAware);
}
Set<IntsRef> refs = new HashSet<>();
Automaton automaton = toLevenshteinAutomata(originalAutomata, refs);
if (unicodeAware) {
Automaton utf8automaton = new UTF32ToUTF8().convert(automaton);
utf8automaton = Operations.determinize(utf8automaton, maxDeterminizedStates);
automaton = utf8automaton;
}
// TODO Accumulating all refs is bad, because the resulting set may be very big.
// TODO Better iterate over automaton again inside FuzzyCompletionWeight?
return new FuzzyCompletionWeight(this, automaton, refs);
}
private static Automaton toContextAutomaton(final Map<IntsRef, ContextMetaData> contexts, final boolean matchAllContexts) {
final Automaton matchAllAutomaton = Operations.repeat(Automata.makeAnyString());
final Automaton sep = Automata.makeChar(ContextSuggestField.CONTEXT_SEPARATOR);
if (matchAllContexts || contexts.size() == 0) {
return Operations.concatenate(matchAllAutomaton, sep);
} else {
Automaton contextsAutomaton = null;
for (Map.Entry<IntsRef, ContextMetaData> entry : contexts.entrySet()) {
final ContextMetaData contextMetaData = entry.getValue();
final IntsRef ref = entry.getKey();
Automaton contextAutomaton = Automata.makeString(ref.ints, ref.offset, ref.length);
if (contextMetaData.exact == false) {
contextAutomaton = Operations.concatenate(contextAutomaton, matchAllAutomaton);
}
contextAutomaton = Operations.concatenate(contextAutomaton, sep);
if (contextsAutomaton == null) {
contextsAutomaton = contextAutomaton;
} else {
contextsAutomaton = Operations.union(contextsAutomaton, contextAutomaton);
}
}
return contextsAutomaton;
}
}
public TermAutomatonWeight(Automaton automaton, IndexSearcher searcher, Map<Integer,TermStates> termStates, float boost) throws IOException {
super(TermAutomatonQuery.this);
this.automaton = automaton;
this.termStates = termStates;
this.similarity = searcher.getSimilarity();
List<TermStatistics> allTermStats = new ArrayList<>();
for(Map.Entry<Integer,BytesRef> ent : idToTerm.entrySet()) {
Integer termID = ent.getKey();
if (ent.getValue() != null) {
TermStates ts = termStates.get(termID);
if (ts.docFreq() > 0) {
allTermStats.add(searcher.termStatistics(new Term(field, ent.getValue()), ts.docFreq(), ts.totalTermFreq()));
}
}
}
if (allTermStats.isEmpty()) {
stats = null; // no terms matched at all, will not use sim
} else {
stats = similarity.scorer(boost, searcher.collectionStatistics(field),
allTermStats.toArray(new TermStatistics[allTermStats.size()]));
}
}
/** Just creates a side path from startState to endState with the provided tokens. */
private static void addSidePath(Automaton.Builder a, int startState, int endState, char[] tokens, List<Integer> flatStates) {
int lastState = startState;
for(int i=0;i<tokens.length;i++) {
int nextState;
if (i == tokens.length-1) {
nextState = endState;
} else if (flatStates == null || i >= flatStates.size()) {
nextState = a.createState();
if (flatStates != null) {
assert i == flatStates.size();
flatStates.add(nextState);
}
} else {
nextState = flatStates.get(i);
}
a.addTransition(lastState, nextState, tokens[i]);
lastState = nextState;
}
}
@BeforeClass
public static void beforeClass() throws Exception {
Automaton single = new Automaton();
int initial = single.createState();
int accept = single.createState();
single.setAccept(accept, true);
// build an automaton matching this jvm's letter definition
for (int i = 0; i <= 0x10FFFF; i++) {
if (Character.isLetter(i)) {
single.addTransition(initial, accept, i);
}
}
Automaton repeat = Operations.repeat(single);
jvmLetter = new CharacterRunAutomaton(repeat);
}
/**
* Creates the automaton map.
*
* @param prefix the prefix
* @param valueList the value list
* @param filter the filter
* @return the map
*/
public static Map<String, Automaton> createAutomatonMap(String prefix,
List<String> valueList, Boolean filter) {
HashMap<String, Automaton> automatonMap = new HashMap<>();
if (valueList != null) {
for (String item : valueList) {
if (filter) {
item = item.replaceAll("([\\\"\\)\\(\\<\\>\\.\\@\\#\\]\\[\\{\\}])",
"\\\\$1");
}
automatonMap.put(item,
new RegExp(prefix + MtasToken.DELIMITER + item + "\u0000*")
.toAutomaton());
}
}
return automatonMap;
}
/** Build an automaton accepting all terms with the specified prefix. */
public static Automaton toAutomaton(BytesRef prefix) {
final int numStatesAndTransitions = prefix.length+1;
final Automaton automaton = new Automaton(numStatesAndTransitions, numStatesAndTransitions);
int lastState = automaton.createState();
for(int i=0;i<prefix.length;i++) {
int state = automaton.createState();
automaton.addTransition(lastState, state, prefix.bytes[prefix.offset+i]&0xff);
lastState = state;
}
automaton.setAccept(lastState, true);
automaton.addTransition(lastState, lastState, 0, 255);
automaton.finishState();
assert automaton.isDeterministic();
return automaton;
}
/**
* Returns the articulation points (or cut vertices) of the graph:
* https://en.wikipedia.org/wiki/Biconnected_component
*/
public int[] articulationPoints() {
if (det.getNumStates() == 0) {
return new int[0];
}
//
Automaton.Builder undirect = new Automaton.Builder();
undirect.copy(det);
for (int i = 0; i < det.getNumStates(); i++) {
int numT = det.initTransition(i, transition);
for (int j = 0; j < numT; j++) {
det.getNextTransition(transition);
undirect.addTransition(transition.dest, i, transition.min);
}
}
int numStates = det.getNumStates();
BitSet visited = new BitSet(numStates);
int[] depth = new int[det.getNumStates()];
int[] low = new int[det.getNumStates()];
int[] parent = new int[det.getNumStates()];
Arrays.fill(parent, -1);
List<Integer> points = new ArrayList<>();
articulationPointsRecurse(undirect.finish(), 0, 0, depth, low, parent, visited, points);
Collections.reverse(points);
return points.stream().mapToInt(p -> p).toArray();
}
public void testCustomProvider() throws IOException {
AutomatonProvider myProvider = new AutomatonProvider() {
// automaton that matches quick or brown
private Automaton quickBrownAutomaton = Operations.union(Arrays
.asList(Automata.makeString("quick"),
Automata.makeString("brown"),
Automata.makeString("bob")));
@Override
public Automaton getAutomaton(String name) {
if (name.equals("quickBrown")) return quickBrownAutomaton;
else return null;
}
};
RegexpQuery query = new RegexpQuery(newTerm("<quickBrown>"), RegExp.ALL,
myProvider, DEFAULT_MAX_DETERMINIZED_STATES);
assertEquals(1, searcher.search(query, 5).totalHits.value);
}
/**
* Creates a new suggester.
*
* @param indexAnalyzer Analyzer that will be used for
* analyzing suggestions while building the index.
* @param queryAnalyzer Analyzer that will be used for
* analyzing query text during lookup
* @param options see {@link #EXACT_FIRST}, {@link #PRESERVE_SEP}
* @param maxSurfaceFormsPerAnalyzedForm Maximum number of
* surface forms to keep for a single analyzed form.
* When there are too many surface forms we discard the
* lowest weighted ones.
* @param maxGraphExpansions Maximum number of graph paths
* to expand from the analyzed form. Set this to -1 for
* no limit.
*/
public XAnalyzingSuggester(Analyzer indexAnalyzer, Automaton queryPrefix, Analyzer queryAnalyzer, int options, int maxSurfaceFormsPerAnalyzedForm, int maxGraphExpansions,
boolean preservePositionIncrements, FST<Pair<Long, BytesRef>> fst, boolean hasPayloads, int maxAnalyzedPathsForOneInput,
int sepLabel, int payloadSep, int endByte, int holeCharacter) {
// SIMON EDIT: I added fst, hasPayloads and maxAnalyzedPathsForOneInput
this.indexAnalyzer = indexAnalyzer;
this.queryAnalyzer = queryAnalyzer;
this.fst = fst;
this.hasPayloads = hasPayloads;
if ((options & ~(EXACT_FIRST | PRESERVE_SEP)) != 0) {
throw new IllegalArgumentException("options should only contain EXACT_FIRST and PRESERVE_SEP; got " + options);
}
this.exactFirst = (options & EXACT_FIRST) != 0;
this.preserveSep = (options & PRESERVE_SEP) != 0;
// FLORIAN EDIT: I added <code>queryPrefix</code> for context dependent suggestions
this.queryPrefix = queryPrefix;
// NOTE: this is just an implementation limitation; if
// somehow this is a problem we could fix it by using
// more than one byte to disambiguate ... but 256 seems
// like it should be way more then enough.
if (maxSurfaceFormsPerAnalyzedForm <= 0 || maxSurfaceFormsPerAnalyzedForm > 256) {
throw new IllegalArgumentException("maxSurfaceFormsPerAnalyzedForm must be > 0 and < 256 (got: " + maxSurfaceFormsPerAnalyzedForm + ")");
}
this.maxSurfaceFormsPerAnalyzedForm = maxSurfaceFormsPerAnalyzedForm;
if (maxGraphExpansions < 1 && maxGraphExpansions != -1) {
throw new IllegalArgumentException("maxGraphExpansions must -1 (no limit) or > 0 (got: " + maxGraphExpansions + ")");
}
this.maxGraphExpansions = maxGraphExpansions;
this.maxAnalyzedPathsForOneInput = maxAnalyzedPathsForOneInput;
this.preservePositionIncrements = preservePositionIncrements;
this.sepLabel = sepLabel;
this.payloadSep = payloadSep;
this.endByte = endByte;
this.holeCharacter = holeCharacter;
}
public void testSynOverHole2() throws Exception {
final TokenStream ts = new CannedTokenStream(
new Token[] {
token("xyz", 1, 1),
token("abc", 0, 3),
token("def", 2, 1),
});
final Automaton expected = Operations.union(
join(s2a("xyz"), SEP_A, HOLE_A, SEP_A, s2a("def")), s2a("abc"));
assertSameLanguage(expected, ts);
}
/** Returns all completion paths to initialize the search. */
protected List<FSTUtil.Path<Pair<Long,BytesRef>>> getFullPrefixPaths(List<FSTUtil.Path<Pair<Long,BytesRef>>> prefixPaths,
Automaton lookupAutomaton,
FST<Pair<Long,BytesRef>> fst)
throws IOException {
return prefixPaths;
}
public Set<IntsRef> toFiniteStrings(TokenStream stream) throws IOException {
final TokenStreamToAutomaton ts2a = getTokenStreamToAutomaton();
Automaton automaton;
try (TokenStream ts = stream) {
automaton = toAutomaton(ts, ts2a);
}
LimitedFiniteStringsIterator finiteStrings =
new LimitedFiniteStringsIterator(automaton, maxGraphExpansions);
Set<IntsRef> set = new HashSet<>();
for (IntsRef string = finiteStrings.next(); string != null; string = finiteStrings.next()) {
set.add(IntsRef.deepCopyOf(string));
}
return Collections.unmodifiableSet(set);
}
@Override
@SuppressWarnings("unchecked")
Filter makeFilter(String fname, Iterator<BytesRef> it) {
Automaton union = Automata.makeStringUnion(IteratorUtils.toList(it));
return new MultiTermQueryWrapperFilter<AutomatonQuery>(new AutomatonQuery(new Term(fname), union)) {
};
}
/** Build an automaton to represent the frontier query */
private Automaton buildAutomaton(BytesRefHash termBytesHash) {
// need top pass a sorted set of terms to the autn builder (maybe a better way to avoid this?)
final TreeSet<BytesRef> terms = new TreeSet<BytesRef>();
for (int i = 0 ; i < termBytesHash.size(); i++) {
BytesRef ref = new BytesRef();
termBytesHash.get(i, ref);
terms.add(ref);
}
final Automaton a = DaciukMihovAutomatonBuilder.build(terms);
return a;
}
@Override
public Automaton toAutomaton() {
List<Automaton> automatons = new ArrayList<>();
for (CharSequence value : values) {
automatons.add(Automata.makeString(value.toString()));
}
return Operations.union(automatons);
}
@Override
public Automaton toAutomaton() {
Automaton automaton;
if(precisions == null || precisions.length == 0) {
automaton = Automata.makeString(location);
} else {
automaton = Automata.makeString(location.substring(0, Math.max(1, Math.min(location.length(), precisions[0]))));
for (int i = 1; i < precisions.length; i++) {
final String cell = location.substring(0, Math.max(1, Math.min(location.length(), precisions[i])));
automaton = Operations.union(automaton, Automata.makeString(cell));
}
}
return automaton;
}
@Override
protected Automaton convertAutomaton(Automaton a) {
if (unicodeAware) {
Automaton utf8automaton = new UTF32ToUTF8().convert(a);
utf8automaton = Operations.determinize(utf8automaton, DEFAULT_MAX_DETERMINIZED_STATES);
return utf8automaton;
} else {
return a;
}
}
/** Returns all prefix paths to initialize the search. */
protected List<FSTUtil.Path<Pair<Long,BytesRef>>> getFullPrefixPaths(List<FSTUtil.Path<Pair<Long,BytesRef>>> prefixPaths,
Automaton lookupAutomaton,
FST<Pair<Long,BytesRef>> fst)
throws IOException {
return prefixPaths;
}
final Automaton toLookupAutomaton(final CharSequence key) throws IOException {
// TODO: is there a Reader from a CharSequence?
// Turn tokenstream into automaton:
Automaton automaton = null;
try (TokenStream ts = queryAnalyzer.tokenStream("", key.toString())) {
automaton = getTokenStreamToAutomaton().toAutomaton(ts);
}
automaton = replaceSep(automaton);
// TODO: we can optimize this somewhat by determinizing
// while we convert
automaton = Operations.determinize(automaton, DEFAULT_MAX_DETERMINIZED_STATES);
return automaton;
}
@Override
public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException {
// If an empty regex is provided, we return an automaton that matches nothing. This ensures
// consistency with PrefixCompletionQuery, which returns no results for an empty term.
Automaton automaton = getTerm().text().isEmpty()
? Automata.makeEmpty()
: new RegExp(getTerm().text(), flags).toAutomaton(maxDeterminizedStates);
return new CompletionWeight(this, automaton);
}
/**
* Creates a scorer for a field-specific <code>suggester</code> scoped by <code>acceptDocs</code>
*/
protected CompletionScorer(final CompletionWeight weight, final NRTSuggester suggester,
final LeafReader reader, final Bits filterDocs,
final boolean filtered, final Automaton automaton) throws IOException {
this.weight = weight;
this.suggester = suggester;
this.reader = reader;
this.automaton = automaton;
this.filtered = filtered;
this.filterDocs = filterDocs;
}
public ContextCompletionWeight(CompletionQuery query, Automaton automaton, CompletionWeight innerWeight,
Map<IntsRef, Float> contextMap,
int[] contextLengths) throws IOException {
super(query, automaton);
this.contextMap = contextMap;
this.contextLengths = contextLengths;
this.innerWeight = innerWeight;
}
public void testOverlappedTokensLattice2() throws Exception {
final TokenStream ts = new CannedTokenStream(
new Token[] {
token("abc", 1, 1),
token("xyz", 0, 3),
token("def", 1, 1),
token("ghi", 1, 1),
});
final Automaton a1 = s2a("xyz");
final Automaton a2 = join("abc", "def", "ghi");
assertSameLanguage(Operations.union(a1, a2), ts);
}
@Override
public Query parse() throws SyntaxError {
if (id == null) {
throw new SyntaxError("no " + MTAS_JOIN_QPARSER_COLLECTION);
} else if (fields == null) {
throw new SyntaxError("no " + MTAS_JOIN_QPARSER_FIELD);
} else {
BooleanQuery.Builder booleanQueryBuilder = new BooleanQuery.Builder();
MtasSolrCollectionCache mtasSolrJoinCache = null;
for (PluginHolder<SearchComponent> item : req.getCore()
.getSearchComponents().getRegistry().values()) {
if (item.get() instanceof MtasSolrSearchComponent) {
mtasSolrJoinCache = ((MtasSolrSearchComponent) item.get())
.getCollectionCache();
}
}
if (mtasSolrJoinCache != null) {
Automaton automaton;
try {
automaton = mtasSolrJoinCache.getAutomatonById(id);
if (automaton != null) {
for (String field : fields) {
booleanQueryBuilder.add(
new AutomatonQuery(new Term(field), automaton), Occur.SHOULD);
}
} else {
throw new IOException("no data for collection '" + id + "'");
}
} catch (IOException e) {
throw new SyntaxError(
"could not construct automaton: " + e.getMessage(), e);
}
return booleanQueryBuilder.build();
} else {
throw new SyntaxError("no MtasSolrSearchComponent found");
}
}
}