下面列出了org.apache.lucene.search.suggest.Lookup.LookupResult#org.apache.lucene.analysis.tokenattributes.OffsetAttribute 实例代码,或者点击链接到github查看源代码,也可以在右侧发表评论。
public LuceneToken(TokenStream stream) {
this.stream = stream;
this.term = stream.getAttribute(CharTermAttribute.class);
this.offset = stream.getAttribute(OffsetAttribute.class);
try {
this.flag = this.stream.incrementToken();
if (!flag) {
this.stream.close();
}
} catch (Exception exception) {
try {
this.stream.close();
} catch (Exception throwable) {
}
throw new RuntimeException(exception);
}
}
@Test
public void testSegmenter() throws Exception {
Tokenizer segmenter = getSegmenter();
String text = "中华人民共和国(People's Republic of China),简称'中国'";
segmenter.setReader(new StringReader(text));
segmenter.reset();
while (segmenter.incrementToken()) {
// 词元
CharTermAttribute term = segmenter.getAttribute(CharTermAttribute.class);
// 偏移量
OffsetAttribute offset = segmenter.getAttribute(OffsetAttribute.class);
// 距离
PositionIncrementAttribute position = segmenter.getAttribute(PositionIncrementAttribute.class);
// 词性
TypeAttribute type = segmenter.getAttribute(TypeAttribute.class);
LOGGER.debug(StringUtility.format("segmenter:term is {}, begin is {}, end is {}", term, offset.startOffset(), offset.endOffset()));
Assert.assertEquals(term.toString().toLowerCase(), text.substring(offset.startOffset(), offset.endOffset()).toLowerCase());
}
}
@Test
public void testCreate() throws Exception {
Map<String, String> args = new TreeMap<>();
args.put("enableTraditionalChineseMode", "true");
TokenizerFactory factory = new HanLpTokenizerFactory(args);
Tokenizer tokenizer = factory.create(null);
tokenizer.setReader(new StringReader("大衛貝克漢不僅僅是名著名球員,球場以外,其妻為前" + "辣妹合唱團成員維多利亞·碧咸,亦由於他擁有" + "突出外表、百變髮型及正面的形象,以至自己" + "品牌的男士香水等商品,及長期擔任運動品牌" + "Adidas的代言人,因此對大眾傳播媒介和時尚界" + "等方面都具很大的影響力,在足球圈外所獲得的" + "認受程度可謂前所未見。"));
tokenizer.reset();
while (tokenizer.incrementToken()) {
CharTermAttribute attribute = tokenizer.getAttribute(CharTermAttribute.class);
// 偏移量
OffsetAttribute offsetAtt = tokenizer.getAttribute(OffsetAttribute.class);
// 距离
PositionIncrementAttribute positionAttr = tokenizer.getAttribute(PositionIncrementAttribute.class);
// 词性
TypeAttribute typeAttr = tokenizer.getAttribute(TypeAttribute.class);
System.out.printf("[%d:%d %d] %s/%s\n", offsetAtt.startOffset(), offsetAtt.endOffset(), positionAttr.getPositionIncrement(), attribute, typeAttr.type());
}
}
@Test
public void testCreateComponents() throws Exception {
String text = "中华人民共和国很辽阔";
for (int i = 0; i < text.length(); ++i) {
System.out.print(text.charAt(i) + "" + i + " ");
}
System.out.println();
try (Analyzer analyzer = new HanLpQueryAnalyzer("viterbi")) {
TokenStream tokenStream = analyzer.tokenStream("field", text);
tokenStream.reset();
while (tokenStream.incrementToken()) {
CharTermAttribute attribute = tokenStream.getAttribute(CharTermAttribute.class);
// 偏移量
OffsetAttribute offsetAtt = tokenStream.getAttribute(OffsetAttribute.class);
// 距离
PositionIncrementAttribute positionAttr = tokenStream.getAttribute(PositionIncrementAttribute.class);
// 词性
TypeAttribute typeAttr = tokenStream.getAttribute(TypeAttribute.class);
System.out.printf("[%d:%d %d] %s/%s\n", offsetAtt.startOffset(), offsetAtt.endOffset(), positionAttr.getPositionIncrement(), attribute, typeAttr.type());
}
}
}
@Test
public void testIssue() throws Exception {
Map<String, String> args = new TreeMap<>();
args.put("enableTraditionalChineseMode", "true");
args.put("enableNormalization", "true");
HanLpTokenizerFactory factory = new HanLpTokenizerFactory(args);
Tokenizer tokenizer = factory.create();
String text = "會辦台星保證最低價的原因?";
tokenizer.setReader(new StringReader(text));
tokenizer.reset();
while (tokenizer.incrementToken()) {
CharTermAttribute attribute = tokenizer.getAttribute(CharTermAttribute.class);
// 偏移量
OffsetAttribute offsetAtt = tokenizer.getAttribute(OffsetAttribute.class);
// 距离
PositionIncrementAttribute positionAttr = tokenizer.getAttribute(PositionIncrementAttribute.class);
// 词性
TypeAttribute typeAttr = tokenizer.getAttribute(TypeAttribute.class);
System.out.printf("[%d:%d %d] %s/%s\n", offsetAtt.startOffset(), offsetAtt.endOffset(), positionAttr.getPositionIncrement(), attribute, typeAttr.type());
}
}
public Tagger(Terms terms, Bits liveDocs, TokenStream tokenStream,
TagClusterReducer tagClusterReducer, boolean skipAltTokens,
boolean ignoreStopWords) throws IOException {
this.terms = terms;
this.liveDocs = liveDocs;
this.tokenStream = tokenStream;
this.skipAltTokens = skipAltTokens;
this.ignoreStopWords = ignoreStopWords;
byteRefAtt = tokenStream.addAttribute(TermToBytesRefAttribute.class);
posIncAtt = tokenStream.addAttribute(PositionIncrementAttribute.class);
offsetAtt = tokenStream.addAttribute(OffsetAttribute.class);
taggingAtt = tokenStream.addAttribute(TaggingAttribute.class);
tokenStream.reset();
this.tagClusterReducer = tagClusterReducer;
}
/**
* Tokenize the given input using a {@link URLTokenizer}. Settings which have been set on this {@link URLTokenFilter}
* will be passed along to the tokenizer.
* @param input a string to be tokenized
* @return a list of tokens extracted from the input string
* @throws IOException
*/
private List<Token> tokenize(String input) throws IOException {
List<Token> tokens = new ArrayList<>();
URLTokenizer tokenizer = new URLTokenizer();
// create a copy of the parts list to avoid ConcurrentModificationException when sorting
tokenizer.setParts(new ArrayList<>(parts));
tokenizer.setUrlDecode(urlDeocde);
tokenizer.setTokenizeHost(tokenizeHost);
tokenizer.setTokenizePath(tokenizePath);
tokenizer.setTokenizeQuery(tokenizeQuery);
tokenizer.setAllowMalformed(allowMalformed || passthrough);
tokenizer.setTokenizeMalformed(tokenizeMalformed);
tokenizer.setReader(new StringReader(input));
tokenizer.reset();
String term;
URLPart part;
OffsetAttribute offset;
while (tokenizer.incrementToken()) {
term = tokenizer.getAttribute(CharTermAttribute.class).toString();
part = URLPart.fromString(tokenizer.getAttribute(TypeAttribute.class).type());
offset = tokenizer.getAttribute(OffsetAttribute.class);
tokens.add(new Token(term, part, offset.startOffset(), offset.endOffset()));
}
return tokens;
}
public static void main(String[] args) throws IOException {
String str = "<body>perchééééééééé";
Analyzer anal = new DexterAnalyzer();
TokenStream ts = anal.tokenStream("content", new StringReader(str));
OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
ts.reset();
while (ts.incrementToken()) {
System.out.println(termAtt.toString()
.substring(0, termAtt.length()));
System.out
.println("token start offset: " + offsetAtt.startOffset());
System.out.println(" token end offset: " + offsetAtt.endOffset());
}
}
private static int findGoodEndForNoHighlightExcerpt(int noMatchSize, Analyzer analyzer, String fieldName, String contents) throws IOException {
try (TokenStream tokenStream = analyzer.tokenStream(fieldName, contents)) {
if (!tokenStream.hasAttribute(OffsetAttribute.class)) {
// Can't split on term boundaries without offsets
return -1;
}
int end = -1;
tokenStream.reset();
while (tokenStream.incrementToken()) {
OffsetAttribute attr = tokenStream.getAttribute(OffsetAttribute.class);
if (attr.endOffset() >= noMatchSize) {
// Jump to the end of this token if it wouldn't put us past the boundary
if (attr.endOffset() == noMatchSize) {
end = noMatchSize;
}
return end;
}
end = attr.endOffset();
}
tokenStream.end();
// We've exhausted the token stream so we should just highlight everything.
return end;
}
}
public static void main(String[] args) throws IOException {
Analyzer az = CustomAnalyzer.builder()
//.withTokenizer("Standard")
.withTokenizer("Name")
.addTokenFilter("EdgeNGram", "minGramSize", "1", "maxGramSize", "20")
//.addTokenFilter("ICUTransform", "id", "Han-Latin;NFD;[[:NonspacingMark:][:Space:]] Remove")
//.addTokenFilter("EdgeNGram", "minGramSize", "1", "maxGramSize", "20")
.build();
StringReader sr = new StringReader(args[0]);
TokenStream ts = az.tokenStream ("" , sr);
OffsetAttribute oa = ts.addAttribute (OffsetAttribute.class);
CharTermAttribute ta = ts.addAttribute (CharTermAttribute.class);
try {
ts.reset(); // Resets this stream to the beginning. (Required)
while (ts.incrementToken()) {
System.out.println(ta.toString() + "|" + ta.length()
+ "[" + oa.startOffset() + "," + oa.endOffset() + "]");
}
ts.end( ); // Perform end-of-stream operations, e.g. set the final offset.
} finally {
ts.close(); // Release resources associated with this stream.
}
}
private void emit(char[] tokenChars) {
char[] token = tokenChars;
if (replaceWhitespaceWith != null) {
token = replaceWhiteSpace(token);
}
CharTermAttribute termAttr = getTermAttribute();
if (termAttr != null) {
termAttr.setEmpty();
termAttr.append(new StringBuilder().append(token));
}
OffsetAttribute offAttr = getOffsetAttribute();
if (offAttr != null && offAttr.endOffset() >= token.length) {
int start = offAttr.endOffset() - token.length;
offAttr.setOffset(start, offAttr.endOffset());
}
PositionIncrementAttribute pia = getPositionIncrementAttribute();
if (pia != null) {
pia.setPositionIncrement(++positionIncr);
}
lastEmitted = token;
}
public void testTokenizerReuse() throws IOException
{
// We should be able to use the same Tokenizer twice.
final String path = "uri1:one";
StringReader reader = new StringReader(path);
PathTokenFilter ts = new PathTokenFilter(PathTokenFilter.PATH_SEPARATOR,
PathTokenFilter.SEPARATOR_TOKEN_TEXT, PathTokenFilter.NO_NS_TOKEN_TEXT,
PathTokenFilter.NAMESPACE_START_DELIMITER, PathTokenFilter.NAMESPACE_END_DELIMITER, true);
ts.setReader(reader);
OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
// First use
tokenise(ts, new String[]{"uri1", "one"});
assertEquals(path.length(), offsetAtt.startOffset());
assertEquals(path.length(), offsetAtt.endOffset());
// Second use
final String path2 = "/{uri1}one/uri2:two/";
StringReader reader2 = new StringReader(path2);
ts.setReader(reader2);
tokenise(ts, new String[]{"uri1", "one", "uri2", "two"});
assertEquals(path2.length(), offsetAtt.startOffset());
assertEquals(path2.length(), offsetAtt.endOffset());
}
public void testAttributesAfterStreamEnd() throws IOException
{
final String path = "uri1:one";
StringReader reader = new StringReader(path);
PathTokenFilter ts = new PathTokenFilter(PathTokenFilter.PATH_SEPARATOR,
PathTokenFilter.SEPARATOR_TOKEN_TEXT, PathTokenFilter.NO_NS_TOKEN_TEXT,
PathTokenFilter.NAMESPACE_START_DELIMITER, PathTokenFilter.NAMESPACE_END_DELIMITER, true);
ts.setReader(reader);
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
TypeAttribute typeAtt = ts.addAttribute(TypeAttribute.class);
OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);
// PathTokenFilter.end() will be called after all tokens consumed.
tokenise(ts, new String[]{"uri1", "one"});
// Check attributes cleaned up
assertEquals("", termAtt.toString());
assertEquals("word", typeAtt.type()); // the default
assertEquals(0, posIncAtt.getPositionIncrement());
// Final offset...
assertEquals(path.length(), offsetAtt.startOffset());
assertEquals(path.length(), offsetAtt.endOffset());
}
@Test
public void testSearch() throws IOException {
LcPinyinAnalyzer analyzer = new LcPinyinAnalyzer(AnalysisSetting.search);
TokenStream tokenStream = analyzer.tokenStream("lc", "重qing");
CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class);
PositionIncrementAttribute positionIncrementAttribute = tokenStream.getAttribute(PositionIncrementAttribute.class);
tokenStream.reset();
Assert.assertTrue(tokenStream.incrementToken());
Assert.assertEquals(charTermAttribute.toString(), "重");
Assert.assertEquals(offsetAttribute.startOffset(), 0);
Assert.assertEquals(offsetAttribute.endOffset(), 1);
Assert.assertEquals(positionIncrementAttribute.getPositionIncrement(), 1);
Assert.assertTrue(tokenStream.incrementToken());
Assert.assertEquals(charTermAttribute.toString(), "qing");
Assert.assertEquals(offsetAttribute.startOffset(), 1);
Assert.assertEquals(offsetAttribute.endOffset(), 5);
Assert.assertEquals(positionIncrementAttribute.getPositionIncrement(), 1);
tokenStream.close();
}
public void testFullPinyinFilter() throws IOException {
LcPinyinAnalyzer analyzer = new LcPinyinAnalyzer(AnalysisSetting.search);
TokenStream tokenStream = analyzer.tokenStream("lc", "作者 : 陈楠");
LcPinyinTokenFilter lcPinyinTokenFilter = new LcPinyinTokenFilter(tokenStream, PinyinFilterSetting.full_pinyin);
CharTermAttribute charTermAttribute = lcPinyinTokenFilter.getAttribute(CharTermAttribute.class);
OffsetAttribute offsetAttribute = lcPinyinTokenFilter.getAttribute(OffsetAttribute.class);
PositionIncrementAttribute positionIncrementAttribute = lcPinyinTokenFilter.getAttribute(PositionIncrementAttribute.class);
lcPinyinTokenFilter.reset();
while (lcPinyinTokenFilter.incrementToken()) {
System.out.println(charTermAttribute.toString() + ":" + offsetAttribute.startOffset() + "," + offsetAttribute.endOffset() + ":" + positionIncrementAttribute.getPositionIncrement());
}
lcPinyinTokenFilter.close();
}
public void testFirstLetterFilter() throws IOException {
LcPinyinAnalyzer analyzer = new LcPinyinAnalyzer(AnalysisSetting.search);
TokenStream tokenStream = analyzer.tokenStream("lc", "作者 : 陈楠");
LcPinyinTokenFilter lcPinyinTokenFilter = new LcPinyinTokenFilter(tokenStream, PinyinFilterSetting.first_letter);
CharTermAttribute charTermAttribute = lcPinyinTokenFilter.getAttribute(CharTermAttribute.class);
OffsetAttribute offsetAttribute = lcPinyinTokenFilter.getAttribute(OffsetAttribute.class);
PositionIncrementAttribute positionIncrementAttribute = lcPinyinTokenFilter.getAttribute(PositionIncrementAttribute.class);
lcPinyinTokenFilter.reset();
while (lcPinyinTokenFilter.incrementToken()) {
System.out.println(charTermAttribute.toString() + ":" + offsetAttribute.startOffset() + "," + offsetAttribute.endOffset() + ":" + positionIncrementAttribute.getPositionIncrement());
}
lcPinyinTokenFilter.close();
}
@Override
public TokenStreamComponents createComponents(String fieldName) {
Tokenizer ts = new Tokenizer() {
final char[] cbuf = new char[maxChars];
final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
final BytesTermAttribute bytesAtt = isPointField() ? addAttribute(BytesTermAttribute.class) : null;
final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
@Override
public boolean incrementToken() throws IOException {
clearAttributes();
int n = input.read(cbuf,0,maxChars);
if (n<=0) return false;
if (isPointField()) {
BytesRef b = ((PointField)FieldType.this).toInternalByteRef(new String(cbuf, 0, n));
bytesAtt.setBytesRef(b);
} else {
String s = toInternal(new String(cbuf, 0, n));
termAtt.setEmpty().append(s);
}
offsetAtt.setOffset(correctOffset(0),correctOffset(n));
return true;
}
};
return new TokenStreamComponents(ts);
}
public void testCreateComponents() throws Exception
{
String text = "中华人民共和国很辽阔";
for (int i = 0; i < text.length(); ++i)
{
System.out.print(text.charAt(i) + "" + i + " ");
}
System.out.println();
Analyzer analyzer = new HanLPAnalyzer();
TokenStream tokenStream = analyzer.tokenStream("field", text);
tokenStream.reset();
while (tokenStream.incrementToken())
{
CharTermAttribute attribute = tokenStream.getAttribute(CharTermAttribute.class);
// 偏移量
OffsetAttribute offsetAtt = tokenStream.getAttribute(OffsetAttribute.class);
// 距离
PositionIncrementAttribute positionAttr = tokenStream.getAttribute(PositionIncrementAttribute.class);
// 词性
TypeAttribute typeAttr = tokenStream.getAttribute(TypeAttribute.class);
System.out.printf("[%d:%d %d] %s/%s\n", offsetAtt.startOffset(), offsetAtt.endOffset(), positionAttr.getPositionIncrement(), attribute, typeAttr.type());
}
}
@Override
public boolean incrementToken() throws IOException {
boolean newSource = false;
while (sources[currentSource].incrementToken() == false) {
if (currentSource >= sources.length - 1)
return false;
sources[currentSource].end();
initialPositionIncrement = sourceIncrements[currentSource].getPositionIncrement();
OffsetAttribute att = sourceOffsets[currentSource];
if (att != null)
offsetIncrement += att.endOffset();
currentSource++;
newSource = true;
}
clearAttributes();
sources[currentSource].copyTo(this);
offsetAtt.setOffset(offsetAtt.startOffset() + offsetIncrement, offsetAtt.endOffset() + offsetIncrement);
if (newSource) {
int posInc = posIncAtt.getPositionIncrement();
posIncAtt.setPositionIncrement(posInc + initialPositionIncrement);
}
return true;
}
protected String displayTokens(String text, String elementId) throws IOException {
Analyzer analyzer = new LuceneSimpleAnalyzer(isCaseSensitive, removeAccents);;
StringBuilder sb = new StringBuilder();
sb.append(elementId).append(": ").append(text).append(": ");
TokenStream tokenStream = analyzer.tokenStream(null, new StringReader(text));
CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class);
tokenStream.reset();
while (tokenStream.incrementToken()) {
int startOffset = offsetAttribute.startOffset();
int endOffset = offsetAttribute.endOffset();
String term = charTermAttribute.toString();
sb.append("[" + term + "](" + startOffset + "," + endOffset + ") ");
}
return sb.toString();
}
public void assertEquals(String s, TokenStream left, TokenStream right) throws Exception {
left.reset();
right.reset();
CharTermAttribute leftTerm = left.addAttribute(CharTermAttribute.class);
CharTermAttribute rightTerm = right.addAttribute(CharTermAttribute.class);
OffsetAttribute leftOffset = left.addAttribute(OffsetAttribute.class);
OffsetAttribute rightOffset = right.addAttribute(OffsetAttribute.class);
PositionIncrementAttribute leftPos = left.addAttribute(PositionIncrementAttribute.class);
PositionIncrementAttribute rightPos = right.addAttribute(PositionIncrementAttribute.class);
while (left.incrementToken()) {
assertTrue("wrong number of tokens for input: " + s, right.incrementToken());
assertEquals("wrong term text for input: " + s, leftTerm.toString(), rightTerm.toString());
assertEquals("wrong position for input: " + s, leftPos.getPositionIncrement(), rightPos.getPositionIncrement());
assertEquals("wrong start offset for input: " + s, leftOffset.startOffset(), rightOffset.startOffset());
assertEquals("wrong end offset for input: " + s, leftOffset.endOffset(), rightOffset.endOffset());
};
assertFalse("wrong number of tokens for input: " + s, right.incrementToken());
left.end();
right.end();
assertEquals("wrong final offset for input: " + s, leftOffset.endOffset(), rightOffset.endOffset());
left.close();
right.close();
}
/**
* Analyzes the given TokenStream, collecting the Tokens it produces.
*
* @param tokenStream TokenStream to analyze
*
* @return List of tokens produced from the TokenStream
*/
private List<AttributeSource> analyzeTokenStream(TokenStream tokenStream) {
final List<AttributeSource> tokens = new ArrayList<>();
final PositionIncrementAttribute posIncrAtt = tokenStream.addAttribute(PositionIncrementAttribute.class);
final TokenTrackingAttribute trackerAtt = tokenStream.addAttribute(TokenTrackingAttribute.class);
// for backwards compatibility, add all "common" attributes
tokenStream.addAttribute(OffsetAttribute.class);
tokenStream.addAttribute(TypeAttribute.class);
try {
tokenStream.reset();
int position = 0;
while (tokenStream.incrementToken()) {
position += posIncrAtt.getPositionIncrement();
trackerAtt.setActPosition(position);
tokens.add(tokenStream.cloneAttributes());
}
tokenStream.end(); // TODO should we capture?
} catch (IOException ioe) {
throw new RuntimeException("Error occurred while iterating over tokenstream", ioe);
} finally {
IOUtils.closeWhileHandlingException(tokenStream);
}
return tokens;
}
protected void analyze(Collection<Token> result, String text, int offset, int flagsAttValue) throws IOException {
TokenStream stream = analyzer.tokenStream("", text);
// TODO: support custom attributes
CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
TypeAttribute typeAtt = stream.addAttribute(TypeAttribute.class);
PayloadAttribute payloadAtt = stream.addAttribute(PayloadAttribute.class);
PositionIncrementAttribute posIncAtt = stream.addAttribute(PositionIncrementAttribute.class);
OffsetAttribute offsetAtt = stream.addAttribute(OffsetAttribute.class);
stream.reset();
while (stream.incrementToken()) {
Token token = new Token();
token.copyBuffer(termAtt.buffer(), 0, termAtt.length());
token.setOffset(offset + offsetAtt.startOffset(),
offset + offsetAtt.endOffset());
token.setFlags(flagsAttValue); //overwriting any flags already set...
token.setType(typeAtt.type());
token.setPayload(payloadAtt.getPayload());
token.setPositionIncrement(posIncAtt.getPositionIncrement());
result.add(token);
}
stream.end();
stream.close();
}
public static void main(String[] args) {
// SynonymsLibrary.put(SynonymsLibrary.DEFAULT, "../../library/synonyms.dic");
//
// DicLibrary.insert(DicLibrary.DEFAULT, "清华", "n", 2000);
// DicLibrary.insert(DicLibrary.DEFAULT, "大学", "n", 2000);
Map<String, String> map = new HashMap<String, String>();
map.put("type", "base_ansj");
// map.put(SynonymsLibrary.DEFAULT, SynonymsLibrary.DEFAULT);
Analyzer ca = new AnsjAnalyzer(map);
String content = "我爱北京天安门天安门上太阳升我美丽的清华大学";
try {
TokenStream tokenStream = ca.tokenStream(content, new StringReader(content));
while (tokenStream.incrementToken()) {
System.out.print(tokenStream.getAttribute(CharTermAttribute.class));
System.out.print("\t");
System.out.print(tokenStream.getAttribute(OffsetAttribute.class).startOffset());
System.out.print("\t");
System.out.print(tokenStream.getAttribute(PositionIncrementAttribute.class).getPositionIncrement());
System.out.print("\t");
System.out.println(tokenStream.getAttribute(TypeAttribute.class).type());
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
ca.close();
}
/**
* Get a list of {@link Token}s from the given {@link Tokenizer}
* @param part the url part which should be used in {@link Token} creation
* @param tokenizer the tokenizer from which tokens will be gleaned
* @return a list of tokens
* @throws IOException
*/
private List<Token> tokenize(URLPart part, Tokenizer tokenizer, int start) throws IOException {
tokenizer.reset();
List<Token> tokens = new ArrayList<>();
OffsetAttribute offset;
String token;
while (tokenizer.incrementToken()) {
token = tokenizer.getAttribute(CharTermAttribute.class).toString();
offset = tokenizer.getAttribute(OffsetAttribute.class);
tokens.add(new Token(token, part, start + offset.startOffset(), start + offset.endOffset()));
}
return tokens;
}
@Test
public void testIncrementToken() throws Exception {
while (tokenizer.incrementToken()) {
CharTermAttribute attribute = tokenizer.getAttribute(CharTermAttribute.class);
// 偏移量
OffsetAttribute offsetAtt = tokenizer.getAttribute(OffsetAttribute.class);
// 距离
PositionIncrementAttribute positionAttr = tokenizer.getAttribute(PositionIncrementAttribute.class);
// 词性
TypeAttribute typeAttr = tokenizer.getAttribute(TypeAttribute.class);
System.out.printf("[%d:%d %d] %s/%s\n", offsetAtt.startOffset(), offsetAtt.endOffset(), positionAttr.getPositionIncrement(), attribute, typeAttr.type());
}
}
public void testCreate() throws Exception
{
Map<String, String> args = new TreeMap<>();
args.put("enableTraditionalChineseMode", "true");
TokenizerFactory factory = new HanLPTokenizerFactory(args);
Tokenizer tokenizer = factory.create(null);
tokenizer.setReader(new StringReader("大衛貝克漢不僅僅是名著名球員,球場以外,其妻為前" +
"辣妹合唱團成員維多利亞·碧咸,亦由於他擁有" +
"突出外表、百變髮型及正面的形象,以至自己" +
"品牌的男士香水等商品,及長期擔任運動品牌" +
"Adidas的代言人,因此對大眾傳播媒介和時尚界" +
"等方面都具很大的影響力,在足球圈外所獲得的" +
"認受程度可謂前所未見。"));
tokenizer.reset();
while (tokenizer.incrementToken())
{
CharTermAttribute attribute = tokenizer.getAttribute(CharTermAttribute.class);
// 偏移量
OffsetAttribute offsetAtt = tokenizer.getAttribute(OffsetAttribute.class);
// 距离
PositionIncrementAttribute positionAttr = tokenizer.getAttribute(PositionIncrementAttribute.class);
// 词性
TypeAttribute typeAttr = tokenizer.getAttribute(TypeAttribute.class);
System.out.printf("[%d:%d %d] %s/%s\n", offsetAtt.startOffset(), offsetAtt.endOffset(), positionAttr.getPositionIncrement(), attribute, typeAttr.type());
}
}
@Test
public void testTokenize() throws Exception {
String[] texts = {
// 空格
" ",
// 语句
"中华人民共和国(People's Republic of China),简称'中国'",
// 文本
"JStarCraft AI 1.0的目标是提供一个完整的Java机器学习(Machine Learning/ML)框架,作为人工智能在学术界与工业界的桥梁. 让相关领域的研发人员能够在各种软硬件环境/数据结构/算法/模型之间无缝切换. 涵盖了从数据处理到模型的训练与评估各个环节,支持硬件加速和并行计算,是最快最全的Java机器学习库." };
for (String text : texts) {
// 测试Tokenizer分词
NlpTokenizer<? extends NlpToken> tokenizer = getTokenizer();
Iterable<? extends NlpToken> tokens = tokenizer.tokenize(text);
for (NlpToken token : tokens) {
LOGGER.debug(StringUtility.format("tokenizer:term is {}, begin is {}, end is {}", token.getTerm(), token.getBegin(), token.getEnd()));
Assert.assertEquals(token.getTerm().toLowerCase(), text.substring(token.getBegin(), token.getEnd()).toLowerCase());
}
// 测试Segmenter分词
try (Tokenizer segmenter = new NlpSegmenter(BreakIterator.getSentenceInstance(), tokenizer)) {
segmenter.setReader(new StringReader(text));
segmenter.reset();
while (segmenter.incrementToken()) {
// 词元
CharTermAttribute term = segmenter.getAttribute(CharTermAttribute.class);
// 偏移
OffsetAttribute offset = segmenter.getAttribute(OffsetAttribute.class);
// 距离
PositionIncrementAttribute position = segmenter.getAttribute(PositionIncrementAttribute.class);
// 词性
TypeAttribute type = segmenter.getAttribute(TypeAttribute.class);
LOGGER.debug(StringUtility.format("tokenizer:term is {}, begin is {}, end is {}", term, offset.startOffset(), offset.endOffset()));
Assert.assertEquals(term.toString().toLowerCase(), text.substring(offset.startOffset(), offset.endOffset()).toLowerCase());
}
}
}
}
/**
* Lucene 4.0 Tokenizer适配器类构造函数
*/
public IKTokenizer(Configuration configuration) {
super();
offsetAtt = addAttribute(OffsetAttribute.class);
termAtt = addAttribute(CharTermAttribute.class);
typeAtt = addAttribute(TypeAttribute.class);
posIncrAtt = addAttribute(PositionIncrementAttribute.class);
_IKImplement = new IKSegmenter(input, configuration);
}
private OffsetAttribute getOffsetAttribute() {
Iterator<AttributeImpl> attrIt = getAttributeImplsIterator();
while (attrIt != null && attrIt.hasNext()) {
AttributeImpl attrImp = attrIt.next();
if (attrImp instanceof OffsetAttribute) {
return (OffsetAttribute) attrImp;
}
}
return null;
}