下面列出了怎么用java.text.BreakIterator的API类实例代码及写法,或者点击链接到github查看源代码。
/**
* Returns the type of the break.
* @return {@code none}, {@code number}, {@code letter}, {@code kana}, {@code ideo} or {@code unknown}
*/
@JsxFunction
public String breakType() {
if (!typeAlwaysNone_) {
final int current = current();
final int previous = breakIterator_.previous();
if (previous == BreakIterator.DONE) {
first();
}
else {
next();
}
if (current != BreakIterator.DONE && previous != BreakIterator.DONE) {
final String token = text_.substring(previous, current);
if (token.matches(".*[a-zA-Z]+.*")) {
return "letter";
}
if (token.matches("[0-9]+")) {
return "number";
}
}
}
return "none";
}
/**
* Needed to unify forward and backward searching.
* The method assumes that s is the text assigned to words.
*/
private int findWordLimit(int index, BreakIterator words, boolean direction,
String s) {
// Fix for 4256660 and 4256661.
// Words iterator is different from character and sentence iterators
// in that end of one word is not necessarily start of another word.
// Please see java.text.BreakIterator JavaDoc. The code below is
// based on nextWordStartAfter example from BreakIterator.java.
int last = (direction == NEXT) ? words.following(index)
: words.preceding(index);
int current = (direction == NEXT) ? words.next()
: words.previous();
while (current != BreakIterator.DONE) {
for (int p = Math.min(last, current); p < Math.max(last, current); p++) {
if (Character.isLetter(s.charAt(p))) {
return last;
}
}
last = current;
current = (direction == NEXT) ? words.next()
: words.previous();
}
return BreakIterator.DONE;
}
@Override
public void execute(Tuple tuple, BasicOutputCollector collector) {
//Get the sentence content from the tuple
String sentence = tuple.getString(0);
//An iterator to get each word
BreakIterator boundary=BreakIterator.getWordInstance();
//Give the iterator the sentence
boundary.setText(sentence);
//Find the beginning first word
int start=boundary.first();
//Iterate over each word and emit it to the output stream
for (int end = boundary.next(); end != BreakIterator.DONE; start=end, end=boundary.next()) {
//get the word
String word=sentence.substring(start,end);
//If a word is whitespace characters, replace it with empty
word=word.replaceAll("\\s+","");
//if it's an actual word, emit it
if (!word.equals("")) {
collector.emit(new Values(word));
}
}
}
public static void usingBreakIterator() {
Locale currentLocale = new Locale("en", "US");
BreakIterator sentenceIterator = BreakIterator.getSentenceInstance();
sentenceIterator.setText(paragraph);
int boundary = sentenceIterator.first();
while (boundary != BreakIterator.DONE) {
int begin = boundary;
System.out.print(boundary + "-");
boundary = sentenceIterator.next();
int end = boundary;
if (end == BreakIterator.DONE) {
break;
}
System.out.println(boundary + " ["
+ paragraph.substring(begin, end) + "]");
}
}
@Override
public void selectWord(int wordPositionInArea) {
if(area.getLength() == 0) {
return;
}
BreakIterator breakIterator = BreakIterator.getWordInstance( getArea().getLocale() );
breakIterator.setText(area.getText());
breakIterator.preceding(wordPositionInArea);
breakIterator.next();
int wordStart = breakIterator.current();
breakIterator.following(wordPositionInArea);
breakIterator.next();
int wordEnd = breakIterator.current();
selectRange(wordStart, wordEnd);
}
@Override
public @Nullable int[] preceding(int offset) {
final int textLegth = getIteratorText().length();
if (textLegth <= 0) {
return null;
}
if (offset <= 0) {
return null;
}
int end = offset;
if (end > textLegth) {
end = textLegth;
}
while (!breakIterator.isBoundary(end)) {
end = breakIterator.preceding(end);
if (end == BreakIterator.DONE) {
return null;
}
}
final int start = breakIterator.preceding(end);
if (start == BreakIterator.DONE) {
return null;
}
return getRange(start, end);
}
void TestNext() {
iter = BreakIterator.getWordInstance(Locale.US);
for (int i = 0; i < given.length; i++) {
iter.setText(given[i]);
start = iter.first();
int j = expected[i].length - 1;
start = iter.next(j);
end = iter.next();
if (!expected[i][j].equals(given[i].substring(start, end))) {
errln("Word break failure: printEachForward() expected:<" +
expected[i][j] + ">, got:<" +
given[i].substring(start, end) +
"> start=" + start + " end=" + end);
}
}
}
private void testBreakIterator(BreakIterator bi, String text, String boundaries) {
bi.setText(text);
//Test first & last
testFirstAndLast(bi, text, boundaries);
//Test if expected boundaries are consistent with reading them from next() in a loop:
assertEquals(boundaries, readBoundariesToString(bi, text));
//Test following() and preceding():
// get each index, randomized in case their is a sequencing bug:
List<Integer> indexes = randomIntsBetweenInclusive(text.length() + 1);
testFollowing(bi, text, boundaries, indexes);
testPreceding(bi, text, boundaries, indexes);
//Test previous():
testPrevious(bi, text, boundaries);
}
private Vector testLastAndPrevious(BreakIterator bi, String text) {
int p = bi.last();
int lastP = p;
Vector<String> result = new Vector<String>();
if (p != text.length())
errln("last() returned " + p + " instead of " + text.length());
while (p != BreakIterator.DONE) {
p = bi.previous();
if (p != BreakIterator.DONE) {
if (p >= lastP)
errln("previous() failed to move backward: previous() on position "
+ lastP + " yielded " + p);
result.insertElementAt(text.substring(p, lastP), 0);
}
else {
if (lastP != 0)
errln("previous() returned DONE prematurely: offset was "
+ lastP + " instead of 0");
}
lastP = p;
}
return result;
}
private void testPreceding(BreakIterator bi, String text, int[] boundaries) {
logln("testPreceding():");
int p = 0;
int i = 0;
try {
for (i = 0; i <= text.length(); i++) { // change to <= when new BI code goes in
int b = bi.preceding(i);
logln("bi.preceding(" + i + ") -> " + b);
if (b != boundaries[p])
errln("Wrong result from preceding() for " + i + ": expected " + boundaries[p]
+ ", got " + b);
if (i == boundaries[p + 1])
++p;
}
} catch (IllegalArgumentException illargExp) {
errln("IllegalArgumentException caught from preceding() for offset: " + i);
}
}
private void testIsBoundary(BreakIterator bi, String text, int[] boundaries) {
logln("testIsBoundary():");
int p = 1;
boolean isB;
for (int i = 0; i <= text.length(); i++) { // change to <= when new BI code goes in
isB = bi.isBoundary(i);
logln("bi.isBoundary(" + i + ") -> " + isB);
if (i == boundaries[p]) {
if (!isB)
errln("Wrong result from isBoundary() for " + i + ": expected true, got false");
++p;
}
else {
if (isB)
errln("Wrong result from isBoundary() for " + i + ": expected false, got true");
}
}
}
private void testPreceding(BreakIterator bi, String text, int[] boundaries) {
logln("testPreceding():");
int p = 0;
int i = 0;
try {
for (i = 0; i <= text.length(); i++) { // change to <= when new BI code goes in
int b = bi.preceding(i);
logln("bi.preceding(" + i + ") -> " + b);
if (b != boundaries[p])
errln("Wrong result from preceding() for " + i + ": expected " + boundaries[p]
+ ", got " + b);
if (i == boundaries[p + 1])
++p;
}
} catch (IllegalArgumentException illargExp) {
errln("IllegalArgumentException caught from preceding() for offset: " + i);
}
}
/**
* Tries to find the word at the given offset.
*
* @param line
* the line
* @param offset
* the offset
* @return the word or <code>null</code> if none
*/
protected static IRegion findWordRegion(String line, int offset)
{
BreakIterator breakIter = BreakIterator.getWordInstance();
breakIter.setText(line);
int start = breakIter.preceding(offset);
if (start == BreakIterator.DONE)
start = 0;
int end = breakIter.following(offset);
if (end == BreakIterator.DONE)
end = line.length();
if (breakIter.isBoundary(offset))
{
if (end - offset > offset - start)
{
start = offset;
}
else
{
end = offset;
}
}
if (end == start)
{
return new Region(start, 0);
}
return new Region(start, end - start);
}
/**
* Make sure highlighter returns whole text when there
* are no hits and BreakIterator is null.
*/
public void testEmptyHighlightsWhole() throws Exception {
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, indexAnalyzer);
Document doc = new Document();
Field body = new Field("body", "test this is. another sentence this test has. far away is that planet.", fieldType);
doc.add(body);
iw.addDocument(doc);
IndexReader ir = iw.getReader();
iw.close();
IndexSearcher searcher = newSearcher(ir);
UnifiedHighlighter highlighter = new UnifiedHighlighter(searcher, indexAnalyzer) {
@Override
protected BreakIterator getBreakIterator(String field) {
return new WholeBreakIterator();
}
};
Query query = new TermQuery(new Term("body", "highlighting"));
int[] docIDs = new int[]{0};
String snippets[] = highlighter.highlightFields(new String[]{"body"}, query, docIDs, new int[]{2}).get("body");
assertEquals(1, snippets.length);
assertEquals("test this is. another sentence this test has. far away is that planet.", snippets[0]);
ir.close();
}
private String preprocessQuery(String aQuery)
{
String result;
if (!(aQuery.contains("\"") || aQuery.contains("[") || aQuery.contains("]")
|| aQuery.contains("{") || aQuery.contains("}") || aQuery.contains("<")
|| aQuery.contains(">"))) {
// Convert raw words query to a Mtas CQP query
result = "";
BreakIterator words = BreakIterator.getWordInstance();
words.setText(aQuery);
int start = words.first();
int end = words.next();
while (end != BreakIterator.DONE) {
String word = aQuery.substring(start, end);
if (!word.trim().isEmpty()) {
// Add the word to the query
result += "\"" + word + "\"";
}
start = end;
end = words.next();
if (end != BreakIterator.DONE) {
result += " ";
}
}
}
else {
result = aQuery;
}
return result;
}
/**
* Implements the "Final_Cased" condition
*
* Specification: Within the closest word boundaries containing C, there is a cased
* letter before C, and there is no cased letter after C.
*
* Regular Expression:
* Before C: [{cased==true}][{wordBoundary!=true}]*
* After C: !([{wordBoundary!=true}]*[{cased}])
*/
private static boolean isFinalCased(String src, int index, Locale locale) {
BreakIterator wordBoundary = BreakIterator.getWordInstance(locale);
wordBoundary.setText(src);
int ch;
// Look for a preceding 'cased' letter
for (int i = index; (i >= 0) && !wordBoundary.isBoundary(i);
i -= Character.charCount(ch)) {
ch = src.codePointBefore(i);
if (isCased(ch)) {
int len = src.length();
// Check that there is no 'cased' letter after the index
for (i = index + Character.charCount(src.codePointAt(index));
(i < len) && !wordBoundary.isBoundary(i);
i += Character.charCount(ch)) {
ch = src.codePointAt(i);
if (isCased(ch)) {
return false;
}
}
return true;
}
}
return false;
}
void TestPrintAt_2() {
iter = BreakIterator.getWordInstance(Locale.US);
int[][] index = {
{2, 9, 10, 15, 17},
{1, 9, 10, 13, 16, 18, 20},
{4, 9, 10, 13, 16, 18, 20},
{6, 7, 10, 11, 15},
};
for (int i = 0; i < given.length; i++) {
iter.setText(given[i]);
// Check preceding(0)'s return value - should equals BreakIterator.DONE.
if (iter.preceding(0) != BreakIterator.DONE) {
errln("Word break failure: printAt_2() expected:-1(BreakIterator.DONE), got:" +
iter.preceding(0));
}
for (int j = 0; j < index[i].length; j++) {
start = iter.preceding(index[i][j]);
end = iter.next();
if (!expected[i][j].equals(given[i].substring(start, end))) {
errln("Word break failure: printAt_2() expected:<" +
expected[i][j] + ">, got:<" +
given[i].substring(start, end) +
"> start=" + start + " end=" + end);
}
}
// Check next()'s return value - should equals BreakIterator.DONE.
end = iter.last();
start = iter.next();
if (start != BreakIterator.DONE) {
errln("Word break failure: printAt_2() expected:-1(BreakIterator.DONE), got:" + start);
}
}
}
private void makeLayoutWindow(int localStart) {
int compStart = localStart;
int compLimit = fChars.length;
// If we've already gone past the layout window, format to end of paragraph
if (layoutCount > 0 && !haveLayoutWindow) {
float avgLineLength = Math.max(layoutCharCount / layoutCount, 1);
compLimit = Math.min(localStart + (int)(avgLineLength*EST_LINES), fChars.length);
}
if (localStart > 0 || compLimit < fChars.length) {
if (charIter == null) {
charIter = new CharArrayIterator(fChars);
}
else {
charIter.reset(fChars);
}
if (fLineBreak == null) {
fLineBreak = BreakIterator.getLineInstance();
}
fLineBreak.setText(charIter);
if (localStart > 0) {
if (!fLineBreak.isBoundary(localStart)) {
compStart = fLineBreak.preceding(localStart);
}
}
if (compLimit < fChars.length) {
if (!fLineBreak.isBoundary(compLimit)) {
compLimit = fLineBreak.following(compLimit);
}
}
}
ensureComponents(compStart, compLimit);
haveLayoutWindow = true;
}
private void doMultipleSelectionTest(BreakIterator iterator, String testText)
{
logln("Multiple selection test...");
BreakIterator testIterator = (BreakIterator)iterator.clone();
int offset = iterator.first();
int testOffset;
int count = 0;
do {
testOffset = testIterator.first();
testOffset = testIterator.next(count);
logln("next(" + count + ") -> " + testOffset);
if (offset != testOffset)
errln("next(n) and next() not returning consistent results: for step " + count + ", next(n) returned " + testOffset + " and next() had " + offset);
if (offset != BreakIterator.DONE) {
count++;
offset = iterator.next();
}
} while (offset != BreakIterator.DONE);
// now do it backwards...
offset = iterator.last();
count = 0;
do {
testOffset = testIterator.last();
testOffset = testIterator.next(count);
logln("next(" + count + ") -> " + testOffset);
if (offset != testOffset)
errln("next(n) and next() not returning consistent results: for step " + count + ", next(n) returned " + testOffset + " and next() had " + offset);
if (offset != BreakIterator.DONE) {
count--;
offset = iterator.previous();
}
} while (offset != BreakIterator.DONE);
}
/**
* Sets the iterator to refer to the first boundary position following
* the specified position.
* @offset The position from which to begin searching for a break position.
* @return The position of the first break after the current position.
*/
@Override
public int following(int offset) {
CharacterIterator text = getText();
checkOffset(offset, text);
// Set our internal iteration position (temporarily)
// to the position passed in. If this is the _beginning_ position,
// then we can just use next() to get our return value
text.setIndex(offset);
if (offset == text.getBeginIndex()) {
cachedLastKnownBreak = handleNext();
return cachedLastKnownBreak;
}
// otherwise, we have to sync up first. Use handlePrevious() to back
// us up to a known break position before the specified position (if
// we can determine that the specified position is a break position,
// we don't back up at all). This may or may not be the last break
// position at or before our starting position. Advance forward
// from here until we've passed the starting position. The position
// we stop on will be the first break position after the specified one.
int result = cachedLastKnownBreak;
if (result >= offset || result <= BreakIterator.DONE) {
result = handlePrevious();
} else {
//it might be better to check if handlePrevious() give us closer
//safe value but handlePrevious() is slow too
//So, this has to be done carefully
text.setIndex(result);
}
while (result != BreakIterator.DONE && result <= offset) {
result = handleNext();
}
cachedLastKnownBreak = result;
return result;
}
public BreakIteratorTest()
{
characterBreak = BreakIterator.getCharacterInstance();
wordBreak = BreakIterator.getWordInstance();
lineBreak = BreakIterator.getLineInstance();
sentenceBreak = BreakIterator.getSentenceInstance();
}
public void testSliceMiddle() throws Exception {
NLPSentenceDetectorOp sentenceDetectorOp = OpenNLPOpsFactory.getSentenceDetector(sentenceModelFile);
BreakIterator bi = new OpenNLPSentenceBreakIterator(sentenceDetectorOp);
bi.setText(getCharArrayIterator(PADDING + SENTENCES[0] + PADDING, PADDING.length(), SENTENCES[0].length()));
test1Sentence(bi, SENTENCES[0]);
}
@Override
public void run() {
// Check whether we are in a java code partition and the preference is enabled
final IPreferenceStore store = getPreferenceStore();
if (store.getString(SubWordPreferences.WORD_NAVIGATION_STYLE)
.equals(SubWordPreferences.WORD_NAVIGATION_STYLE_NATIVE)) {
super.run();
return;
}
final ISourceViewer viewer = getSourceViewer();
final IDocument document = viewer.getDocument();
try {
fIterator.setText((CharacterIterator) new DocumentCharacterIterator(document));
int position = widgetOffset2ModelOffset(viewer, viewer.getTextWidget().getCaretOffset());
if (position == -1) {
return;
}
int next = findNextPosition(position);
if (isBlockSelectionModeEnabled()
&& document.getLineOfOffset(next) != document.getLineOfOffset(position)) {
super.run(); // may navigate into virtual white space
} else if (next != BreakIterator.DONE) {
setCaretPosition(next);
getTextWidget().showSelection();
fireSelectionChanged();
}
} catch (BadLocationException x) {
// ignore
}
}
public static void main(String[] args) {
BreakIterator b = BreakIterator.getWordInstance();
b.setText("abc");
if (b.equals(null)) {
throw new RuntimeException("BreakIterator.equals(null) should return false.");
}
}
void TestPrintEachBackward() {
iter = BreakIterator.getWordInstance(Locale.US);
for (int i = 0; i < given.length; i++) {
iter.setText(given[i]);
end = iter.last();
// Check current()'s return value - should be same as last()'s.
current = iter.current();
if (end != current) {
errln("Word break failure: printEachBackward() Unexpected current value: current()=" +
current + ", expected(=last())=" + end);
}
int j;
for (start = iter.previous(), j = expected[i].length-1;
start != BreakIterator.DONE;
end = start, start = iter.previous(), j--) {
// Check current()'s return value - should be same as previous()'s.
current = iter.current();
if (start != current) {
errln("Word break failure: printEachBackward() Unexpected current value: current()=" +
current + ", expected(=previous())=" + start);
}
if (!expected[i][j].equals(given[i].substring(start, end))) {
errln("Word break failure: printEachBackward() expected:<" +
expected[i][j] + ">, got:<" +
given[i].substring(start, end) +
"> start=" + start + " end=" + end);
}
}
}
}
private BreakIterator getBreakInstance(Locale locale,
int type,
String dataName,
String dictionaryName) {
if (locale == null) {
throw new NullPointerException();
}
LocaleResources lr = LocaleProviderAdapter.forJRE().getLocaleResources(locale);
String[] classNames = (String[]) lr.getBreakIteratorInfo("BreakIteratorClasses");
String dataFile = (String) lr.getBreakIteratorInfo(dataName);
try {
switch (classNames[type]) {
case "RuleBasedBreakIterator":
return new RuleBasedBreakIterator(dataFile);
case "DictionaryBasedBreakIterator":
String dictionaryFile = (String) lr.getBreakIteratorInfo(dictionaryName);
return new DictionaryBasedBreakIterator(dataFile, dictionaryFile);
default:
throw new IllegalArgumentException("Invalid break iterator class \"" +
classNames[type] + "\"");
}
} catch (IOException | MissingResourceException | IllegalArgumentException e) {
throw new InternalError(e.toString(), e);
}
}
/**
* Constructor
*/
DocLocale(DocEnv docenv, String localeName, boolean useBreakIterator) {
this.docenv = docenv;
this.localeName = localeName;
this.useBreakIterator = useBreakIterator;
locale = getLocale();
if (locale == null) {
docenv.exit();
} else {
Locale.setDefault(locale); // NOTE: updating global state
}
collator = Collator.getInstance(locale);
sentenceBreaker = BreakIterator.getSentenceInstance(locale);
}
public DocSplitterFallbackImpl()
{
buffer = "";
bdry = BreakIterator.getSentenceInstance(Locale.US);
bdry.setText("");
start = bdry.first();
}
@Override
protected Passage[] getEmptyHighlight(String fieldName, BreakIterator bi, int maxPassages) {
if (returnNonHighlightedSnippets) {
//we want to return the first sentence of the first snippet only
return super.getEmptyHighlight(fieldName, bi, 1);
}
return EMPTY_PASSAGE;
}
/**
* Constructor
*/
DocLocale(DocEnv docenv, String localeName, boolean useBreakIterator) {
this.docenv = docenv;
this.localeName = localeName;
this.useBreakIterator = useBreakIterator;
locale = getLocale();
if (locale == null) {
docenv.exit();
} else {
Locale.setDefault(locale); // NOTE: updating global state
}
collator = Collator.getInstance(locale);
sentenceBreaker = BreakIterator.getSentenceInstance(locale);
}