下面列出了java.text.BreakIterator#setText ( ) 实例代码,或者点击链接到github查看源代码,也可以在右侧发表评论。
public void TestBug4153072() {
BreakIterator iter = BreakIterator.getWordInstance();
String str = "...Hello, World!...";
int begin = 3;
int end = str.length() - 3;
boolean gotException = false;
boolean dummy;
iter.setText(new StringCharacterIterator(str, begin, end, begin));
for (int index = -1; index < begin + 1; ++index) {
try {
dummy = iter.isBoundary(index);
if (index < begin)
errln("Didn't get exception with offset = " + index +
" and begin index = " + begin);
}
catch (IllegalArgumentException e) {
if (index >= begin)
errln("Got exception with offset = " + index +
" and begin index = " + begin);
}
}
}
public void TestBug4153072() {
BreakIterator iter = BreakIterator.getWordInstance();
String str = "...Hello, World!...";
int begin = 3;
int end = str.length() - 3;
boolean gotException = false;
boolean dummy;
iter.setText(new StringCharacterIterator(str, begin, end, begin));
for (int index = -1; index < begin + 1; ++index) {
try {
dummy = iter.isBoundary(index);
if (index < begin)
errln("Didn't get exception with offset = " + index +
" and begin index = " + begin);
}
catch (IllegalArgumentException e) {
if (index >= begin)
errln("Got exception with offset = " + index +
" and begin index = " + begin);
}
}
}
private static void usingTheBreakIterator() {
Locale currentLocale = new Locale("en", "US");
BreakIterator wordIterator
= BreakIterator.getWordInstance();
String text = "Let's pause, and then reflect.";
wordIterator.setText(text);
int boundary = wordIterator.first();
while (boundary != BreakIterator.DONE) {
int begin = boundary;
System.out.print(boundary + "-");
boundary = wordIterator.next();
int end = boundary;
if(end == BreakIterator.DONE) break;
System.out.println(boundary + " [" +
text.substring(begin, end) + "]");
}
}
/**
* Adds HTML line breaks and indentation to a string so it wraps for things like long tooltips.
*
* <pre>
* string part 1
* string part 2
* ...
* string part X
* </pre>
*/
public static String addHtmlBreaksAndIndents(
final String target, final int firstLineMaxLength, final int maxLength) {
final StringBuilder sb = new StringBuilder();
final BreakIterator breakIterator = BreakIterator.getLineInstance();
breakIterator.setText(target);
int start = breakIterator.first();
int end = breakIterator.next();
int lineLength = 0;
int currentMaxLength = firstLineMaxLength;
while (end != BreakIterator.DONE) {
final String word = target.substring(start, end);
lineLength = lineLength + word.length();
if (lineLength >= currentMaxLength) {
sb.append("<br /> ");
lineLength = word.length() + 5; // Add 5 for the indent
currentMaxLength = maxLength;
}
sb.append(word);
start = end;
end = breakIterator.next();
}
return sb.toString();
}
/**
* Implements the "Final_Cased" condition
*
* Specification: Within the closest word boundaries containing C, there is a cased
* letter before C, and there is no cased letter after C.
*
* Regular Expression:
* Before C: [{cased==true}][{wordBoundary!=true}]*
* After C: !([{wordBoundary!=true}]*[{cased}])
*/
private static boolean isFinalCased(String src, int index, Locale locale) {
BreakIterator wordBoundary = BreakIterator.getWordInstance(locale);
wordBoundary.setText(src);
int ch;
// Look for a preceding 'cased' letter
for (int i = index; (i >= 0) && !wordBoundary.isBoundary(i);
i -= Character.charCount(ch)) {
ch = src.codePointBefore(i);
if (isCased(ch)) {
int len = src.length();
// Check that there is no 'cased' letter after the index
for (i = index + Character.charCount(src.codePointAt(index));
(i < len) && !wordBoundary.isBoundary(i);
i += Character.charCount(ch)) {
ch = src.codePointAt(i);
if (isCased(ch)) {
return false;
}
}
return true;
}
}
return false;
}
public void testSliceEnd() throws Exception {
NLPSentenceDetectorOp sentenceDetectorOp = OpenNLPOpsFactory.getSentenceDetector(sentenceModelFile);
BreakIterator bi = new OpenNLPSentenceBreakIterator(sentenceDetectorOp);
bi.setText(getCharArrayIterator(SENTENCES[0] + PADDING, 0, SENTENCES[0].length()));
test1Sentence(bi, SENTENCES[0]);
}
/**
* Returns the Segment at <code>index</code> representing either
* the paragraph or sentence as identified by <code>part</code>, or
* null if a valid paragraph/sentence can't be found. The offset
* will point to the start of the word/sentence in the array, and
* the modelOffset will point to the location of the word/sentence
* in the model.
*/
private IndexedSegment getSegmentAt(int part, int index)
throws BadLocationException {
IndexedSegment seg = getParagraphElementText(index);
if (seg == null) {
return null;
}
BreakIterator iterator;
switch (part) {
case AccessibleText.WORD:
iterator = BreakIterator.getWordInstance(getLocale());
break;
case AccessibleText.SENTENCE:
iterator = BreakIterator.getSentenceInstance(getLocale());
break;
default:
return null;
}
seg.first();
iterator.setText(seg);
int end = iterator.following(index - seg.modelOffset + seg.offset);
if (end == BreakIterator.DONE) {
return null;
}
if (end > seg.offset + seg.count) {
return null;
}
int begin = iterator.previous();
if (begin == BreakIterator.DONE ||
begin >= seg.offset + seg.count) {
return null;
}
seg.modelOffset = seg.modelOffset + begin - seg.offset;
seg.offset = begin;
seg.count = end - begin;
return seg;
}
/**
* Implements the "Final_Cased" condition
*
* Specification: Within the closest word boundaries containing C, there is a cased
* letter before C, and there is no cased letter after C.
*
* Regular Expression:
* Before C: [{cased==true}][{wordBoundary!=true}]*
* After C: !([{wordBoundary!=true}]*[{cased}])
*/
private static boolean isFinalCased(String src, int index, Locale locale) {
BreakIterator wordBoundary = BreakIterator.getWordInstance(locale);
wordBoundary.setText(src);
int ch;
// Look for a preceding 'cased' letter
for (int i = index; (i >= 0) && !wordBoundary.isBoundary(i);
i -= Character.charCount(ch)) {
ch = src.codePointBefore(i);
if (isCased(ch)) {
int len = src.length();
// Check that there is no 'cased' letter after the index
for (i = index + Character.charCount(src.codePointAt(index));
(i < len) && !wordBoundary.isBoundary(i);
i += Character.charCount(ch)) {
ch = src.codePointAt(i);
if (isCased(ch)) {
return false;
}
}
return true;
}
}
return false;
}
@Override
protected IRegion findWord(IDocument document, int offset) {
try {
IRegion line = document.getLineInformationOfOffset(offset);
if (offset == line.getOffset() + line.getLength())
return null;
BreakIterator breakIter = createBreakIterator();
CharacterIterator characterIterator = new DocumentCharacterIterator(document);
breakIter.setText(characterIterator);
int start = breakIter.preceding(offset);
if (start == BreakIterator.DONE)
start = line.getOffset();
int end = breakIter.following(offset);
if (end == BreakIterator.DONE)
end = line.getOffset() + line.getLength();
if (breakIter.isBoundary(offset)) {
if (end - offset > offset - start)
start = offset;
else
end = offset;
}
if (end == start)
return null;
return new Region(start, end - start);
} catch (BadLocationException e) {
return null;
}
}
/**
* Returns the Segment at <code>index</code> representing either
* the paragraph or sentence as identified by <code>part</code>, or
* null if a valid paragraph/sentence can't be found. The offset
* will point to the start of the word/sentence in the array, and
* the modelOffset will point to the location of the word/sentence
* in the model.
*/
private IndexedSegment getSegmentAt(int part, int index)
throws BadLocationException {
IndexedSegment seg = getParagraphElementText(index);
if (seg == null) {
return null;
}
BreakIterator iterator;
switch (part) {
case AccessibleText.WORD:
iterator = BreakIterator.getWordInstance(getLocale());
break;
case AccessibleText.SENTENCE:
iterator = BreakIterator.getSentenceInstance(getLocale());
break;
default:
return null;
}
seg.first();
iterator.setText(seg);
int end = iterator.following(index - seg.modelOffset + seg.offset);
if (end == BreakIterator.DONE) {
return null;
}
if (end > seg.offset + seg.count) {
return null;
}
int begin = iterator.previous();
if (begin == BreakIterator.DONE ||
begin >= seg.offset + seg.count) {
return null;
}
seg.modelOffset = seg.modelOffset + begin - seg.offset;
seg.offset = begin;
seg.count = end - begin;
return seg;
}
public static void main(String[] args) {
BreakIterator b = BreakIterator.getWordInstance();
b.setText("abc");
if (b.equals(null)) {
throw new RuntimeException("BreakIterator.equals(null) should return false.");
}
}
private static int nextWordStartAfter(int pos, String text) {
BreakIterator wb = BreakIterator.getWordInstance();
wb.setText(text);
int last = wb.following(pos);
int current = wb.next();
while (current != BreakIterator.DONE) {
for (int p = last; p < current; p++) {
if (Character.isLetter(text.codePointAt(p)))
return last;
}
last = current;
current = wb.next();
}
return BreakIterator.DONE;
}
@Override
public void updateEndByBreaksForward(int numOfBreaks, BreakIterator breakIterator) {
if (getAreaLength() == 0) {
return;
}
breakIterator.setText(getArea().getText());
int position = calculatePositionViaBreakingForwards(numOfBreaks, breakIterator, getStartPosition());
updateEndTo(position);
}
/**
* Returns the Segment at <code>index</code> representing either
* the paragraph or sentence as identified by <code>part</code>, or
* null if a valid paragraph/sentence can't be found. The offset
* will point to the start of the word/sentence in the array, and
* the modelOffset will point to the location of the word/sentence
* in the model.
*/
private IndexedSegment getSegmentAt(int part, int index)
throws BadLocationException {
IndexedSegment seg = getParagraphElementText(index);
if (seg == null) {
return null;
}
BreakIterator iterator;
switch (part) {
case AccessibleText.WORD:
iterator = BreakIterator.getWordInstance(getLocale());
break;
case AccessibleText.SENTENCE:
iterator = BreakIterator.getSentenceInstance(getLocale());
break;
default:
return null;
}
seg.first();
iterator.setText(seg);
int end = iterator.following(index - seg.modelOffset + seg.offset);
if (end == BreakIterator.DONE) {
return null;
}
if (end > seg.offset + seg.count) {
return null;
}
int begin = iterator.previous();
if (begin == BreakIterator.DONE ||
begin >= seg.offset + seg.count) {
return null;
}
seg.modelOffset = seg.modelOffset + begin - seg.offset;
seg.offset = begin;
seg.count = end - begin;
return seg;
}
/**
* Returns the Segment at <code>index</code> representing either
* the paragraph or sentence as identified by <code>part</code>, or
* null if a valid paragraph/sentence can't be found. The offset
* will point to the start of the word/sentence in the array, and
* the modelOffset will point to the location of the word/sentence
* in the model.
*/
private IndexedSegment getSegmentAt(int part, int index)
throws BadLocationException {
IndexedSegment seg = getParagraphElementText(index);
if (seg == null) {
return null;
}
BreakIterator iterator;
switch (part) {
case AccessibleText.WORD:
iterator = BreakIterator.getWordInstance(getLocale());
break;
case AccessibleText.SENTENCE:
iterator = BreakIterator.getSentenceInstance(getLocale());
break;
default:
return null;
}
seg.first();
iterator.setText(seg);
int end = iterator.following(index - seg.modelOffset + seg.offset);
if (end == BreakIterator.DONE) {
return null;
}
if (end > seg.offset + seg.count) {
return null;
}
int begin = iterator.previous();
if (begin == BreakIterator.DONE ||
begin >= seg.offset + seg.count) {
return null;
}
seg.modelOffset = seg.modelOffset + begin - seg.offset;
seg.offset = begin;
seg.count = end - begin;
return seg;
}
/**
* Returns a location to break at in the passed in region, or
* BreakIterator.DONE if there isn't a good location to break at
* in the specified region.
*/
private int getBreakSpot(int p0, int p1) {
if (breakSpots == null) {
// Re-calculate breakpoints for the whole view
int start = getStartOffset();
int end = getEndOffset();
int[] bs = new int[end + 1 - start];
int ix = 0;
// Breaker should work on the parent element because there may be
// a valid breakpoint at the end edge of the view (space, etc.)
Element parent = getElement().getParentElement();
int pstart = (parent == null ? start : parent.getStartOffset());
int pend = (parent == null ? end : parent.getEndOffset());
Segment s = getText(pstart, pend);
s.first();
BreakIterator breaker = getBreaker();
breaker.setText(s);
// Backward search should start from end+1 unless there's NO end+1
int startFrom = end + (pend > end ? 1 : 0);
for (;;) {
startFrom = breaker.preceding(s.offset + (startFrom - pstart))
+ (pstart - s.offset);
if (startFrom > start) {
// The break spot is within the view
bs[ix++] = startFrom;
} else {
break;
}
}
SegmentCache.releaseSharedSegment(s);
breakSpots = new int[ix];
System.arraycopy(bs, 0, breakSpots, 0, ix);
}
int breakSpot = BreakIterator.DONE;
for (int i = 0; i < breakSpots.length; i++) {
int bsp = breakSpots[i];
if (bsp <= p1) {
if (bsp > p0) {
breakSpot = bsp;
}
break;
}
}
return breakSpot;
}
/**
* Returns a location to break at in the passed in region, or
* BreakIterator.DONE if there isn't a good location to break at
* in the specified region.
*/
private int getBreakSpot(int p0, int p1) {
if (breakSpots == null) {
// Re-calculate breakpoints for the whole view
int start = getStartOffset();
int end = getEndOffset();
int[] bs = new int[end + 1 - start];
int ix = 0;
// Breaker should work on the parent element because there may be
// a valid breakpoint at the end edge of the view (space, etc.)
Element parent = getElement().getParentElement();
int pstart = (parent == null ? start : parent.getStartOffset());
int pend = (parent == null ? end : parent.getEndOffset());
Segment s = getText(pstart, pend);
s.first();
BreakIterator breaker = getBreaker();
breaker.setText(s);
// Backward search should start from end+1 unless there's NO end+1
int startFrom = end + (pend > end ? 1 : 0);
for (;;) {
startFrom = breaker.preceding(s.offset + (startFrom - pstart))
+ (pstart - s.offset);
if (startFrom > start) {
// The break spot is within the view
bs[ix++] = startFrom;
} else {
break;
}
}
SegmentCache.releaseSharedSegment(s);
breakSpots = new int[ix];
System.arraycopy(bs, 0, breakSpots, 0, ix);
}
int breakSpot = BreakIterator.DONE;
for (int i = 0; i < breakSpots.length; i++) {
int bsp = breakSpots[i];
if (bsp <= p1) {
if (bsp > p0) {
breakSpot = bsp;
}
break;
}
}
return breakSpot;
}
static void formatText(PrintWriter writer, String target,
int initialLength) {
BreakIterator boundary = BreakIterator.getLineInstance();
boundary.setText(target);
int start = boundary.first();
int end = boundary.next();
int lineLength = initialLength;
while (end != BreakIterator.DONE) {
// Look at the end and only accept whitespace breaks
char endChar = target.charAt(end-1);
while (!Character.isWhitespace(endChar)) {
int lastEnd = end;
end = boundary.next();
if (end == BreakIterator.DONE) {
// give up. We are at the end of the string
end = lastEnd;
break;
}
endChar = target.charAt(end-1);
}
int wordEnd = end;
if (endChar == '\n') {
// trim off the \n since println will do it for us
wordEnd--;
if (wordEnd > 0 && target.charAt(wordEnd-1) == '\r') {
wordEnd--;
}
} else if (endChar == '\t') {
// figure tabs use 8 characters
lineLength += 7;
}
String word = target.substring(start, wordEnd);
lineLength += word.length();
writer.print(word);
if (endChar == '\n' || endChar == '\r') {
// force end of line
writer.println();
writer.print(" ");
lineLength = 2;
}
start = end;
end = boundary.next();
}
if (lineLength != 0) {
writer.println();
}
}
/**
* Gets the number of words in the string assuming the string is in English.
*
* This implementation doesn't know about placeholders. They are counted as
* word. Later, we can do something more clever using an
* Okapi step later to exclude them.
*
* @param string
* @return number of word
*/
public int getEnglishWordCount(String string) {
int wordCount = 0;
BreakIterator wordBreakIterator = BreakIterator.getWordInstance(Locale.ENGLISH);
wordBreakIterator.setText(string);
int start = wordBreakIterator.first();
int end = wordBreakIterator.next();
while (end != BreakIterator.DONE) {
if (Character.isLetterOrDigit(string.charAt(start))) {
wordCount += 1;
}
start = end;
end = wordBreakIterator.next();
}
return wordCount;
}
/**
* Returns a location to break at in the passed in region, or
* BreakIterator.DONE if there isn't a good location to break at
* in the specified region.
*/
private int getBreakSpot(int p0, int p1) {
if (breakSpots == null) {
// Re-calculate breakpoints for the whole view
int start = getStartOffset();
int end = getEndOffset();
int[] bs = new int[end + 1 - start];
int ix = 0;
// Breaker should work on the parent element because there may be
// a valid breakpoint at the end edge of the view (space, etc.)
Element parent = getElement().getParentElement();
int pstart = (parent == null ? start : parent.getStartOffset());
int pend = (parent == null ? end : parent.getEndOffset());
Segment s = getText(pstart, pend);
s.first();
BreakIterator breaker = getBreaker();
breaker.setText(s);
// Backward search should start from end+1 unless there's NO end+1
int startFrom = end + (pend > end ? 1 : 0);
for (;;) {
startFrom = breaker.preceding(s.offset + (startFrom - pstart))
+ (pstart - s.offset);
if (startFrom > start) {
// The break spot is within the view
bs[ix++] = startFrom;
} else {
break;
}
}
SegmentCache.releaseSharedSegment(s);
breakSpots = new int[ix];
System.arraycopy(bs, 0, breakSpots, 0, ix);
}
int breakSpot = BreakIterator.DONE;
for (int i = 0; i < breakSpots.length; i++) {
int bsp = breakSpots[i];
if (bsp <= p1) {
if (bsp > p0) {
breakSpot = bsp;
}
break;
}
}
return breakSpot;
}