下面列出了java.text.BreakIterator#next ( ) 实例代码,或者点击链接到github查看源代码,也可以在右侧发表评论。
/**
* Needed to unify forward and backward searching.
* The method assumes that s is the text assigned to words.
*/
private int findWordLimit(int index, BreakIterator words, boolean direction,
String s) {
// Fix for 4256660 and 4256661.
// Words iterator is different from character and sentence iterators
// in that end of one word is not necessarily start of another word.
// Please see java.text.BreakIterator JavaDoc. The code below is
// based on nextWordStartAfter example from BreakIterator.java.
int last = (direction == NEXT) ? words.following(index)
: words.preceding(index);
int current = (direction == NEXT) ? words.next()
: words.previous();
while (current != BreakIterator.DONE) {
for (int p = Math.min(last, current); p < Math.max(last, current); p++) {
if (Character.isLetter(s.charAt(p))) {
return last;
}
}
last = current;
current = (direction == NEXT) ? words.next()
: words.previous();
}
return BreakIterator.DONE;
}
/**
* Needed to unify forward and backward searching.
* The method assumes that s is the text assigned to words.
*/
private int findWordLimit(int index, BreakIterator words, boolean direction,
String s) {
// Fix for 4256660 and 4256661.
// Words iterator is different from character and sentence iterators
// in that end of one word is not necessarily start of another word.
// Please see java.text.BreakIterator JavaDoc. The code below is
// based on nextWordStartAfter example from BreakIterator.java.
int last = (direction == NEXT) ? words.following(index)
: words.preceding(index);
int current = (direction == NEXT) ? words.next()
: words.previous();
while (current != BreakIterator.DONE) {
for (int p = Math.min(last, current); p < Math.max(last, current); p++) {
if (Character.isLetter(s.charAt(p))) {
return last;
}
}
last = current;
current = (direction == NEXT) ? words.next()
: words.previous();
}
return BreakIterator.DONE;
}
/**
* Needed to unify forward and backward searching.
* The method assumes that s is the text assigned to words.
*/
private int findWordLimit(int index, BreakIterator words, boolean direction,
String s) {
// Fix for 4256660 and 4256661.
// Words iterator is different from character and sentence iterators
// in that end of one word is not necessarily start of another word.
// Please see java.text.BreakIterator JavaDoc. The code below is
// based on nextWordStartAfter example from BreakIterator.java.
int last = (direction == NEXT) ? words.following(index)
: words.preceding(index);
int current = (direction == NEXT) ? words.next()
: words.previous();
while (current != BreakIterator.DONE) {
for (int p = Math.min(last, current); p < Math.max(last, current); p++) {
if (Character.isLetter(s.charAt(p))) {
return last;
}
}
last = current;
current = (direction == NEXT) ? words.next()
: words.previous();
}
return BreakIterator.DONE;
}
void makeAnnotations(Maker m, BreakIterator b) {
b.setText(input);
for (int end = b.next(), start = b.first(); end != BreakIterator.DONE; start = end, end = b
.next()) {
// eliminate all-whitespace tokens
boolean isWhitespace = true;
for (int i = start; i < end; i++) {
if (!Character.isWhitespace(input.charAt(i))) {
isWhitespace = false;
break;
}
}
if (!isWhitespace) {
m.newAnnotation(jcas, start, end).addToIndexes();
}
}
}
/**
* Returns a string comprised of spaces and '^' only at the boundaries.
*/
private String readBoundariesToString(BreakIterator bi, String text) {
// init markers to spaces
StringBuilder markers = new StringBuilder();
markers.setLength(text.length() + 1);
for (int k = 0; k < markers.length(); k++) {
markers.setCharAt(k, ' ');
}
bi.setText(text);
for (int boundary = bi.current(); boundary != BreakIterator.DONE; boundary = bi.next()) {
markers.setCharAt(boundary, '^');
}
return markers.toString();
}
/**
* @see com.mulgasoft.emacsplus.commands.SexpHandler#getNextPosition(org.eclipse.jface.text.IDocument, java.text.BreakIterator)
*/
@Override
protected int getNextPosition(IDocument document, BreakIterator iter) {
int pos = iter.current();
int result = iter.next();
if (result != BreakIterator.DONE) {
result = checkDot(document,pos,result);
result = checkUnder(document,result);
}
return result;
}
/**
* Checks if a String is a multi word unit.
*
* @param t
* the t
* @return true, if is multi word
*/
private boolean isMultiWord(String t) {
BreakIterator tokenBreaker = BreakIterator.getWordInstance(locale);
tokenBreaker.setText(t);
// count tokens
int pos = tokenBreaker.first();
int nTokens = 0;
while (pos != BreakIterator.DONE) {
nTokens++;
pos = tokenBreaker.next();
}
nTokens = nTokens / 2;
return nTokens > 1;
}
MirroredBreakIterator(BreakIterator bi) {
List<Integer> b = new ArrayList<Integer>();
int i = bi.first();
charIndex = i;
for (; i != DONE; i = bi.next()) {
b.add(i);
}
boundaries = Collections.unmodifiableList(b);
}
/**
* Skips n number of word boundaries forward.
*/
default void wordBreaksForwards(int n, SelectionPolicy selectionPolicy) {
if(getLength() == 0) {
return;
}
BreakIterator wordBreakIterator = BreakIterator.getWordInstance();
wordBreakIterator.setText(getText());
wordBreakIterator.following(getCaretPosition());
for (int i = 1; i < n; i++) {
wordBreakIterator.next();
}
moveTo(wordBreakIterator.current(), selectionPolicy);
}
private String preprocessQuery(String aQuery)
{
String result;
if (!(aQuery.contains("\"") || aQuery.contains("[") || aQuery.contains("]")
|| aQuery.contains("{") || aQuery.contains("}") || aQuery.contains("<")
|| aQuery.contains(">"))) {
// Convert raw words query to a Mtas CQP query
result = "";
BreakIterator words = BreakIterator.getWordInstance();
words.setText(aQuery);
int start = words.first();
int end = words.next();
while (end != BreakIterator.DONE) {
String word = aQuery.substring(start, end);
if (!word.trim().isEmpty()) {
// Add the word to the query
result += "\"" + word + "\"";
}
start = end;
end = words.next();
if (end != BreakIterator.DONE) {
result += " ";
}
}
}
else {
result = aQuery;
}
return result;
}
/**
* set / update the text of the displayLabels. these are the Week column
* headers above the days on the Calendar part of the <code>CDateTime</code>
* .
*/
private void updateDaysOfWeek() {
if (dayPanel != null) {
Calendar tmpcal = cdt.getCalendarInstance();
tmpcal.set(Calendar.DAY_OF_WEEK, tmpcal.getFirstDayOfWeek());
Locale locale = cdt.getLocale();
boolean ltr = ComponentOrientation.getOrientation(locale)
.isLeftToRight() && !locale.getLanguage().equals("zh"); //$NON-NLS-1$
BreakIterator iterator = BreakIterator.getCharacterInstance(locale);
for (VLabel dayLabel : dayLabels) {
String str = getFormattedDate("E", tmpcal.getTime()); //$NON-NLS-1$
if (dayLabel.getData(CDT.Key.Compact, Boolean.class)) {
iterator.setText(str);
int start, end;
if (ltr) {
start = iterator.first();
end = iterator.next();
} else {
end = iterator.last();
start = iterator.previous();
}
dayLabel.setText(str.substring(start, end));
} else {
dayLabel.setText(str);
}
tmpcal.add(Calendar.DAY_OF_WEEK, 1);
}
}
}
MirroredBreakIterator(BreakIterator bi) {
List<Integer> b = new ArrayList<Integer>();
int i = bi.first();
charIndex = i;
for (; i != DONE; i = bi.next()) {
b.add(i);
}
boundaries = Collections.unmodifiableList(b);
}
/**
* Bug 4068137
*/
public void TestEndBehavior()
{
String testString = "boo.";
BreakIterator wb = BreakIterator.getWordInstance();
wb.setText(testString);
if (wb.first() != 0)
errln("Didn't get break at beginning of string.");
if (wb.next() != 3)
errln("Didn't get break before period in \"boo.\"");
if (wb.current() != 4 && wb.next() != 4)
errln("Didn't get break at end of string.");
}
/**
* Break the paragraph into individual lines.
*
* @param font the font used for rendering the text.
* @param fontSize the fontSize used for rendering the text.
* @param width the width of the box holding the content.
* @return the individual lines.
* @throws IOException
*/
List<Line> getLines(PDFont font, float fontSize, float width) throws IOException
{
BreakIterator iterator = BreakIterator.getLineInstance();
iterator.setText(textContent);
final float scale = fontSize/FONTSCALE;
int start = iterator.first();
int end = iterator.next();
float lineWidth = 0;
List<Line> textLines = new ArrayList<Line>();
Line textLine = new Line();
while (end != BreakIterator.DONE)
{
String word = textContent.substring(start,end);
float wordWidth = font.getStringWidth(word) * scale;
lineWidth = lineWidth + wordWidth;
// check if the last word would fit without the whitespace ending it
if (lineWidth >= width && Character.isWhitespace(word.charAt(word.length()-1)))
{
float whitespaceWidth = font.getStringWidth(word.substring(word.length()-1)) * scale;
lineWidth = lineWidth - whitespaceWidth;
}
if (lineWidth >= width)
{
textLine.setWidth(textLine.calculateWidth(font, fontSize));
textLines.add(textLine);
textLine = new Line();
lineWidth = font.getStringWidth(word) * scale;
}
AttributedString as = new AttributedString(word);
as.addAttribute(TextAttribute.WIDTH, wordWidth);
Word wordInstance = new Word(word);
wordInstance.setAttributes(as);
textLine.addWord(wordInstance);
start = end;
end = iterator.next();
}
textLine.setWidth(textLine.calculateWidth(font, fontSize));
textLines.add(textLine);
return textLines;
}
public static void main(String args[]){
String words[] = {"bank", "banking", "banks", "banker", "banked",
"bankart"};
PorterStemmer ps = new PorterStemmer();
for(String w : words){
String stem = ps.stem(w);
System.out.println("Word : " + w + " Stem : " + stem);
}
String paragraph = "When determining the end of sentences "
+ "we need to consider several factors. Sentences may end with "
+ "exclamation marks! Or possibly questions marks? Within "
+ "sentences we may find numbers like 3.14159, abbreviations "
+ "such as found in Mr. Smith, and possibly ellipses either "
+ "within a sentence …, or at the end of a sentence…";
String simple = "[.?!]";
String[] splitString = (paragraph.split(simple));
for (String string : splitString) {
System.out.println(string);
}
System.out.println("-------------Using Pattern and Matcher-------------");
Pattern sentencePattern = Pattern.compile(
"# Match a sentence ending in punctuation or EOS.\n"
+ "[^.!?\\s] # First char is non-punct, non-ws\n"
+ "[^.!?]* # Greedily consume up to punctuation.\n"
+ "(?: # Group for unrolling the loop.\n"
+ " [.!?] # (special) inner punctuation ok if\n"
+ " (?!['\"]?\\s|$) # not followed by ws or EOS.\n"
+ " [^.!?]* # Greedily consume up to punctuation.\n"
+ ")* # Zero or more (special normal*)\n"
+ "[.!?]? # Optional ending punctuation.\n"
+ "['\"]? # Optional closing quote.\n"
+ "(?=\\s|$)",
Pattern.MULTILINE | Pattern.COMMENTS);
Matcher matcher = sentencePattern.matcher(paragraph);
while (matcher.find()) {
System.out.println(matcher.group());
}
System.out.println("-------------Using BreakIterator-------------");
BreakIterator si = BreakIterator.getSentenceInstance();
Locale cl = new Locale("en", "US");
si.setText(paragraph);
int boundary = si.first();
while(boundary!=BreakIterator.DONE){
int begin = boundary;
System.out.println(boundary + " - ");
boundary = si.next();
int end = boundary;
if(end == BreakIterator.DONE){
break;
}
System.out.println(boundary + " [ " + paragraph.substring(begin,end) + " ] ");
}
System.out.println("-------------Using SentenceDetectorME-------------");
try{
InputStream is = new FileInputStream(new File("/home/ashish/Downloads/" + "en-sent.bin"));
SentenceModel sm = new SentenceModel(is);
SentenceDetectorME detector = new SentenceDetectorME(sm);
String sentences [] = detector.sentDetect(paragraph);
for(String s : sentences){
System.out.println(s);
}
}
catch(IOException e){
System.out.println("Error Detected" + e);
e.printStackTrace();
}
}
private void doOtherInvariantTest(BreakIterator tb, String testChars)
{
StringBuffer work = new StringBuffer("a\r\na");
int errorCount = 0;
// a break should never occur between CR and LF
for (int i = 0; i < testChars.length(); i++) {
work.setCharAt(0, testChars.charAt(i));
for (int j = 0; j < testChars.length(); j++) {
work.setCharAt(3, testChars.charAt(j));
tb.setText(work.toString());
for (int k = tb.first(); k != BreakIterator.DONE; k = tb.next())
if (k == 2) {
errln("Break between CR and LF in string U+" + Integer.toHexString(
(int)(work.charAt(0))) + ", U+d U+a U+" + Integer.toHexString(
(int)(work.charAt(3))));
errorCount++;
if (errorCount >= 75)
return;
}
}
}
// a break should never occur before a non-spacing mark, unless it's preceded
// by a line terminator
work.setLength(0);
work.append("aaaa");
for (int i = 0; i < testChars.length(); i++) {
char c = testChars.charAt(i);
if (c == '\n' || c == '\r' || c == '\u2029' || c == '\u2028' || c == '\u0003')
continue;
work.setCharAt(1, c);
for (int j = 0; j < testChars.length(); j++) {
c = testChars.charAt(j);
if (Character.getType(c) != Character.NON_SPACING_MARK && Character.getType(c)
!= Character.ENCLOSING_MARK)
continue;
work.setCharAt(2, c);
// CONTROL (Cc) and FORMAT (Cf) Characters are to be ignored
// for breaking purposes as per UTR14
int type1 = Character.getType(work.charAt(1));
int type2 = Character.getType(work.charAt(2));
if (type1 == Character.CONTROL || type1 == Character.FORMAT ||
type2 == Character.CONTROL || type2 == Character.FORMAT) {
continue;
}
tb.setText(work.toString());
for (int k = tb.first(); k != BreakIterator.DONE; k = tb.next())
if (k == 2) {
errln("Break between U+" + Integer.toHexString((int)(work.charAt(1)))
+ " and U+" + Integer.toHexString((int)(work.charAt(2))));
errorCount++;
if (errorCount >= 75)
return;
}
}
}
}
private void doBreakInvariantTest(BreakIterator tb, String testChars)
{
StringBuffer work = new StringBuffer("aaa");
int errorCount = 0;
// a break should always occur after CR (unless followed by LF), LF, PS, and LS
String breaks = /*"\r\n\u2029\u2028"*/"\n\u2029\u2028";
// change this back when new BI code is added
for (int i = 0; i < breaks.length(); i++) {
work.setCharAt(1, breaks.charAt(i));
for (int j = 0; j < testChars.length(); j++) {
work.setCharAt(0, testChars.charAt(j));
for (int k = 0; k < testChars.length(); k++) {
char c = testChars.charAt(k);
// if a cr is followed by lf, don't do the check (they stay together)
if (work.charAt(1) == '\r' && (c == '\n'))
continue;
// CONTROL (Cc) and FORMAT (Cf) Characters are to be ignored
// for breaking purposes as per UTR14
int type1 = Character.getType(work.charAt(1));
int type2 = Character.getType(c);
if (type1 == Character.CONTROL || type1 == Character.FORMAT ||
type2 == Character.CONTROL || type2 == Character.FORMAT) {
continue;
}
work.setCharAt(2, c);
tb.setText(work.toString());
boolean seen2 = false;
for (int l = tb.first(); l != BreakIterator.DONE; l = tb.next()) {
if (l == 2)
seen2 = true;
}
if (!seen2) {
errln("No break between U+" + Integer.toHexString((int)(work.charAt(1)))
+ " and U+" + Integer.toHexString((int)(work.charAt(2))));
errorCount++;
if (errorCount >= 75)
return;
}
}
}
}
}
/**
* Breaks up the string, if wider than "columns" characters.
*
* @param s the string to process
* @param columns the width in columns
* @return the processed string
*/
public static String[] breakUp(String s, int columns) {
Vector<String> result;
String line;
BreakIterator boundary;
int boundaryStart;
int boundaryEnd;
String word;
String punctuation;
int i;
String[] lines;
result = new Vector<String>();
punctuation = " .,;:!?'\"";
lines = s.split("\n");
for (i = 0; i < lines.length; i++) {
boundary = BreakIterator.getWordInstance();
boundary.setText(lines[i]);
boundaryStart = boundary.first();
boundaryEnd = boundary.next();
line = "";
while (boundaryEnd != BreakIterator.DONE) {
word = lines[i].substring(boundaryStart, boundaryEnd);
if (line.length() >= columns) {
if (word.length() == 1) {
if (punctuation.indexOf(word.charAt(0)) > -1) {
line += word;
word = "";
}
}
result.add(line);
line = "";
}
line += word;
boundaryStart = boundaryEnd;
boundaryEnd = boundary.next();
}
if (line.length() > 0)
result.add(line);
}
return result.toArray(new String[result.size()]);
}
/**
* Bug 4638433
*/
public void TestLineBreakBasedOnUnicode3_0_0()
{
BreakIterator iter;
int i;
/* Latin Extend-B characters
* 0x0218-0x0233 which have been added since Unicode 3.0.0.
*/
iter = BreakIterator.getWordInstance(Locale.US);
iter.setText("\u0216\u0217\u0218\u0219\u021A");
i = iter.first();
i = iter.next();
if (i != 5) {
errln("Word break failure: failed to stop at 5 and bounded at " + i);
}
iter = BreakIterator.getLineInstance(Locale.US);
/* <Three(Nd)><Two(Nd)><Low Double Prime Quotation Mark(Pe)><One(Nd)>
* \u301f has changed its category from Ps to Pe since Unicode 2.1.
*/
iter.setText("32\u301f1");
i = iter.first();
i = iter.next();
if (i != 3) {
errln("Line break failure: failed to skip before \\u301F(Pe) at 3 and bounded at " + i);
}
/* Mongolian <Letter A(Lo)><Todo Soft Hyphen(Pd)><Letter E(Lo)>
* which have been added since Unicode 3.0.0.
*/
iter.setText("\u1820\u1806\u1821");
i = iter.first();
i = iter.next();
if (i != 2) {
errln("Mongolian line break failure: failed to skip position before \\u1806(Pd) at 2 and bounded at " + i);
}
/* Khmer <ZERO(Nd)><Currency Symbol(Sc)><ONE(Nd)> which have
* been added since Unicode 3.0.0.
*/
iter.setText("\u17E0\u17DB\u17E1");
i = iter.first();
i = iter.next();
if (i != 1) {
errln("Khmer line break failure: failed to stop before \\u17DB(Sc) at 1 and bounded at " + i);
}
i = iter.next();
if (i != 3) {
errln("Khmer line break failure: failed to skip position after \\u17DB(Sc) at 3 and bounded at " + i);
}
/* Ogham <Letter UR(Lo)><Space Mark(Zs)><Letter OR(Lo)> which have
* been added since Unicode 3.0.0.
*/
iter.setText("\u1692\u1680\u1696");
i = iter.first();
i = iter.next();
if (i != 2) {
errln("Ogham line break failure: failed to skip postion before \\u1680(Zs) at 2 and bounded at " + i);
}
// Confirm changes in BreakIteratorRules_th.java have been reflected.
iter = BreakIterator.getLineInstance(new Locale("th", ""));
/* Thai <Seven(Nd)>
* <Left Double Quotation Mark(Pi)>
* <Five(Nd)>
* <Right Double Quotation Mark(Pf)>
* <Three(Nd)>
*/
iter.setText("\u0E57\u201C\u0E55\u201D\u0E53");
i = iter.first();
i = iter.next();
if (i != 1) {
errln("Thai line break failure: failed to stop before \\u201C(Pi) at 1 and bounded at " + i);
}
i = iter.next();
if (i != 4) {
errln("Thai line break failure: failed to stop after \\u201D(Pf) at 4 and bounded at " + i);
}
}
/** Wrap multi-line strings (and get the individual lines).
* @param original the original string to wrap
* @param width the maximum width of lines
* @param breakIterator breaks original to chars, words, sentences, depending on what instance you provide.
* @param removeNewLines if <code>true</code>, any newlines in the original string are ignored
* @return the lines after wrapping
*/
public static String[] wrapStringToArray(
String original, int width, BreakIterator breakIterator, boolean removeNewLines
) {
if (original.length() == 0) {
return new String[] { original };
}
String[] workingSet;
// substitute original newlines with spaces,
// remove newlines from head and tail
if (removeNewLines) {
original = trimString(original);
original = original.replace('\n', ' ');
workingSet = new String[] { original };
} else {
StringTokenizer tokens = new StringTokenizer(original, "\n"); // NOI18N
int len = tokens.countTokens();
workingSet = new String[len];
for (int i = 0; i < len; i++) {
workingSet[i] = tokens.nextToken();
}
}
if (width < 1) {
width = 1;
}
if (original.length() <= width) {
return workingSet;
}
widthcheck: {
boolean ok = true;
for (int i = 0; i < workingSet.length; i++) {
ok = ok && (workingSet[i].length() < width);
if (!ok) {
break widthcheck;
}
}
return workingSet;
}
java.util.ArrayList<String> lines = new java.util.ArrayList<String>();
int lineStart = 0; // the position of start of currently processed line in the original string
for (int i = 0; i < workingSet.length; i++) {
if (workingSet[i].length() < width) {
lines.add(workingSet[i]);
} else {
breakIterator.setText(workingSet[i]);
int nextStart = breakIterator.next();
int prevStart = 0;
do {
while (((nextStart - lineStart) < width) && (nextStart != BreakIterator.DONE)) {
prevStart = nextStart;
nextStart = breakIterator.next();
}
if (nextStart == BreakIterator.DONE) {
nextStart = prevStart = workingSet[i].length();
}
if (prevStart == 0) {
prevStart = nextStart;
}
lines.add(workingSet[i].substring(lineStart, prevStart));
lineStart = prevStart;
prevStart = 0;
} while (lineStart < workingSet[i].length());
lineStart = 0;
}
}
String[] s = new String[lines.size()];
return lines.toArray(s);
}