下面列出了怎么用java.text.Normalizer.Form的API类实例代码及写法,或者点击链接到github查看源代码。
@Override
public Object translateBytes(byte[] bytes, DataFlavor flavor,
long format, Transferable transferable) throws IOException {
if (format == CF_URL && URL.class.equals(flavor.getRepresentationClass()))
{
String charset = getDefaultTextCharset();
if (transferable != null && transferable.isDataFlavorSupported(javaTextEncodingFlavor)) {
try {
charset = new String((byte[])transferable.getTransferData(javaTextEncodingFlavor), "UTF-8");
} catch (UnsupportedFlavorException cannotHappen) {
}
}
return new URL(new String(bytes, charset));
}
if (format == CF_STRING) {
bytes = Normalizer.normalize(new String(bytes, "UTF8"), Form.NFC).getBytes("UTF8");
}
return super.translateBytes(bytes, flavor, format, transferable);
}
public static String normalizeText(String text) {
String normalizedText = text.toLowerCase(Locale.getDefault());
normalizedText = new MCRHyphenNormalizer().normalize(normalizedText).replace("-", " ");
//canonical decomposition, remove accents
normalizedText = Normalizer.normalize(normalizedText, Form.NFD).replaceAll("\\p{M}", "");
normalizedText = normalizedText.replace("ue", "u")
.replace("oe", "o").replace("ae", "a")
.replace("ß", "s").replace("ss", "s");
//remove all non-alphabetic characters
normalizedText = normalizedText.replaceAll("[^a-z0-9]\\s]", "");
// remove all words with fewer than four characters
// normalizedText = normalizedText.replaceAll("\\b.{1,3}\\b", " ").trim();
normalizedText = normalizedText.replaceAll("\\p{Punct}", " ").trim(); // remove all punctuation
normalizedText = normalizedText.replaceAll("\\s+", " "); // normalize whitespace
return normalizedText;
}
private static String getHash(int iterations, byte[] salt, String text, String algorithm)
throws NoSuchAlgorithmException {
MessageDigest digest;
if (--iterations < 0) {
iterations = 0;
}
byte[] data;
digest = MessageDigest.getInstance(algorithm);
text = Normalizer.normalize(text, Form.NFC);
if (salt != null) {
digest.update(salt);
}
data = digest.digest(text.getBytes(StandardCharsets.UTF_8));
for (int i = 0; i < iterations; i++) {
data = digest.digest(data);
}
return toHexString(data);
}
String toString(List<List<TextPosition>> words)
{
StringBuilder stringBuilder = new StringBuilder();
boolean first = true;
for (List<TextPosition> word : words)
{
if (first)
first = false;
else
stringBuilder.append(' ');
for (TextPosition textPosition : word)
{
stringBuilder.append(textPosition.getUnicode());
}
}
// cf. http://stackoverflow.com/a/7171932/1729265
return Normalizer.normalize(stringBuilder, Form.NFKC);
}
public static String hashIt ( final String salt, String data )
{
data = Normalizer.normalize ( data, Form.NFC );
final byte[] strData = data.getBytes ( StandardCharsets.UTF_8 );
final byte[] saltData = salt.getBytes ( StandardCharsets.UTF_8 );
final byte[] first = new byte[saltData.length + strData.length];
System.arraycopy ( saltData, 0, first, 0, saltData.length );
System.arraycopy ( strData, 0, first, saltData.length, strData.length );
final MessageDigest md = createDigest ();
byte[] digest = md.digest ( first );
final byte[] current = new byte[saltData.length + digest.length];
for ( int i = 0; i < 1000; i++ )
{
System.arraycopy ( saltData, 0, current, 0, saltData.length );
System.arraycopy ( digest, 0, current, saltData.length, digest.length );
digest = md.digest ( current );
}
return Base64.getEncoder ().encodeToString ( digest );
}
@Override
public Object translateBytes(byte[] bytes, DataFlavor flavor,
long format, Transferable transferable) throws IOException {
if (format == CF_URL && URL.class.equals(flavor.getRepresentationClass()))
{
String charset = getDefaultTextCharset();
if (transferable != null && transferable.isDataFlavorSupported(javaTextEncodingFlavor)) {
try {
charset = new String((byte[])transferable.getTransferData(javaTextEncodingFlavor), "UTF-8");
} catch (UnsupportedFlavorException cannotHappen) {
}
}
return new URL(new String(bytes, charset));
}
if (format == CF_STRING) {
bytes = Normalizer.normalize(new String(bytes, "UTF8"), Form.NFC).getBytes("UTF8");
}
return super.translateBytes(bytes, flavor, format, transferable);
}
@Override
public Object translateBytes(byte[] bytes, DataFlavor flavor,
long format, Transferable transferable) throws IOException {
if (format == CF_URL && URL.class.equals(flavor.getRepresentationClass()))
{
String charset = getDefaultTextCharset();
if (transferable != null && transferable.isDataFlavorSupported(javaTextEncodingFlavor)) {
try {
charset = new String((byte[])transferable.getTransferData(javaTextEncodingFlavor), "UTF-8");
} catch (UnsupportedFlavorException cannotHappen) {
}
}
return new URL(new String(bytes, charset));
}
if (format == CF_STRING) {
bytes = Normalizer.normalize(new String(bytes, "UTF8"), Form.NFC).getBytes("UTF8");
}
return super.translateBytes(bytes, flavor, format, transferable);
}
/**
* 获取字符串的Slug.
* @param str - 待获取Slug的字符串
* @return 字符串对应的Slug
*/
public static String getSlug(String str) {
if ( str == null ) {
return "";
}
// Rid of White Spaces
String noWhiteSpace = WHITESPACE.matcher(str.trim()).replaceAll("-");
// Processing Non-ASCII Characters
try {
noWhiteSpace = URLEncoder.encode(noWhiteSpace, "UTF-8");
} catch (UnsupportedEncodingException e) {
// Never reach here
}
// Slugify String
String normalized = Normalizer.normalize(noWhiteSpace, Form.NFD);
return normalized.toLowerCase();
}
/**
* <p>Normalize input using specified normalization form.</p>
*
* <p>Following normalization forms are supported:
* <ul>
* <li>NFD: canonical Unicode decomposition</li>
* <li>NFC: canonical Unicode decomposition followed by canonical composition</li>
* <li>NFKD: compatibility decomposition</li>
* <li>NFKC: compatibility decomposition followed by canonical composition</li>
* </ul>
* </p>
* <p>Function gracefully handles null input - null is simply passed through.</p>
*
* @param context function call context.
* @param input input string to normalize. May be null.
* @param form specifies algorithm to use. Algorithm name is case insensitive. Cannot be null.
*
* @return normalized input string or null if input is also null.
*
* @see Normalizer#normalize(CharSequence, Form)
*/
@TLFunctionAnnotation("Perform Unicode normalization of given string.")
@CTL2FunctionDeclaration(impl = UnicodeNormalizeFunction.class)
public static final String unicodeNormalize(TLFunctionCallContext context, String input, String form) {
if (form == null) {
throw new NullPointerException("Null form is not allowed.");
}
Form normalizerForm;
try {
normalizerForm = Form.valueOf(form.toUpperCase());
} catch (IllegalArgumentException iae) {
throw new IllegalArgumentException("Unsupported normalization form '" + form + "'.", iae);
}
if (input == null) {
return null;
}
return Normalizer.normalize(input, normalizerForm);
}
/**
* <p>Determine if input string is Unicode normalized according to the given form.</p>
*
* <p>Following normalization forms are supported:
* <ul>
* <li>NFD: canonical Unicode decomposition</li>
* <li>NFC: canonical Unicode decomposition followed by canonical composition</li>
* <li>NFKD: compatibility decomposition</li>
* <li>NFKC: compatibility decomposition followed by canonical composition</li>
* </ul>
* </p>
* <p>Function gracefully handles null input - null is simply passed through.</p>
*
* @param context function call context.
* @param input input string to normalize. May be null.
* @param form specifies algorithm to use. Algorithm name is case insensitive. Cannot be null.
*
* @return true if input is normalized with respect to the selected form of if input is null. False is returned otherwise.
*
* @see Normalizer#isNormalized(CharSequence, Form)
*/
@TLFunctionAnnotation("Determine if given string is Unicode normalized.")
@CTL2FunctionDeclaration(impl = IsUnicodeNormalizedFunction.class)
public static final boolean isUnicodeNormalized(TLFunctionCallContext context, String input, String form) {
if (form == null) {
throw new NullPointerException("Null form is not allowed.");
}
Form normalizerForm;
try {
normalizerForm = Form.valueOf(form.toUpperCase());
} catch (IllegalArgumentException iae) {
throw new IllegalArgumentException("Unsupported normalization form '" + form + "'.", iae);
}
if (input == null) {
return true;
}
return Normalizer.isNormalized(input, normalizerForm);
}
@Override
public void literal(final String name, final String value) {
//A Subfield has one character or digit exactly.
if (name.length() != 1) {
throw new FormatException(name);
}
if (!entityOpen) {
throw new FormatException(name); //new exceptions definition for literal out of entity
}
final String valueNew = Normalizer.normalize(value, Form.NFD);
if (idnControlSubField) {
// it is a [email protected] field, the same record id delivered with record should follow
if (!this.id.equals(value)) {
throw new MissingIdException(value);
}
idnControlSubField = false; //only one record Id will be checked.
}
builder.append(SUB_DELIMITER);
builder.append(name);
builder.append(valueNew);
}
/**
* Normalizes a string to be used as a search parameter value. All accents and
* diacritics are removed. And then the
* string is transformed to lower case.
*
* @param value
* @return
*/
public static String normalizeForSearch(String value) {
String normalizedValue = null;
if (value != null) {
normalizedValue = Normalizer.normalize(value, Form.NFD).replaceAll("\\p{InCombiningDiacriticalMarks}+", "");
normalizedValue = normalizedValue.toLowerCase();
}
return normalizedValue;
}
/**
* Compares the given Strings using the specified options.
*
* @param from The first String to be compared.
* @param with The second String to be compared.
* @param NSStringCompareOptions The option for searching the Strings.
* @return The result of comparing the two Strings.
* @see crossmobile.ios.foundation.NSOrdered
*/
@CMSelector(value = "- (NSComparisonResult)compare:(NSString *)aString options:(NSStringCompareOptions)mask", staticMapping = true)
public static int compare(String from, String with, int NSStringCompareOptions) {
if (with == null && from == null)
return NSOrdered.Same;
if (with == null)
return NSOrdered.Descending;
if (from == null)
return NSOrdered.Ascending;
int order;
if ((NSStringCompareOptions & crossmobile.ios.foundation.NSStringCompareOptions.NSNumericSearch) != 0) {
double fromD = stringToRelaxedDouble(from);
double withD = stringToRelaxedDouble(with);
order = fromD == withD ? 0 : (fromD < withD ? -1 : 1);
} else {
if ((NSStringCompareOptions & crossmobile.ios.foundation.NSStringCompareOptions.NSCaseInsensitiveSearch) != 0) {
from = from.toLowerCase();
with = with.toLowerCase();
}
if ((NSStringCompareOptions & crossmobile.ios.foundation.NSStringCompareOptions.NSDiacriticInsensitiveSearch) != 0) {
from = Normalizer.normalize(from, Form.NFD).replaceAll("\\p{InCombiningDiacriticalMarks}+", "");
with = Normalizer.normalize(with, Form.NFD).replaceAll("\\p{InCombiningDiacriticalMarks}+", "");
}
order = from.compareTo(with);
}
return order < 0 ? NSOrdered.Ascending : (order > 0 ? NSOrdered.Descending : NSOrdered.Same);
}
private void writeNormalized(String entity, String name, String source) throws IOException {
String normalizedName = Normalizer.normalize(name, Form.NFD).replaceAll("\\p{InCombiningDiacriticalMarks}+", "");
if (!normalizedName.equals(name)) {
write(PERSONNAMEHEURISTICS, new Fact(entity, RDFS.label, FactComponent.forStringWithLanguage(normalizedName, "eng")), PERSONNAMESOURCES, source,
"PersonNameExtractor_normalized");
}
}
protected String getSlug(String input) {
// Get a URL/file-safe version of a string
String nowhitespace = WHITESPACE.matcher(input).replaceAll("-");
String normalized = Normalizer.normalize(nowhitespace, Form.NFD);
String slug = NONLATIN.matcher(normalized).replaceAll("");
return slug.toLowerCase(Locale.ENGLISH);
}
@Override
public Object translateBytes(byte[] bytes, DataFlavor flavor,
long format, Transferable transferable) throws IOException {
if (format == CF_URL && URL.class.equals(flavor.getRepresentationClass())) {
String charset = Charset.defaultCharset().name();
if (transferable != null && transferable.isDataFlavorSupported(javaTextEncodingFlavor)) {
try {
charset = new String((byte[]) transferable.getTransferData(javaTextEncodingFlavor), StandardCharsets.UTF_8);
} catch (UnsupportedFlavorException cannotHappen) {
}
}
String xml = new String(bytes, charset);
// macosx pasteboard returns a property list that consists of one URL
// let's extract it.
return new URL(extractURL(xml));
}
if(isUriListFlavor(flavor) && format == CF_FILE) {
// dragQueryFile works fine with files and url,
// it parses and extracts values from property list.
// maxosx always returns property list for
// CF_URL and CF_FILE
String[] strings = dragQueryFile(bytes);
if(strings == null) {
return null;
}
bytes = String.join(System.getProperty("line.separator"),
strings).getBytes();
// now we extracted uri from xml, now we should treat it as
// regular string that allows to translate data to target represantation
// class by base method
format = CF_STRING;
} else if (format == CF_STRING) {
bytes = Normalizer.normalize(new String(bytes, "UTF8"), Form.NFC).getBytes("UTF8");
}
return super.translateBytes(bytes, flavor, format, transferable);
}
public static String makeSlug(String input, boolean transliterate) {
String origInput = input;
String tempInputValue = "";
// Validate the input
if (input == null) {
ProjectLogger.log("Provided input value is null");
return input;
}
// Remove extra spaces
tempInputValue = input.trim();
// Remove URL encoding
tempInputValue = urlDecode(tempInputValue);
// If transliterate is required
if (transliterate) {
// Tranlisterate & cleanup
String transliterated = transliterate(tempInputValue);
tempInputValue = transliterated;
}
// Replace all whitespace with dashes
tempInputValue = WHITESPACE.matcher(tempInputValue).replaceAll("-");
// Remove all accent chars
tempInputValue = Normalizer.normalize(tempInputValue, Form.NFD);
// Remove all non-latin special characters
tempInputValue = NONLATIN.matcher(tempInputValue).replaceAll("");
// Remove any consecutive dashes
tempInputValue = normalizeDashes(tempInputValue);
// Validate before returning
validateResult(tempInputValue, origInput);
// Slug is always lowercase
return tempInputValue.toLowerCase(Locale.ENGLISH);
}
protected Object translateBytesOrStream(InputStream stream, byte[] bytes, DataFlavor flavor, long format,
Transferable transferable) throws IOException
{
// 5-28-03 VL: [Radar 3266030]
// We need to do like Windows does here.
if (format == CF_HTML && flavor.isFlavorTextType()) {
if (stream == null) {
stream = new ByteArrayInputStream(bytes);
bytes = null;
}
stream = new HTMLDecodingInputStream(stream);
}
if (format == CF_URL && URL.class.equals(flavor.getRepresentationClass()))
{
if (bytes == null) {
bytes = inputStreamToByteArray(stream);
stream = null;
}
String charset = getDefaultTextCharset();
if (transferable != null && transferable.isDataFlavorSupported(javaTextEncodingFlavor)) {
try {
charset = new String((byte[])transferable.getTransferData(javaTextEncodingFlavor), "UTF-8");
} catch (UnsupportedFlavorException cannotHappen) {
}
}
return new URL(new String(bytes, charset));
}
if (format == CF_STRING) {
bytes = Normalizer.normalize(new String(bytes, "UTF8"), Form.NFC).getBytes("UTF8");
}
return super.translateBytes(bytes, flavor, format, transferable);
}
private String normalize(String nameFragment) {
String text = nameFragment.toLowerCase(Locale.getDefault());
text = new MCRHyphenNormalizer().normalize(text).replace("-", " ");
// canonical decomposition, then remove accents
text = Normalizer.normalize(text, Form.NFD).replaceAll("\\p{M}", "");
text = text.replace("ue", "u").replace("oe", "o").replace("ae", "a").replace("ß", "s").replace("ss", "s");
text = text.replaceAll("[^a-z0-9]\\s]", ""); //remove all non-alphabetic characters
text = text.replaceAll("\\p{Punct}", " ").trim(); // remove all punctuation
text = text.replaceAll("\\s+", " "); // normalize whitespace
return text.trim();
}
/**
* Transform any string on slug. Just alphanumeric, dash or underscore characters.
* @param input string to convert on slug
* @return slug string
*/
public static String toSlug(String input) {
String nowhitespace = Constant.WHITESPACE.matcher(input).replaceAll("-");
String normalized = Normalizer.normalize(nowhitespace, Form.NFD);
String slug = Constant.NONLATIN.matcher(normalized).replaceAll("");
return slug.toLowerCase(Locale.ENGLISH);
}
protected Object translateBytesOrStream(InputStream stream, byte[] bytes, DataFlavor flavor, long format,
Transferable transferable) throws IOException
{
// 5-28-03 VL: [Radar 3266030]
// We need to do like Windows does here.
if (format == CF_HTML && flavor.isFlavorTextType()) {
if (stream == null) {
stream = new ByteArrayInputStream(bytes);
bytes = null;
}
stream = new HTMLDecodingInputStream(stream);
}
if (format == CF_URL && URL.class.equals(flavor.getRepresentationClass()))
{
if (bytes == null) {
bytes = inputStreamToByteArray(stream);
stream = null;
}
String charset = getDefaultTextCharset();
if (transferable != null && transferable.isDataFlavorSupported(javaTextEncodingFlavor)) {
try {
charset = new String((byte[])transferable.getTransferData(javaTextEncodingFlavor), "UTF-8");
} catch (UnsupportedFlavorException cannotHappen) {
}
}
return new URL(new String(bytes, charset));
}
if (format == CF_STRING) {
bytes = Normalizer.normalize(new String(bytes, "UTF8"), Form.NFC).getBytes("UTF8");
}
return super.translateBytes(bytes, flavor, format, transferable);
}
public static String slugify(String input) {
String nowhitespace = WHITESPACE.matcher(input).replaceAll("-");
String normalized = Normalizer.normalize(nowhitespace, Form.NFD);
String slug = NONLATIN.matcher(normalized).replaceAll("").replace('-', '_');
return slug;
}
private static String NormalizeThenValidate(String input) {
// Normalize
String s = Normalizer.normalize(input, Form.NFKC);
// Validate
Pattern pattern = Pattern.compile("[<>]"); // Check for angle brackets
Matcher matcher = pattern.matcher(s);
if (matcher.find()) {
// Found black listed tag
throw new IllegalStateException();
}
System.out.println("valid input");
return s;
}
private static String filterString(String str) {
String s = Normalizer.normalize(str, Form.NFKC);
// Replaces all noncharacter code points with Unicode U+FFFD
s = s.replaceAll("[\\p{Cn}]", "\uFFFD");
// Validate input
Pattern pattern = Pattern.compile("<script>");
Matcher matcher = pattern.matcher(s);
if (matcher.find()) {
throw new IllegalArgumentException("Invalid input");
}
return s;
}
/**
* Method to slugify a name.
*
* @param name name to be changed.
* @return Changed name.
*/
public static String slugify(String name) {
String nowhitespace = WHITESPACE.matcher(name).replaceAll("_");
String normalized = Normalizer.normalize(nowhitespace, Form.NFD);
String slug = NONLATIN.matcher(normalized).replaceAll("");
return slug.toLowerCase(Locale.ENGLISH);
}
public static String replaceAccents(String string) {
String withoutAccent = Normalizer
.normalize(string, Form.NFD)
.replaceAll(ASCII_REPLACEMENT, EMPTY_STRING);
//FIXME accent removal fails for russian. This is a quick fix
if(withoutAccent.isEmpty() && !string.isEmpty())
withoutAccent = string;
return withoutAccent;
}
/** Static helper method for normalizing the title. That is, the title
* is converted into lower case and non-ASCII characters are removed. */
public static String normalizeTitle(final String title) {
if (title == null)
return null;
return Normalizer.normalize(title, Form.NFD)
.replaceAll("[^\\p{ASCII}]", "")
.toLowerCase(Locale.US);
}
/**
* Normalizes a given string as {@link Function#normalize} but also removing all spaces and punctuation.
*
* @param value the string to be normalized.
* @return the normalized string.
*/
public String normalizeStrong(final String value) {
return value == null ? UUID.randomUUID().toString()
: uuid(Normalizer.normalize(value, Form.NFD)
.replaceAll("\\p{InCombiningDiacriticalMarks}+", "")
.replaceAll("[^A-Za-z0-9]", ""));
}
/**
* Converts the given value to a string that can be used as local name in URIs.
* Basically it will normalize diacritics and replace spaces with underscores.
*
* @param value the source string.
* @return a string that can be used as local name in URIs.
*/
public static String toURILocalName(final String value) {
return value == null ? null
: Normalizer.normalize(value, Form.NFD)
.replaceAll("\\p{InCombiningDiacriticalMarks}+", "")
.replaceAll(" ", "")
.replaceAll("©", "")
.replaceAll("\\p{Punct}", "")
.replaceAll("\\uFFFD", "")
.trim();
}
public static String toSlug(String input) {
if(input == null || input.isEmpty()) return "";
Pattern NONLATIN = Pattern.compile("[^\\w-]");
Pattern WHITESPACE = Pattern.compile("[\\s]");
String nowhitespace = WHITESPACE.matcher(input).replaceAll("-");
String normalized = Normalizer.normalize(nowhitespace, Form.NFD);
String slug = NONLATIN.matcher(normalized).replaceAll("");
return slug.toLowerCase(Locale.ENGLISH);
}