下面列出了org.apache.hadoop.io.Text#append ( ) 实例代码,或者点击链接到github查看源代码,也可以在右侧发表评论。
private void scanFastaLine(Text line, Text key, ReferenceFragment fragment)
{
// Build the key. We concatenate the chromosome/fragment descripion and
// the start position of the FASTA sequence line, replacing the tabs with colons.
key.clear();
key.append(current_split_indexseq.getBytes(UTF8), 0, current_split_indexseq.getBytes(UTF8).length);
key.append(Integer.toString(current_split_pos).getBytes(UTF8), 0, Integer.toString(current_split_pos).getBytes(UTF8).length);
// replace tabs with :
byte[] bytes = key.getBytes();
int temporaryEnd = key.getLength();
for (int i = 0; i < temporaryEnd; ++i)
if (bytes[i] == '\t')
bytes[i] = ':';
fragment.clear();
fragment.setPosition(current_split_pos);
fragment.setIndexSequence(current_split_indexseq);
fragment.getSequence().append(line.getBytes(), 0, line.getBytes().length);
}
@Test
public void testSplit()
throws Exception
{
Text key = new Text("123_456789");
// hard-split using array copy
int i = key.find("_", 0);
Text outputKey = new Text("");
byte[] bytes = key.getBytes();
outputKey.append(bytes, i + 1, bytes.length - i - 2);
String fileName = new String(bytes, 0, i);
assertEquals("123", fileName);
assertEquals("456789", outputKey.toString());
}
private static void addData(List<Entry<Key,Value>> data, String row, String cf, int chunkSize,
int chunkCount, String vis, String value) {
Text chunkCQ = new Text(FileDataIngest.intToBytes(chunkSize));
chunkCQ.append(FileDataIngest.intToBytes(chunkCount), 0, 4);
data.add(new KeyValue(new Key(new Text(row), new Text(cf), chunkCQ, new Text(vis)),
value.getBytes()));
}
/**
* Given a sorted set of cut points, build a trie that will find the correct
* partition quickly.
* @param splits the list of cut points
* @param lower the lower bound of partitions 0..numPartitions-1
* @param upper the upper bound of partitions 0..numPartitions-1
* @param prefix the prefix that we have already checked against
* @param maxDepth the maximum depth we will build a trie for
* @return the trie node that will divide the splits correctly
*/
private static TrieNode buildTrie(Text[] splits, int lower, int upper,
Text prefix, int maxDepth) {
int depth = prefix.getLength();
if (depth >= maxDepth || lower == upper) {
return new LeafTrieNode(depth, splits, lower, upper);
}
InnerTrieNode result = new InnerTrieNode(depth);
Text trial = new Text(prefix);
// append an extra byte on to the prefix
trial.append(new byte[1], 0, 1);
int currentBound = lower;
for(int ch = 0; ch < 255; ++ch) {
trial.getBytes()[depth] = (byte) (ch + 1);
lower = currentBound;
while (currentBound < upper) {
if (splits[currentBound].compareTo(trial) >= 0) {
break;
}
currentBound += 1;
}
trial.getBytes()[depth] = (byte) ch;
result.child[ch] = buildTrie(splits, lower, currentBound, trial,
maxDepth);
}
// pick up the rest
trial.getBytes()[depth] = (byte) 255;
result.child[255] = buildTrie(splits, currentBound, upper, trial,
maxDepth);
return result;
}
/**
* Add the rowid to the row.
*/
private Text getRowIdString(long rowId) {
Text paddedRowIdString = new Text();
byte[] rowid = Integer.toString((int) rowId).getBytes();
int padSpace = 10 - rowid.length;
if (padSpace > 0) {
paddedRowIdString.append(spaces, 0, 10 - rowid.length);
}
paddedRowIdString.append(rowid, 0, Math.min(rowid.length, 10));
return paddedRowIdString;
}
/**
* Given a sorted set of cut points, build a trie that will find the correct
* partition quickly.
* @param splits the list of cut points
* @param lower the lower bound of partitions 0..numPartitions-1
* @param upper the upper bound of partitions 0..numPartitions-1
* @param prefix the prefix that we have already checked against
* @param maxDepth the maximum depth we will build a trie for
* @return the trie node that will divide the splits correctly
*/
private static TrieNode buildTrie(Text[] splits, int lower, int upper,
Text prefix, int maxDepth) {
int depth = prefix.getLength();
if (depth >= maxDepth || lower == upper) {
return new LeafTrieNode(depth, splits, lower, upper);
}
InnerTrieNode result = new InnerTrieNode(depth);
Text trial = new Text(prefix);
// append an extra byte on to the prefix
trial.append(new byte[1], 0, 1);
int currentBound = lower;
for(int ch = 0; ch < 255; ++ch) {
trial.getBytes()[depth] = (byte) (ch + 1);
lower = currentBound;
while (currentBound < upper) {
if (splits[currentBound].compareTo(trial) >= 0) {
break;
}
currentBound += 1;
}
trial.getBytes()[depth] = (byte) ch;
result.child[ch] = buildTrie(splits, lower, currentBound, trial,
maxDepth);
}
// pick up the rest
trial.getBytes()[depth] = (byte) 255;
result.child[255] = buildTrie(splits, currentBound, upper, trial,
maxDepth);
return result;
}
/**
* Given a sorted set of cut points, build a trie that will find the correct
* partition quickly.
* @param splits the list of cut points
* @param lower the lower bound of partitions 0..numPartitions-1
* @param upper the upper bound of partitions 0..numPartitions-1
* @param prefix the prefix that we have already checked against
* @param maxDepth the maximum depth we will build a trie for
* @return the trie node that will divide the splits correctly
*/
private static TrieNode buildTrie(Text[] splits, int lower, int upper,
Text prefix, int maxDepth) {
int depth = prefix.getLength();
if (depth >= maxDepth || lower == upper) {
return new LeafTrieNode(depth, splits, lower, upper);
}
InnerTrieNode result = new InnerTrieNode(depth);
Text trial = new Text(prefix);
// append an extra byte on to the prefix
trial.append(new byte[1], 0, 1);
int currentBound = lower;
for(int ch = 0; ch < 255; ++ch) {
trial.getBytes()[depth] = (byte) (ch + 1);
lower = currentBound;
while (currentBound < upper) {
if (splits[currentBound].compareTo(trial) >= 0) {
break;
}
currentBound += 1;
}
trial.getBytes()[depth] = (byte) ch;
result.child[ch] = buildTrie(splits, lower, currentBound, trial,
maxDepth);
}
// pick up the rest
trial.getBytes()[depth] = 127;
result.child[255] = buildTrie(splits, currentBound, upper, trial,
maxDepth);
return result;
}
static void addData(List<Entry<Key,Value>> data, String row, String cf, int chunkSize,
int chunkCount, String vis, String value) {
Text chunkCQ = new Text(FileDataIngest.intToBytes(chunkSize));
chunkCQ.append(FileDataIngest.intToBytes(chunkCount), 0, 4);
data.add(new KeyValue(new Key(new Text(row), new Text(cf), chunkCQ, new Text(vis)),
value.getBytes()));
}
private static Text concat(Text prefix, String str) {
Text temp = new Text(prefix);
try {
ByteBuffer buffer = Text.encode(str, false);
temp.append(buffer.array(), 0, buffer.limit());
} catch (CharacterCodingException cce) {
throw new IllegalArgumentException(cce);
}
return temp;
}
/**
* Given a sorted set of cut points, build a trie that will find the correct
* partition quickly.
* @param splits the list of cut points
* @param lower the lower bound of partitions 0..numPartitions-1
* @param upper the upper bound of partitions 0..numPartitions-1
* @param prefix the prefix that we have already checked against
* @param maxDepth the maximum depth we will build a trie for
* @return the trie node that will divide the splits correctly
*/
private static TrieNode buildTrie(Text[] splits, int lower, int upper,
Text prefix, int maxDepth) {
int depth = prefix.getLength();
if (depth >= maxDepth || lower == upper) {
return new LeafTrieNode(depth, splits, lower, upper);
}
InnerTrieNode result = new InnerTrieNode(depth);
Text trial = new Text(prefix);
// append an extra byte on to the prefix
trial.append(new byte[1], 0, 1);
int currentBound = lower;
for(int ch = 0; ch < 255; ++ch) {
trial.getBytes()[depth] = (byte) (ch + 1);
lower = currentBound;
while (currentBound < upper) {
if (splits[currentBound].compareTo(trial) >= 0) {
break;
}
currentBound += 1;
}
trial.getBytes()[depth] = (byte) ch;
result.child[ch] = buildTrie(splits, lower, currentBound, trial,
maxDepth);
}
// pick up the rest
trial.getBytes()[depth] = 127;
result.child[255] = buildTrie(splits, currentBound, upper, trial,
maxDepth);
return result;
}
/**
* Given a path, construct an accumulo row prepended with the {@link #FORWARD_PREFIX} for the
* index table.
*
* @param path
* the full path of a file or directory
* @return the accumulo row associated with this path
*/
public static Text getForwardIndex(String path) {
String part = path.substring(path.lastIndexOf("/") + 1);
if (part.length() == 0)
return null;
Text row = new Text(FORWARD_PREFIX);
row.append(part.getBytes(), 0, part.length());
return row;
}
/**
* Given a path, construct an accumulo row prepended with the {@link #REVERSE_PREFIX} with the
* path reversed for the index table.
*
* @param path
* the full path of a file or directory
* @return the accumulo row associated with this path
*/
public static Text getReverseIndex(String path) {
String part = path.substring(path.lastIndexOf("/") + 1);
if (part.length() == 0)
return null;
byte[] rev = new byte[part.length()];
int i = part.length() - 1;
for (byte b : part.getBytes())
rev[i--] = b;
Text row = new Text(REVERSE_PREFIX);
row.append(rev, 0, rev.length);
return row;
}
/**
* Read one line from the InputStream into the given Text. A line can be terminated by '\n' (LF). EOF also terminates an otherwise unterminated line.
*
* @param str
* the object to store the given line (without newline)
* @param maxLineLength
* the maximum number of bytes to store into str; the rest of the line is silently discarded.
* @param maxBytesToConsume
* the maximum number of bytes to consume in this call. This is only a hint, because if the line cross this threshold, we allow it to happen. It
* can overshoot potentially by as much as one buffer length.
*
* @return the number of bytes read including the (longest) newline found.
*
* @throws IOException
* if the underlying stream throws
*/
public int readLine(Text str, int maxLineLength, int maxBytesToConsume) throws IOException {
/*
* We're reading data from in, but the head of the stream may be already buffered in buffer, so we have several cases: 1. No newline characters are in
* the buffer, so we need to copy everything and read another buffer from the stream. 2. An unambiguously terminated line is in buffer, so we just copy
* to str.
*/
str.clear();
int txtLength = 0; // tracks str.getLength(), as an optimization
int newlineLength = 0; // length of terminating newline
long bytesConsumed = 0;
do {
int startPosn = bufferPosn; // starting from where we left off the
// last time
if (bufferPosn >= bufferLength) {
startPosn = bufferPosn = 0;
bufferLength = in.read(buffer);
if (bufferLength <= 0)
break; // EOF
}
for (; bufferPosn < bufferLength; ++bufferPosn) { // search for
// newline
if (buffer[bufferPosn] == LF) {
newlineLength = 1;
++bufferPosn; // at next invocation proceed from following
// byte
break;
}
}
int readLength = bufferPosn - startPosn;
bytesConsumed += readLength;
int appendLength;
if (isNewLineIncluded()) {
appendLength = readLength;
} else {
appendLength = readLength - newlineLength;
}
if (appendLength > maxLineLength - txtLength) {
appendLength = maxLineLength - txtLength;
}
if (appendLength > 0) {
str.append(buffer, startPosn, appendLength);
txtLength += appendLength;
}
} while (newlineLength == 0 && bytesConsumed < maxBytesToConsume);
if (bytesConsumed > Integer.MAX_VALUE)
throw new IOException("Too many bytes before newline: " + bytesConsumed);
return (int) bytesConsumed;
}
public static void textAppend(Text t, long s) {
t.append(nullByte, 0, 1);
t.append(LongCombiner.FIXED_LEN_ENCODER.encode(s), 0, 8);
}
/**
* Read a line terminated by one of CR, LF, or CRLF.
*/
private int readDefaultLine(Text str, int maxLineLength, int maxBytesToConsume)
throws IOException {
/* We're reading data from in, but the head of the stream may be
* already buffered in buffer, so we have several cases:
* 1. No newline characters are in the buffer, so we need to copy
* everything and read another buffer from the stream.
* 2. An unambiguously terminated line is in buffer, so we just
* copy to str.
* 3. Ambiguously terminated line is in buffer, i.e. buffer ends
* in CR. In this case we copy everything up to CR to str, but
* we also need to see what follows CR: if it's LF, then we
* need consume LF as well, so next call to readLine will read
* from after that.
* We use a flag prevCharCR to signal if previous character was CR
* and, if it happens to be at the end of the buffer, delay
* consuming it until we have a chance to look at the char that
* follows.
*/
str.clear();
int txtLength = 0; //tracks str.getLength(), as an optimization
int newlineLength = 0; //length of terminating newline
boolean prevCharCR = false; //true of prev char was CR
long bytesConsumed = 0;
do {
int startPosn = bufferPosn; //starting from where we left off the last time
if (bufferPosn >= bufferLength) {
startPosn = bufferPosn = 0;
if (prevCharCR) {
++bytesConsumed; //account for CR from previous read
}
bufferLength = fillBuffer(in, buffer, prevCharCR);
if (bufferLength <= 0) {
break; // EOF
}
}
for (; bufferPosn < bufferLength; ++bufferPosn) { //search for newline
if (buffer[bufferPosn] == LF) {
newlineLength = (prevCharCR) ? 2 : 1;
++bufferPosn; // at next invocation proceed from following byte
break;
}
if (prevCharCR) { //CR + notLF, we are at notLF
newlineLength = 1;
break;
}
prevCharCR = (buffer[bufferPosn] == CR);
}
int readLength = bufferPosn - startPosn;
if (prevCharCR && newlineLength == 0) {
--readLength; //CR at the end of the buffer
}
bytesConsumed += readLength;
int appendLength = readLength - newlineLength;
if (appendLength > maxLineLength - txtLength) {
appendLength = maxLineLength - txtLength;
}
if (appendLength > 0) {
str.append(buffer, startPosn, appendLength);
txtLength += appendLength;
}
} while (newlineLength == 0 && bytesConsumed < maxBytesToConsume);
if (bytesConsumed > (long) Integer.MAX_VALUE) {
throw new IOException("Too many bytes before newline: " + bytesConsumed);
}
return (int) bytesConsumed;
}
private void scanQseqLine(Text line, Text key, SequencedFragment fragment)
{
setFieldPositionsAndLengths(line);
// Build the key. We concatenate all fields from 0 to 5 (machine to y-pos)
// and then the read number, replacing the tabs with colons.
key.clear();
// append up and including field[5]
key.append(line.getBytes(), 0, fieldPositions[5] + fieldLengths[5]);
// replace tabs with :
byte[] bytes = key.getBytes();
int temporaryEnd = key.getLength();
for (int i = 0; i < temporaryEnd; ++i)
if (bytes[i] == '\t')
bytes[i] = ':';
// append the read number
key.append(line.getBytes(), fieldPositions[7] - 1, fieldLengths[7] + 1); // +/- 1 to catch the preceding tab.
// convert the tab preceding the read number into a :
key.getBytes()[temporaryEnd] = ':';
// now the fragment
try
{
fragment.clear();
fragment.setInstrument( Text.decode(line.getBytes(), fieldPositions[0], fieldLengths[0]) );
fragment.setRunNumber( Integer.parseInt(Text.decode(line.getBytes(), fieldPositions[1], fieldLengths[1])) );
//fragment.setFlowcellId();
fragment.setLane( Integer.parseInt(Text.decode(line.getBytes(), fieldPositions[2], fieldLengths[2])) );
fragment.setTile( Integer.parseInt(Text.decode(line.getBytes(), fieldPositions[3], fieldLengths[3])) );
fragment.setXpos( Integer.parseInt(Text.decode(line.getBytes(), fieldPositions[4], fieldLengths[4])) );
fragment.setYpos( Integer.parseInt(Text.decode(line.getBytes(), fieldPositions[5], fieldLengths[5])) );
fragment.setRead( Integer.parseInt(Text.decode(line.getBytes(), fieldPositions[7], fieldLengths[7])) );
fragment.setFilterPassed( line.getBytes()[fieldPositions[10]] != '0' );
//fragment.setControlNumber();
if (fieldLengths[6] > 0 && line.getBytes()[fieldPositions[6]] == '0') // 0 is a null index sequence
fragment.setIndexSequence(null);
else
fragment.setIndexSequence(Text.decode(line.getBytes(), fieldPositions[6], fieldLengths[6]).replace('.', 'N'));
}
catch (CharacterCodingException e) {
throw new FormatException("Invalid character format at " + makePositionMessage(this.pos - line.getLength()) + "; line: " + line);
}
fragment.getSequence().append(line.getBytes(), fieldPositions[8], fieldLengths[8]);
fragment.getQuality().append(line.getBytes(), fieldPositions[9], fieldLengths[9]);
}
/**
* Read a line terminated by a custom delimiter.
*/
private int readCustomLine(Text str, int maxLineLength, int maxBytesToConsume)
throws IOException {
/* We're reading data from inputStream, but the head of the stream may be
* already captured in the previous buffer, so we have several cases:
*
* 1. The buffer tail does not contain any character sequence which
* matches with the head of delimiter. We count it as a
* ambiguous byte count = 0
*
* 2. The buffer tail contains a X number of characters,
* that forms a sequence, which matches with the
* head of delimiter. We count ambiguous byte count = X
*
* // *** eg: A segment of input file is as follows
*
* " record 1792: I found this bug very interesting and
* I have completely read about it. record 1793: This bug
* can be solved easily record 1794: This ."
*
* delimiter = "record";
*
* supposing:- String at the end of buffer =
* "I found this bug very interesting and I have completely re"
* There for next buffer = "ad about it. record 179 ...."
*
* The matching characters in the input
* buffer tail and delimiter head = "re"
* Therefore, ambiguous byte count = 2 **** //
*
* 2.1 If the following bytes are the remaining characters of
* the delimiter, then we have to capture only up to the starting
* position of delimiter. That means, we need not include the
* ambiguous characters in str.
*
* 2.2 If the following bytes are not the remaining characters of
* the delimiter ( as mentioned in the example ),
* then we have to include the ambiguous characters in str.
*/
str.clear();
int txtLength = 0; // tracks str.getLength(), as an optimization
long bytesConsumed = 0;
int delPosn = 0;
int ambiguousByteCount=0; // To capture the ambiguous characters count
do {
int startPosn = bufferPosn; // Start from previous end position
if (bufferPosn >= bufferLength) {
startPosn = bufferPosn = 0;
bufferLength = fillBuffer(in, buffer, ambiguousByteCount > 0);
if (bufferLength <= 0) {
str.append(recordDelimiterBytes, 0, ambiguousByteCount);
break; // EOF
}
}
for (; bufferPosn < bufferLength; ++bufferPosn) {
if (buffer[bufferPosn] == recordDelimiterBytes[delPosn]) {
delPosn++;
if (delPosn >= recordDelimiterBytes.length) {
bufferPosn++;
break;
}
} else if (delPosn != 0) {
bufferPosn--;
delPosn = 0;
}
}
int readLength = bufferPosn - startPosn;
bytesConsumed += readLength;
int appendLength = readLength - delPosn;
if (appendLength > maxLineLength - txtLength) {
appendLength = maxLineLength - txtLength;
}
if (appendLength > 0) {
if (ambiguousByteCount > 0) {
str.append(recordDelimiterBytes, 0, ambiguousByteCount);
//appending the ambiguous characters (refer case 2.2)
bytesConsumed += ambiguousByteCount;
ambiguousByteCount=0;
}
str.append(buffer, startPosn, appendLength);
txtLength += appendLength;
}
if (bufferPosn >= bufferLength) {
if (delPosn > 0 && delPosn < recordDelimiterBytes.length) {
ambiguousByteCount = delPosn;
bytesConsumed -= ambiguousByteCount; //to be consumed in next
}
}
} while (delPosn < recordDelimiterBytes.length
&& bytesConsumed < maxBytesToConsume);
if (bytesConsumed > Integer.MAX_VALUE) {
throw new IOException("Too many bytes before delimiter: " + bytesConsumed);
}
return (int) bytesConsumed;
}
/**
* Get a collision unlikely hash string and append to the key,
* so that if two keys have the same value, then they will be the same,
* if two different values that occur at the same time there keys are different.
* If the application uses a very large number of statements at the exact same time,
* the md5 value might be upgraded to for example sha-1 to avoid collisions.
* @param statement
* @param keyText
*/
public static void appendUniqueness(final Statement statement, final Text keyText) {
keyText.append(HASH_PREFIX, 0, 1); // delimiter
final Value statementValue = new Value(StringUtils.getBytesUtf8(StatementSerializer.writeStatement(statement)));
final byte[] hashOfValue = Md5Hash.md5Binary(statementValue);
keyText.append(hashOfValue, 0, hashOfValue.length);
}
/**
* Appends a null byte to the given text
*
* @param text
* the text to which to append the null byte
*/
public static void appendNullByte(Text text) {
text.append(nullByte, 0, nullByte.length);
}
/**
* Given a path, construct an accumulo row prepended with the path's depth for the directory
* table.
*
* @param path
* the full path of a file or directory
* @return the accumulo row associated with this path
*/
public static Text getRow(String path) {
Text row = new Text(String.format("%03d", getDepth(path)));
row.append(path.getBytes(), 0, path.length());
return row;
}