下面列出了org.apache.hadoop.io.Text#getBytes ( ) 实例代码,或者点击链接到github查看源代码,也可以在右侧发表评论。
/**
* Convert a line of TSV text into an HBase table row.
*/
@Override
public void map(LongWritable offset, Text value, Context context) throws IOException {
try {
Pair<Integer,Integer> rowKeyOffests = parser.parseRowKey(value.getBytes(), value.getLength());
ImmutableBytesWritable rowKey = new ImmutableBytesWritable(
value.getBytes(), rowKeyOffests.getFirst(), rowKeyOffests.getSecond());
context.write(rowKey, value);
} catch (ImportTsv.TsvParser.BadTsvLineException|IllegalArgumentException badLine) {
if (logBadLines) {
System.err.println(value);
}
System.err.println("Bad line at offset: " + offset.get() + ":\n" + badLine.getMessage());
if (skipBadLines) {
incrementBadLineCount(1);
return;
}
throw new IOException(badLine);
} catch (InterruptedException e) {
LOG.error("Interrupted while emitting TSV text", e);
Thread.currentThread().interrupt();
}
}
@Nonnull
public List<String> evaluate(@Nonnull final String modelId, @Nonnull final Text script,
@Nonnull final Vector features) throws HiveException {
if (!modelId.equals(prevModelId)) {
this.prevModelId = modelId;
int length = script.getLength();
byte[] b = script.getBytes();
b = Base91.decode(b, 0, length);
this.rNode = RegressionTree.deserialize(b, b.length, true);
}
Preconditions.checkNotNull(rNode);
handler.init();
rNode.predict(features, handler);
return handler.getResult();
}
/**
* Convert a Text object of a tileId to a back to a long.
*
* @param rowId Text object to convert.
* @return the long value from the Text object.
*/
public static long toLong(Text rowId)
{
byte[] outB = new byte[8];
for (int x = 0; x < outB.length; x++)
{
if (x >= rowId.getLength())
{
outB[x] = 0x0;
}
else
{
outB[x] = rowId.getBytes()[x];
}
}
return ByteBuffer.wrap(outB).getLong();
}
@Nonnull
public Object[] evaluate(@Nonnull final String modelId, @Nonnull final Text script,
@Nonnull final Vector features) throws HiveException {
if (!modelId.equals(prevModelId)) {
this.prevModelId = modelId;
int length = script.getLength();
byte[] b = script.getBytes();
b = Base91.decode(b, 0, length);
this.cNode = DecisionTree.deserialize(b, b.length, true);
}
Arrays.fill(result, null);
Preconditions.checkNotNull(cNode);
cNode.predict(features, new PredictionHandler() {
public void visitLeaf(int output, double[] posteriori) {
result[0] = new IntWritable(output);
result[1] = WritableUtils.toWritableList(posteriori);
}
});
return result;
}
@Override
public int getPartition(Object o) {
if (initialized == false) {
synchronized (SparkFactDistinct.class) {
if (initialized == false) {
init();
}
}
}
SelfDefineSortableKey skey = (SelfDefineSortableKey) o;
Text key = skey.getText();
if (key.getBytes()[0] == FactDistinctColumnsReducerMapping.MARK_FOR_HLL_COUNTER) {
Long cuboidId = Bytes.toLong(key.getBytes(), 1, Bytes.SIZEOF_LONG);
return reducerMapping.getReducerIdForCuboidRowCount(cuboidId);
} else {
return BytesUtil.readUnsigned(key.getBytes(), 0, 1);
}
}
@Test
public void testAppend_multiByteChars() throws UnsupportedEncodingException {
String prefix = "prefix\u6C34";
int prefixLength = prefix.getBytes("UTF-8").length;
Text text = new Text(prefix.getBytes("UTF-8"));
// A random multi-byte char string I found. Don't know what it means.
String multiByteCharString = "\u007A\u6C34\uD834\uDD1E";
TextUtil.textAppend(text, multiByteCharString);
byte[] stringBytes = multiByteCharString.getBytes("UTF-8");
Assert.assertEquals("Length was wrong", 1 + stringBytes.length, text.getLength() - prefixLength);
byte[] textBytes = text.getBytes();
Assert.assertEquals("First byte was wrong", (byte) 0, textBytes[prefixLength]);
byte[] restOfText = new byte[text.getLength() - prefixLength - 1];
System.arraycopy(textBytes, 1 + prefixLength, restOfText, 0, restOfText.length);
Assert.assertArrayEquals("Contents were wrong", stringBytes, restOfText);
}
/**
* Convert a line of TSV text into an HBase table row after transforming the
* values by multiplying them by 3.
*/
@Override
public void map(LongWritable offset, Text value, Context context)
throws IOException {
byte[] family = Bytes.toBytes("FAM");
final byte[][] qualifiers = { Bytes.toBytes("A"), Bytes.toBytes("B") };
// do some basic line parsing
byte[] lineBytes = value.getBytes();
String[] valueTokens = new String(lineBytes, StandardCharsets.UTF_8).split("\u001b");
// create the rowKey and Put
ImmutableBytesWritable rowKey =
new ImmutableBytesWritable(Bytes.toBytes(valueTokens[0]));
Put put = new Put(rowKey.copyBytes());
put.setDurability(Durability.SKIP_WAL);
//The value should look like this: VALUE1 or VALUE2. Let's multiply
//the integer by 3
for(int i = 1; i < valueTokens.length; i++) {
String prefix = valueTokens[i].substring(0, "VALUE".length());
String suffix = valueTokens[i].substring("VALUE".length());
String newValue = prefix + Integer.parseInt(suffix) * 3;
KeyValue kv = new KeyValue(rowKey.copyBytes(), family,
qualifiers[i-1], Bytes.toBytes(newValue));
put.add(kv);
}
try {
context.write(rowKey, put);
} catch (InterruptedException e) {
e.printStackTrace();
}
}
@Test
public void testMapperWithNull() throws Exception {
String cubeName = "test_kylin_cube_with_slr_1_new_segment";
String segmentName = "20130331080000_20131212080000";
mapDriver.getConfiguration().set(BatchConstants.CFG_CUBE_NAME, cubeName);
mapDriver.getConfiguration().set(BatchConstants.CFG_CUBE_SEGMENT_NAME, segmentName);
// mapDriver.getConfiguration().set(BatchConstants.CFG_METADATA_URL,
// metadata);
mapDriver.withInput(new Text("key"), new Text("2012-12-15118480Health & BeautyFragrances\\NAuction15123456789\\N"));
List<Pair<Text, Text>> result = mapDriver.run();
CubeManager cubeMgr = CubeManager.getInstance(getTestConfig());
CubeInstance cube = cubeMgr.getCube(cubeName);
assertEquals(1, result.size());
Text rowkey = result.get(0).getFirst();
byte[] key = rowkey.getBytes();
byte[] header = Bytes.head(key, 26);
byte[] sellerId = Bytes.tail(header, 18);
byte[] cuboidId = Bytes.head(header, 8);
byte[] restKey = Bytes.tail(key, rowkey.getLength() - 26);
RowKeyDecoder decoder = new RowKeyDecoder(cube.getFirstSegment());
decoder.decode(key);
assertEquals("[123456789, 2012-12-15, 11848, Health & Beauty, Fragrances, null, Auction, 0, 15]", decoder.getValues().toString());
assertTrue(Bytes.toString(sellerId).startsWith("123456789"));
assertEquals(511, Bytes.toLong(cuboidId));
assertEquals(22, restKey.length);
verifyMeasures(cube.getDescriptor().getMeasures(), result.get(0).getSecond(), "0", "0", "0");
}
/**
* Write a String as a VInt n, followed by n Bytes as in Text format.
*
* @param out
* @param s
* @throws IOException
*/
public static void writeString(DataOutput out, String s) throws IOException {
if (s != null) {
Text text = new Text(s);
byte[] buffer = text.getBytes();
int len = text.getLength();
writeVInt(out, len);
out.write(buffer, 0, len);
} else {
writeVInt(out, -1);
}
}
@Override
public void setSafeValue(ObjectInspector oi, Object hiveFieldValue, ValueVector outputVV, int outputIndex) {
final Text value = ((HiveVarcharObjectInspector)oi).getPrimitiveWritableObject(hiveFieldValue).getTextValue();
final int valueLen = value.getLength();
checkSizeLimit(valueLen);
final byte[] valueBytes = value.getBytes();
((VarCharVector) outputVV).setSafe(outputIndex, valueBytes, 0, valueLen);
}
@Override
public void setSafeValue(ObjectInspector oi, Object hiveFieldValue, ValueVector outputVV, int outputIndex) {
final Text value = ((StringObjectInspector)oi).getPrimitiveWritableObject(hiveFieldValue);
final int len = value.getLength();
checkSizeLimit(len);
final byte[] valueBytes = value.getBytes();
((VarCharVector) outputVV).setSafe(outputIndex, valueBytes, 0, len);
}
@Test
public void testOutput() throws InterruptedException, IOException {
String s = "urn:gem:etype#1234";
String p = "urn:gem#pred";
String ts = "798497748386999999";
Text t1 = new Text(TripleValueType.subject.name() + DELIM + s + DELIM + 1);
Text t2 = new Text(TripleValueType.predicate.name() + DELIM + p + DELIM + 2);
Text t3 = new Text(TripleValueType.subjectpredicate.name() + DELIM + s + DELIM + p + DELIM + ts);
byte[] b = new byte[0];
byte[] c = "25".getBytes();
byte[] d = "47".getBytes();
byte[] e = "15".getBytes();
Key key1 = new Key(t1.getBytes(), b, b, b, 1);
Key key2 = new Key(t2.getBytes(), b, b, b, 1);
Key key3 = new Key(t3.getBytes(), b, b, b, 1);
Value val1 = new Value(c);
Value val2 = new Value(d);
Value val3 = new Value(e);
// System.out.println("Keys are " + key1 + " and " + key2);
new MapDriver<Key, Value, CompositeType, TripleCard>()
.withMapper(new JoinSelectProspectOutput.CardinalityMapper())
.withInput(key1, val1)
.withInput(key2, val2)
.withInput(key3, val3)
.withOutput(new CompositeType(s, 1), new TripleCard(new CardinalityType(25, "subject", 1)))
.withOutput(new CompositeType(p, 1), new TripleCard(new CardinalityType(47, "predicate", 2)))
.withOutput(new CompositeType(s + DELIM + p, 1),
new TripleCard(new CardinalityType(15, "subjectpredicate", Long.parseLong(ts)))).runTest();
}
@Override
public void init(SortedKeyValueIterator<Key,Value> source, Map<String,String> options, IteratorEnvironment env) throws IOException {
super.init(source, options, env);
if (options == null)
throw new IllegalArgumentException("ttl must be set for DateBasedAgeOffFilter");
String ttl = options.get("ttl");
if (ttl == null)
throw new IllegalArgumentException("ttl must be set for DateBasedAgeOffFilter");
int thresholdDays = Integer.parseInt(ttl);
Text cutoffDateText = new Text(DateHelper.format(getCutoffDate(thresholdDays)));
cutoffDate = cutoffDateText.getBytes();
cutoffDateLen = cutoffDateText.getLength();
}
@Override
public void setSafeValue(ObjectInspector oi, Object hiveFieldValue, ValueVector outputVV, int outputIndex) {
final Text value = ((HiveCharObjectInspector)oi).getPrimitiveWritableObject(hiveFieldValue).getStrippedValue();
final int valueLen = value.getLength();
checkSizeLimit(valueLen);
final byte[] valueBytes = value.getBytes();
((VarCharVector) outputVV).setSafe(outputIndex, valueBytes, 0, valueLen);
}
public void reduce(Text key, Iterable<Text> values, Context context)
throws IOException, InterruptedException {
Text text = values.iterator().next();
String[] columns = text.toString().split(split_str);
Put put = new Put(key.getBytes());
put.add(Bytes.toBytes(Constants.hbase_column_family),
Bytes.toBytes(Constants.hbase_column_yearrate),
Bytes.toBytes(columns[0]));
put.add(Bytes.toBytes(Constants.hbase_column_family),
Bytes.toBytes(Constants.hbase_column_repaylimittime),
Bytes.toBytes(columns[1]));
context.write(NullWritable.get(), put);
}
@Test
public void testMapperWithHeader() throws Exception {
String cubeName = "test_kylin_cube_with_slr_1_new_segment";
String segmentName = "20130331080000_20131212080000";
mapDriver.getConfiguration().set(BatchConstants.CFG_CUBE_NAME, cubeName);
mapDriver.getConfiguration().set(BatchConstants.CFG_CUBE_SEGMENT_NAME, segmentName);
// mapDriver.getConfiguration().set(BatchConstants.CFG_METADATA_URL,
// metadata);
mapDriver.withInput(new Text("key"), new Text("2012-12-15118480Health & BeautyFragrancesWomenAuction15123456789132.33"));
List<Pair<Text, Text>> result = mapDriver.run();
CubeManager cubeMgr = CubeManager.getInstance(getTestConfig());
CubeInstance cube = cubeMgr.getCube(cubeName);
assertEquals(1, result.size());
Text rowkey = result.get(0).getFirst();
byte[] key = rowkey.getBytes();
byte[] header = Bytes.head(key, 26);
byte[] sellerId = Bytes.tail(header, 18);
byte[] cuboidId = Bytes.head(header, 8);
byte[] restKey = Bytes.tail(key, rowkey.getLength() - 26);
RowKeyDecoder decoder = new RowKeyDecoder(cube.getFirstSegment());
decoder.decode(key);
assertEquals("[123456789, 2012-12-15, 11848, Health & Beauty, Fragrances, Women, Auction, 0, 15]", decoder.getValues().toString());
assertTrue(Bytes.toString(sellerId).startsWith("123456789"));
assertEquals(511, Bytes.toLong(cuboidId));
assertEquals(22, restKey.length);
verifyMeasures(cube.getDescriptor().getMeasures(), result.get(0).getSecond(), "132.33", "132.33", "132.33");
}
private DoubleWritable evaluateRegression(@Nonnull String modelId, boolean compressed,
@Nonnull Text script, double[] features) throws HiveException {
if (!modelId.equals(prevModelId)) {
this.prevModelId = modelId;
int length = script.getLength();
byte[] b = script.getBytes();
b = Base91.decode(b, 0, length);
this.rNode = deserializeRegressionTree(b, b.length, compressed);
}
assert (rNode != null);
double result = rNode.predict(features);
return new DoubleWritable(result);
}
@Override
public Tuple getNext() throws IOException {
mProtoTuple = new ArrayList<Object>();
boolean inField = false;
boolean inQuotedField = false;
boolean evenQuotesSeen = true;
if (!mRequiredColumnsInitialized) {
if (signature != null) {
Properties p = UDFContext.getUDFContext().getUDFProperties(this.getClass());
mRequiredColumns = (boolean[])ObjectSerializer.deserialize(p.getProperty(signature));
}
mRequiredColumnsInitialized = true;
}
try {
if (!in.nextKeyValue()) {
return null;
}
Text value = (Text) in.getCurrentValue();
byte[] buf = value.getBytes();
int len = value.getLength();
int fieldID = 0;
ByteBuffer fieldBuffer = ByteBuffer.allocate(len);
for (int i = 0; i < len; i++) {
byte b = buf[i];
inField = true;
if (inQuotedField) {
if (b == DOUBLE_QUOTE) {
evenQuotesSeen = !evenQuotesSeen;
if (evenQuotesSeen) {
fieldBuffer.put(DOUBLE_QUOTE);
}
} else
if (!evenQuotesSeen &&
(b == FIELD_DEL || b == RECORD_DEL)) {
inQuotedField = false;
inField = false;
readField(fieldBuffer, fieldID++);
} else {
fieldBuffer.put(b);
}
} else if (b == DOUBLE_QUOTE) {
inQuotedField = true;
evenQuotesSeen = true;
} else if (b == FIELD_DEL) {
inField = false;
readField(fieldBuffer, fieldID++); // end of the field
} else {
evenQuotesSeen = true;
fieldBuffer.put(b);
}
}
if (inField) readField(fieldBuffer, fieldID++);
} catch (InterruptedException e) {
int errCode = 6018;
String errMsg = "Error while reading input";
throw new ExecException(errMsg, errCode,
PigException.REMOTE_ENVIRONMENT, e);
}
Tuple t = mTupleFactory.newTupleNoCopy(mProtoTuple);
return t;
}
@Override
protected void doReduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
String col = key.toString();
logger.info("merge dictionary for column:{}", col);
TblColRef tblColRef = colNeedDictMap.get(col);
if (tblColRef == null) {
logger.warn("column:{} not found in the columns need dictionary map: {}", col, colNeedDictMap.keySet());
return;
}
DataType dataType = tblColRef.getType();
List<Dictionary<String>> dicts = Lists.newLinkedList();
for (Text value : values) {
ByteArray byteArray = new ByteArray(value.getBytes());
Dictionary<String> dict = (Dictionary<String>) DictionarySerializer.deserialize(byteArray);
dicts.add(dict);
}
Dictionary mergedDict;
if (dicts.size() > 1) {
MultipleDictionaryValueEnumerator multipleDictionaryValueEnumerator = new MultipleDictionaryValueEnumerator(
dataType, dicts);
mergedDict = DictionaryGenerator.buildDictionary(dataType, multipleDictionaryValueEnumerator);
} else if (dicts.size() == 1) {
mergedDict = dicts.get(0);
} else {
throw new IllegalArgumentException("Dictionary missing for column " + col);
}
if (mergedDict == null) {
throw new IllegalArgumentException("Merge dictionaries error for column " + col);
}
TableDesc tableDesc = tblColRef.getColumnDesc().getTable();
IReadableTable.TableSignature signature = new IReadableTable.TableSignature();
signature.setLastModifiedTime(System.currentTimeMillis());
signature.setPath(tableDesc.getResourcePath());
//TODO: Table signature size?
// signature.setSize(mergedDict.getSize());
DictionaryInfo dictionaryInfo = new DictionaryInfo(tblColRef.getTable(), tblColRef.getName(), tblColRef
.getColumnDesc().getZeroBasedIndex(), tblColRef.getDatatype(), signature);
dictionaryInfo.setDictionaryObject(mergedDict);
dictionaryInfo.setDictionaryClass(mergedDict.getClass().getName());
dictionaryInfo.setCardinality(mergedDict.getSize());
ByteArrayOutputStream fulBuf = new ByteArrayOutputStream();
DataOutputStream fulDout = new DataOutputStream(fulBuf);
DictionaryInfoSerializer.FULL_SERIALIZER.serialize(dictionaryInfo, fulDout);
Text outValue = new Text(fulBuf.toByteArray());
context.write(key, outValue);
logger.debug("output dict info of column {} to path: {}", col,
context.getConfiguration().get(FileOutputFormat.OUTDIR));
}
@Override
public Tuple getNext() throws IOException {
mProtoTuple = new ArrayList<Object>();
if (!mRequiredColumnsInitialized) {
if (signature!=null) {
Properties p = UDFContext.getUDFContext().getUDFProperties(this.getClass());
mRequiredColumns = (boolean[])ObjectSerializer.deserialize(p.getProperty(signature));
}
mRequiredColumnsInitialized = true;
}
//Prepend input source path if source tagging is enabled
if(tagFile) {
mProtoTuple.add(new DataByteArray(sourcePath.getName()));
} else if (tagPath) {
mProtoTuple.add(new DataByteArray(sourcePath.toString()));
}
try {
boolean notDone = in.nextKeyValue();
if (!notDone) {
return null;
}
Text value = (Text) in.getCurrentValue();
byte[] buf = value.getBytes();
int len = value.getLength();
int start = 0;
int fieldID = 0;
for (int i = 0; i < len; i++) {
if (buf[i] == fieldDel) {
if (mRequiredColumns==null || (mRequiredColumns.length>fieldID && mRequiredColumns[fieldID]))
addTupleValue(mProtoTuple, buf, start, i);
start = i + 1;
fieldID++;
}
}
// pick up the last field
if (start <= len && (mRequiredColumns==null || (mRequiredColumns.length>fieldID && mRequiredColumns[fieldID]))) {
addTupleValue(mProtoTuple, buf, start, len);
}
Tuple t = mTupleFactory.newTupleNoCopy(mProtoTuple);
return dontLoadSchema ? t : applySchema(t);
} catch (InterruptedException e) {
int errCode = 6018;
String errMsg = "Error while reading input";
throw new ExecException(errMsg, errCode,
PigException.REMOTE_ENVIRONMENT, e);
}
}