下面列出了怎么用org.apache.hadoop.mapreduce.split.JobSplit.TaskSplitIndex的API类实例代码及写法,或者点击链接到github查看源代码。
@SuppressWarnings("unchecked")
public static org.apache.hadoop.mapreduce.InputSplit getNewSplitDetailsFromDisk(
TaskSplitIndex splitMetaInfo, JobConf jobConf, TezCounter splitBytesCounter)
throws IOException {
Path file = new Path(splitMetaInfo.getSplitLocation());
long offset = splitMetaInfo.getStartOffset();
// Split information read from local filesystem.
FileSystem fs = FileSystem.getLocal(jobConf);
file = fs.makeQualified(file);
LOG.info("Reading input split file from : " + file);
FSDataInputStream inFile = fs.open(file);
inFile.seek(offset);
String className = Text.readString(inFile);
Class<org.apache.hadoop.mapreduce.InputSplit> cls;
try {
cls = (Class<org.apache.hadoop.mapreduce.InputSplit>) jobConf.getClassByName(className);
} catch (ClassNotFoundException ce) {
IOException wrap = new IOException("Split class " + className + " not found");
wrap.initCause(ce);
throw wrap;
}
SerializationFactory factory = new SerializationFactory(jobConf);
Deserializer<org.apache.hadoop.mapreduce.InputSplit> deserializer = (Deserializer<org.apache.hadoop.mapreduce.InputSplit>) factory
.getDeserializer(cls);
deserializer.open(inFile);
org.apache.hadoop.mapreduce.InputSplit split = deserializer.deserialize(null);
long pos = inFile.getPos();
if (splitBytesCounter != null) {
splitBytesCounter.increment(pos - offset);
}
inFile.close();
return split;
}
@SuppressWarnings("unchecked")
public static InputSplit getOldSplitDetailsFromDisk(TaskSplitIndex splitMetaInfo,
JobConf jobConf, TezCounter splitBytesCounter) throws IOException {
Path file = new Path(splitMetaInfo.getSplitLocation());
FileSystem fs = FileSystem.getLocal(jobConf);
file = fs.makeQualified(file);
LOG.info("Reading input split file from : " + file);
long offset = splitMetaInfo.getStartOffset();
FSDataInputStream inFile = fs.open(file);
inFile.seek(offset);
String className = Text.readString(inFile);
Class<org.apache.hadoop.mapred.InputSplit> cls;
try {
cls = (Class<org.apache.hadoop.mapred.InputSplit>) jobConf.getClassByName(className);
} catch (ClassNotFoundException ce) {
IOException wrap = new IOException("Split class " + className + " not found");
wrap.initCause(ce);
throw wrap;
}
SerializationFactory factory = new SerializationFactory(jobConf);
Deserializer<org.apache.hadoop.mapred.InputSplit> deserializer = (Deserializer<org.apache.hadoop.mapred.InputSplit>) factory
.getDeserializer(cls);
deserializer.open(inFile);
org.apache.hadoop.mapred.InputSplit split = deserializer.deserialize(null);
long pos = inFile.getPos();
if (splitBytesCounter != null) {
splitBytesCounter.increment(pos - offset);
}
inFile.close();
return split;
}
@Private
void initializeInternal() throws IOException {
// Primarily for visibility
rrLock.lock();
try {
if (splitInfoViaEvents) {
if (useNewApi) {
mrReader = new MRReaderMapReduce(jobConf, getContext().getCounters(), inputRecordCounter,
getContext().getApplicationId().getClusterTimestamp(), getContext()
.getTaskVertexIndex(), getContext().getApplicationId().getId(), getContext()
.getTaskIndex(), getContext().getTaskAttemptNumber());
} else {
mrReader = new MRReaderMapred(jobConf, getContext().getCounters(), inputRecordCounter);
}
} else {
TaskSplitMetaInfo[] allMetaInfo = MRInputUtils.readSplits(jobConf);
TaskSplitMetaInfo thisTaskMetaInfo = allMetaInfo[getContext().getTaskIndex()];
TaskSplitIndex splitMetaInfo = new TaskSplitIndex(thisTaskMetaInfo.getSplitLocation(),
thisTaskMetaInfo.getStartOffset());
if (useNewApi) {
org.apache.hadoop.mapreduce.InputSplit newInputSplit = MRInputUtils
.getNewSplitDetailsFromDisk(splitMetaInfo, jobConf, getContext().getCounters()
.findCounter(TaskCounter.SPLIT_RAW_BYTES));
mrReader = new MRReaderMapReduce(jobConf, newInputSplit, getContext().getCounters(),
inputRecordCounter, getContext().getApplicationId().getClusterTimestamp(),
getContext().getTaskVertexIndex(), getContext().getApplicationId().getId(),
getContext().getTaskIndex(), getContext().getTaskAttemptNumber());
} else {
org.apache.hadoop.mapred.InputSplit oldInputSplit = MRInputUtils
.getOldSplitDetailsFromDisk(splitMetaInfo, jobConf, getContext().getCounters()
.findCounter(TaskCounter.SPLIT_RAW_BYTES));
mrReader = new MRReaderMapred(jobConf, oldInputSplit, getContext().getCounters(), inputRecordCounter);
}
}
} finally {
rrLock.unlock();
}
LOG.info("Initialzed MRInput: " + getContext().getSourceVertexName());
}
@SuppressWarnings("unchecked")
public static org.apache.hadoop.mapreduce.InputSplit getNewSplitDetailsFromDisk(
TaskSplitIndex splitMetaInfo, JobConf jobConf, TezCounter splitBytesCounter)
throws IOException {
Path file = new Path(splitMetaInfo.getSplitLocation());
long offset = splitMetaInfo.getStartOffset();
// Split information read from local filesystem.
FileSystem fs = FileSystem.getLocal(jobConf);
file = fs.makeQualified(file);
LOG.info("Reading input split file from : " + file);
FSDataInputStream inFile = fs.open(file);
inFile.seek(offset);
String className = Text.readString(inFile);
Class<org.apache.hadoop.mapreduce.InputSplit> cls;
try {
cls = (Class<org.apache.hadoop.mapreduce.InputSplit>) jobConf.getClassByName(className);
} catch (ClassNotFoundException ce) {
IOException wrap = new IOException("Split class " + className + " not found");
wrap.initCause(ce);
throw wrap;
}
SerializationFactory factory = new SerializationFactory(jobConf);
Deserializer<org.apache.hadoop.mapreduce.InputSplit> deserializer = (Deserializer<org.apache.hadoop.mapreduce.InputSplit>) factory
.getDeserializer(cls);
deserializer.open(inFile);
org.apache.hadoop.mapreduce.InputSplit split = deserializer.deserialize(null);
long pos = inFile.getPos();
if (splitBytesCounter != null) {
splitBytesCounter.increment(pos - offset);
}
inFile.close();
return split;
}
@SuppressWarnings("unchecked")
public static InputSplit getOldSplitDetailsFromDisk(TaskSplitIndex splitMetaInfo,
JobConf jobConf, TezCounter splitBytesCounter) throws IOException {
Path file = new Path(splitMetaInfo.getSplitLocation());
FileSystem fs = FileSystem.getLocal(jobConf);
file = fs.makeQualified(file);
LOG.info("Reading input split file from : " + file);
long offset = splitMetaInfo.getStartOffset();
FSDataInputStream inFile = fs.open(file);
inFile.seek(offset);
String className = Text.readString(inFile);
Class<org.apache.hadoop.mapred.InputSplit> cls;
try {
cls = (Class<org.apache.hadoop.mapred.InputSplit>) jobConf.getClassByName(className);
} catch (ClassNotFoundException ce) {
IOException wrap = new IOException("Split class " + className + " not found");
wrap.initCause(ce);
throw wrap;
}
SerializationFactory factory = new SerializationFactory(jobConf);
Deserializer<org.apache.hadoop.mapred.InputSplit> deserializer = (Deserializer<org.apache.hadoop.mapred.InputSplit>) factory
.getDeserializer(cls);
deserializer.open(inFile);
org.apache.hadoop.mapred.InputSplit split = deserializer.deserialize(null);
long pos = inFile.getPos();
if (splitBytesCounter != null) {
splitBytesCounter.increment(pos - offset);
}
inFile.close();
return split;
}
public TestMapTask(String jobFile, TaskAttemptID taskId,
int partition, TaskSplitIndex splitIndex,
int numSlotsRequired) {
super(jobFile, taskId, partition, splitIndex, numSlotsRequired);
}
public MapTask(String jobFile, TaskAttemptID taskId,
int partition, TaskSplitIndex splitIndex,
int numSlotsRequired) {
super(jobFile, taskId, partition, numSlotsRequired);
this.splitMetaInfo = splitIndex;
}
@SuppressWarnings("unchecked")
private <INKEY,INVALUE,OUTKEY,OUTVALUE>
void runOldMapper(final JobConf job,
final TaskSplitIndex splitIndex,
final TaskUmbilicalProtocol umbilical,
TaskReporter reporter
) throws IOException, InterruptedException,
ClassNotFoundException {
InputSplit inputSplit = getSplitDetails(new Path(splitIndex.getSplitLocation()),
splitIndex.getStartOffset());
updateJobWithSplit(job, inputSplit);
reporter.setInputSplit(inputSplit);
RecordReader<INKEY,INVALUE> in = isSkipping() ?
new SkippingRecordReader<INKEY,INVALUE>(umbilical, reporter, job) :
new TrackedRecordReader<INKEY,INVALUE>(reporter, job);
job.setBoolean(JobContext.SKIP_RECORDS, isSkipping());
int numReduceTasks = conf.getNumReduceTasks();
LOG.info("numReduceTasks: " + numReduceTasks);
MapOutputCollector<OUTKEY, OUTVALUE> collector = null;
if (numReduceTasks > 0) {
collector = createSortingCollector(job, reporter);
} else {
collector = new DirectMapOutputCollector<OUTKEY, OUTVALUE>();
MapOutputCollector.Context context =
new MapOutputCollector.Context(this, job, reporter);
collector.init(context);
}
MapRunnable<INKEY,INVALUE,OUTKEY,OUTVALUE> runner =
ReflectionUtils.newInstance(job.getMapRunnerClass(), job);
try {
runner.run(in, new OldOutputCollector(collector, conf), reporter);
mapPhase.complete();
// start the sort phase only if there are reducers
if (numReduceTasks > 0) {
setPhase(TaskStatus.Phase.SORT);
}
statusUpdate(umbilical);
collector.flush();
in.close();
in = null;
collector.close();
collector = null;
} finally {
closeQuietly(in);
closeQuietly(collector);
}
}
@SuppressWarnings("unchecked")
private <INKEY,INVALUE,OUTKEY,OUTVALUE>
void runNewMapper(final JobConf job,
final TaskSplitIndex splitIndex,
final TaskUmbilicalProtocol umbilical,
TaskReporter reporter
) throws IOException, ClassNotFoundException,
InterruptedException {
// make a task context so we can get the classes
org.apache.hadoop.mapreduce.TaskAttemptContext taskContext =
new org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl(job,
getTaskID(),
reporter);
// make a mapper
org.apache.hadoop.mapreduce.Mapper<INKEY,INVALUE,OUTKEY,OUTVALUE> mapper =
(org.apache.hadoop.mapreduce.Mapper<INKEY,INVALUE,OUTKEY,OUTVALUE>)
ReflectionUtils.newInstance(taskContext.getMapperClass(), job);
// make the input format
org.apache.hadoop.mapreduce.InputFormat<INKEY,INVALUE> inputFormat =
(org.apache.hadoop.mapreduce.InputFormat<INKEY,INVALUE>)
ReflectionUtils.newInstance(taskContext.getInputFormatClass(), job);
// rebuild the input split
org.apache.hadoop.mapreduce.InputSplit split = null;
split = getSplitDetails(new Path(splitIndex.getSplitLocation()),
splitIndex.getStartOffset());
LOG.info("Processing split: " + split);
org.apache.hadoop.mapreduce.RecordReader<INKEY,INVALUE> input =
new NewTrackingRecordReader<INKEY,INVALUE>
(split, inputFormat, reporter, taskContext);
job.setBoolean(JobContext.SKIP_RECORDS, isSkipping());
org.apache.hadoop.mapreduce.RecordWriter output = null;
// get an output object
if (job.getNumReduceTasks() == 0) {
output =
new NewDirectOutputCollector(taskContext, job, umbilical, reporter);
} else {
output = new NewOutputCollector(taskContext, job, umbilical, reporter);
}
org.apache.hadoop.mapreduce.MapContext<INKEY, INVALUE, OUTKEY, OUTVALUE>
mapContext =
new MapContextImpl<INKEY, INVALUE, OUTKEY, OUTVALUE>(job, getTaskID(),
input, output,
committer,
reporter, split);
org.apache.hadoop.mapreduce.Mapper<INKEY,INVALUE,OUTKEY,OUTVALUE>.Context
mapperContext =
new WrappedMapper<INKEY, INVALUE, OUTKEY, OUTVALUE>().getMapContext(
mapContext);
try {
input.initialize(split, mapperContext);
mapper.run(mapperContext);
mapPhase.complete();
setPhase(TaskStatus.Phase.SORT);
statusUpdate(umbilical);
input.close();
input = null;
output.close(mapperContext);
output = null;
} finally {
closeQuietly(input);
closeQuietly(output, mapperContext);
}
}
public TestMapTask(String jobFile, TaskAttemptID taskId,
int partition, TaskSplitIndex splitIndex,
int numSlotsRequired) {
super(jobFile, taskId, partition, splitIndex, numSlotsRequired);
}
public MapTask(String jobFile, TaskAttemptID taskId,
int partition, TaskSplitIndex splitIndex,
int numSlotsRequired) {
super(jobFile, taskId, partition, numSlotsRequired);
this.splitMetaInfo = splitIndex;
}
@SuppressWarnings("unchecked")
private <INKEY,INVALUE,OUTKEY,OUTVALUE>
void runOldMapper(final JobConf job,
final TaskSplitIndex splitIndex,
final TaskUmbilicalProtocol umbilical,
TaskReporter reporter
) throws IOException, InterruptedException,
ClassNotFoundException {
InputSplit inputSplit = getSplitDetails(new Path(splitIndex.getSplitLocation()),
splitIndex.getStartOffset());
updateJobWithSplit(job, inputSplit);
reporter.setInputSplit(inputSplit);
RecordReader<INKEY,INVALUE> in = isSkipping() ?
new SkippingRecordReader<INKEY,INVALUE>(umbilical, reporter, job) :
new TrackedRecordReader<INKEY,INVALUE>(reporter, job);
job.setBoolean(JobContext.SKIP_RECORDS, isSkipping());
int numReduceTasks = conf.getNumReduceTasks();
LOG.info("numReduceTasks: " + numReduceTasks);
MapOutputCollector<OUTKEY, OUTVALUE> collector = null;
if (numReduceTasks > 0) {
collector = createSortingCollector(job, reporter);
} else {
collector = new DirectMapOutputCollector<OUTKEY, OUTVALUE>();
MapOutputCollector.Context context =
new MapOutputCollector.Context(this, job, reporter);
collector.init(context);
}
MapRunnable<INKEY,INVALUE,OUTKEY,OUTVALUE> runner =
ReflectionUtils.newInstance(job.getMapRunnerClass(), job);
try {
runner.run(in, new OldOutputCollector(collector, conf), reporter);
mapPhase.complete();
// start the sort phase only if there are reducers
if (numReduceTasks > 0) {
setPhase(TaskStatus.Phase.SORT);
}
statusUpdate(umbilical);
collector.flush();
in.close();
in = null;
collector.close();
collector = null;
} finally {
closeQuietly(in);
closeQuietly(collector);
}
}
@SuppressWarnings("unchecked")
private <INKEY,INVALUE,OUTKEY,OUTVALUE>
void runNewMapper(final JobConf job,
final TaskSplitIndex splitIndex,
final TaskUmbilicalProtocol umbilical,
TaskReporter reporter
) throws IOException, ClassNotFoundException,
InterruptedException {
// make a task context so we can get the classes
org.apache.hadoop.mapreduce.TaskAttemptContext taskContext =
new org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl(job,
getTaskID(),
reporter);
// make a mapper
org.apache.hadoop.mapreduce.Mapper<INKEY,INVALUE,OUTKEY,OUTVALUE> mapper =
(org.apache.hadoop.mapreduce.Mapper<INKEY,INVALUE,OUTKEY,OUTVALUE>)
ReflectionUtils.newInstance(taskContext.getMapperClass(), job);
// make the input format
org.apache.hadoop.mapreduce.InputFormat<INKEY,INVALUE> inputFormat =
(org.apache.hadoop.mapreduce.InputFormat<INKEY,INVALUE>)
ReflectionUtils.newInstance(taskContext.getInputFormatClass(), job);
// rebuild the input split
org.apache.hadoop.mapreduce.InputSplit split = null;
split = getSplitDetails(new Path(splitIndex.getSplitLocation()),
splitIndex.getStartOffset());
LOG.info("Processing split: " + split);
org.apache.hadoop.mapreduce.RecordReader<INKEY,INVALUE> input =
new NewTrackingRecordReader<INKEY,INVALUE>
(split, inputFormat, reporter, taskContext);
job.setBoolean(JobContext.SKIP_RECORDS, isSkipping());
org.apache.hadoop.mapreduce.RecordWriter output = null;
// get an output object
if (job.getNumReduceTasks() == 0) {
output =
new NewDirectOutputCollector(taskContext, job, umbilical, reporter);
} else {
output = new NewOutputCollector(taskContext, job, umbilical, reporter);
}
org.apache.hadoop.mapreduce.MapContext<INKEY, INVALUE, OUTKEY, OUTVALUE>
mapContext =
new MapContextImpl<INKEY, INVALUE, OUTKEY, OUTVALUE>(job, getTaskID(),
input, output,
committer,
reporter, split);
org.apache.hadoop.mapreduce.Mapper<INKEY,INVALUE,OUTKEY,OUTVALUE>.Context
mapperContext =
new WrappedMapper<INKEY, INVALUE, OUTKEY, OUTVALUE>().getMapContext(
mapContext);
try {
input.initialize(split, mapperContext);
mapper.run(mapperContext);
mapPhase.complete();
setPhase(TaskStatus.Phase.SORT);
statusUpdate(umbilical);
input.close();
input = null;
output.close(mapperContext);
output = null;
} finally {
closeQuietly(input);
closeQuietly(output, mapperContext);
}
}
@Private
void initializeInternal() throws IOException {
// Primarily for visibility
rrLock.lock();
try {
if (splitInfoViaEvents) {
if (useNewApi) {
mrReader = new MRReaderMapReduce(jobConf, getContext().getCounters(), inputRecordCounter,
getContext().getApplicationId().getClusterTimestamp(), getContext()
.getTaskVertexIndex(), getContext().getApplicationId().getId(), getContext()
.getTaskIndex(), getContext().getTaskAttemptNumber(), getContext());
} else {
mrReader = new MRReaderMapred(jobConf, getContext().getCounters(), inputRecordCounter,
getContext());
}
} else {
TaskSplitMetaInfo thisTaskMetaInfo = MRInputUtils.getSplits(jobConf,
getContext().getTaskIndex());
TaskSplitIndex splitMetaInfo = new TaskSplitIndex(thisTaskMetaInfo.getSplitLocation(),
thisTaskMetaInfo.getStartOffset());
long splitLength = -1;
if (useNewApi) {
org.apache.hadoop.mapreduce.InputSplit newInputSplit = MRInputUtils
.getNewSplitDetailsFromDisk(splitMetaInfo, jobConf, getContext().getCounters()
.findCounter(TaskCounter.SPLIT_RAW_BYTES));
try {
splitLength = newInputSplit.getLength();
} catch (InterruptedException e) {
LOG.warn("Got interrupted while reading split length: ", e);
}
mrReader = new MRReaderMapReduce(jobConf, newInputSplit, getContext().getCounters(),
inputRecordCounter, getContext().getApplicationId().getClusterTimestamp(),
getContext().getTaskVertexIndex(), getContext().getApplicationId().getId(),
getContext().getTaskIndex(), getContext().getTaskAttemptNumber(), getContext());
} else {
org.apache.hadoop.mapred.InputSplit oldInputSplit = MRInputUtils
.getOldSplitDetailsFromDisk(splitMetaInfo, jobConf, getContext().getCounters()
.findCounter(TaskCounter.SPLIT_RAW_BYTES));
splitLength = oldInputSplit.getLength();
mrReader =
new MRReaderMapred(jobConf, oldInputSplit, getContext().getCounters(),
inputRecordCounter, getContext());
}
if (splitLength != -1) {
getContext().getCounters().findCounter(TaskCounter.INPUT_SPLIT_LENGTH_BYTES)
.increment(splitLength);
}
}
} finally {
rrLock.unlock();
}
LOG.info("Initialized MRInput: " + getContext().getSourceVertexName());
}