下面列出了org.apache.hadoop.mapreduce.InputFormat#getSplits ( ) 实例代码,或者点击链接到github查看源代码,也可以在右侧发表评论。
public static double getTotalMapInputMB(Job job)
throws ClassNotFoundException, IOException, InterruptedException, JobException {
if (job == null) {
throw new JobException("Job is null");
}
long mapInputBytes = 0;
InputFormat<?, ?> input = ReflectionUtils.newInstance(job.getInputFormatClass(), job.getConfiguration());
for (InputSplit split : input.getSplits(job)) {
mapInputBytes += split.getLength();
}
// 0 input bytes is possible when the segment range hits no partition on a partitioned hive table (KYLIN-2470)
if (mapInputBytes == 0) {
logger.warn("Map input splits are 0 bytes, something is wrong?");
}
double totalMapInputMB = (double) mapInputBytes / 1024 / 1024;
return totalMapInputMB;
}
public static double getTotalMapInputMB(Job job)
throws ClassNotFoundException, IOException, InterruptedException, JobException {
if (job == null) {
throw new JobException("Job is null");
}
long mapInputBytes = 0;
InputFormat<?, ?> input = ReflectionUtils.newInstance(job.getInputFormatClass(), job.getConfiguration());
for (InputSplit split : input.getSplits(job)) {
mapInputBytes += split.getLength();
}
// 0 input bytes is possible when the segment range hits no partition on a partitioned hive table (KYLIN-2470)
if (mapInputBytes == 0) {
logger.warn("Map input splits are 0 bytes, something is wrong?");
}
double totalMapInputMB = (double) mapInputBytes / 1024 / 1024;
return totalMapInputMB;
}
public HadoopElementIterator(final HadoopGraph graph) {
try {
this.graph = graph;
final Configuration configuration = ConfUtil.makeHadoopConfiguration(this.graph.configuration());
final InputFormat<NullWritable, VertexWritable> inputFormat = ConfUtil.getReaderAsInputFormat(configuration);
if (inputFormat instanceof FileInputFormat) {
final Storage storage = FileSystemStorage.open(configuration);
if (!this.graph.configuration().containsKey(Constants.GREMLIN_HADOOP_INPUT_LOCATION))
return; // there is no input location and thus, no data (empty graph)
if (!Constants.getSearchGraphLocation(this.graph.configuration().getInputLocation(), storage).isPresent())
return; // there is no data at the input location (empty graph)
configuration.set(Constants.MAPREDUCE_INPUT_FILEINPUTFORMAT_INPUTDIR, Constants.getSearchGraphLocation(this.graph.configuration().getInputLocation(), storage).get());
}
final List<InputSplit> splits = inputFormat.getSplits(new JobContextImpl(configuration, new JobID(UUID.randomUUID().toString(), 1)));
for (final InputSplit split : splits) {
this.readers.add(inputFormat.createRecordReader(split, new TaskAttemptContextImpl(configuration, new TaskAttemptID())));
}
} catch (final Exception e) {
throw new IllegalStateException(e.getMessage(), e);
}
}
protected List<StatementPatternStorage> createStorages(final String location) throws IOException, InterruptedException {
final List<StatementPatternStorage> storages = new ArrayList<StatementPatternStorage>();
StatementPatternStorage storage = new StatementPatternStorage();
final InputFormat<?, ?> inputFormat = storage.getInputFormat();
Job job = Job.getInstance(new Configuration());
storage.setLocation(location, job);
final List<InputSplit> splits = inputFormat.getSplits(job);
assertNotNull(splits);
for (final InputSplit inputSplit : splits) {
storage = new StatementPatternStorage();
job = Job.getInstance(new Configuration());
storage.setLocation(location, job);
final TaskAttemptContext taskAttemptContext = new TaskAttemptContextImpl(job.getConfiguration(),
new TaskAttemptID("jtid", 0, TaskType.REDUCE, 0, 0));
final RecordReader<?, ?> recordReader = inputFormat.createRecordReader(inputSplit,
taskAttemptContext);
recordReader.initialize(inputSplit, taskAttemptContext);
storage.prepareToRead(recordReader, null);
storages.add(storage);
}
return storages;
}
protected double getTotalMapInputMB() throws ClassNotFoundException, IOException, InterruptedException, JobException {
if (job == null) {
throw new JobException("Job is null");
}
long mapInputBytes = 0;
InputFormat<?, ?> input = ReflectionUtils.newInstance(job.getInputFormatClass(), job.getConfiguration());
for (InputSplit split : input.getSplits(job)) {
mapInputBytes += split.getLength();
}
if (mapInputBytes == 0) {
throw new IllegalArgumentException("Map input splits are 0 bytes, something is wrong!");
}
double totalMapInputMB = (double) mapInputBytes / 1024 / 1024;
return totalMapInputMB;
}
private int countRecords(int numSplits)
throws IOException, InterruptedException {
InputFormat<Text, BytesWritable> format =
new SequenceFileInputFilter<Text, BytesWritable>();
if (numSplits == 0) {
numSplits =
random.nextInt(MAX_LENGTH / (SequenceFile.SYNC_INTERVAL / 20)) + 1;
}
FileInputFormat.setMaxInputSplitSize(job,
fs.getFileStatus(inFile).getLen() / numSplits);
TaskAttemptContext context = MapReduceTestUtil.
createDummyMapTaskAttemptContext(job.getConfiguration());
// check each split
int count = 0;
for (InputSplit split : format.getSplits(job)) {
RecordReader<Text, BytesWritable> reader =
format.createRecordReader(split, context);
MapContext<Text, BytesWritable, Text, BytesWritable> mcontext =
new MapContextImpl<Text, BytesWritable, Text, BytesWritable>(
job.getConfiguration(),
context.getTaskAttemptID(), reader, null, null,
MapReduceTestUtil.createDummyReporter(), split);
reader.initialize(split, mcontext);
try {
while (reader.nextKeyValue()) {
LOG.info("Accept record " + reader.getCurrentKey().toString());
count++;
}
} finally {
reader.close();
}
}
return count;
}
/**
* From each split sampled, take the first numSamples / numSplits records.
*/
@SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type
public K[] getSample(InputFormat<K,V> inf, Job job)
throws IOException, InterruptedException {
List<InputSplit> splits = inf.getSplits(job);
ArrayList<K> samples = new ArrayList<K>(numSamples);
int splitsToSample = Math.min(maxSplitsSampled, splits.size());
int samplesPerSplit = numSamples / splitsToSample;
long records = 0;
for (int i = 0; i < splitsToSample; ++i) {
TaskAttemptContext samplingContext = new TaskAttemptContextImpl(
job.getConfiguration(), new TaskAttemptID());
RecordReader<K,V> reader = inf.createRecordReader(
splits.get(i), samplingContext);
reader.initialize(splits.get(i), samplingContext);
while (reader.nextKeyValue()) {
samples.add(ReflectionUtils.copy(job.getConfiguration(),
reader.getCurrentKey(), null));
++records;
if ((i+1) * samplesPerSplit <= records) {
break;
}
}
reader.close();
}
return (K[])samples.toArray();
}
/**
* For each split sampled, emit when the ratio of the number of records
* retained to the total record count is less than the specified
* frequency.
*/
@SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type
public K[] getSample(InputFormat<K,V> inf, Job job)
throws IOException, InterruptedException {
List<InputSplit> splits = inf.getSplits(job);
ArrayList<K> samples = new ArrayList<K>();
int splitsToSample = Math.min(maxSplitsSampled, splits.size());
long records = 0;
long kept = 0;
for (int i = 0; i < splitsToSample; ++i) {
TaskAttemptContext samplingContext = new TaskAttemptContextImpl(
job.getConfiguration(), new TaskAttemptID());
RecordReader<K,V> reader = inf.createRecordReader(
splits.get(i), samplingContext);
reader.initialize(splits.get(i), samplingContext);
while (reader.nextKeyValue()) {
++records;
if ((double) kept / records < freq) {
samples.add(ReflectionUtils.copy(job.getConfiguration(),
reader.getCurrentKey(), null));
++kept;
}
}
reader.close();
}
return (K[])samples.toArray();
}
private int countRecords(int numSplits)
throws IOException, InterruptedException {
InputFormat<Text, BytesWritable> format =
new SequenceFileInputFilter<Text, BytesWritable>();
if (numSplits == 0) {
numSplits =
random.nextInt(MAX_LENGTH / (SequenceFile.SYNC_INTERVAL / 20)) + 1;
}
FileInputFormat.setMaxInputSplitSize(job,
fs.getFileStatus(inFile).getLen() / numSplits);
TaskAttemptContext context = MapReduceTestUtil.
createDummyMapTaskAttemptContext(job.getConfiguration());
// check each split
int count = 0;
for (InputSplit split : format.getSplits(job)) {
RecordReader<Text, BytesWritable> reader =
format.createRecordReader(split, context);
MapContext<Text, BytesWritable, Text, BytesWritable> mcontext =
new MapContextImpl<Text, BytesWritable, Text, BytesWritable>(
job.getConfiguration(),
context.getTaskAttemptID(), reader, null, null,
MapReduceTestUtil.createDummyReporter(), split);
reader.initialize(split, mcontext);
try {
while (reader.nextKeyValue()) {
LOG.info("Accept record " + reader.getCurrentKey().toString());
count++;
}
} finally {
reader.close();
}
}
return count;
}
/**
* From each split sampled, take the first numSamples / numSplits records.
*/
@SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type
public K[] getSample(InputFormat<K,V> inf, Job job)
throws IOException, InterruptedException {
List<InputSplit> splits = inf.getSplits(job);
ArrayList<K> samples = new ArrayList<K>(numSamples);
int splitsToSample = Math.min(maxSplitsSampled, splits.size());
int samplesPerSplit = numSamples / splitsToSample;
long records = 0;
for (int i = 0; i < splitsToSample; ++i) {
TaskAttemptContext samplingContext = new TaskAttemptContextImpl(
job.getConfiguration(), new TaskAttemptID());
RecordReader<K,V> reader = inf.createRecordReader(
splits.get(i), samplingContext);
reader.initialize(splits.get(i), samplingContext);
while (reader.nextKeyValue()) {
samples.add(ReflectionUtils.copy(job.getConfiguration(),
reader.getCurrentKey(), null));
++records;
if ((i+1) * samplesPerSplit <= records) {
break;
}
}
reader.close();
}
return (K[])samples.toArray();
}
/**
* For each split sampled, emit when the ratio of the number of records
* retained to the total record count is less than the specified
* frequency.
*/
@SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type
public K[] getSample(InputFormat<K,V> inf, Job job)
throws IOException, InterruptedException {
List<InputSplit> splits = inf.getSplits(job);
ArrayList<K> samples = new ArrayList<K>();
int splitsToSample = Math.min(maxSplitsSampled, splits.size());
long records = 0;
long kept = 0;
for (int i = 0; i < splitsToSample; ++i) {
TaskAttemptContext samplingContext = new TaskAttemptContextImpl(
job.getConfiguration(), new TaskAttemptID());
RecordReader<K,V> reader = inf.createRecordReader(
splits.get(i), samplingContext);
reader.initialize(splits.get(i), samplingContext);
while (reader.nextKeyValue()) {
++records;
if ((double) kept / records < freq) {
samples.add(ReflectionUtils.copy(job.getConfiguration(),
reader.getCurrentKey(), null));
++kept;
}
}
reader.close();
}
return (K[])samples.toArray();
}
/**
* IndexableLoadFunc interface implementation
*/
@Override
public void initialize(Configuration conf) throws IOException {
try {
InputFormat inputFormat = this.getInputFormat();
TaskAttemptID id = HadoopShims.getNewTaskAttemptID();
if (System.getenv("HADOOP_TOKEN_FILE_LOCATION") != null) {
conf.set(MRConfiguration.JOB_CREDENTIALS_BINARY, System.getenv("HADOOP_TOKEN_FILE_LOCATION"));
}
List<FileSplit> fileSplits = inputFormat.getSplits(HadoopShims.createJobContext(conf, null));
this.readers = new IndexedStorageRecordReader[fileSplits.size()];
int idx = 0;
Iterator<FileSplit> it = fileSplits.iterator();
while (it.hasNext()) {
FileSplit fileSplit = it.next();
TaskAttemptContext context = HadoopShims.createTaskAttemptContext(conf, id);
IndexedStorageRecordReader r = (IndexedStorageRecordReader) inputFormat.createRecordReader(fileSplit, context);
r.initialize(fileSplit, context);
this.readers[idx] = r;
idx++;
}
Arrays.sort(this.readers, this.readerComparator);
} catch (InterruptedException e) {
throw new IOException(e);
}
}
public void testBinary() throws IOException, InterruptedException {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
Path outdir = new Path(System.getProperty("test.build.data", "/tmp"),
"outseq");
Random r = new Random();
long seed = r.nextLong();
r.setSeed(seed);
FileOutputFormat.setOutputPath(job, outdir);
SequenceFileAsBinaryOutputFormat.setSequenceFileOutputKeyClass(job,
IntWritable.class );
SequenceFileAsBinaryOutputFormat.setSequenceFileOutputValueClass(job,
DoubleWritable.class );
SequenceFileAsBinaryOutputFormat.setCompressOutput(job, true);
SequenceFileAsBinaryOutputFormat.setOutputCompressionType(job,
CompressionType.BLOCK);
BytesWritable bkey = new BytesWritable();
BytesWritable bval = new BytesWritable();
TaskAttemptContext context =
MapReduceTestUtil.createDummyMapTaskAttemptContext(job.getConfiguration());
OutputFormat<BytesWritable, BytesWritable> outputFormat =
new SequenceFileAsBinaryOutputFormat();
OutputCommitter committer = outputFormat.getOutputCommitter(context);
committer.setupJob(job);
RecordWriter<BytesWritable, BytesWritable> writer = outputFormat.
getRecordWriter(context);
IntWritable iwritable = new IntWritable();
DoubleWritable dwritable = new DoubleWritable();
DataOutputBuffer outbuf = new DataOutputBuffer();
LOG.info("Creating data by SequenceFileAsBinaryOutputFormat");
try {
for (int i = 0; i < RECORDS; ++i) {
iwritable = new IntWritable(r.nextInt());
iwritable.write(outbuf);
bkey.set(outbuf.getData(), 0, outbuf.getLength());
outbuf.reset();
dwritable = new DoubleWritable(r.nextDouble());
dwritable.write(outbuf);
bval.set(outbuf.getData(), 0, outbuf.getLength());
outbuf.reset();
writer.write(bkey, bval);
}
} finally {
writer.close(context);
}
committer.commitTask(context);
committer.commitJob(job);
InputFormat<IntWritable, DoubleWritable> iformat =
new SequenceFileInputFormat<IntWritable, DoubleWritable>();
int count = 0;
r.setSeed(seed);
SequenceFileInputFormat.setInputPaths(job, outdir);
LOG.info("Reading data by SequenceFileInputFormat");
for (InputSplit split : iformat.getSplits(job)) {
RecordReader<IntWritable, DoubleWritable> reader =
iformat.createRecordReader(split, context);
MapContext<IntWritable, DoubleWritable, BytesWritable, BytesWritable>
mcontext = new MapContextImpl<IntWritable, DoubleWritable,
BytesWritable, BytesWritable>(job.getConfiguration(),
context.getTaskAttemptID(), reader, null, null,
MapReduceTestUtil.createDummyReporter(),
split);
reader.initialize(split, mcontext);
try {
int sourceInt;
double sourceDouble;
while (reader.nextKeyValue()) {
sourceInt = r.nextInt();
sourceDouble = r.nextDouble();
iwritable = reader.getCurrentKey();
dwritable = reader.getCurrentValue();
assertEquals(
"Keys don't match: " + "*" + iwritable.get() + ":" +
sourceInt + "*",
sourceInt, iwritable.get());
assertTrue(
"Vals don't match: " + "*" + dwritable.get() + ":" +
sourceDouble + "*",
Double.compare(dwritable.get(), sourceDouble) == 0 );
++count;
}
} finally {
reader.close();
}
}
assertEquals("Some records not found", RECORDS, count);
}
public void testBinary() throws IOException, InterruptedException {
Job job = Job.getInstance();
FileSystem fs = FileSystem.getLocal(job.getConfiguration());
Path dir = new Path(System.getProperty("test.build.data",".") + "/mapred");
Path file = new Path(dir, "testbinary.seq");
Random r = new Random();
long seed = r.nextLong();
r.setSeed(seed);
fs.delete(dir, true);
FileInputFormat.setInputPaths(job, dir);
Text tkey = new Text();
Text tval = new Text();
SequenceFile.Writer writer = new SequenceFile.Writer(fs,
job.getConfiguration(), file, Text.class, Text.class);
try {
for (int i = 0; i < RECORDS; ++i) {
tkey.set(Integer.toString(r.nextInt(), 36));
tval.set(Long.toString(r.nextLong(), 36));
writer.append(tkey, tval);
}
} finally {
writer.close();
}
TaskAttemptContext context = MapReduceTestUtil.
createDummyMapTaskAttemptContext(job.getConfiguration());
InputFormat<BytesWritable,BytesWritable> bformat =
new SequenceFileAsBinaryInputFormat();
int count = 0;
r.setSeed(seed);
BytesWritable bkey = new BytesWritable();
BytesWritable bval = new BytesWritable();
Text cmpkey = new Text();
Text cmpval = new Text();
DataInputBuffer buf = new DataInputBuffer();
FileInputFormat.setInputPaths(job, file);
for (InputSplit split : bformat.getSplits(job)) {
RecordReader<BytesWritable, BytesWritable> reader =
bformat.createRecordReader(split, context);
MapContext<BytesWritable, BytesWritable, BytesWritable, BytesWritable>
mcontext = new MapContextImpl<BytesWritable, BytesWritable,
BytesWritable, BytesWritable>(job.getConfiguration(),
context.getTaskAttemptID(), reader, null, null,
MapReduceTestUtil.createDummyReporter(),
split);
reader.initialize(split, mcontext);
try {
while (reader.nextKeyValue()) {
bkey = reader.getCurrentKey();
bval = reader.getCurrentValue();
tkey.set(Integer.toString(r.nextInt(), 36));
tval.set(Long.toString(r.nextLong(), 36));
buf.reset(bkey.getBytes(), bkey.getLength());
cmpkey.readFields(buf);
buf.reset(bval.getBytes(), bval.getLength());
cmpval.readFields(buf);
assertTrue(
"Keys don't match: " + "*" + cmpkey.toString() + ":" +
tkey.toString() + "*",
cmpkey.toString().equals(tkey.toString()));
assertTrue(
"Vals don't match: " + "*" + cmpval.toString() + ":" +
tval.toString() + "*",
cmpval.toString().equals(tval.toString()));
++count;
}
} finally {
reader.close();
}
}
assertEquals("Some records not found", RECORDS, count);
}
@Test(timeout=10000)
public void testFormat() throws IOException, InterruptedException {
Job job = Job.getInstance(conf);
Random random = new Random();
long seed = random.nextLong();
random.setSeed(seed);
localFs.delete(workDir, true);
FileInputFormat.setInputPaths(job, workDir);
final int length = 10000;
final int numFiles = 10;
// create files with a variety of lengths
createFiles(length, numFiles, random, job);
TaskAttemptContext context = MapReduceTestUtil.
createDummyMapTaskAttemptContext(job.getConfiguration());
// create a combine split for the files
InputFormat<IntWritable,BytesWritable> format =
new CombineSequenceFileInputFormat<IntWritable,BytesWritable>();
for (int i = 0; i < 3; i++) {
int numSplits =
random.nextInt(length/(SequenceFile.SYNC_INTERVAL/20)) + 1;
LOG.info("splitting: requesting = " + numSplits);
List<InputSplit> splits = format.getSplits(job);
LOG.info("splitting: got = " + splits.size());
// we should have a single split as the length is comfortably smaller than
// the block size
assertEquals("We got more than one splits!", 1, splits.size());
InputSplit split = splits.get(0);
assertEquals("It should be CombineFileSplit",
CombineFileSplit.class, split.getClass());
// check the split
BitSet bits = new BitSet(length);
RecordReader<IntWritable,BytesWritable> reader =
format.createRecordReader(split, context);
MapContext<IntWritable,BytesWritable,IntWritable,BytesWritable> mcontext =
new MapContextImpl<IntWritable,BytesWritable,IntWritable,BytesWritable>(job.getConfiguration(),
context.getTaskAttemptID(), reader, null, null,
MapReduceTestUtil.createDummyReporter(), split);
reader.initialize(split, mcontext);
assertEquals("reader class is CombineFileRecordReader.",
CombineFileRecordReader.class, reader.getClass());
try {
while (reader.nextKeyValue()) {
IntWritable key = reader.getCurrentKey();
BytesWritable value = reader.getCurrentValue();
assertNotNull("Value should not be null.", value);
final int k = key.get();
LOG.debug("read " + k);
assertFalse("Key in multiple partitions.", bits.get(k));
bits.set(k);
}
} finally {
reader.close();
}
assertEquals("Some keys in no partition.", length, bits.cardinality());
}
}
/**
* Randomize the split order, then take the specified number of keys from
* each split sampled, where each key is selected with the specified
* probability and possibly replaced by a subsequently selected key when
* the quota of keys from that split is satisfied.
*/
@SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type
public K[] getSample(InputFormat<K,V> inf, Job job)
throws IOException, InterruptedException {
List<InputSplit> splits = inf.getSplits(job);
ArrayList<K> samples = new ArrayList<K>(numSamples);
int splitsToSample = Math.min(maxSplitsSampled, splits.size());
Random r = new Random();
long seed = r.nextLong();
r.setSeed(seed);
LOG.debug("seed: " + seed);
// shuffle splits
for (int i = 0; i < splits.size(); ++i) {
InputSplit tmp = splits.get(i);
int j = r.nextInt(splits.size());
splits.set(i, splits.get(j));
splits.set(j, tmp);
}
// our target rate is in terms of the maximum number of sample splits,
// but we accept the possibility of sampling additional splits to hit
// the target sample keyset
for (int i = 0; i < splitsToSample ||
(i < splits.size() && samples.size() < numSamples); ++i) {
TaskAttemptContext samplingContext = new TaskAttemptContextImpl(
job.getConfiguration(), new TaskAttemptID());
RecordReader<K,V> reader = inf.createRecordReader(
splits.get(i), samplingContext);
reader.initialize(splits.get(i), samplingContext);
while (reader.nextKeyValue()) {
if (r.nextDouble() <= freq) {
if (samples.size() < numSamples) {
samples.add(ReflectionUtils.copy(job.getConfiguration(),
reader.getCurrentKey(), null));
} else {
// When exceeding the maximum number of samples, replace a
// random element with this one, then adjust the frequency
// to reflect the possibility of existing elements being
// pushed out
int ind = r.nextInt(numSamples);
if (ind != numSamples) {
samples.set(ind, ReflectionUtils.copy(job.getConfiguration(),
reader.getCurrentKey(), null));
}
freq *= (numSamples - 1) / (double) numSamples;
}
}
}
reader.close();
}
return (K[])samples.toArray();
}
public void testBinary() throws IOException, InterruptedException {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
Path outdir = new Path(System.getProperty("test.build.data", "/tmp"),
"outseq");
Random r = new Random();
long seed = r.nextLong();
r.setSeed(seed);
FileOutputFormat.setOutputPath(job, outdir);
SequenceFileAsBinaryOutputFormat.setSequenceFileOutputKeyClass(job,
IntWritable.class );
SequenceFileAsBinaryOutputFormat.setSequenceFileOutputValueClass(job,
DoubleWritable.class );
SequenceFileAsBinaryOutputFormat.setCompressOutput(job, true);
SequenceFileAsBinaryOutputFormat.setOutputCompressionType(job,
CompressionType.BLOCK);
BytesWritable bkey = new BytesWritable();
BytesWritable bval = new BytesWritable();
TaskAttemptContext context =
MapReduceTestUtil.createDummyMapTaskAttemptContext(job.getConfiguration());
OutputFormat<BytesWritable, BytesWritable> outputFormat =
new SequenceFileAsBinaryOutputFormat();
OutputCommitter committer = outputFormat.getOutputCommitter(context);
committer.setupJob(job);
RecordWriter<BytesWritable, BytesWritable> writer = outputFormat.
getRecordWriter(context);
IntWritable iwritable = new IntWritable();
DoubleWritable dwritable = new DoubleWritable();
DataOutputBuffer outbuf = new DataOutputBuffer();
LOG.info("Creating data by SequenceFileAsBinaryOutputFormat");
try {
for (int i = 0; i < RECORDS; ++i) {
iwritable = new IntWritable(r.nextInt());
iwritable.write(outbuf);
bkey.set(outbuf.getData(), 0, outbuf.getLength());
outbuf.reset();
dwritable = new DoubleWritable(r.nextDouble());
dwritable.write(outbuf);
bval.set(outbuf.getData(), 0, outbuf.getLength());
outbuf.reset();
writer.write(bkey, bval);
}
} finally {
writer.close(context);
}
committer.commitTask(context);
committer.commitJob(job);
InputFormat<IntWritable, DoubleWritable> iformat =
new SequenceFileInputFormat<IntWritable, DoubleWritable>();
int count = 0;
r.setSeed(seed);
SequenceFileInputFormat.setInputPaths(job, outdir);
LOG.info("Reading data by SequenceFileInputFormat");
for (InputSplit split : iformat.getSplits(job)) {
RecordReader<IntWritable, DoubleWritable> reader =
iformat.createRecordReader(split, context);
MapContext<IntWritable, DoubleWritable, BytesWritable, BytesWritable>
mcontext = new MapContextImpl<IntWritable, DoubleWritable,
BytesWritable, BytesWritable>(job.getConfiguration(),
context.getTaskAttemptID(), reader, null, null,
MapReduceTestUtil.createDummyReporter(),
split);
reader.initialize(split, mcontext);
try {
int sourceInt;
double sourceDouble;
while (reader.nextKeyValue()) {
sourceInt = r.nextInt();
sourceDouble = r.nextDouble();
iwritable = reader.getCurrentKey();
dwritable = reader.getCurrentValue();
assertEquals(
"Keys don't match: " + "*" + iwritable.get() + ":" +
sourceInt + "*",
sourceInt, iwritable.get());
assertTrue(
"Vals don't match: " + "*" + dwritable.get() + ":" +
sourceDouble + "*",
Double.compare(dwritable.get(), sourceDouble) == 0 );
++count;
}
} finally {
reader.close();
}
}
assertEquals("Some records not found", RECORDS, count);
}
public void testBinary() throws IOException, InterruptedException {
Job job = Job.getInstance();
FileSystem fs = FileSystem.getLocal(job.getConfiguration());
Path dir = new Path(System.getProperty("test.build.data",".") + "/mapred");
Path file = new Path(dir, "testbinary.seq");
Random r = new Random();
long seed = r.nextLong();
r.setSeed(seed);
fs.delete(dir, true);
FileInputFormat.setInputPaths(job, dir);
Text tkey = new Text();
Text tval = new Text();
SequenceFile.Writer writer = new SequenceFile.Writer(fs,
job.getConfiguration(), file, Text.class, Text.class);
try {
for (int i = 0; i < RECORDS; ++i) {
tkey.set(Integer.toString(r.nextInt(), 36));
tval.set(Long.toString(r.nextLong(), 36));
writer.append(tkey, tval);
}
} finally {
writer.close();
}
TaskAttemptContext context = MapReduceTestUtil.
createDummyMapTaskAttemptContext(job.getConfiguration());
InputFormat<BytesWritable,BytesWritable> bformat =
new SequenceFileAsBinaryInputFormat();
int count = 0;
r.setSeed(seed);
BytesWritable bkey = new BytesWritable();
BytesWritable bval = new BytesWritable();
Text cmpkey = new Text();
Text cmpval = new Text();
DataInputBuffer buf = new DataInputBuffer();
FileInputFormat.setInputPaths(job, file);
for (InputSplit split : bformat.getSplits(job)) {
RecordReader<BytesWritable, BytesWritable> reader =
bformat.createRecordReader(split, context);
MapContext<BytesWritable, BytesWritable, BytesWritable, BytesWritable>
mcontext = new MapContextImpl<BytesWritable, BytesWritable,
BytesWritable, BytesWritable>(job.getConfiguration(),
context.getTaskAttemptID(), reader, null, null,
MapReduceTestUtil.createDummyReporter(),
split);
reader.initialize(split, mcontext);
try {
while (reader.nextKeyValue()) {
bkey = reader.getCurrentKey();
bval = reader.getCurrentValue();
tkey.set(Integer.toString(r.nextInt(), 36));
tval.set(Long.toString(r.nextLong(), 36));
buf.reset(bkey.getBytes(), bkey.getLength());
cmpkey.readFields(buf);
buf.reset(bval.getBytes(), bval.getLength());
cmpval.readFields(buf);
assertTrue(
"Keys don't match: " + "*" + cmpkey.toString() + ":" +
tkey.toString() + "*",
cmpkey.toString().equals(tkey.toString()));
assertTrue(
"Vals don't match: " + "*" + cmpval.toString() + ":" +
tval.toString() + "*",
cmpval.toString().equals(tval.toString()));
++count;
}
} finally {
reader.close();
}
}
assertEquals("Some records not found", RECORDS, count);
}
@Test(timeout=10000)
public void testFormat() throws IOException, InterruptedException {
Job job = Job.getInstance(conf);
Random random = new Random();
long seed = random.nextLong();
random.setSeed(seed);
localFs.delete(workDir, true);
FileInputFormat.setInputPaths(job, workDir);
final int length = 10000;
final int numFiles = 10;
// create files with a variety of lengths
createFiles(length, numFiles, random, job);
TaskAttemptContext context = MapReduceTestUtil.
createDummyMapTaskAttemptContext(job.getConfiguration());
// create a combine split for the files
InputFormat<IntWritable,BytesWritable> format =
new CombineSequenceFileInputFormat<IntWritable,BytesWritable>();
for (int i = 0; i < 3; i++) {
int numSplits =
random.nextInt(length/(SequenceFile.SYNC_INTERVAL/20)) + 1;
LOG.info("splitting: requesting = " + numSplits);
List<InputSplit> splits = format.getSplits(job);
LOG.info("splitting: got = " + splits.size());
// we should have a single split as the length is comfortably smaller than
// the block size
assertEquals("We got more than one splits!", 1, splits.size());
InputSplit split = splits.get(0);
assertEquals("It should be CombineFileSplit",
CombineFileSplit.class, split.getClass());
// check the split
BitSet bits = new BitSet(length);
RecordReader<IntWritable,BytesWritable> reader =
format.createRecordReader(split, context);
MapContext<IntWritable,BytesWritable,IntWritable,BytesWritable> mcontext =
new MapContextImpl<IntWritable,BytesWritable,IntWritable,BytesWritable>(job.getConfiguration(),
context.getTaskAttemptID(), reader, null, null,
MapReduceTestUtil.createDummyReporter(), split);
reader.initialize(split, mcontext);
assertEquals("reader class is CombineFileRecordReader.",
CombineFileRecordReader.class, reader.getClass());
try {
while (reader.nextKeyValue()) {
IntWritable key = reader.getCurrentKey();
BytesWritable value = reader.getCurrentValue();
assertNotNull("Value should not be null.", value);
final int k = key.get();
LOG.debug("read " + k);
assertFalse("Key in multiple partitions.", bits.get(k));
bits.set(k);
}
} finally {
reader.close();
}
assertEquals("Some keys in no partition.", length, bits.cardinality());
}
}
/**
* Randomize the split order, then take the specified number of keys from
* each split sampled, where each key is selected with the specified
* probability and possibly replaced by a subsequently selected key when
* the quota of keys from that split is satisfied.
*/
@SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type
public K[] getSample(InputFormat<K,V> inf, Job job)
throws IOException, InterruptedException {
List<InputSplit> splits = inf.getSplits(job);
ArrayList<K> samples = new ArrayList<K>(numSamples);
int splitsToSample = Math.min(maxSplitsSampled, splits.size());
Random r = new Random();
long seed = r.nextLong();
r.setSeed(seed);
LOG.debug("seed: " + seed);
// shuffle splits
for (int i = 0; i < splits.size(); ++i) {
InputSplit tmp = splits.get(i);
int j = r.nextInt(splits.size());
splits.set(i, splits.get(j));
splits.set(j, tmp);
}
// our target rate is in terms of the maximum number of sample splits,
// but we accept the possibility of sampling additional splits to hit
// the target sample keyset
for (int i = 0; i < splitsToSample ||
(i < splits.size() && samples.size() < numSamples); ++i) {
TaskAttemptContext samplingContext = new TaskAttemptContextImpl(
job.getConfiguration(), new TaskAttemptID());
RecordReader<K,V> reader = inf.createRecordReader(
splits.get(i), samplingContext);
reader.initialize(splits.get(i), samplingContext);
while (reader.nextKeyValue()) {
if (r.nextDouble() <= freq) {
if (samples.size() < numSamples) {
samples.add(ReflectionUtils.copy(job.getConfiguration(),
reader.getCurrentKey(), null));
} else {
// When exceeding the maximum number of samples, replace a
// random element with this one, then adjust the frequency
// to reflect the possibility of existing elements being
// pushed out
int ind = r.nextInt(numSamples);
if (ind != numSamples) {
samples.set(ind, ReflectionUtils.copy(job.getConfiguration(),
reader.getCurrentKey(), null));
}
freq *= (numSamples - 1) / (double) numSamples;
}
}
}
reader.close();
}
return (K[])samples.toArray();
}