下面列出了org.apache.hadoop.io.serializer.JavaSerializationComparator#org.apache.hadoop.mapreduce.lib.output.FileOutputFormat 实例代码,或者点击链接到github查看源代码,也可以在右侧发表评论。
public static void runSortJob(Configuration conf, Path input, Path outputPath)
throws Exception {
Job job = new Job(conf);
job.setJarByClass(Main.class);
job.setMapperClass(SortMapReduce.Map.class);
job.setReducerClass(SortMapReduce.Reduce.class);
job.setInputFormatClass(KeyValueTextInputFormat.class);
job.setMapOutputKeyClass(Person.class);
job.setMapOutputValueClass(Person.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.setPartitionerClass(PersonNamePartitioner.class);
job.setSortComparatorClass(PersonComparator.class);
job.setGroupingComparatorClass(PersonNameComparator.class);
FileInputFormat.setInputPaths(job, input);
FileOutputFormat.setOutputPath(job, outputPath);
job.waitForCompletion(true);
}
/**
* Refer to MRCompactorAvroKeyDedupJobRunner#configureInputAndOutputPaths(Job).
* @return false if no valid input paths present for MR job to process, where a path is valid if it is
* a directory containing one or more files.
*
*/
protected boolean configureInputAndOutputPaths(Job job, FileSystemDataset dataset) throws IOException {
boolean emptyDirectoryFlag = false;
String mrOutputBase = this.state.getProp(MRCompactor.COMPACTION_JOB_DIR);
CompactionPathParser parser = new CompactionPathParser(this.state);
CompactionPathParser.CompactionParserResult rst = parser.parse(dataset);
this.mrOutputPath = concatPaths(mrOutputBase, rst.getDatasetName(), rst.getDstSubDir(), rst.getTimeString());
log.info("Cleaning temporary MR output directory: " + mrOutputPath);
this.fs.delete(mrOutputPath, true);
this.mapReduceInputPaths = getGranularInputPaths(dataset.datasetRoot());
if (this.mapReduceInputPaths.isEmpty()) {
this.mapReduceInputPaths.add(dataset.datasetRoot());
emptyDirectoryFlag = true;
}
this.oldFiles = new HashSet<>();
for (Path path : mapReduceInputPaths) {
oldFiles.add(this.fs.makeQualified(path).toString());
FileInputFormat.addInputPath(job, path);
}
FileOutputFormat.setOutputPath(job, mrOutputPath);
return emptyDirectoryFlag;
}
public void execute(String inputPath1, String inputPath2, String outputPath) throws Exception {
Configuration conf = new Configuration();
Job job = new Job(conf, "bigdiff");
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.setMapperClass(Map.class);
job.setReducerClass(Reduce.class);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
FileInputFormat.addInputPath(job, new Path(inputPath1));
FileInputFormat.addInputPath(job, new Path(inputPath2));
FileOutputFormat.setOutputPath(job, new Path(outputPath));
job.waitForCompletion(true);
}
private static void checkOuterConsistency(Job job, Path[] src)
throws IOException {
Path outf = FileOutputFormat.getOutputPath(job);
FileStatus[] outlist = cluster.getFileSystem().listStatus(outf, new
Utils.OutputFileUtils.OutputFilesFilter());
assertEquals("number of part files is more than 1. It is" + outlist.length,
1, outlist.length);
assertTrue("output file with zero length" + outlist[0].getLen(),
0 < outlist[0].getLen());
SequenceFile.Reader r =
new SequenceFile.Reader(cluster.getFileSystem(),
outlist[0].getPath(), job.getConfiguration());
IntWritable k = new IntWritable();
IntWritable v = new IntWritable();
while (r.next(k, v)) {
assertEquals("counts does not match", v.get(),
countProduct(k, src, job.getConfiguration()));
}
r.close();
}
public static void writeAvro(DataSet<Tuple2<Void, Person>> data, String outputPath) throws IOException {
// Set up the Hadoop Input Format
Job job = Job.getInstance();
// Set up Hadoop Output Format
HadoopOutputFormat hadoopOutputFormat = new HadoopOutputFormat(new AvroParquetOutputFormat(), job);
FileOutputFormat.setOutputPath(job, new Path(outputPath));
AvroParquetOutputFormat.setSchema(job, Person.getClassSchema());
ParquetOutputFormat.setCompression(job, CompressionCodecName.SNAPPY);
ParquetOutputFormat.setEnableDictionary(job, true);
// Output & Execute
data.output(hadoopOutputFormat);
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
if (otherArgs.length != 2) {
System.err.println("Usage: MapperInputSplitInfo <in> <out>");
System.exit(2);
}
Job job = Job.getInstance(conf, MapperInputSplitInfo.class.getSimpleName());
job.setJarByClass(MapperInputSplitInfo.class);
job.setMapperClass(MyMapper.class);
job.setCombinerClass(MyReducer.class);
job.setReducerClass(MyReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
/**
* Gets a configured instance of the stored {@link FileOutputFormat} in the configuration.
*
* @param conf the configuration to reference the keys from.
* @return a configured instance of the stored {@link FileOutputFormat} in the configuration.
* @throws IOException if there's an issue getting an instance of a FileOutputFormat from the
* configuration.
*/
@SuppressWarnings("rawtypes")
public static FileOutputFormat getFileOutputFormat(Configuration conf) throws IOException {
// Ensure the BigQuery output information is valid.
getMandatoryConfig(conf, OUTPUT_FORMAT_CLASS);
Class<?> confClass = OUTPUT_FORMAT_CLASS.get(conf, conf::getClass);
// Fail if the default value was used, or the class isn't a FileOutputFormat.
if (confClass == null) {
throw new IOException(
"Unable to resolve value for the configuration key '"
+ OUTPUT_FORMAT_CLASS.getKey()
+ "'.");
} else if (!FileOutputFormat.class.isAssignableFrom(confClass)) {
throw new IOException("The class " + confClass.getName() + " is not a FileOutputFormat.");
}
Class<? extends FileOutputFormat> fileOutputClass =
confClass.asSubclass(FileOutputFormat.class);
// Create a new instance and configure it if it's configurable.
return ReflectionUtils.newInstance(fileOutputClass, conf);
}
protected void setOutput() throws IOException
{
JsonNode output = get(root, "output");
JsonNode params = output.get("params");
if (params == null)
params = mapper.createObjectNode();
Path outputPath = new Path(getText(output, "path"));
FileOutputFormat.setOutputPath(job, outputPath);
if (params.has("overwrite") && Boolean.parseBoolean(getText(params, "overwrite")))
{
fs.delete(outputPath, true);
}
BlockSchema schema = new BlockSchema(output.get("schema"));
Storage storage = StorageFactory.get(getText(output, "type"));
storage.prepareOutput(job, conf, params, schema, outputPath);
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf,args).getRemainingArgs();
// System.out.println(otherArgs);
if(otherArgs.length != 2) {
System.out.println("Usage:wordcount <in> <out>");
System.exit(2);
}
// if(args.length != 2) {
// System.out.println("param error!");
// System.exit(-1);
// }
Job job = new Job(conf, "word count");
job.setJarByClass(WordCount.class);
job.setMapperClass(TokenizerMapper.class);
job.setCombinerClass(IntSumReducer.class);
job.setReducerClass(IntSumReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
public Job call() throws IOException, InterruptedException,
ClassNotFoundException {
job.setMapperClass(GridmixMapper.class);
job.setReducerClass(GridmixReducer.class);
job.setNumReduceTasks(jobdesc.getNumberReduces());
job.setMapOutputKeyClass(GridmixKey.class);
job.setMapOutputValueClass(GridmixRecord.class);
job.setSortComparatorClass(GridmixKey.Comparator.class);
job.setGroupingComparatorClass(SpecGroupingComparator.class);
job.setInputFormatClass(GridmixInputFormat.class);
job.setOutputFormatClass(RawBytesOutputFormat.class);
job.setPartitionerClass(DraftPartitioner.class);
job.setJarByClass(GridmixJob.class);
job.getConfiguration().setInt("gridmix.job.seq", seq);
job.getConfiguration().set(ORIGNAME, null == jobdesc.getJobID()
? "<unknown>" : jobdesc.getJobID().toString());
job.getConfiguration().setBoolean("mapred.used.genericoptionsparser", true);
FileInputFormat.addInputPath(job, new Path("ignored"));
FileOutputFormat.setOutputPath(job, outdir);
job.submit();
return job;
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
if (otherArgs.length != 2) {
System.err.println("Usage: wordcount <in> <out>");
System.exit(2);
}
Job job = new Job(conf, "word count");
job.setJarByClass(WordCount.class);
job.setMapperClass(TokenizerMapper.class);
job.setCombinerClass(IntSumReducer.class);
job.setReducerClass(IntSumReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
/**
* Creates a simple copy job.
*
* @param conf Configuration object
* @param outdir Output directory.
* @param indirs Comma separated input directories.
* @return Job initialized for a data copy job.
* @throws Exception If an error occurs creating job configuration.
*/
public static Job createCopyJob(Configuration conf, Path outdir,
Path... indirs) throws Exception {
conf.setInt(MRJobConfig.NUM_MAPS, 3);
Job theJob = Job.getInstance(conf);
theJob.setJobName("DataMoveJob");
FileInputFormat.setInputPaths(theJob, indirs);
theJob.setMapperClass(DataCopyMapper.class);
FileOutputFormat.setOutputPath(theJob, outdir);
theJob.setOutputKeyClass(Text.class);
theJob.setOutputValueClass(Text.class);
theJob.setReducerClass(DataCopyReducer.class);
theJob.setNumReduceTasks(1);
return theJob;
}
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
Configuration conf = new Configuration();
// A is an m-by-n matrix; B is an n-by-p matrix.
conf.set("m", args[0]);
conf.set("n", args[1]);
conf.set("p", args[2]);
Job job = new Job(conf, "Matrix_Multiplication");
job.setJarByClass(MatMulDriver.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.setMapperClass(MatMulMap.class);
//Don't use combiner if there is no scope of combining the output. Otherwise the job will get stuck.
//job.setCombinerClass(MatMulModGenReduce.class);
job.setReducerClass(MatMulReduce.class);
//args[3] is the input path.
FileInputFormat.addInputPath(job, new Path(args[3]));
//args[4] is the output path.
FileOutputFormat.setOutputPath(job, new Path(args[4]));
System.exit(job.waitForCompletion(true)?0:1);
}
public Job createSubmittableJob(String[] args) throws IOException {
Path partitionsPath = new Path(destPath, PARTITIONS_FILE_NAME);
generatePartitions(partitionsPath);
Job job = Job.getInstance(getConf(),
getConf().get("mapreduce.job.name", "hashTable_" + tableHash.tableName));
Configuration jobConf = job.getConfiguration();
jobConf.setLong(HASH_BATCH_SIZE_CONF_KEY, tableHash.batchSize);
jobConf.setBoolean(IGNORE_TIMESTAMPS, tableHash.ignoreTimestamps);
job.setJarByClass(HashTable.class);
TableMapReduceUtil.initTableMapperJob(tableHash.tableName, tableHash.initScan(),
HashMapper.class, ImmutableBytesWritable.class, ImmutableBytesWritable.class, job);
// use a TotalOrderPartitioner and reducers to group region output into hash files
job.setPartitionerClass(TotalOrderPartitioner.class);
TotalOrderPartitioner.setPartitionFile(jobConf, partitionsPath);
job.setReducerClass(Reducer.class); // identity reducer
job.setNumReduceTasks(tableHash.numHashFiles);
job.setOutputKeyClass(ImmutableBytesWritable.class);
job.setOutputValueClass(ImmutableBytesWritable.class);
job.setOutputFormatClass(MapFileOutputFormat.class);
FileOutputFormat.setOutputPath(job, new Path(destPath, HASH_DATA_DIR));
return job;
}
private Path doMapReduce(final String inputFile) throws Exception {
final FileSystem fileSystem = FileSystem.get(conf);
final Path inputPath = new Path(inputFile);
final Path outputPath = fileSystem.makeQualified(new Path("target/out"));
fileSystem.delete(outputPath, true);
final Job job = Job.getInstance(conf);
FileInputFormat.setInputPaths(job, inputPath);
job.setInputFormatClass(CRAMInputFormat.class);
job.setMapOutputKeyClass(LongWritable.class);
job.setMapOutputValueClass(SAMRecordWritable.class);
conf.set(CRAMTestNoHeaderOutputFormat.READ_HEADER_FROM_FILE, inputFile);
job.setOutputFormatClass(CRAMTestNoHeaderOutputFormat.class);
job.setOutputKeyClass(LongWritable.class);
job.setOutputValueClass(SAMRecordWritable.class);
job.setNumReduceTasks(0);
FileOutputFormat.setOutputPath(job, outputPath);
final boolean success = job.waitForCompletion(true);
assertTrue(success);
return outputPath;
}
protected Job doLoad(Configuration conf, TableDescriptor tableDescriptor) throws Exception {
Path outputDir = getTestDir(TEST_NAME, "load-output");
LOG.info("Load output dir: " + outputDir);
NMapInputFormat.setNumMapTasks(conf, conf.getInt(NUM_MAP_TASKS_KEY, NUM_MAP_TASKS_DEFAULT));
conf.set(TABLE_NAME_KEY, tableDescriptor.getTableName().getNameAsString());
Job job = Job.getInstance(conf);
job.setJobName(TEST_NAME + " Load for " + tableDescriptor.getTableName());
job.setJarByClass(this.getClass());
setMapperClass(job);
job.setInputFormatClass(NMapInputFormat.class);
job.setNumReduceTasks(0);
setJobScannerConf(job);
FileOutputFormat.setOutputPath(job, outputDir);
TableMapReduceUtil.addDependencyJars(job);
TableMapReduceUtil.addDependencyJarsForClasses(job.getConfiguration(), AbstractHBaseTool.class);
TableMapReduceUtil.initCredentials(job);
assertTrue(job.waitForCompletion(true));
return job;
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
if (otherArgs.length < 1) {
System.err.println("Usage: ElemValueCooccurrencesTest configFile outputDir");
System.exit(2);
}
Job job = Job.getInstance(conf);
job.setJarByClass(ElemValueCooccurrencesTest.class);
job.setInputFormatClass(ValueInputFormat.class);
job.setMapperClass(ElemCooccurrencesMapper.class);
job.setMapOutputKeyClass(LongWritable.class);
job.setMapOutputValueClass(Text.class);
job.setOutputFormatClass(TextOutputFormat.class);
FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
conf = job.getConfiguration();
conf.addResource(otherArgs[0]);
conf.setClass(MarkLogicConstants.INPUT_VALUE_CLASS, Text.class,
Writable.class);
conf.setClass(MarkLogicConstants.INPUT_LEXICON_FUNCTION_CLASS,
ElemValueCooccurrencesFunction.class, ElemValueCooccurrences.class);
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
public static void initializeCenters (Configuration conf, FileSystem fs, String pointsPath, String seqFilePath) throws Exception {
Path points = new Path (pointsPath);
Path seqFile = new Path (seqFilePath);
if (fs.exists(seqFile)) {
fs.delete(seqFile, true);
}
Job job = Job.getInstance(conf);
job.setMapperClass(CenterInitializer.class);
job.setReducerClass(Reducer.class);
job.setNumReduceTasks(0);
job.setMapOutputKeyClass(Centroid.class);
job.setMapOutputValueClass(Point.class);
job.setOutputKeyClass(Centroid.class);
job.setOutputValueClass(Point.class);
job.setOutputFormatClass(SequenceFileOutputFormat.class);
job.setInputFormatClass(TextInputFormat.class);
FileInputFormat.addInputPath(job, new Path(pointsPath));
FileOutputFormat.setOutputPath(job, seqFile);
job.waitForCompletion(true);
}
public static boolean run(Configuration config, Map<String, String> paths)
throws IOException, ClassNotFoundException, InterruptedException {
String jobName = "step6";
Job job = Job.getInstance(config, jobName);
job.setJarByClass(Step6.class);
job.setJar("export\\ItemCF.jar");
job.setMapperClass(Step6_Mapper.class);
job.setReducerClass(Step6_Reducer.class);
job.setMapOutputKeyClass(PairWritable.class);
job.setMapOutputValueClass(Text.class);
//job.setSortComparatorClass(ScoreSort.class); //自定义排序
job.setGroupingComparatorClass(UserGroup.class); //自定义分组
Path inPath = new Path(paths.get("Step6Input"));
Path outpath = new Path(paths.get("Step6Output"));
FileInputFormat.addInputPath(job, inPath);
FileOutputFormat.setOutputPath(job, outpath);
FileSystem fs = FileSystem.get(config);
if (fs.exists(outpath)) {
fs.delete(outpath, true);
}
return job.waitForCompletion(true);
}
public static void main(String[] args) throws Exception {
String[] otherArgs = new GenericOptionsParser(args).getRemainingArgs();
if (otherArgs.length != 2) {
System.err.println("Usage: xflowdstipcount <in> <out>");
System.exit(2);
}
Job job = Job.getInstance();
job.setJobName("xflow dstip count");
job.setJarByClass(XflowDstIPCount.class);
job.setMapperClass(ParesDstIPMapper.class);
job.setCombinerClass(IntSumReducer.class);
job.setReducerClass(IntSumReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
Configuration conf=new Configuration();
// The test input for which you want to find the acitivity that the Person should be doing
conf.set("test_input", args[0]);
Job job = new Job(conf);
job.setJarByClass(NBCDriver.class);
job.setJobName("Naive_Bayes_calssifier using Hadoop");
FileInputFormat.setInputPaths(job, new Path(args[1]));
FileOutputFormat.setOutputPath(job, new Path(args[2]));
job.setMapperClass(NBCMap.class);
job.setReducerClass(NBCReduce.class);
job.setMapOutputKeyClass(IntWritable.class);
job.setMapOutputValueClass(Text.class);
job.setOutputKeyClass(IntWritable.class);
job.setOutputValueClass(Text.class);
boolean success = job.waitForCompletion(true);
System.exit(success ? 0 : 1);
}
public static void calculateNextRanks (Configuration conf, FileSystem fs, String inputPath, String outputPath) throws Exception {
Path outFile = new Path (outputPath);
if (fs.exists(outFile)) {
fs.delete(outFile, true);
}
Job job = Job.getInstance(conf);
job.setJarByClass(PageRankMapper.class);
job.setMapperClass(PageRankMapper.class);
job.setReducerClass(PageRankReducer.class);
job.setMapOutputKeyClass(LongWritable.class);
job.setMapOutputValueClass(Message.class);
job.setOutputKeyClass(LongWritable.class);
job.setOutputValueClass(Message.class);
job.setInputFormatClass(SequenceFileInputFormat.class);
job.setOutputFormatClass(SequenceFileOutputFormat.class);
FileInputFormat.addInputPath(job, new Path(inputPath));
FileOutputFormat.setOutputPath(job, outFile);
job.waitForCompletion(true);
}
/**
* Main方法里面,设置了 Patent任务流程,Mapper ->Combiner ->Reducer ->Partitioner.
*
* @param args
* @throws Exception
*/
public static void main(String[] args) throws Exception {
//配置 Job,并完成初始化
Job job = Job.getInstance(new Configuration());
//指定 Job的主类
job.setJarByClass(PatentMainController.class);
//指定 Job的 Mapper组件
job.setMapperClass(PatentMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
//指定 Job的数据输入地址
FileInputFormat.setInputPaths(job, new Path(args[0]));
//指定 Job的 Combiner组件
job.setCombinerClass(InverseIndexByKeywordCombiner.class);
job.setReducerClass(InverseIndexByKeywordCombiner.class);
//指定 Job的 Reducer组件
job.setReducerClass(PatentReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
//指定 Job的数据输出地址
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.setPartitionerClass(PatentPartitioner.class);
//指定最大的 Task数量
job.setNumReduceTasks(ConfUtil.getMax());
//提交并等待执行完成
job.waitForCompletion(true);
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
if (otherArgs.length != 2) {
System.err.println("Usage: secondarysrot <in> <out>");
System.exit(2);
}
Job job = new Job(conf, "secondary sort");
job.setJarByClass(SecondarySort.class);
job.setMapperClass(MapClass.class);
job.setReducerClass(Reduce.class);
// group and partition by the first int in the pair
job.setPartitionerClass(FirstPartitioner.class);
job.setGroupingComparatorClass(FirstGroupingComparator.class);
// the map output is IntPair, IntWritable
job.setMapOutputKeyClass(IntPair.class);
job.setMapOutputValueClass(IntWritable.class);
// the reduce output is Text, IntWritable
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
@Test
public void testNewApis() throws Exception {
Random r = new Random(System.currentTimeMillis());
Path tmpBaseDir = new Path("/tmp/wc-" + r.nextInt());
final Path inDir = new Path(tmpBaseDir, "input");
final Path outDir = new Path(tmpBaseDir, "output");
String input = "The quick brown fox\nhas many silly\nred fox sox\n";
FileSystem inFs = inDir.getFileSystem(conf);
FileSystem outFs = outDir.getFileSystem(conf);
outFs.delete(outDir, true);
if (!inFs.mkdirs(inDir)) {
throw new IOException("Mkdirs failed to create " + inDir.toString());
}
{
DataOutputStream file = inFs.create(new Path(inDir, "part-0"));
file.writeBytes(input);
file.close();
}
Job job = Job.getInstance(conf, "word count");
job.setJarByClass(TestLocalModeWithNewApis.class);
job.setMapperClass(TokenizerMapper.class);
job.setCombinerClass(IntSumReducer.class);
job.setReducerClass(IntSumReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.addInputPath(job, inDir);
FileOutputFormat.setOutputPath(job, outDir);
assertEquals(job.waitForCompletion(true), true);
String output = readOutput(outDir, conf);
assertEquals("The\t1\nbrown\t1\nfox\t2\nhas\t1\nmany\t1\n" +
"quick\t1\nred\t1\nsilly\t1\nsox\t1\n", output);
outFs.delete(tmpBaseDir, true);
}
/**
* Job configurator
*
* @param job job instance
* @param jarByClass class of the jar
* @param mapperClass mapper
* @param reducerClass reducer
* @param commaSeparatedInputFiles input paths
* @param outputPath output
* @throws IOException I/O exception
*/
public static void configureJob(Job job, Class<?> jarByClass,
Class<? extends Mapper> mapperClass, Class<? extends Reducer> reducerClass,
String commaSeparatedInputFiles, String outputPath)
throws IOException
{
job.setJarByClass(jarByClass);
job.setJobName(jarByClass.getName());
// mapper
job.setMapperClass(mapperClass);
// reducer
job.setReducerClass(reducerClass);
// input-output is warc
job.setInputFormatClass(WARCInputFormat.class);
// prevent producing empty files
LazyOutputFormat.setOutputFormatClass(job, WARCOutputFormat.class);
// intermediate data
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(WARCWritable.class);
// output data
job.setOutputKeyClass(NullWritable.class);
job.setOutputValueClass(WARCWritable.class);
// set output compression to GZip
FileOutputFormat.setCompressOutput(job, true);
FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);
FileInputFormat.addInputPaths(job, commaSeparatedInputFiles);
FileOutputFormat.setOutputPath(job, new Path(outputPath));
}
public int run(String[] args) throws Exception {
if (args.length != 5) {
usage();
return 2;
}
LOG.info("starting");
Path inputDir = new Path(args[0]);
Path outputDir = new Path(args[1]);
getConf().setStrings(INPUT_URI_STRING, args[2]);
getConf().setStrings(INPUT_SCOPE_NAME, args[3]);
getConf().setStrings(INPUT_STREAM_NAME, args[4]);
getConf().setStrings(INPUT_DESERIALIZER, TextSerializer.class.getName());
getConf().setInt(MRJobConfig.NUM_MAPS, 1);
Job job = Job.getInstance(getConf());
TeraInputFormat.setInputPaths(job, inputDir);
FileOutputFormat.setOutputPath(job, outputDir);
job.setJobName("TeraStreamValidate");
job.setJarByClass(TeraStreamValidate.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.setMapperClass(TeraSortMapper.class);
job.setNumReduceTasks(1);
job.setInputFormatClass(PravegaInputFormat.class);
job.setOutputFormatClass(SequenceFileOutputFormat.class);
int ret = job.waitForCompletion(true) ? 0 : 1;
LOG.info("done");
return ret;
}
/**
* Run a test which creates a SequenceMapper / IdentityReducer
* job over a set of generated number files.
*/
private void doMultiReducerTest(int numMaps, int numReduces,
int parallelMaps, int parallelReduces) throws Exception {
Path in = getNumberDirPath();
Path out = getOutputPath();
// Clear data from any previous tests.
Configuration conf = new Configuration();
FileSystem fs = FileSystem.getLocal(conf);
if (fs.exists(out)) {
fs.delete(out, true);
}
if (fs.exists(in)) {
fs.delete(in, true);
}
for (int i = 0; i < numMaps; i++) {
makeNumberFile(i, 100);
}
Job job = Job.getInstance();
job.setNumReduceTasks(numReduces);
job.setMapperClass(SequenceMapper.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
FileInputFormat.addInputPath(job, in);
FileOutputFormat.setOutputPath(job, out);
LocalJobRunner.setLocalMaxRunningMaps(job, parallelMaps);
LocalJobRunner.setLocalMaxRunningReduces(job, parallelReduces);
boolean result = job.waitForCompletion(true);
assertTrue("Job failed!!", result);
verifyNumberJob(numMaps);
}
/**
* The MapReduce driver - setup and launch the job.
*
* @param args the command-line arguments
* @return the process exit code
* @throws Exception if something goes wrong
*/
public int run(final String[] args) throws Exception {
Cli cli = Cli.builder().setArgs(args).addOptions(CliCommonOpts.MrIoOpts.values()).build();
int result = cli.runCmd();
if (result != 0) {
return result;
}
Path inputPath = new Path(cli.getArgValueAsString(CliCommonOpts.MrIoOpts.INPUT));
Path outputPath = new Path(cli.getArgValueAsString(CliCommonOpts.MrIoOpts.OUTPUT));
Configuration conf = super.getConf();
Job job = new Job(conf);
job.setJarByClass(AvroParquetMapReduce.class);
job.setInputFormatClass(AvroParquetInputFormat.class);
AvroParquetInputFormat.setInputPaths(job, inputPath);
job.setMapperClass(Map.class);
job.setReducerClass(Reduce.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(DoubleWritable.class);
job.setOutputFormatClass(AvroParquetOutputFormat.class);
FileOutputFormat.setOutputPath(job, outputPath);
AvroParquetOutputFormat.setSchema(job, StockAvg.SCHEMA$);
return job.waitForCompletion(true) ? 0 : 1;
}
public static void run() throws IOException, ClassNotFoundException,
InterruptedException {
String inputPath1 = ItemBasedCFDriver.path.get("step7InputPath1");
String inputPath2 = ItemBasedCFDriver.path.get("step7InputPath2");
String outputPath = ItemBasedCFDriver.path.get("step7OutputPath");
Configuration conf = new Configuration();
conf.set("mapred.textoutputformat.separator", ",");
Job job = Job.getInstance(conf);
HDFS hdfs = new HDFS(conf);
hdfs.rmr(outputPath);
job.setMapperClass(Step1_Mapper.class);
job.setJarByClass(IAndKMatrixMultiplicationStep1.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
FileInputFormat.setInputPaths(job, new Path(inputPath1), new Path(
inputPath2));
FileOutputFormat.setOutputPath(job, new Path(outputPath));
job.waitForCompletion(true);
}