org.apache.hadoop.mapred.TextInputFormat#addInputPath ( )源码实例Demo

下面列出了org.apache.hadoop.mapred.TextInputFormat#addInputPath ( ) 实例代码,或者点击链接到github查看源代码,也可以在右侧发表评论。

源代码1 项目: parquet-mr   文件: TestZstandardCodec.java
private RunningJob runMapReduceJob(CompressionCodecName codec, JobConf jobConf, Configuration conf, Path parquetPath) throws IOException, ClassNotFoundException, InterruptedException {
  String writeSchema = "message example {\n" +
    "required int32 line;\n" +
    "required binary content;\n" +
    "}";

  FileSystem fileSystem = parquetPath.getFileSystem(conf);
  fileSystem.delete(parquetPath, true);
  jobConf.setInputFormat(TextInputFormat.class);
  TextInputFormat.addInputPath(jobConf, inputPath);
  jobConf.setNumReduceTasks(0);
  jobConf.setOutputFormat(DeprecatedParquetOutputFormat.class);
  DeprecatedParquetOutputFormat.setCompression(jobConf, codec);
  DeprecatedParquetOutputFormat.setOutputPath(jobConf, parquetPath);
  DeprecatedParquetOutputFormat.setWriteSupportClass(jobConf, GroupWriteSupport.class);
  GroupWriteSupport.setSchema(MessageTypeParser.parseMessageType(writeSchema), jobConf);

  jobConf.setMapperClass(TestZstandardCodec.DumpMapper.class);
  return JobClient.runJob(jobConf);
}
 
public static void main(String[] args) throws Exception {
	if (args.length < 2) {
		System.err.println("Usage: WordCount <input path> <result path>");
		return;
	}

	final String inputPath = args[0];
	final String outputPath = args[1];

	final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

	// Set up the Hadoop Input Format
	HadoopInputFormat<LongWritable, Text> hadoopInputFormat = new HadoopInputFormat<LongWritable, Text>(new TextInputFormat(), LongWritable.class, Text.class, new JobConf());
	TextInputFormat.addInputPath(hadoopInputFormat.getJobConf(), new Path(inputPath));

	// Create a Flink job with it
	DataSet<Tuple2<LongWritable, Text>> text = env.createInput(hadoopInputFormat);

	DataSet<Tuple2<Text, LongWritable>> words =
			text.flatMap(new HadoopMapFunction<LongWritable, Text, Text, LongWritable>(new Tokenizer()))
				.groupBy(0).reduceGroup(new HadoopReduceCombineFunction<Text, LongWritable, Text, LongWritable>(new Counter(), new Counter()));

	// Set up Hadoop Output Format
	HadoopOutputFormat<Text, LongWritable> hadoopOutputFormat =
			new HadoopOutputFormat<Text, LongWritable>(new TextOutputFormat<Text, LongWritable>(), new JobConf());
	hadoopOutputFormat.getJobConf().set("mapred.textoutputformat.separator", " ");
	TextOutputFormat.setOutputPath(hadoopOutputFormat.getJobConf(), new Path(outputPath));

	// Output & Execute
	words.output(hadoopOutputFormat).setParallelism(1);
	env.execute("Hadoop Compat WordCount");
}
 
源代码3 项目: flink   文件: HadoopMapredCompatWordCount.java
public static void main(String[] args) throws Exception {
	if (args.length < 2) {
		System.err.println("Usage: WordCount <input path> <result path>");
		return;
	}

	final String inputPath = args[0];
	final String outputPath = args[1];

	final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

	// Set up the Hadoop Input Format
	HadoopInputFormat<LongWritable, Text> hadoopInputFormat = new HadoopInputFormat<LongWritable, Text>(new TextInputFormat(), LongWritable.class, Text.class, new JobConf());
	TextInputFormat.addInputPath(hadoopInputFormat.getJobConf(), new Path(inputPath));

	// Create a Flink job with it
	DataSet<Tuple2<LongWritable, Text>> text = env.createInput(hadoopInputFormat);

	DataSet<Tuple2<Text, LongWritable>> words =
			text.flatMap(new HadoopMapFunction<LongWritable, Text, Text, LongWritable>(new Tokenizer()))
				.groupBy(0).reduceGroup(new HadoopReduceCombineFunction<Text, LongWritable, Text, LongWritable>(new Counter(), new Counter()));

	// Set up Hadoop Output Format
	HadoopOutputFormat<Text, LongWritable> hadoopOutputFormat =
			new HadoopOutputFormat<Text, LongWritable>(new TextOutputFormat<Text, LongWritable>(), new JobConf());
	hadoopOutputFormat.getJobConf().set("mapred.textoutputformat.separator", " ");
	TextOutputFormat.setOutputPath(hadoopOutputFormat.getJobConf(), new Path(outputPath));

	// Output & Execute
	words.output(hadoopOutputFormat).setParallelism(1);
	env.execute("Hadoop Compat WordCount");
}
 
源代码4 项目: flink   文件: HadoopMapredCompatWordCount.java
public static void main(String[] args) throws Exception {
	if (args.length < 2) {
		System.err.println("Usage: WordCount <input path> <result path>");
		return;
	}

	final String inputPath = args[0];
	final String outputPath = args[1];

	final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

	// Set up the Hadoop Input Format
	HadoopInputFormat<LongWritable, Text> hadoopInputFormat = new HadoopInputFormat<LongWritable, Text>(new TextInputFormat(), LongWritable.class, Text.class, new JobConf());
	TextInputFormat.addInputPath(hadoopInputFormat.getJobConf(), new Path(inputPath));

	// Create a Flink job with it
	DataSet<Tuple2<LongWritable, Text>> text = env.createInput(hadoopInputFormat);

	DataSet<Tuple2<Text, LongWritable>> words =
			text.flatMap(new HadoopMapFunction<LongWritable, Text, Text, LongWritable>(new Tokenizer()))
				.groupBy(0).reduceGroup(new HadoopReduceCombineFunction<Text, LongWritable, Text, LongWritable>(new Counter(), new Counter()));

	// Set up Hadoop Output Format
	HadoopOutputFormat<Text, LongWritable> hadoopOutputFormat =
			new HadoopOutputFormat<Text, LongWritable>(new TextOutputFormat<Text, LongWritable>(), new JobConf());
	hadoopOutputFormat.getJobConf().set("mapred.textoutputformat.separator", " ");
	TextOutputFormat.setOutputPath(hadoopOutputFormat.getJobConf(), new Path(outputPath));

	// Output & Execute
	words.output(hadoopOutputFormat).setParallelism(1);
	env.execute("Hadoop Compat WordCount");
}
 
@Override
public Plan getPlan(String... args) {
	// parse job parameters
	int numSubTasks   = (args.length > 0 ? Integer.parseInt(args[0]) : 1);
	String dataInput = (args.length > 1 ? args[1] : "");
	String output    = (args.length > 2 ? args[2] : "");

	HadoopDataSource<LongWritable, Text> source = new HadoopDataSource<LongWritable, Text>(
			new TextInputFormat(), new JobConf(), "Input Lines");
	TextInputFormat.addInputPath(source.getJobConf(), new Path(dataInput));


	MapOperator mapper = MapOperator.builder(new TokenizeLine())
			.input(source)
			.name("Tokenize Lines")
			.build();
	ReduceOperator reducer = ReduceOperator.builder(CountWords.class, StringValue.class, 0)
			.input(mapper)
			.name("Count Words")
			.build();
	HadoopDataSink<Text, IntWritable> out = new HadoopDataSink<Text, IntWritable>(new TextOutputFormat<Text, IntWritable>(),new JobConf(), "Hadoop TextOutputFormat", reducer, Text.class, IntWritable.class);
	TextOutputFormat.setOutputPath(out.getJobConf(), new Path(output));

	Plan plan = new Plan(out, "Hadoop OutputFormat Example");
	plan.setDefaultParallelism(numSubTasks);
	return plan;
}
 
源代码6 项目: stratosphere   文件: WordCount.java
@SuppressWarnings({ "rawtypes", "unchecked", "unused" })
@Override
public Plan getPlan(String... args) {
	// parse job parameters
	int numSubTasks   = (args.length > 0 ? Integer.parseInt(args[0]) : 1);
	String dataInput = (args.length > 1 ? args[1] : "");
	String output    = (args.length > 2 ? args[2] : "");
	
	
	HadoopDataSource source = new HadoopDataSource(new TextInputFormat(), new JobConf(), "Input Lines");
	TextInputFormat.addInputPath(source.getJobConf(), new Path(dataInput));
	
	// Example with Wrapper Converter
	HadoopDataSource<LongWritable,Text> sourceHadoopType = new HadoopDataSource<LongWritable, Text>(
			new TextInputFormat(), new JobConf(), "Input Lines", new WritableWrapperConverter<LongWritable, Text>());
	TextInputFormat.addInputPath(source.getJobConf(), new Path(dataInput));
	
	MapOperator mapper = MapOperator.builder(new TokenizeLine())
		.input(source)
		.name("Tokenize Lines")
		.build();
	ReduceOperator reducer = ReduceOperator.builder(CountWords.class, StringValue.class, 0)
		.input(mapper)
		.name("Count Words")
		.build();
	FileDataSink out = new FileDataSink(new CsvOutputFormat(), output, reducer, "Word Counts");
	CsvOutputFormat.configureRecordFormat(out)
		.recordDelimiter('\n')
		.fieldDelimiter(' ')
		.field(StringValue.class, 0)
		.field(IntValue.class, 1);
	
	Plan plan = new Plan(out, "WordCount Example");
	plan.setDefaultParallelism(numSubTasks);
	return plan;
}