下面列出了org.apache.hadoop.mapreduce.Job#getInstance ( ) 实例代码,或者点击链接到github查看源代码,也可以在右侧发表评论。
/**
* Sets up the actual job.
*
* @param conf The current configuration.
* @param args The command line parameters.
* @return The newly created job.
* @throws IOException When setting up the job fails.
*/
public static Job createSubmittableJob(Configuration conf, String[] args)
throws IOException {
Triple<TableName, Scan, Path> arguments = ExportUtils.getArgumentsFromCommandLine(conf, args);
String tableName = arguments.getFirst().getNameAsString();
Path outputDir = arguments.getThird();
Job job = Job.getInstance(conf, conf.get(JOB_NAME_CONF_KEY, NAME + "_" + tableName));
job.setJobName(NAME + "_" + tableName);
job.setJarByClass(Export.class);
// Set optional scan parameters
Scan s = arguments.getSecond();
IdentityTableMapper.initJob(tableName, s, IdentityTableMapper.class, job);
// No reducers. Just write straight to output files.
job.setNumReduceTasks(0);
job.setOutputFormatClass(SequenceFileOutputFormat.class);
job.setOutputKeyClass(ImmutableBytesWritable.class);
job.setOutputValueClass(Result.class);
FileOutputFormat.setOutputPath(job, outputDir); // job conf doesn't contain the conf so doesn't have a default fs.
return job;
}
public static Job createJob() throws IOException {
final Configuration conf = new Configuration();
final Job baseJob = Job.getInstance(conf);
baseJob.setOutputKeyClass(Text.class);
baseJob.setOutputValueClass(IntWritable.class);
baseJob.setMapperClass(NewMapTokenizer.class);
baseJob.setCombinerClass(NewSummer.class);
baseJob.setReducerClass(NewSummer.class);
baseJob.setNumReduceTasks(1);
baseJob.getConfiguration().setInt(JobContext.IO_SORT_MB, 1);
baseJob.getConfiguration().set(JobContext.MAP_SORT_SPILL_PERCENT, "0.50");
baseJob.getConfiguration().setInt(JobContext.MAP_COMBINE_MIN_SPILLS, 3);
org.apache.hadoop.mapreduce.lib.input.FileInputFormat.setMinInputSplitSize(
baseJob, Long.MAX_VALUE);
return baseJob;
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
if (otherArgs.length < 1) {
System.err.println("Usage: ElemValueCooccurrencesTest configFile outputDir");
System.exit(2);
}
Job job = Job.getInstance(conf);
job.setJarByClass(ElemValueCooccurrencesTest.class);
job.setInputFormatClass(ValueInputFormat.class);
job.setMapperClass(ElemCooccurrencesMapper.class);
job.setMapOutputKeyClass(LongWritable.class);
job.setMapOutputValueClass(Text.class);
job.setOutputFormatClass(TextOutputFormat.class);
FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
conf = job.getConfiguration();
conf.addResource(otherArgs[0]);
conf.setClass(MarkLogicConstants.INPUT_VALUE_CLASS, Text.class,
Writable.class);
conf.setClass(MarkLogicConstants.INPUT_LEXICON_FUNCTION_CLASS,
ElemValueCooccurrencesFunction.class, ElemValueCooccurrences.class);
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
public void run() throws Exception{
long startTime = System.currentTimeMillis();
Configuration conf = new Configuration();
conf.set(TableOutputFormat.OUTPUT_TABLE, Constants.hbase_user_item_pref_table);
Job job = Job.getInstance(conf, "hbasewriter"+System.currentTimeMillis());
job.setJarByClass(UpdateCFJob.class);
job.setMapperClass(TokenizerMapper.class);
job.setReducerClass(HBaseWriteReducer.class);
job.setMapOutputKeyClass(IntWritable.class);
job.setMapOutputValueClass(Text.class);
job.setOutputFormatClass(TableOutputFormat.class);
FileInputFormat.addInputPath(job, new Path(input));
long endTime = System.currentTimeMillis();
boolean isFinish = job.waitForCompletion(true);
if(isFinish){
logger.info("UpdateCFJob job ["+job.getJobName()+"] run finish.it costs"+ (endTime - startTime) / 1000 +"s.");
} else {
logger.error("UpdateCFJob job ["+job.getJobName()+"] run failed.");
}
}
public static void main(String[] args) throws Exception {
CommandLine cli = StressTestUtils.parseCommandLine(OPTIONS, args);
Configuration configuration = new Configuration();
if (cli.hasOption(THROTTLING_SERVER_URI.getOpt())) {
configuration.setBoolean(USE_THROTTLING_SERVER, true);
String resourceLimited = cli.getOptionValue(RESOURCE_ID_OPT.getOpt(), "MRStressTest");
configuration.set(RESOURCE_ID, resourceLimited);
configuration.set(
BrokerConfigurationKeyGenerator.generateKey(new SharedRestClientFactory(),
new SharedRestClientKey(RestliLimiterFactory.RESTLI_SERVICE_NAME),
null, SharedRestClientFactory.SERVER_URI_KEY), cli.getOptionValue(THROTTLING_SERVER_URI.getOpt()));
}
if (cli.hasOption(LOCAL_QPS_OPT.getOpt())) {
configuration .set(LOCALLY_ENFORCED_QPS, cli.getOptionValue(LOCAL_QPS_OPT.getOpt()));
}
Job job = Job.getInstance(configuration, "ThrottlingStressTest");
job.getConfiguration().setBoolean("mapreduce.job.user.classpath.first", true);
job.getConfiguration().setBoolean("mapreduce.map.speculative", false);
job.getConfiguration().set(NUM_MAPPERS, cli.getOptionValue(NUM_MAPPERS_OPT.getOpt(), DEFAULT_MAPPERS));
StressTestUtils.populateConfigFromCli(job.getConfiguration(), cli);
job.setJarByClass(MRStressTest.class);
job.setMapperClass(StresserMapper.class);
job.setReducerClass(AggregatorReducer.class);
job.setInputFormatClass(MyInputFormat.class);
job.setOutputKeyClass(LongWritable.class);
job.setOutputValueClass(DoubleWritable.class);
FileOutputFormat.setOutputPath(job, new Path("/tmp/MRStressTest" + System.currentTimeMillis()));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
@Test(expected = IllegalStateException.class)
public void testGetMrJobIdThrowsIllegalStateException() throws IOException {
Job job = Job.getInstance(new Configuration(false));
HadoopCmdOutput hadoopCmdOutput = new HadoopCmdOutput(job, new StringBuilder());
assertNull(hadoopCmdOutput.getMrJobId());
}
public static void run() throws IOException, ClassNotFoundException,
InterruptedException {
String inputPath = ItemBasedCFDriver.path.get("step4InputPath");
String outputPath = ItemBasedCFDriver.path.get("step4OutputPath");
Configuration conf = new Configuration();
conf.set("mapred.textoutputformat.separator", ":");
Job job = Job.getInstance(conf);
HDFS hdfs = new HDFS(conf);
hdfs.rmr(outputPath);
job.setMapperClass(Step4_Mapper.class);
job.setReducerClass(Step4_Reducer.class);
job.setJarByClass(CalculateSimilarityStep4.class);
job.setMapOutputKeyClass(IntWritable.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(IntWritable.class);
job.setOutputValueClass(IntWritable.class);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
FileInputFormat.setInputPaths(job, new Path(inputPath));
FileOutputFormat.setOutputPath(job, new Path(outputPath));
job.waitForCompletion(true);
}
private static void createStoreSales(ExecutionEnvironment env) throws IOException {
DataSet<Tuple2<Void, StoreSalesTable>> storeSales = getStoreSalesDataSet(env).map(new StoreSalesToParquet());
Job job = Job.getInstance();
HadoopOutputFormat hadoopOutputFormat = new HadoopOutputFormat(new ParquetThriftOutputFormat(), job);
ParquetThriftOutputFormat.setOutputPath(job, new Path(outputPath + "/storesales"));
ParquetThriftOutputFormat.setThriftClass(job, StoreSalesTable.class);
ParquetThriftOutputFormat.setCompression(job, CompressionCodecName.SNAPPY);
ParquetThriftOutputFormat.setCompressOutput(job, true);
ParquetThriftOutputFormat.setEnableDictionary(job, false);
storeSales.output(hadoopOutputFormat);
}
public ConfigurationProxyV2( String namedCluster ) throws IOException {
job = Job.getInstance();
// Reset static HashSets for logging
ShimConfigsLoader.CLUSTER_NAME_FOR_LOGGING.clear();
ShimConfigsLoader.SITE_FILE_NAME.clear();
addConfigsForJobConf( namedCluster );
}
/**
* Test using the gzip codec for reading
*/
@Test(timeout=10000)
public void testGzip() throws IOException, InterruptedException {
Configuration conf = new Configuration(defaultConf);
CompressionCodec gzip = new GzipCodec();
ReflectionUtils.setConf(gzip, conf);
localFs.delete(workDir, true);
writeFile(localFs, new Path(workDir, "part1.txt.gz"), gzip,
"the quick\nbrown\nfox jumped\nover\n the lazy\n dog\n");
writeFile(localFs, new Path(workDir, "part2.txt.gz"), gzip,
"this is a test\nof gzip\n");
Job job = Job.getInstance(conf);
FileInputFormat.setInputPaths(job, workDir);
CombineTextInputFormat format = new CombineTextInputFormat();
List<InputSplit> splits = format.getSplits(job);
assertEquals("compressed splits == 1", 1, splits.size());
List<Text> results = readSplit(format, splits.get(0), job);
assertEquals("splits[0] length", 8, results.size());
final String[] firstList =
{"the quick", "brown", "fox jumped", "over", " the lazy", " dog"};
final String[] secondList = {"this is a test", "of gzip"};
String first = results.get(0).toString();
if (first.equals(firstList[0])) {
testResults(results, firstList, secondList);
} else if (first.equals(secondList[0])) {
testResults(results, secondList, firstList);
} else {
fail("unexpected first token!");
}
}
@SuppressWarnings("unchecked")
public void testCommitter() throws Exception {
Job job = Job.getInstance();
FileOutputFormat.setOutputPath(job, outDir);
Configuration conf = job.getConfiguration();
conf.set(MRJobConfig.TASK_ATTEMPT_ID, attempt);
JobContext jContext = new JobContextImpl(conf, taskID.getJobID());
TaskAttemptContext tContext = new TaskAttemptContextImpl(conf, taskID);
FileOutputCommitter committer = new FileOutputCommitter(outDir, tContext);
// setup
committer.setupJob(jContext);
committer.setupTask(tContext);
// write output
TextOutputFormat theOutputFormat = new TextOutputFormat();
RecordWriter theRecordWriter = theOutputFormat.getRecordWriter(tContext);
writeOutput(theRecordWriter, tContext);
// do commit
committer.commitTask(tContext);
committer.commitJob(jContext);
// validate output
File expectedFile = new File(new Path(outDir, partFile).toString());
StringBuffer expectedOutput = new StringBuffer();
expectedOutput.append(key1).append('\t').append(val1).append("\n");
expectedOutput.append(val1).append("\n");
expectedOutput.append(val2).append("\n");
expectedOutput.append(key2).append("\n");
expectedOutput.append(key1).append("\n");
expectedOutput.append(key2).append('\t').append(val2).append("\n");
String output = UtilsForTests.slurp(expectedFile);
assertEquals(output, expectedOutput.toString());
FileUtil.fullyDelete(new File(outDir.toString()));
}
@Override
protected void execute(OptionsHelper optionsHelper) throws Exception {
String cubeName = optionsHelper.getOptionValue(OPTION_CUBE_NAME);
String metaUrl = optionsHelper.getOptionValue(OPTION_META_URL);
String segmentId = optionsHelper.getOptionValue(OPTION_SEGMENT_ID);
String inputPath = optionsHelper.getOptionValue(OPTION_INPUT_PATH);
String outputPath = optionsHelper.getOptionValue(OPTION_OUTPUT_PATH);
String counterPath = optionsHelper.getOptionValue(OPTION_COUNTER_PATH);
Class[] kryoClassArray = new Class[]{Class.forName("scala.reflect.ClassTag$$anon$1"),
Class.forName("org.apache.kylin.engine.mr.steps.SelfDefineSortableKey")};
SparkConf conf = new SparkConf().setAppName("Build uhc dictionary with spark for:" + cubeName + " segment " + segmentId);
//serialization conf
conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
conf.set("spark.kryo.registrator", "org.apache.kylin.engine.spark.KylinKryoRegistrator");
conf.set("spark.kryo.registrationRequired", "true").registerKryoClasses(kryoClassArray);
KylinSparkJobListener jobListener = new KylinSparkJobListener();
try (JavaSparkContext sc = new JavaSparkContext(conf)) {
sc.sc().addSparkListener(jobListener);
HadoopUtil.deletePath(sc.hadoopConfiguration(), new Path(outputPath));
Configuration hadoopConf = sc.hadoopConfiguration();
hadoopConf.set("mapreduce.input.pathFilter.class", "org.apache.kylin.engine.mr.steps.filter.UHCDictPathFilter");
final SerializableConfiguration sConf = new SerializableConfiguration(hadoopConf);
KylinConfig config = AbstractHadoopJob.loadKylinConfigFromHdfs(sConf, metaUrl);
CubeManager cubeMgr = CubeManager.getInstance(config);
CubeInstance cube = cubeMgr.getCube(cubeName);
final Job job = Job.getInstance(sConf.get());
// calculate source record bytes size
final LongAccumulator bytesWritten = sc.sc().longAccumulator();
String hdfsDir = sc.hadoopConfiguration().get(BatchConstants.CFG_GLOBAL_DICT_BASE_DIR);
List<TblColRef> uhcColumns = cube.getDescriptor().getAllUHCColumns();
int reducerCount = uhcColumns.size();
if (reducerCount == 0) {
return;
}
logger.info("RDD Output path: {}", outputPath);
logger.info("getTotalReducerNum: {}", reducerCount);
logger.info("counter path {}", counterPath);
JavaPairRDD<String, String> wholeSequenceFileNames = null;
for (TblColRef tblColRef : uhcColumns) {
String columnPath = inputPath + "/" + tblColRef.getIdentity();
if (!HadoopUtil.getFileSystem(columnPath).exists(new Path(columnPath))) {
continue;
}
if (wholeSequenceFileNames == null) {
wholeSequenceFileNames = sc.wholeTextFiles(columnPath);
} else {
wholeSequenceFileNames = wholeSequenceFileNames.union(sc.wholeTextFiles(columnPath));
}
}
if (wholeSequenceFileNames == null) {
logger.error("There're no sequence files at " + inputPath + " !");
return;
}
JavaPairRDD<String, Tuple3<Writable, Writable, String>> pairRDD = wholeSequenceFileNames.map(tuple -> tuple._1)
.mapToPair(new InputPathAndFilterAddFunction2(config, uhcColumns))
.filter(tuple -> tuple._1 != -1)
.reduceByKey((list1, list2) -> combineAllColumnDistinctValues(list1, list2))
.mapToPair(new ProcessUHCColumnValues(cubeName, config, hdfsDir, uhcColumns));
MultipleOutputs.addNamedOutput(job, BatchConstants.CFG_OUTPUT_DICT, SequenceFileOutputFormat.class,
NullWritable.class, ArrayPrimitiveWritable.class);
FileOutputFormat.setOutputPath(job, new Path(outputPath));
job.getConfiguration().set(BatchConstants.CFG_OUTPUT_PATH, outputPath);
//prevent to create zero-sized default output
LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class);
MultipleOutputsRDD multipleOutputsRDD = MultipleOutputsRDD.rddToMultipleOutputsRDD(pairRDD);
multipleOutputsRDD.saveAsNewAPIHadoopDatasetWithMultipleOutputs(job.getConfiguration());
logger.info("Map input records={}", reducerCount);
logger.info("HDFS Read: {} HDFS Write", bytesWritten.value());
Map<String, String> counterMap = Maps.newHashMap();
counterMap.put(ExecutableConstants.SOURCE_RECORDS_COUNT, String.valueOf(reducerCount));
counterMap.put(ExecutableConstants.SOURCE_RECORDS_SIZE, String.valueOf(bytesWritten.value()));
// save counter to hdfs
HadoopUtil.writeToSequenceFile(sc.hadoopConfiguration(), counterPath, counterMap);
HadoopUtil.deleteHDFSMeta(metaUrl);
}
}
/**
* Helper api to generate splits
* @param conf Configuration with all necessary information set to generate
* splits. The following are required at a minimum:
*
* - mapred.mapper.new-api: determine whether mapred.InputFormat or
* mapreduce.InputFormat is to be used
* - mapred.input.format.class or mapreduce.job.inputformat.class:
* determines the InputFormat class to be used
*
* In addition to this, all the configs needed by the InputFormat class also
* have to be set. For example, FileInputFormat needs the input directory
* paths to be set in the config.
*
* @param inputSplitsDir Directory in which the splits file and meta info file
* will be generated. job.split and job.splitmetainfo files in this directory
* will be overwritten. Should be a fully-qualified path.
*
* @return InputSplitInfo containing the split files' information and the
* number of splits generated to be used to determining parallelism of
* the map stage.
*
* @throws IOException
* @throws InterruptedException
* @throws ClassNotFoundException
*/
private static InputSplitInfoDisk generateInputSplits(Configuration conf,
Path inputSplitsDir) throws IOException, InterruptedException,
ClassNotFoundException {
Job job = Job.getInstance(conf);
JobConf jobConf = new JobConf(conf);
conf.setBoolean(MRJobConfig.MR_TEZ_SPLITS_VIA_EVENTS, false);
if (jobConf.getUseNewMapper()) {
LOG.info("Generating new input splits"
+ ", splitsDir=" + inputSplitsDir.toString());
return writeNewSplits(job, inputSplitsDir);
} else {
LOG.info("Generating old input splits"
+ ", splitsDir=" + inputSplitsDir.toString());
return writeOldSplits(jobConf, inputSplitsDir);
}
}
@Override
public int run(String[] args) throws Exception {
Options options = new Options();
try {
options.addOption(OPTION_JOB_NAME);
options.addOption(OPTION_CUBE_NAME);
options.addOption(OPTION_SEGMENT_ID);
options.addOption(OPTION_INPUT_PATH);
options.addOption(OPTION_OUTPUT_PATH);
parseOptions(options, args);
String input = getOptionValue(OPTION_INPUT_PATH);
String output = getOptionValue(OPTION_OUTPUT_PATH);
String cubeName = getOptionValue(OPTION_CUBE_NAME).toUpperCase(Locale.ROOT);
String segmentID = getOptionValue(OPTION_SEGMENT_ID);
CubeManager cubeMgr = CubeManager.getInstance(KylinConfig.getInstanceFromEnv());
CubeInstance cube = cubeMgr.getCube(cubeName);
CubeSegment cubeSeg = cube.getSegmentById(segmentID);
// start job
String jobName = getOptionValue(OPTION_JOB_NAME);
logger.info("Starting: " + jobName);
job = Job.getInstance(getConf(), jobName);
setJobClasspath(job, cube.getConfig());
// add metadata to distributed cache
Segments<CubeSegment> allSegs = cube.getMergingSegments(cubeSeg);
allSegs.add(cubeSeg);
attachSegmentsMetadataWithDict(allSegs, job.getConfiguration());
// Mapper
job.setMapperClass(MergeCuboidMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
// Reducer
job.setReducerClass(CuboidReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
// set inputs
IMROutput2.IMRMergeOutputFormat outputFormat = MRUtil.getBatchMergeOutputSide2(cubeSeg).getOutputFormat();
outputFormat.configureJobInput(job, input);
addInputDirs(input, job);
// set output
outputFormat.configureJobOutput(job, output, cubeSeg);
// set job configuration
job.getConfiguration().set(BatchConstants.CFG_CUBE_NAME, cubeName);
job.getConfiguration().set(BatchConstants.CFG_CUBE_SEGMENT_ID, segmentID);
return waitForCompletion(job);
} finally {
if (job != null)
cleanupTempConfFile(job.getConfiguration());
}
}
@Override
public int run(String[] args) throws Exception {
Options options = new Options();
try {
options.addOption(OPTION_JOB_NAME);
options.addOption(OPTION_CUBE_NAME);
options.addOption(OPTION_SEGMENT_NAME);
options.addOption(OPTION_INPUT_PATH);
options.addOption(OPTION_OUTPUT_PATH);
parseOptions(options, args);
String cubeName = getOptionValue(OPTION_CUBE_NAME).toUpperCase();
String segmentName = getOptionValue(OPTION_SEGMENT_NAME).toUpperCase();
KylinConfig config = KylinConfig.getInstanceFromEnv();
CubeManager cubeMgr = CubeManager.getInstance(config);
CubeInstance cube = cubeMgr.getCube(cubeName);
// start job
String jobName = getOptionValue(OPTION_JOB_NAME);
System.out.println("Starting: " + jobName);
job = Job.getInstance(getConf(), jobName);
setJobClasspath(job);
// set inputs
addInputDirs(getOptionValue(OPTION_INPUT_PATH), job);
Path output = new Path(getOptionValue(OPTION_OUTPUT_PATH));
FileOutputFormat.setOutputPath(job, output);
// Mapper
job.setInputFormatClass(SequenceFileInputFormat.class);
job.setMapperClass(MergeCuboidMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
// Reducer - only one
job.setReducerClass(CuboidReducer.class);
job.setOutputFormatClass(SequenceFileOutputFormat.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
// set job configuration
job.getConfiguration().set(BatchConstants.CFG_CUBE_NAME, cubeName);
job.getConfiguration().set(BatchConstants.CFG_CUBE_SEGMENT_NAME, segmentName);
// add metadata to distributed cache
attachKylinPropsAndMetadata(cube, job.getConfiguration());
setReduceTaskNum(job, config, cubeName, 0);
this.deletePath(job.getConfiguration(), output);
return waitForCompletion(job);
} catch (Exception e) {
logger.error("error in MergeCuboidJob", e);
printUsage(options);
throw e;
}
}
@Override
public int run(String[] args) throws Exception {
try {
Options options = new Options();
options.addOption(OPTION_JOB_NAME);
options.addOption(OPTION_SEGMENT_ID);
options.addOption(OPTION_CUBE_NAME);
options.addOption(OPTION_META_URL);
options.addOption(OPTION_MERGE_SEGMENT_IDS);
options.addOption(OPTION_OUTPUT_PATH_DICT);
options.addOption(OPTION_OUTPUT_PATH_STAT);
parseOptions(options, args);
final String segmentId = getOptionValue(OPTION_SEGMENT_ID);
final String segmentIds = getOptionValue(OPTION_MERGE_SEGMENT_IDS);
final String cubeName = getOptionValue(OPTION_CUBE_NAME);
final String metaUrl = getOptionValue(OPTION_META_URL);
final String dictOutputPath = getOptionValue(OPTION_OUTPUT_PATH_DICT);
final String statOutputPath = getOptionValue(OPTION_OUTPUT_PATH_STAT);
CubeManager cubeMgr = CubeManager.getInstance(KylinConfig.getInstanceFromEnv());
CubeInstance cube = cubeMgr.getCube(cubeName);
CubeDesc cubeDesc = cube.getDescriptor();
CubeSegment segment = cube.getSegmentById(segmentId);
Segments<CubeSegment> mergingSeg = cube.getMergingSegments(segment);
job = Job.getInstance(getConf(), getOptionValue(OPTION_JOB_NAME));
job.getConfiguration().set(BatchConstants.ARG_CUBE_NAME, cubeName);
job.getConfiguration().set(OPTION_META_URL.getOpt(), metaUrl);
job.getConfiguration().set(OPTION_SEGMENT_ID.getOpt(), segmentId);
job.getConfiguration().set(OPTION_MERGE_SEGMENT_IDS.getOpt(), segmentIds);
job.getConfiguration().set(OPTION_OUTPUT_PATH_STAT.getOpt(), statOutputPath);
job.getConfiguration().set("num.map.tasks", String.valueOf(cubeDesc.getAllColumnsNeedDictionaryBuilt().size() + 1));
job.setNumReduceTasks(1);
setJobClasspath(job, cube.getConfig());
// dump metadata to HDFS
attachSegmentsMetadataWithDict(mergingSeg, metaUrl);
// clean output dir
HadoopUtil.deletePath(job.getConfiguration(), new Path(dictOutputPath));
job.setMapperClass(MergeDictionaryMapper.class);
job.setReducerClass(MergeDictionaryReducer.class);
job.setMapOutputKeyClass(IntWritable.class);
job.setMapOutputValueClass(Text.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.setInputFormatClass(IndexArrInputFormat.class);
job.setOutputFormatClass(SequenceFileOutputFormat.class);
SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.NONE);
SequenceFileOutputFormat.setOutputPath(job, new Path(dictOutputPath));
logger.info("Starting: " + job.getJobName());
return waitForCompletion(job);
} finally {
if (job != null)
cleanupTempConfFile(job.getConfiguration());
}
}
@Override
protected int run(CommandLine cmd) throws Exception {
String source = cmd.getOptionValue('s');
String workdir = cmd.getOptionValue('w');
String target = cmd.getOptionValue('t');
getConf().setBoolean(SKIP_INVALID_PROPERTY, cmd.hasOption('i'));
getConf().setBoolean(VERIFY_DATATYPE_VALUES_PROPERTY, cmd.hasOption('d'));
getConf().setBoolean(TRUNCATE_PROPERTY, cmd.hasOption('r'));
getConf().setInt(SPLIT_BITS_PROPERTY, Integer.parseInt(cmd.getOptionValue('b', "3")));
if (cmd.hasOption('g')) getConf().set(DEFAULT_CONTEXT_PROPERTY, cmd.getOptionValue('g'));
getConf().setBoolean(OVERRIDE_CONTEXT_PROPERTY, cmd.hasOption('o'));
getConf().setLong(DEFAULT_TIMESTAMP_PROPERTY, Long.parseLong(cmd.getOptionValue('e', String.valueOf(System.currentTimeMillis()))));
if (cmd.hasOption('m')) getConf().setLong("mapreduce.input.fileinputformat.split.maxsize", Long.parseLong(cmd.getOptionValue('m')));
TableMapReduceUtil.addDependencyJars(getConf(),
NTriplesUtil.class,
Rio.class,
AbstractRDFHandler.class,
RDFFormat.class,
RDFParser.class);
HBaseConfiguration.addHbaseResources(getConf());
Job job = Job.getInstance(getConf(), "HalyardBulkLoad -> " + workdir + " -> " + target);
job.setJarByClass(HalyardBulkLoad.class);
job.setMapperClass(RDFMapper.class);
job.setMapOutputKeyClass(ImmutableBytesWritable.class);
job.setMapOutputValueClass(KeyValue.class);
job.setInputFormatClass(RioFileInputFormat.class);
job.setSpeculativeExecution(false);
job.setReduceSpeculativeExecution(false);
try (HTable hTable = HalyardTableUtils.getTable(getConf(), target, true, getConf().getInt(SPLIT_BITS_PROPERTY, 3))) {
HFileOutputFormat2.configureIncrementalLoad(job, hTable.getTableDescriptor(), hTable.getRegionLocator());
FileInputFormat.setInputDirRecursive(job, true);
FileInputFormat.setInputPaths(job, source);
FileOutputFormat.setOutputPath(job, new Path(workdir));
TableMapReduceUtil.addDependencyJars(job);
TableMapReduceUtil.initCredentials(job);
if (job.waitForCompletion(true)) {
if (getConf().getBoolean(TRUNCATE_PROPERTY, false)) {
HalyardTableUtils.truncateTable(hTable).close();
}
new LoadIncrementalHFiles(getConf()).doBulkLoad(new Path(workdir), hTable);
LOG.info("Bulk Load Completed..");
return 0;
}
}
return -1;
}
@Override
public int run(final String[] args) throws Exception {
final Configuration conf = getConf();
final AccumuloRequiredOptions accumuloOptions =
(AccumuloRequiredOptions) inputStoreOptions.getFactoryOptions();
// job settings
final Job job = Job.getInstance(conf, ingestOptions.getJobName() + "NodeConversion");
job.setJarByClass(OSMConversionRunner.class);
job.getConfiguration().set("osm_mapping", ingestOptions.getMappingContents());
job.getConfiguration().set("arguments", ingestOptions.serializeToString());
if (ingestOptions.getVisibilityOptions().getVisibility() != null) {
job.getConfiguration().set(
AbstractMapReduceIngest.GLOBAL_VISIBILITY_KEY,
ingestOptions.getVisibilityOptions().getVisibility());
}
// input format
AbstractInputFormat.setConnectorInfo(
job,
accumuloOptions.getUser(),
new PasswordToken(accumuloOptions.getPassword()));
InputFormatBase.setInputTableName(job, ingestOptions.getQualifiedTableName());
AbstractInputFormat.setZooKeeperInstance(
job,
new ClientConfiguration().withInstance(accumuloOptions.getInstance()).withZkHosts(
accumuloOptions.getZookeeper()));
AbstractInputFormat.setScanAuthorizations(
job,
new Authorizations(ingestOptions.getVisibilityOptions().getVisibility()));
final IteratorSetting is = new IteratorSetting(50, "WholeRow", WholeRowIterator.class);
InputFormatBase.addIterator(job, is);
job.setInputFormatClass(AccumuloInputFormat.class);
final Range r = new Range();
// final ArrayList<Pair<Text, Text>> columns = new ArrayList<>();
InputFormatBase.setRanges(job, Arrays.asList(r));
// output format
GeoWaveOutputFormat.setStoreOptions(job.getConfiguration(), inputStoreOptions);
final AccumuloOptions options = new AccumuloOptions();
final AdapterStore as =
new AdapterStoreImpl(
new AccumuloOperations(
accumuloOptions.getZookeeper(),
accumuloOptions.getInstance(),
accumuloOptions.getUser(),
accumuloOptions.getPassword(),
accumuloOptions.getGeoWaveNamespace(),
options),
options);
for (final FeatureDataAdapter fda : FeatureDefinitionSet.featureAdapters.values()) {
as.addAdapter(fda);
GeoWaveOutputFormat.addDataAdapter(job.getConfiguration(), fda);
}
final Index primaryIndex =
new SpatialDimensionalityTypeProvider().createIndex(new SpatialOptions());
GeoWaveOutputFormat.addIndex(job.getConfiguration(), primaryIndex);
job.getConfiguration().set(AbstractMapReduceIngest.INDEX_NAMES_KEY, primaryIndex.getName());
job.setOutputFormatClass(GeoWaveOutputFormat.class);
job.setMapOutputKeyClass(GeoWaveOutputKey.class);
job.setMapOutputValueClass(SimpleFeature.class);
// mappper
job.setMapperClass(OSMConversionMapper.class);
// reducer
job.setNumReduceTasks(0);
return job.waitForCompletion(true) ? 0 : -1;
}
@Override
public int run(String[] args) throws Exception {
try {
Configuration conf = HBaseConfiguration.create();
Connection connection = ConnectionFactory.createConnection(conf);
String inputPath = args[0];
String outputPath = args[1];
final TableName tableName = TableName.valueOf(args[2]);
// tag::SETUP[]
Table table = connection.getTable(tableName);
Job job = Job.getInstance(conf, "ConvertToHFiles: Convert CSV to HFiles");
HFileOutputFormat2.configureIncrementalLoad(job, table,
connection.getRegionLocator(tableName)); // <1>
job.setInputFormatClass(TextInputFormat.class); // <2>
job.setJarByClass(ConvertToHFiles.class); // <3>
job.setJar("/home/cloudera/ahae/target/ahae.jar"); // <3>
job.setMapperClass(ConvertToHFilesMapper.class); // <4>
job.setMapOutputKeyClass(ImmutableBytesWritable.class); // <5>
job.setMapOutputValueClass(KeyValue.class); // <6>
FileInputFormat.setInputPaths(job, inputPath);
HFileOutputFormat2.setOutputPath(job, new Path(outputPath));
// end::SETUP[]
if (!job.waitForCompletion(true)) {
LOG.error("Failure");
} else {
LOG.info("Success");
return 0;
}
} catch (Exception e) {
e.printStackTrace();
}
return 1;
}
@Override
public int run(String[] args) throws Exception {
Options options = new Options();
try {
options.addOption(OPTION_JOB_NAME);
options.addOption(OPTION_CUBING_JOB_ID);
options.addOption(OPTION_OUTPUT_PATH);
options.addOption(OPTION_CUBE_NAME);
options.addOption(OPTION_SEGMENT_ID);
parseOptions(options, args);
job = Job.getInstance(getConf(), getOptionValue(OPTION_JOB_NAME));
String job_id = getOptionValue(OPTION_CUBING_JOB_ID);
job.getConfiguration().set(BatchConstants.ARG_CUBING_JOB_ID, job_id);
String cubeName = getOptionValue(OPTION_CUBE_NAME);
String segmentID = getOptionValue(OPTION_SEGMENT_ID);
// ----------------------------------------------------------------------------
// add metadata to distributed cache
CubeManager cubeMgr = CubeManager.getInstance(KylinConfig.getInstanceFromEnv());
CubeInstance cube = cubeMgr.getCube(cubeName);
CubeSegment segment = cube.getSegmentById(segmentID);
job.getConfiguration().set(BatchConstants.CFG_CUBE_NAME, cubeName);
job.getConfiguration().set(BatchConstants.CFG_CUBE_SEGMENT_ID, segmentID);
logger.info("Starting: " + job.getJobName());
job.getConfiguration().set("mapreduce.map.speculative", "false");
setJobClasspath(job, cube.getConfig());
// Mapper
job.setMapperClass(ExtractDictionaryFromGlobalMapper.class);
// Reducer
job.setNumReduceTasks(0);
// Input
IMRInput.IMRTableInputFormat flatTableInputFormat = MRUtil.getBatchCubingInputSide(segment)
.getFlatTableInputFormat();
flatTableInputFormat.configureJob(job);
// Output
//// prevent to create zero-sized default output
LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class);
Path output = new Path(getOptionValue(OPTION_OUTPUT_PATH));
FileOutputFormat.setOutputPath(job, output);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
deletePath(job.getConfiguration(), output);
attachSegmentMetadataWithDict(segment, job.getConfiguration());
return waitForCompletion(job);
} finally {
if (job != null)
cleanupTempConfFile(job.getConfiguration());
}
}