private static JavaPairRDD interleaveReads(String fastq, String fastq2, int splitlen, JavaSparkContext sc) throws IOException { FileSystem fs = FileSystem.get(new Configuration()); FileStatus fst = fs.getFileStatus(new Path(fastq)); FileStatus fst2 = fs.getFileStatus(new Path(fastq2)); List nlif = NLineInputFormat.getSplitsForFile(fst, sc.hadoopConfiguration(), splitlen); List nlif2 = NLineInputFormat.getSplitsForFile(fst2, sc.hadoopConfiguration(), splitlen); JavaRDD splitRDD = sc.parallelize(nlif); JavaRDD splitRDD2 = sc.parallelize(nlif2); JavaPairRDD zips = splitRDD.zip(splitRDD2); return zips.flatMapToPair( splits -> { FastqInputFormat.FastqRecordReader fqreader = new FastqInputFormat.FastqRecordReader(new Configuration(), splits._1); FastqInputFormat.FastqRecordReader fqreader2 = new FastqInputFormat.FastqRecordReader(new Configuration(), splits._2); ArrayList> reads = new ArrayList>(); while (fqreader.nextKeyValue()) { String key = fqreader.getCurrentKey().toString(); String[] keysplit = key.split(" "); key = keysplit[0]; SequencedFragment sf = new SequencedFragment(); sf.setQuality(new Text(fqreader.getCurrentValue().getQuality().toString())); sf.setSequence(new Text(fqreader.getCurrentValue().getSequence().toString())); if (fqreader2.nextKeyValue()) { String key2 = fqreader2.getCurrentKey().toString(); String[] keysplit2 = key2.split(" "); key2 = keysplit2[0]; //key2 = key2.replace(" 2:N:0:1","/2"); SequencedFragment sf2 = new SequencedFragment(); sf2.setQuality(new Text(fqreader2.getCurrentValue().getQuality().toString())); sf2.setSequence(new Text(fqreader2.getCurrentValue().getSequence().toString())); reads.add(new Tuple2(new Text(key), sf)); reads.add(new Tuple2(new Text(key2), sf2)); } } return reads.iterator(); }); }

类org.apache.hadoop.mapreduce.lib.input.NLineInputFormat源码实例Demo