Map任务执行完前会对spill文件进行合并操作,每次spill都会生成一个spill文件,在传向reduce前,map会把这些文件合并为一个文件,文件合并不是一次性把所有文件合并的,每次合并的个数可以通过参数io.sort.factor指定,当实际spill文件数量超过该值的时候,会生成相应的中间临时文件,总之,每次合并文件的数量不会超过io.sort.factor。文件合并由mergeParts函数来实现,该函数在flush阶段被调用,当执行到该阶段是,作业客户端会看到map已经执行了100%,所以当我们看到map执行到100%时,mapTask并不一定真的执行完毕。文件合并的概览图如下:
private void mergeParts() throws IOException, InterruptedException,
ClassNotFoundException {
// get the approximate size of the final output/index files
long finalOutFileSize = 0;
long finalIndexFileSize = 0;
final Path[] filename = new Path[numSpills];
final TaskAttemptID mapId = getTaskID();
//获取所有spill文件
for(int i = 0; i < numSpills; i++) {
filename[i] = mapOutputFile.getSpillFile(i);
finalOutFileSize += rfs.getFileStatus(filename[i]).getLen();
}
//如果spill文件只有一个,则无需合并,直接重命名
if (numSpills == 1) { //the spill is the final output
rfs.rename(filename[0],
new Path(filename[0].getParent(), "file.out"));
if (indexCacheList.size() == 0) {
rfs.rename(mapOutputFile.getSpillIndexFile(0),
new Path(filename[0].getParent(),"file.out.index"));
} else {
indexCacheList.get(0).writeToFile(
new Path(filename[0].getParent(),"file.out.index"), job);
}
return;
}
// 读取索引文件
for (int i = indexCacheList.size(); i < numSpills; ++i) {
Path indexFileName = mapOutputFile.getSpillIndexFile(i);
indexCacheList.add(new SpillRecord(indexFileName, job, null));
}
//计算最终输出文件和最终索引文件的大小,并打开输出流准备写操作
finalOutFileSize += partitions * APPROX_HEADER_LENGTH;
finalIndexFileSize = partitions * MAP_OUTPUT_INDEX_RECORD_LENGTH;
//生成数据文件:file.out
Path finalOutputFile =
mapOutputFile.getOutputFileForWrite(finalOutFileSize);
//生成索引文件:file.out.index
Path finalIndexFile =
mapOutputFile.getOutputIndexFileForWrite(finalIndexFileSize);
//The output stream for the final single output file
FSDataOutputStream finalOut = rfs.create(finalOutputFile, true, 4096);
//如果没有map输出,则创建一个空文件
if (numSpills == 0) {
//create dummy files
IndexRecord rec = new IndexRecord();
SpillRecord sr = new SpillRecord(partitions);
try {
for (int i = 0; i < partitions; i++) {
long segmentStart = finalOut.getPos();
Writer<K, V> writer =
new Writer<K, V>(job, finalOut, keyClass, valClass, codec, null);
writer.close();
rec.startOffset = segmentStart;
rec.rawLength = writer.getRawLength();
rec.partLength = writer.getCompressedLength();
sr.putIndex(rec, i);
}
sr.writeToFile(finalIndexFile, job);
} finally {
finalOut.close();
}
return;
}
{
IndexRecord rec = new IndexRecord();
final SpillRecord spillRec = new SpillRecord(partitions);
//最终生成的输出文件按partition顺序写入file.out中
for (int parts = 0; parts < partitions; parts++) {
//create the segments to be merged
List<Segment<K,V>> segmentList =
new ArrayList<Segment<K, V>>(numSpills);
//循环读取索引文件,把每个spill文件中,相同的partition取出
for(int i = 0; i < numSpills; i++) {
IndexRecord indexRecord = indexCacheList.get(i).getIndex(parts);
//构建需要操作段的元数据
Segment<K,V> s =
new Segment<K,V>(job, rfs, filename[i], indexRecord.startOffset,
indexRecord.partLength, codec, true);
//相同的段元数据放入一个集合中,以便统一操作
segmentList.add(i, s);
if (LOG.isDebugEnabled()) {
LOG.debug("MapId=" + mapId + " Reducer=" + parts +
"Spill =" + i + "(" + indexRecord.startOffset + "," +
indexRecord.rawLength + ", " + indexRecord.partLength + ")");
}
}
//开始合并,spill文件可能有多个,在这一步中会将这些文件合并直到数量小于io.sort.factor,
//以便下面的合并操作一次完成,下面代码中会继续分析这个合并函数
@SuppressWarnings("unchecked")
RawKeyValueIterator kvIter = Merger.merge(job, rfs,
keyClass, valClass, codec,
segmentList, job.getInt("io.sort.factor", 100),
new Path(mapId.toString()),
job.getOutputKeyComparator(), reporter,
null, spilledRecordsCounter);
//如果包含combiner则执行本地合并
long segmentStart = finalOut.getPos();
Writer<K, V> writer =
new Writer<K, V>(job, finalOut, keyClass, valClass, codec,
spilledRecordsCounter);
if (combinerRunner == null || numSpills < minSpillsForCombine) {
Merger.writeFile(kvIter, writer, reporter, job);
} else {
combineCollector.setWriter(writer);
combinerRunner.combine(kvIter, combineCollector);
}
//close
writer.close();
// 记录索引信息
rec.startOffset = segmentStart;
rec.rawLength = writer.getRawLength();
rec.partLength = writer.getCompressedLength();
spillRec.putIndex(rec, parts);
}
//写入索引文件
spillRec.writeToFile(finalIndexFile, job);
finalOut.close();
//删除spill文件
for(int i = 0; i < numSpills; i++) {
rfs.delete(filename[i],true);
}
}
}
}
上面代码中提到了合并阶段如果有大量spill文件的话会先通过merge合并一部分,直到文件数量小于io.sort.factor,所以说这个值确定了一次最多合并文件的数量,如果调大这个值可以减少文件合并的次数,对于IO提升有一部分帮助,当然没有调节io.sort.mb来的直接,缓存大小直接影响了spill文件的数量,增加缓存spill的次数就会减少,但要注意极限值,过大的缓存可能会出发linux的自我保护机制OOM killer,另外对于一个JVM来说,他占用的内存是有限的,缓存部分加大那么剩余空间就会变少,任务运行过程中临时分配空间可能导致对内存溢出,所以生产线调整的时候需要权衡。
下面这个函数会创建文件合并流,每个partition的数据会封装在一个优先级队列中进行合并。
RawKeyValueIterator merge(Class<K> keyClass, Class<V> valueClass,
int factor, int inMem, Path tmpDir,
Counters.Counter readsCounter,
Counters.Counter writesCounter)
throws IOException {
LOG.info("Merging " + segments.size() + " sorted segments");
//获得本次需要处理的segment数量
int numSegments = segments.size();
//保留原始合并因子
int origFactor = factor;
int passNo = 1;
do {
//计算本次合并因子
factor = getPassFactor(factor, passNo, numSegments - inMem);
if (1 == passNo) {
factor += inMem;
}
//一次合并的segmengt需要先放入链表集合中然后会加入到优先队列中进行调度
List<Segment<K, V>> segmentsToMerge =
new ArrayList<Segment<K, V>>();
int segmentsConsidered = 0;
int numSegmentsToConsider = factor;
long startBytes = 0; // starting bytes of segments of this merge
while (true) {
//获得本次需要合并的segment列表
List<Segment<K, V>> mStream =
getSegmentDescriptors(numSegmentsToConsider);
for (Segment<K, V> segment : mStream) {
// 初始化一个segment,打开文件,创建Reader,其中Reader的缓存受io.file.buffer.size影响,可以配置
segment.init(readsCounter);
//获得该segment的起始位置
long startPos = segment.getPosition();
//判断该segment是否还有记录
boolean hasNext = segment.next();
//获得该segment结束位置
long endPos = segment.getPosition();
startBytes += endPos - startPos;
//如果有合并数据则加入合并集合中
if (hasNext) {
segmentsToMerge.add(segment);
segmentsConsidered++;
}
else {
segment.close();
numSegments--; //we ignore this segment for the merge
}
}
//当达到一次合并数量和没有文件需要合并时则退出该循环
if (segmentsConsidered == factor ||
segments.size() == 0) {
break;
}
numSegmentsToConsider = factor - segmentsConsidered;
}
//初始化优先级队列并把上面计算出来的segment加入其中
initialize(segmentsToMerge.size());
clear();
for (Segment<K, V> segment : segmentsToMerge) {
put(segment);
}
//如果需要合并的文件小于factor则直接返回,为什么不在一开始就判断呢?
//这种情况下不会生成临时合并文件,我们主要分析产生临时文件的情况
if (numSegments <= factor) {
// Reset totalBytesProcessed to track the progress of the final merge.
// This is considered the progress of the reducePhase, the 3rd phase
// of reduce task. Currently totalBytesProcessed is not used in sort
// phase of reduce task(i.e. when intermediate merges happen).
totalBytesProcessed = startBytes;
//calculate the length of the remaining segments. Required for
//calculating the merge progress
long totalBytes = 0;
for (int i = 0; i < segmentsToMerge.size(); i++) {
totalBytes += segmentsToMerge.get(i).getLength();
}
if (totalBytes != 0) //being paranoid
progPerByte = 1.0f / (float)totalBytes;
if (totalBytes != 0)
mergeProgress.set(totalBytesProcessed * progPerByte);
else
mergeProgress.set(1.0f); // Last pass and no segments left - we're done
LOG.info("Down to the last merge-pass, with " + numSegments +
" segments left of total size: " + totalBytes + " bytes");
return this;
} else {
LOG.info("Merging " + segmentsToMerge.size() +
" intermediate segments out of a total of " +
(segments.size()+segmentsToMerge.size()));
//该分支的执行会生成临时合并文件intermediate.1...
long approxOutputSize = 0;
for (Segment<K, V> s : segmentsToMerge) {
approxOutputSize += s.getLength() +
ChecksumFileSystem.getApproxChkSumLength(
s.getLength());
}
//确定文件名并在磁盘创建该文件
Path tmpFilename =
new Path(tmpDir, "intermediate").suffix("." + passNo);
Path outputFile = lDirAlloc.getLocalPathForWrite(
tmpFilename.toString(),
approxOutputSize, conf);
Writer<K, V> writer =
new Writer<K, V>(conf, fs, outputFile, keyClass, valueClass, codec,
writesCounter);
//开始写入文件
writeFile(this, writer, reporter, conf);
writer.close();
//we finished one single level merge; now clean up the priority
//queue
this.close();
// 将刚才产生的临时段作为一个临时段加入段列表
Segment<K, V> tempSegment =
new Segment<K, V>(conf, fs, outputFile, codec, false);
segments.add(tempSegment);
numSegments = segments.size();
Collections.sort(segments, segmentComparator);
passNo++;//更新合并次数
}
//we are worried about only the first pass merge factor. So reset the
//factor to what it originally was
factor = origFactor;
} while(true);
}