1. 发起 hbase split
1.1 HBaseAdmin.split
/**
* Split a table or an individual region.
* Asynchronous operation.
*
* @param tableNameOrRegionName table to region to split
* @param splitPoint the explicit position to split on
* @throws IOException if a remote or network exception occurs
* @throws InterruptedException interrupt exception occurred
*/
public void split(final byte [] tableNameOrRegionName,
final byte [] splitPoint) throws IOException, InterruptedException {
CatalogTracker ct = getCatalogTracker();
try {
Pair<HRegionInfo, ServerName> regionServerPair
= getRegion(tableNameOrRegionName, ct); //如果tableNameOrRegionName是RegionName则可以获得Pair<HRegionInfo, ServerName>,否则为空
if (regionServerPair != null) {
if (regionServerPair.getSecond() == null) {
throw new NoServerForRegionException(Bytes.toStringBinary(tableNameOrRegionName));
} else {
//split region 重点分析方法
split(regionServerPair.getSecond(), regionServerPair.getFirst(), splitPoint);
}
} else {
//如果tableNameOrRegionName为表名称则进入这个分支
final String tableName = tableNameString(tableNameOrRegionName, ct);
List<Pair<HRegionInfo, ServerName>> pairs =
MetaReader.getTableRegionsAndLocations(ct,
tableName);//获得tableName这个表的所有region的HRegionInfo和对应的ServerName
//如果splitPoint为空则会对所有region执行split,如果非空则只对包含splitPoint的region执行split.
for (Pair<HRegionInfo, ServerName> pair: pairs) {
// May not be a server for a particular row
if (pair.getSecond() == null) continue;
HRegionInfo r = pair.getFirst();
// check for parents
if (r.isSplitParent()) continue;
// if a split point given, only split that particular region
if (splitPoint != null && !r.containsRow(splitPoint)) continue;
// call out to region server to do split now
split(pair.getSecond(), pair.getFirst(), splitPoint);
}
}
} finally {
cleanupCatalogTracker(ct);
}
}
1.2 HBaseAdmin.split
//这个函数为上面函数中调用的split
private void split(final ServerName sn, final HRegionInfo hri,
byte[] splitPoint) throws IOException {
HRegionInterface rs =
this.connection.getHRegionConnection(sn.getHostname(), sn.getPort());//获得HRegionServer实例
rs.splitRegion(hri, splitPoint);//调用HRegionServer.splitRegion对region进行split
}
1.3 HRegionServer.splitRegion 对Region进行split
public void splitRegion(HRegionInfo regionInfo, byte[] splitPoint)
throws NotServingRegionException, IOException {
checkOpen();//检查server和hdfs是否可用
HRegion region = getRegion(regionInfo.getRegionName());//根据RegionName获得region
region.flushcache();//flush cache 有几种情况不进行flush,the cache is empty | the region is closed | a flush is already in progress | writes are disabled
region.forceSplit(splitPoint);//设置split point
compactSplitThread.requestSplit(region, region.checkSplit()); //通过region.checkSplit()获取split point,进行split
}
2. 确定split point
2.2 HRegion.checkSplit
public byte[] checkSplit() {
// Can't split ROOT/META
if (this.regionInfo.isMetaTable()) {
if (shouldForceSplit()) {
LOG.warn("Cannot split root/meta regions in HBase 0.20 and above");
}
return null;
}
if (!splitPolicy.shouldSplit()) {
return null;
}
byte[] ret = splitPolicy.getSplitPoint();
if (ret != null) {
try {
checkRow(ret, "calculated split");
} catch (IOException e) {
LOG.error("Ignoring invalid split", e);
return null;
}
}
return ret;
}
2.3 RegionSplitPolicy.getSplitPoint 具体获得分割点方法
//如果region设置了split Point,则返回设置的split Point。否则,获取store的midkey作为splitpoint
protected byte[] getSplitPoint() {
byte[] explicitSplitPoint = this.region.getExplicitSplitPoint();
if (explicitSplitPoint != null) {
return explicitSplitPoint;
}
Map<byte[], Store> stores = region.getStores();
byte[] splitPointFromLargestStore = null;
long largestStoreSize = 0;
for (Store s : stores.values()) {
byte[] splitPoint = s.getSplitPoint();
long storeSize = s.getSize();
if (splitPoint != null && largestStoreSize < storeSize) {
splitPointFromLargestStore = splitPoint;
largestStoreSize = storeSize;
}
}
return splitPointFromLargestStore;
}
3.执行split
3.1 CompactSplitThread.requestSplit
public synchronized void requestSplit(final HRegion r, byte[] midKey) {
if (midKey == null) {
LOG.debug("Region " + r.getRegionNameAsString() +
" not splittable because midkey=null");
return;
}
try {
this.splits.execute(new SplitRequest(r, midKey, this.server));
if (LOG.isDebugEnabled()) {
LOG.debug("Split requested for " + r + ". " + this);
}
} catch (RejectedExecutionException ree) {
LOG.info("Could not execute split for " + r, ree);
}
}
3.2 SplitRequest.run
public void run() {
if (this.server.isStopping() || this.server.isStopped()) {
LOG.debug("Skipping split because server is stopping=" +
this.server.isStopping() + " or stopped=" + this.server.isStopped());
return;
}
try {
final long startTime = System.currentTimeMillis();
SplitTransaction st = new SplitTransaction(parent, midKey);
// If prepare does not return true, for some reason -- logged inside in
// the prepare call -- we are not ready to split just now. Just return.
//SplitTransaction.prepare()初始化SplitTransaction对象中的两个子region。同时做一些检测比如splitrow必须被region包含等
if (!st.prepare()) return;
try {
st.execute(this.server, this.server);
this.server.getMetrics().incrementSplitSuccessCount();
} catch (Exception e) {
if (this.server.isStopping() || this.server.isStopped()) {
LOG.info(
"Skip rollback/cleanup of failed split of "
+ parent.getRegionNameAsString() + " because server is"
+ (this.server.isStopping() ? " stopping" : " stopped"), e);
return;
}
try {
LOG.info("Running rollback/cleanup of failed split of " +
parent.getRegionNameAsString() + "; " + e.getMessage(), e);
if (st.rollback(this.server, this.server)) {
LOG.info("Successful rollback of failed split of " +
parent.getRegionNameAsString());
this.server.getMetrics().incrementSplitFailureCount();
} else {
this.server.abort("Abort; we got an error after point-of-no-return");
}
} catch (RuntimeException ee) {
........
}
3.3 SplitTransaction.execute
/**
* Run the transaction.
* @param server Hosting server instance. Can be null when testing (won't try
* and update in zk if a null server)
* @param services Used to online/offline regions.
* @throws IOException If thrown, transaction failed. Call {@link #rollback(Server, RegionServerServices)}
* @return Regions created
* @throws IOException
* @see #rollback(Server, RegionServerServices)
*/
public PairOfSameType<HRegion> execute(final Server server,
final RegionServerServices services)
throws IOException {
PairOfSameType<HRegion> regions = createDaughters(server, services);
//创建split临时目录,改变region zk状态,关闭region,停止所有store服务
//创建daughter目录,将region storefile放入目录中
//创建子region A、B,在zk上注册,并且设置原HRI下线
openDaughters(server, services, regions.getFirst(), regions.getSecond());
transitionZKNode(server, services, regions.getFirst(), regions.getSecond());
return regions;
}
3.3.1 SplitTransaction.createDaughters 创建两个region,获得parent region的写锁
/**
* Prepare the regions and region files.
* @param server Hosting server instance. Can be null when testing (won't try
* and update in zk if a null server)
* @param services Used to online/offline regions.
* @throws IOException If thrown, transaction failed. Call {@link #rollback(Server, RegionServerServices)}
* @return Regions created
*/
/* package */PairOfSameType<HRegion> createDaughters(final Server server,
final RegionServerServices services) throws IOException {
LOG.info("Starting split of region " + this.parent);
if ((server != null && server.isStopped()) ||
(services != null && services.isStopping())) {
throw new IOException("Server is stopped or stopping");
}
assert !this.parent.lock.writeLock().isHeldByCurrentThread(): "Unsafe to hold write lock while performing RPCs";
// Coprocessor callback
//这个就是触发BaseRegionObserver.preSplit的源头
if (this.parent.getCoprocessorHost() != null) {
this.parent.getCoprocessorHost().preSplit();
}
// If true, no cluster to write meta edits to or to update znodes in.
boolean testing = server == null? true:
server.getConfiguration().getBoolean("hbase.testing.nocluster", false);
this.fileSplitTimeout = testing ? this.fileSplitTimeout :
server.getConfiguration().getLong("hbase.regionserver.fileSplitTimeout",
this.fileSplitTimeout);
// Set ephemeral SPLITTING znode up in zk. Mocked servers sometimes don't
// have zookeeper so don't do zk stuff if server or zookeeper is null
if (server != null && server.getZooKeeper() != null) {
try {
// 1. 在zk上创建一个临时的node splitting point
createNodeSplitting(server.getZooKeeper(),
this.parent.getRegionInfo(), server.getServerName());
} catch (KeeperException e) {
throw new IOException("Failed creating SPLITTING znode on " +
this.parent.getRegionNameAsString(), e);
}
}
//记录了进度在 private final List<JournalEntry> journal = new ArrayList<JournalEntry>();中
this.journal.add(JournalEntry.SET_SPLITTING_IN_ZK);
if (server != null && server.getZooKeeper() != null) {
try {
// Transition node from SPLITTING to SPLITTING after creating the split node.
// Master will get the callback for node change only if the transition is successful.
// Note that if the transition fails then the rollback will delete the created znode
// TODO : May be we can add some new state to znode and handle the new state incase of success/failure
// 2. 等待master直到这个region转为splitting状态
this.znodeVersion = transitionNodeSplitting(server.getZooKeeper(),
this.parent.getRegionInfo(), server.getServerName(), -1);
} catch (KeeperException e) {
throw new IOException("Failed setting SPLITTING znode on "
+ this.parent.getRegionNameAsString(), e);
}
}
// 3. 建立splitting的文件夹
createSplitDir(this.parent.getFilesystem(), this.splitdir);
this.journal.add(JournalEntry.CREATE_SPLIT_DIR);
List<StoreFile> hstoreFilesToSplit = null;
Exception exceptionToThrow = null;
try{
// 4. 等待region的flush和compact都完成后,关闭这个region
hstoreFilesToSplit = this.parent.close(false);
} catch (Exception e) {
exceptionToThrow = e;
}
if (exceptionToThrow == null && hstoreFilesToSplit == null) {
// The region was closed by a concurrent thread. We can't continue
// with the split, instead we must just abandon the split. If we
// reopen or split this could cause problems because the region has
// probably already been moved to a different server, or is in the
// process of moving to a different server.
exceptionToThrow = closedByOtherException;
}
if (exceptionToThrow != closedByOtherException) {
this.journal.add(JournalEntry.CLOSED_PARENT_REGION);
}
if (exceptionToThrow != null) {
if (exceptionToThrow instanceof IOException) throw (IOException)exceptionToThrow;
throw new IOException(exceptionToThrow);
}
if (!testing) {
// 5. 从HRegionServer上移除,加入到下线region中
services.removeFromOnlineRegions(this.parent.getRegionInfo().getEncodedName());
}
this.journal.add(JournalEntry.OFFLINED_PARENT);
// TODO: If splitStoreFiles were multithreaded would we complete steps in
// less elapsed time? St.Ack 20100920
//
// splitStoreFiles creates daughter region dirs under the parent splits dir
// Nothing to unroll here if failure -- clean up of CREATE_SPLIT_DIR will
// clean this up.
// 6. 进行regionsplit操作,创建线程池,用StoreFileSplitter类将region下的所有Hfile(StoreFile)进行split,
// (split row在hfile中的不管,其他的都进行引用,把引用文件分别写到region下边)
splitStoreFiles(this.splitdir, hstoreFilesToSplit);
// Log to the journal that we are creating region A, the first daughter
// region. We could fail halfway through. If we do, we could have left
// stuff in fs that needs cleanup -- a storefile or two. Thats why we
// add entry to journal BEFORE rather than AFTER the change.
// 7. 生成左右两个子region,删除meta上parent,根据引用文件生成子region的regioninfo,写到hdfs上
this.journal.add(JournalEntry.STARTED_REGION_A_CREATION);
HRegion a = createDaughterRegion(this.hri_a, this.parent.rsServices);
// Ditto
this.journal.add(JournalEntry.STARTED_REGION_B_CREATION);
HRegion b = createDaughterRegion(this.hri_b, this.parent.rsServices);
// This is the point of no return. Adding subsequent edits to .META. as we
// do below when we do the daughter opens adding each to .META. can fail in
// various interesting ways the most interesting of which is a timeout
// BUT the edits all go through (See HBASE-3872). IF we reach the PONR
// then subsequent failures need to crash out this regionserver; the
// server shutdown processing should be able to fix-up the incomplete split.
// The offlined parent will have the daughters as extra columns. If
// we leave the daughter regions in place and do not remove them when we
// crash out, then they will have their references to the parent in place
// still and the server shutdown fixup of .META. will point to these
// regions.
// We should add PONR JournalEntry before offlineParentInMeta,so even if
// OfflineParentInMeta timeout,this will cause regionserver exit,and then
// master ServerShutdownHandler will fix daughter & avoid data loss. (See
// HBase-4562).
this.journal.add(JournalEntry.PONR);
// Edit parent in meta. Offlines parent region and adds splita and splitb.
if (!testing) {
MetaEditor.offlineParentInMeta(server.getCatalogTracker(),
this.parent.getRegionInfo(), a.getRegionInfo(), b.getRegionInfo());
}
return new PairOfSameType<HRegion>(a, b);
}
3.3.2 SplitTransaction.openDaughters 打开两个子region
/**
* Perform time consuming opening of the daughter regions.
* @param server Hosting server instance. Can be null when testing (won't try
* and update in zk if a null server)
* @param services Used to online/offline regions.
* @param a first daughter region
* @param a second daughter region
* @throws IOException If thrown, transaction failed. Call {@link #rollback(Server, RegionServerServices)}
*/
/* package */void openDaughters(final Server server,
final RegionServerServices services, HRegion a, HRegion b)
throws IOException {
boolean stopped = server != null && server.isStopped();
boolean stopping = services != null && services.isStopping();
// TODO: Is this check needed here?
if (stopped || stopping) {
LOG.info("Not opening daughters " +
b.getRegionInfo().getRegionNameAsString() +
" and " +
a.getRegionInfo().getRegionNameAsString() +
" because stopping=" + stopping + ", stopped=" + stopped);
} else {
// Open daughters in parallel.
//打开两个子region
//内层会调用HRegion.openHRegion去打开一个Region,具体的初始化是在HRegion.initializeRegionInternals中
DaughterOpener aOpener = new DaughterOpener(server, a);
DaughterOpener bOpener = new DaughterOpener(server, b);
aOpener.start();
bOpener.start();
try {
aOpener.join();
bOpener.join();
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
throw new IOException("Interrupted " + e.getMessage());
}
if (aOpener.getException() != null) {
throw new IOException("Failed " +
aOpener.getName(), aOpener.getException());
}
if (bOpener.getException() != null) {
throw new IOException("Failed " +
bOpener.getName(), bOpener.getException());
}
if (services != null) {
try {
// add 2nd daughter first (see HBASE-4335)
services.postOpenDeployTasks(b, server.getCatalogTracker(), true);
// Should add it to OnlineRegions
services.addToOnlineRegions(b);
services.postOpenDeployTasks(a, server.getCatalogTracker(), true);
services.addToOnlineRegions(a);
} catch (KeeperException ke) {
throw new IOException(ke);
}
}
}
}
a)DaughterOpener,打开region。(会调用openDaughterRegion,最底层会调用HRegion.openHRegion )
1).向hdfs上写入.regionInfo文件以便meta挂掉以便恢复
2).初始化其下的HStore,主要是LoadStoreFiles函数:
该函数会构造storefile对象,从hdfs上获取路径和文件,每个文件一个
storefile对象,对每个storefile对象会读取文件上的内容创建一个
HalfStoreFileReader读对象来操作该region的父region上的相应的文件,及该
region上目前存储的是引用文件,其指向的是其父region上的相应的文件,对该
region的所有读或写都将关联到父region上。
b).services.addToOnlineRegions 将子Region添加到rs的online region列表上,并添加到meta表上。
3.3.3 HRegion.openHRegion
/**
* Open HRegion.
* Calls initialize and sets sequenceid.
* @param reporter
* @return Returns <code>this</code>
* @throws IOException
*/
protected HRegion openHRegion(final CancelableProgressable reporter)
throws IOException {
checkCompressionCodecs();
//初始化region,
//1.checkRegionInfoOnFilesystem将HRegionInfo写入文件
//2.cleanupTempDir 清空老region临时目录
//3.初始化HRegion store,加载hfile
//4.获得recover.edit文件,找到对应的store,将读取的keyvalue输出到store,恢复hregion
long seqid = initialize(reporter);
if (this.log != null) {
this.log.setSequenceNumber(seqid);
}
return this;
}
3.4 SplitTransaction.transitionZKNode 修改zk节点状态,等待split结束
/**
* Finish off split transaction, transition the zknode
* @param server Hosting server instance. Can be null when testing (won't try
* and update in zk if a null server)
* @param services Used to online/offline regions.
* @param a first daughter region
* @param a second daughter region
* @throws IOException If thrown, transaction failed. Call {@link #rollback(Server, RegionServerServices)}
*/
/* package */void transitionZKNode(final Server server,
final RegionServerServices services, HRegion a, HRegion b)
throws IOException {
// Tell master about split by updating zk. If we fail, abort.
if (server != null && server.getZooKeeper() != null) {
try {
this.znodeVersion = transitionNodeSplit(server.getZooKeeper(),
parent.getRegionInfo(), a.getRegionInfo(), b.getRegionInfo(),
server.getServerName(), this.znodeVersion);
int spins = 0;
// Now wait for the master to process the split. We know it's done
// when the znode is deleted. The reason we keep tickling the znode is
// that it's possible for the master to miss an event.
do {
if (spins % 10 == 0) {
LOG.debug("Still waiting on the master to process the split for " +
this.parent.getRegionInfo().getEncodedName());
}
Thread.sleep(100);
// When this returns -1 it means the znode doesn't exist
this.znodeVersion = tickleNodeSplit(server.getZooKeeper(),
parent.getRegionInfo(), a.getRegionInfo(), b.getRegionInfo(),
server.getServerName(), this.znodeVersion);
spins++;
} while (this.znodeVersion != -1 && !server.isStopped()
&& !services.isStopping());
} catch (Exception e) {
if (e instanceof InterruptedException) {
Thread.currentThread().interrupt();
}
throw new IOException("Failed telling master about split", e);
}
}
// Coprocessor callback
if (this.parent.getCoprocessorHost() != null) {
this.parent.getCoprocessorHost().postSplit(a,b);
}
// Leaving here, the splitdir with its dross will be in place but since the
// split was successful, just leave it; it'll be cleaned when parent is
// deleted and cleaned up.
}