View Javadoc

1   /**
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *     http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   */
18  
19  package org.apache.hadoop.hbase.master.procedure;
20  
21  import java.io.IOException;
22  import java.util.concurrent.CountDownLatch;
23  
24  import org.apache.commons.logging.Log;
25  import org.apache.commons.logging.LogFactory;
26  import org.apache.hadoop.conf.Configuration;
27  import org.apache.hadoop.fs.FileSystem;
28  import org.apache.hadoop.fs.Path;
29  import org.apache.hadoop.hbase.HBaseTestingUtility;
30  import org.apache.hadoop.hbase.HRegionInfo;
31  import org.apache.hadoop.hbase.HTableDescriptor;
32  import org.apache.hadoop.hbase.MiniHBaseCluster;
33  import org.apache.hadoop.hbase.TableName;
34  import org.apache.hadoop.hbase.master.HMaster;
35  import org.apache.hadoop.hbase.procedure2.Procedure;
36  import org.apache.hadoop.hbase.procedure2.ProcedureExecutor;
37  import org.apache.hadoop.hbase.procedure2.ProcedureTestingUtility;
38  import org.apache.hadoop.hbase.procedure2.ProcedureTestingUtility.TestProcedure;
39  import org.apache.hadoop.hbase.procedure2.store.ProcedureStore;
40  import org.apache.hadoop.hbase.procedure2.store.wal.WALProcedureStore;
41  import org.apache.hadoop.hbase.protobuf.generated.MasterProcedureProtos.CreateTableState;
42  import org.apache.hadoop.hbase.protobuf.generated.MasterProcedureProtos.DeleteTableState;
43  import org.apache.hadoop.hbase.protobuf.generated.MasterProcedureProtos.DisableTableState;
44  import org.apache.hadoop.hbase.protobuf.generated.MasterProcedureProtos.EnableTableState;
45  import org.apache.hadoop.hbase.protobuf.generated.MasterProcedureProtos.TruncateTableState;
46  import org.apache.hadoop.hbase.testclassification.LargeTests;
47  import org.apache.hadoop.hbase.util.Bytes;
48  import org.apache.hadoop.hbase.util.FSUtils;
49  import org.apache.hadoop.hbase.util.ModifyRegionUtils;
50  import org.junit.After;
51  import org.junit.Before;
52  import org.junit.Test;
53  import org.junit.experimental.categories.Category;
54  import org.mockito.Mockito;
55  
56  import static org.junit.Assert.assertEquals;
57  import static org.junit.Assert.assertTrue;
58  import static org.junit.Assert.fail;
59  
60  @Category(LargeTests.class)
61  public class TestMasterFailoverWithProcedures {
62    private static final Log LOG = LogFactory.getLog(TestMasterFailoverWithProcedures.class);
63  
64    protected static final HBaseTestingUtility UTIL = new HBaseTestingUtility();
65  
66    private static void setupConf(Configuration conf) {
67      // don't waste time retrying with the roll, the test is already slow enough.
68      conf.setInt("hbase.procedure.store.wal.max.retries.before.roll", 1);
69      conf.setInt("hbase.procedure.store.wal.wait.before.roll", 0);
70      conf.setInt("hbase.procedure.store.wal.max.roll.retries", 1);
71      conf.setInt("hbase.procedure.store.wal.sync.failure.roll.max", 1);
72    }
73  
74    @Before
75    public void setup() throws Exception {
76      setupConf(UTIL.getConfiguration());
77      UTIL.startMiniCluster(2, 1);
78  
79      final ProcedureExecutor<MasterProcedureEnv> procExec = getMasterProcedureExecutor();
80      ProcedureTestingUtility.setToggleKillBeforeStoreUpdate(procExec, false);
81      ProcedureTestingUtility.setKillBeforeStoreUpdate(procExec, false);
82    }
83  
84    @After
85    public void tearDown() throws Exception {
86      try {
87        UTIL.shutdownMiniCluster();
88      } catch (Exception e) {
89        LOG.warn("failure shutting down cluster", e);
90      }
91    }
92  
93    @Test(timeout=60000)
94    public void testWalRecoverLease() throws Exception {
95      final ProcedureStore masterStore = getMasterProcedureExecutor().getStore();
96      assertTrue("expected WALStore for this test", masterStore instanceof WALProcedureStore);
97  
98      HMaster firstMaster = UTIL.getHBaseCluster().getMaster();
99      // Abort Latch for the master store
100     final CountDownLatch masterStoreAbort = new CountDownLatch(1);
101     masterStore.registerListener(new ProcedureStore.ProcedureStoreListener() {
102       @Override
103       public void postSync() {}
104 
105       @Override
106       public void abortProcess() {
107         LOG.debug("Abort store of Master");
108         masterStoreAbort.countDown();
109       }
110     });
111 
112     // startup a fake master the new WAL store will take the lease
113     // and the active master should abort.
114     HMaster backupMaster3 = Mockito.mock(HMaster.class);
115     Mockito.doReturn(firstMaster.getConfiguration()).when(backupMaster3).getConfiguration();
116     Mockito.doReturn(true).when(backupMaster3).isActiveMaster();
117     final WALProcedureStore backupStore3 = new WALProcedureStore(firstMaster.getConfiguration(),
118         firstMaster.getMasterFileSystem().getFileSystem(),
119         ((WALProcedureStore)masterStore).getLogDir(),
120         new MasterProcedureEnv.WALStoreLeaseRecovery(backupMaster3));
121     // Abort Latch for the test store
122     final CountDownLatch backupStore3Abort = new CountDownLatch(1);
123     backupStore3.registerListener(new ProcedureStore.ProcedureStoreListener() {
124       @Override
125       public void postSync() {}
126 
127       @Override
128       public void abortProcess() {
129         LOG.debug("Abort store of backupMaster3");
130         backupStore3Abort.countDown();
131         backupStore3.stop(true);
132       }
133     });
134     backupStore3.start(1);
135     backupStore3.recoverLease();
136 
137     // Try to trigger a command on the master (WAL lease expired on the active one)
138     HTableDescriptor htd = MasterProcedureTestingUtility.createHTD(TableName.valueOf("mtb"), "f");
139     HRegionInfo[] regions = ModifyRegionUtils.createHRegionInfos(htd, null);
140     LOG.debug("submit proc");
141     try {
142       getMasterProcedureExecutor().submitProcedure(
143         new CreateTableProcedure(getMasterProcedureExecutor().getEnvironment(), htd, regions));
144       fail("expected RuntimeException 'sync aborted'");
145     } catch (RuntimeException e) {
146       LOG.info("got " + e.getMessage());
147     }
148     LOG.debug("wait master store abort");
149     masterStoreAbort.await();
150 
151     // Now the real backup master should start up
152     LOG.debug("wait backup master to startup");
153     waitBackupMaster(UTIL, firstMaster);
154     assertEquals(true, firstMaster.isStopped());
155 
156     // wait the store in here to abort (the test will fail due to timeout if it doesn't)
157     LOG.debug("wait the store to abort");
158     backupStore3.getStoreTracker().setDeleted(1, false);
159     try {
160       backupStore3.delete(1);
161       fail("expected RuntimeException 'sync aborted'");
162     } catch (RuntimeException e) {
163       LOG.info("got " + e.getMessage());
164     }
165     backupStore3Abort.await();
166   }
167 
168   /**
169    * Tests proper fencing in case the current WAL store is fenced
170    */
171   @Test
172   public void testWALfencingWithoutWALRolling() throws IOException {
173     testWALfencing(false);
174   }
175 
176   /**
177    * Tests proper fencing in case the current WAL store does not receive writes until after the
178    * new WAL does a couple of WAL rolls.
179    */
180   @Test
181   public void testWALfencingWithWALRolling() throws IOException {
182     testWALfencing(true);
183   }
184 
185   public void testWALfencing(boolean walRolls) throws IOException {
186     final ProcedureStore procStore = getMasterProcedureExecutor().getStore();
187     assertTrue("expected WALStore for this test", procStore instanceof WALProcedureStore);
188 
189     HMaster firstMaster = UTIL.getHBaseCluster().getMaster();
190 
191     // cause WAL rolling after a delete in WAL:
192     firstMaster.getConfiguration().setLong("hbase.procedure.store.wal.roll.threshold", 1);
193 
194     HMaster backupMaster3 = Mockito.mock(HMaster.class);
195     Mockito.doReturn(firstMaster.getConfiguration()).when(backupMaster3).getConfiguration();
196     Mockito.doReturn(true).when(backupMaster3).isActiveMaster();
197     final WALProcedureStore procStore2 = new WALProcedureStore(firstMaster.getConfiguration(),
198         firstMaster.getMasterFileSystem().getFileSystem(),
199         ((WALProcedureStore)procStore).getLogDir(),
200         new MasterProcedureEnv.WALStoreLeaseRecovery(backupMaster3));
201 
202     // start a second store which should fence the first one out
203     LOG.info("Starting new WALProcedureStore");
204     procStore2.start(1);
205     procStore2.recoverLease();
206 
207     // before writing back to the WAL store, optionally do a couple of WAL rolls (which causes
208     // to delete the old WAL files).
209     if (walRolls) {
210       LOG.info("Inserting into second WALProcedureStore, causing WAL rolls");
211       for (int i = 0; i < 512; i++) {
212         // insert something to the second store then delete it, causing a WAL roll(s)
213         Procedure proc2 = new TestProcedure(i);
214         procStore2.insert(proc2, null);
215         procStore2.delete(proc2.getProcId()); // delete the procedure so that the WAL is removed later
216       }
217     }
218 
219     // Now, insert something to the first store, should fail.
220     // If the store does a WAL roll and continue with another logId without checking higher logIds
221     // it will incorrectly succeed.
222     LOG.info("Inserting into first WALProcedureStore");
223     try {
224       procStore.insert(new TestProcedure(11), null);
225       fail("Inserting into Procedure Store should have failed");
226     } catch (Exception ex) {
227       LOG.info("Received expected exception", ex);
228     }
229   }
230 
231   // ==========================================================================
232   //  Test Create Table
233   // ==========================================================================
234   @Test(timeout=60000)
235   public void testCreateWithFailover() throws Exception {
236     // TODO: Should we try every step? (master failover takes long time)
237     // It is already covered by TestCreateTableProcedure
238     // but without the master restart, only the executor/store is restarted.
239     // Without Master restart we may not find bug in the procedure code
240     // like missing "wait" for resources to be available (e.g. RS)
241     testCreateWithFailoverAtStep(CreateTableState.CREATE_TABLE_ASSIGN_REGIONS.ordinal());
242   }
243 
244   private void testCreateWithFailoverAtStep(final int step) throws Exception {
245     final TableName tableName = TableName.valueOf("testCreateWithFailoverAtStep" + step);
246 
247     // create the table
248     ProcedureExecutor<MasterProcedureEnv> procExec = getMasterProcedureExecutor();
249     ProcedureTestingUtility.setKillBeforeStoreUpdate(procExec, true);
250     ProcedureTestingUtility.setToggleKillBeforeStoreUpdate(procExec, true);
251 
252     // Start the Create procedure && kill the executor
253     byte[][] splitKeys = null;
254     HTableDescriptor htd = MasterProcedureTestingUtility.createHTD(tableName, "f1", "f2");
255     HRegionInfo[] regions = ModifyRegionUtils.createHRegionInfos(htd, splitKeys);
256     long procId = procExec.submitProcedure(
257       new CreateTableProcedure(procExec.getEnvironment(), htd, regions));
258     testRecoveryAndDoubleExecution(UTIL, procId, step, CreateTableState.values());
259 
260     MasterProcedureTestingUtility.validateTableCreation(
261       UTIL.getHBaseCluster().getMaster(), tableName, regions, "f1", "f2");
262   }
263 
264   // ==========================================================================
265   //  Test Delete Table
266   // ==========================================================================
267   @Test(timeout=60000)
268   public void testDeleteWithFailover() throws Exception {
269     // TODO: Should we try every step? (master failover takes long time)
270     // It is already covered by TestDeleteTableProcedure
271     // but without the master restart, only the executor/store is restarted.
272     // Without Master restart we may not find bug in the procedure code
273     // like missing "wait" for resources to be available (e.g. RS)
274     testDeleteWithFailoverAtStep(DeleteTableState.DELETE_TABLE_UNASSIGN_REGIONS.ordinal());
275   }
276 
277   private void testDeleteWithFailoverAtStep(final int step) throws Exception {
278     final TableName tableName = TableName.valueOf("testDeleteWithFailoverAtStep" + step);
279 
280     // create the table
281     byte[][] splitKeys = null;
282     HRegionInfo[] regions = MasterProcedureTestingUtility.createTable(
283       getMasterProcedureExecutor(), tableName, splitKeys, "f1", "f2");
284     Path tableDir = FSUtils.getTableDir(getRootDir(), tableName);
285     MasterProcedureTestingUtility.validateTableCreation(
286       UTIL.getHBaseCluster().getMaster(), tableName, regions, "f1", "f2");
287     UTIL.getHBaseAdmin().disableTable(tableName);
288 
289     ProcedureExecutor<MasterProcedureEnv> procExec = getMasterProcedureExecutor();
290     ProcedureTestingUtility.setKillBeforeStoreUpdate(procExec, true);
291     ProcedureTestingUtility.setToggleKillBeforeStoreUpdate(procExec, true);
292 
293     // Start the Delete procedure && kill the executor
294     long procId = procExec.submitProcedure(
295       new DeleteTableProcedure(procExec.getEnvironment(), tableName));
296     testRecoveryAndDoubleExecution(UTIL, procId, step, DeleteTableState.values());
297 
298     MasterProcedureTestingUtility.validateTableDeletion(
299       UTIL.getHBaseCluster().getMaster(), tableName, regions, "f1", "f2");
300   }
301 
302   // ==========================================================================
303   //  Test Truncate Table
304   // ==========================================================================
305   @Test(timeout=90000)
306   public void testTruncateWithFailover() throws Exception {
307     // TODO: Should we try every step? (master failover takes long time)
308     // It is already covered by TestTruncateTableProcedure
309     // but without the master restart, only the executor/store is restarted.
310     // Without Master restart we may not find bug in the procedure code
311     // like missing "wait" for resources to be available (e.g. RS)
312     testTruncateWithFailoverAtStep(true, TruncateTableState.TRUNCATE_TABLE_ADD_TO_META.ordinal());
313   }
314 
315   private void testTruncateWithFailoverAtStep(final boolean preserveSplits, final int step)
316       throws Exception {
317     final TableName tableName = TableName.valueOf("testTruncateWithFailoverAtStep" + step);
318 
319     // create the table
320     final String[] families = new String[] { "f1", "f2" };
321     final byte[][] splitKeys = new byte[][] {
322       Bytes.toBytes("a"), Bytes.toBytes("b"), Bytes.toBytes("c")
323     };
324     HRegionInfo[] regions = MasterProcedureTestingUtility.createTable(
325       getMasterProcedureExecutor(), tableName, splitKeys, families);
326     // load and verify that there are rows in the table
327     MasterProcedureTestingUtility.loadData(
328       UTIL.getConnection(), tableName, 100, splitKeys, families);
329     assertEquals(100, UTIL.countRows(tableName));
330     // disable the table
331     UTIL.getHBaseAdmin().disableTable(tableName);
332 
333     ProcedureExecutor<MasterProcedureEnv> procExec = getMasterProcedureExecutor();
334     ProcedureTestingUtility.setKillAndToggleBeforeStoreUpdate(procExec, true);
335 
336     // Start the Truncate procedure && kill the executor
337     long procId = procExec.submitProcedure(
338       new TruncateTableProcedure(procExec.getEnvironment(), tableName, preserveSplits));
339     testRecoveryAndDoubleExecution(UTIL, procId, step, TruncateTableState.values());
340 
341     ProcedureTestingUtility.setKillAndToggleBeforeStoreUpdate(procExec, false);
342     UTIL.waitUntilAllRegionsAssigned(tableName);
343 
344     // validate the table regions and layout
345     if (preserveSplits) {
346       assertEquals(1 + splitKeys.length, UTIL.getHBaseAdmin().getTableRegions(tableName).size());
347     } else {
348       regions = UTIL.getHBaseAdmin().getTableRegions(tableName).toArray(new HRegionInfo[1]);
349       assertEquals(1, regions.length);
350     }
351     MasterProcedureTestingUtility.validateTableCreation(
352       UTIL.getHBaseCluster().getMaster(), tableName, regions, families);
353 
354     // verify that there are no rows in the table
355     assertEquals(0, UTIL.countRows(tableName));
356 
357     // verify that the table is read/writable
358     MasterProcedureTestingUtility.loadData(
359       UTIL.getConnection(), tableName, 50, splitKeys, families);
360     assertEquals(50, UTIL.countRows(tableName));
361   }
362 
363   // ==========================================================================
364   //  Test Disable Table
365   // ==========================================================================
366   @Test(timeout=60000)
367   public void testDisableTableWithFailover() throws Exception {
368     // TODO: Should we try every step? (master failover takes long time)
369     // It is already covered by TestDisableTableProcedure
370     // but without the master restart, only the executor/store is restarted.
371     // Without Master restart we may not find bug in the procedure code
372     // like missing "wait" for resources to be available (e.g. RS)
373     testDisableTableWithFailoverAtStep(
374       DisableTableState.DISABLE_TABLE_MARK_REGIONS_OFFLINE.ordinal());
375   }
376 
377   private void testDisableTableWithFailoverAtStep(final int step) throws Exception {
378     final TableName tableName = TableName.valueOf("testDisableTableWithFailoverAtStep" + step);
379 
380     // create the table
381     final byte[][] splitKeys = new byte[][] {
382       Bytes.toBytes("a"), Bytes.toBytes("b"), Bytes.toBytes("c")
383     };
384     MasterProcedureTestingUtility.createTable(
385       getMasterProcedureExecutor(), tableName, splitKeys, "f1", "f2");
386 
387     ProcedureExecutor<MasterProcedureEnv> procExec = getMasterProcedureExecutor();
388     ProcedureTestingUtility.setKillAndToggleBeforeStoreUpdate(procExec, true);
389 
390     // Start the Delete procedure && kill the executor
391     long procId = procExec.submitProcedure(
392       new DisableTableProcedure(procExec.getEnvironment(), tableName, false));
393     testRecoveryAndDoubleExecution(UTIL, procId, step, DisableTableState.values());
394 
395     MasterProcedureTestingUtility.validateTableIsDisabled(
396       UTIL.getHBaseCluster().getMaster(), tableName);
397   }
398 
399   // ==========================================================================
400   //  Test Enable Table
401   // ==========================================================================
402   @Test(timeout=60000)
403   public void testEnableTableWithFailover() throws Exception {
404     // TODO: Should we try every step? (master failover takes long time)
405     // It is already covered by TestEnableTableProcedure
406     // but without the master restart, only the executor/store is restarted.
407     // Without Master restart we may not find bug in the procedure code
408     // like missing "wait" for resources to be available (e.g. RS)
409     testEnableTableWithFailoverAtStep(
410       EnableTableState.ENABLE_TABLE_MARK_REGIONS_ONLINE.ordinal());
411   }
412 
413   private void testEnableTableWithFailoverAtStep(final int step) throws Exception {
414     final TableName tableName = TableName.valueOf("testEnableTableWithFailoverAtStep" + step);
415 
416     // create the table
417     final byte[][] splitKeys = new byte[][] {
418       Bytes.toBytes("a"), Bytes.toBytes("b"), Bytes.toBytes("c")
419     };
420     MasterProcedureTestingUtility.createTable(
421       getMasterProcedureExecutor(), tableName, splitKeys, "f1", "f2");
422     UTIL.getHBaseAdmin().disableTable(tableName);
423 
424     ProcedureExecutor<MasterProcedureEnv> procExec = getMasterProcedureExecutor();
425     ProcedureTestingUtility.setKillAndToggleBeforeStoreUpdate(procExec, true);
426 
427     // Start the Delete procedure && kill the executor
428     long procId = procExec.submitProcedure(
429       new EnableTableProcedure(procExec.getEnvironment(), tableName, false));
430     testRecoveryAndDoubleExecution(UTIL, procId, step, EnableTableState.values());
431 
432     MasterProcedureTestingUtility.validateTableIsEnabled(
433       UTIL.getHBaseCluster().getMaster(), tableName);
434   }
435 
436   // ==========================================================================
437   //  Test Helpers
438   // ==========================================================================
439   public static <TState> void testRecoveryAndDoubleExecution(final HBaseTestingUtility testUtil,
440       final long procId, final int lastStepBeforeFailover, TState[] states) throws Exception {
441     ProcedureExecutor<MasterProcedureEnv> procExec =
442       testUtil.getHBaseCluster().getMaster().getMasterProcedureExecutor();
443     ProcedureTestingUtility.waitProcedure(procExec, procId);
444 
445     for (int i = 0; i < lastStepBeforeFailover; ++i) {
446       LOG.info("Restart "+ i +" exec state: " + states[i]);
447       ProcedureTestingUtility.assertProcNotYetCompleted(procExec, procId);
448       ProcedureTestingUtility.restart(procExec);
449       ProcedureTestingUtility.waitProcedure(procExec, procId);
450     }
451     ProcedureTestingUtility.assertProcNotYetCompleted(procExec, procId);
452 
453     LOG.info("Trigger master failover");
454     masterFailover(testUtil);
455 
456     procExec = testUtil.getHBaseCluster().getMaster().getMasterProcedureExecutor();
457     ProcedureTestingUtility.waitProcedure(procExec, procId);
458     ProcedureTestingUtility.assertProcNotFailed(procExec, procId);
459   }
460 
461   // ==========================================================================
462   //  Master failover utils
463   // ==========================================================================
464   public static void masterFailover(final HBaseTestingUtility testUtil)
465       throws Exception {
466     MiniHBaseCluster cluster = testUtil.getMiniHBaseCluster();
467 
468     // Kill the master
469     HMaster oldMaster = cluster.getMaster();
470     cluster.killMaster(cluster.getMaster().getServerName());
471 
472     // Wait the secondary
473     waitBackupMaster(testUtil, oldMaster);
474   }
475 
476   public static void waitBackupMaster(final HBaseTestingUtility testUtil,
477       final HMaster oldMaster) throws Exception {
478     MiniHBaseCluster cluster = testUtil.getMiniHBaseCluster();
479 
480     HMaster newMaster = cluster.getMaster();
481     while (newMaster == null || newMaster == oldMaster) {
482       Thread.sleep(250);
483       newMaster = cluster.getMaster();
484     }
485 
486     while (!(newMaster.isActiveMaster() && newMaster.isInitialized())) {
487       Thread.sleep(250);
488     }
489   }
490 
491   // ==========================================================================
492   //  Helpers
493   // ==========================================================================
494   private MasterProcedureEnv getMasterProcedureEnv() {
495     return getMasterProcedureExecutor().getEnvironment();
496   }
497 
498   private ProcedureExecutor<MasterProcedureEnv> getMasterProcedureExecutor() {
499     return UTIL.getHBaseCluster().getMaster().getMasterProcedureExecutor();
500   }
501 
502   private FileSystem getFileSystem() {
503     return UTIL.getHBaseCluster().getMaster().getMasterFileSystem().getFileSystem();
504   }
505 
506   private Path getRootDir() {
507     return UTIL.getHBaseCluster().getMaster().getMasterFileSystem().getRootDir();
508   }
509 
510   private Path getTempDir() {
511     return UTIL.getHBaseCluster().getMaster().getMasterFileSystem().getTempDir();
512   }
513 }