View Javadoc

1   /**
2    *
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  package org.apache.hadoop.hbase.regionserver;
20  
21  
22  import static org.junit.Assert.assertTrue;
23  
24  import java.io.IOException;
25  import java.util.ArrayList;
26  import java.util.List;
27  import java.util.concurrent.CountDownLatch;
28  
29  import org.apache.commons.logging.Log;
30  import org.apache.commons.logging.LogFactory;
31  import org.apache.hadoop.conf.Configuration;
32  import org.apache.hadoop.fs.FileSystem;
33  import org.apache.hadoop.fs.Path;
34  import org.apache.hadoop.hbase.Cell;
35  import org.apache.hadoop.hbase.CellScanner;
36  import org.apache.hadoop.hbase.HBaseTestingUtility;
37  import org.apache.hadoop.hbase.HConstants;
38  import org.apache.hadoop.hbase.HTableDescriptor;
39  import org.apache.hadoop.hbase.Server;
40  import org.apache.hadoop.hbase.TableName;
41  import org.apache.hadoop.hbase.client.Durability;
42  import org.apache.hadoop.hbase.client.Put;
43  import org.apache.hadoop.hbase.regionserver.wal.FSHLog;
44  import org.apache.hadoop.hbase.regionserver.wal.WALEdit;
45  import org.apache.hadoop.hbase.testclassification.MediumTests;
46  import org.apache.hadoop.hbase.util.Bytes;
47  import org.apache.hadoop.hbase.util.EnvironmentEdgeManagerTestHelper;
48  import org.apache.hadoop.hbase.util.Threads;
49  import org.apache.hadoop.hbase.wal.WAL;
50  import org.apache.hadoop.hbase.wal.WALKey;
51  import org.apache.hadoop.hbase.wal.WALProvider.Writer;
52  import org.junit.After;
53  import org.junit.Before;
54  import org.junit.Ignore;
55  import org.junit.Rule;
56  import org.junit.Test;
57  import org.junit.experimental.categories.Category;
58  import org.junit.rules.TestName;
59  import org.mockito.Mockito;
60  
61  /**
62   * Testing for lock up of WAL subsystem.
63   * Copied from TestHRegion.
64   */
65  @Category({MediumTests.class})
66  public class TestWALLockup {
67    private static final Log LOG = LogFactory.getLog(TestWALLockup.class);
68    @Rule public TestName name = new TestName();
69  
70    private static final String COLUMN_FAMILY = "MyCF";
71    private static final byte [] COLUMN_FAMILY_BYTES = Bytes.toBytes(COLUMN_FAMILY);
72  
73    HRegion region = null;
74    // Do not run unit tests in parallel (? Why not?  It don't work?  Why not?  St.Ack)
75    private static HBaseTestingUtility TEST_UTIL;
76    private static Configuration CONF ;
77    private String dir;
78  
79    // Test names
80    protected TableName tableName;
81  
82    @Before
83    public void setup() throws IOException {
84      TEST_UTIL = HBaseTestingUtility.createLocalHTU();
85      CONF = TEST_UTIL.getConfiguration();
86      // Disable block cache.
87      CONF.setFloat(HConstants.HFILE_BLOCK_CACHE_SIZE_KEY, 0f);
88      dir = TEST_UTIL.getDataTestDir("TestHRegion").toString();
89      tableName = TableName.valueOf(name.getMethodName());
90    }
91  
92    @After
93    public void tearDown() throws Exception {
94      EnvironmentEdgeManagerTestHelper.reset();
95      LOG.info("Cleaning test directory: " + TEST_UTIL.getDataTestDir());
96      TEST_UTIL.cleanupTestDir();
97    }
98  
99    String getName() {
100     return name.getMethodName();
101   }
102 
103   /**
104    * Reproduce locking up that happens when we get an inopportune sync during setup for
105    * zigzaglatch wait. See HBASE-14317. If below is broken, we will see this test timeout because
106    * it is locked up.
107    * <p>First I need to set up some mocks for Server and RegionServerServices. I also need to
108    * set up a dodgy WAL that will throw an exception when we go to append to it.
109    */
110   @Ignore @Test (timeout=30000)
111   public void testLockupWhenSyncInMiddleOfZigZagSetup() throws IOException {
112     // A WAL that we can have throw exceptions when a flag is set.
113     class DodgyFSLog extends FSHLog {
114       // Set this when want the WAL to start throwing exceptions.
115       volatile boolean throwException = false;
116 
117       // Latch to hold up processing until after another operation has had time to run.
118       CountDownLatch latch = new CountDownLatch(1);
119 
120       public DodgyFSLog(FileSystem fs, Path root, String logDir, Configuration conf)
121       throws IOException {
122         super(fs, root, logDir, conf);
123       }
124 
125       @Override
126       protected void afterCreatingZigZagLatch() {
127         // If throwException set, then append will throw an exception causing the WAL to be
128         // rolled. We'll come in here. Hold up processing until a sync can get in before
129         // the zigzag has time to complete its setup and get its own sync in. This is what causes
130         // the lock up we've seen in production.
131         if (throwException) {
132           try {
133             LOG.info("LATCHED");
134             this.latch.await();
135           } catch (InterruptedException e) {
136             // TODO Auto-generated catch block
137             e.printStackTrace();
138           }
139         }
140       }
141 
142       @Override
143       protected void beforeWaitOnSafePoint() {
144         if (throwException) {
145           LOG.info("COUNTDOWN");
146           // Don't countdown latch until someone waiting on it otherwise, the above
147           // afterCreatingZigZagLatch will get to the latch and no one will ever free it and we'll
148           // be stuck; test won't go down
149           while (this.latch.getCount() <= 0) Threads.sleep(1);
150           this.latch.countDown();
151         }
152       }
153 
154       @Override
155       protected Writer createWriterInstance(Path path) throws IOException {
156         final Writer w = super.createWriterInstance(path);
157         return new Writer() {
158           @Override
159           public void close() throws IOException {
160             w.close();
161           }
162 
163           @Override
164           public void sync() throws IOException {
165             if (throwException) {
166               throw new IOException("FAKE! Failed to replace a bad datanode...SYNC");
167             }
168             w.sync();
169           }
170 
171           @Override
172           public void append(Entry entry) throws IOException {
173             if (throwException) {
174               throw new IOException("FAKE! Failed to replace a bad datanode...APPEND");
175             }
176             w.append(entry);
177           }
178 
179           @Override
180           public long getLength() throws IOException {
181             return w.getLength();
182           }
183         };
184       }
185     }
186 
187     // Mocked up server and regionserver services. Needed below.
188     Server server = Mockito.mock(Server.class);
189     Mockito.when(server.getConfiguration()).thenReturn(CONF);
190     Mockito.when(server.isStopped()).thenReturn(false);
191     Mockito.when(server.isAborted()).thenReturn(false);
192     RegionServerServices services = Mockito.mock(RegionServerServices.class);
193 
194     // OK. Now I have my mocked up Server & RegionServerServices and dodgy WAL, go ahead with test.
195     FileSystem fs = FileSystem.get(CONF);
196     Path rootDir = new Path(dir + getName());
197     DodgyFSLog dodgyWAL = new DodgyFSLog(fs, rootDir, getName(), CONF);
198     Path originalWAL = dodgyWAL.getCurrentFileName();
199     // I need a log roller running.
200     LogRoller logRoller = new LogRoller(server, services);
201     logRoller.addWAL(dodgyWAL);
202     // There is no 'stop' once a logRoller is running.. it just dies.
203     logRoller.start();
204     // Now get a region and start adding in edits.
205     HTableDescriptor htd = new HTableDescriptor(TableName.META_TABLE_NAME);
206     final HRegion region = initHRegion(tableName, null, null, dodgyWAL);
207     byte [] bytes = Bytes.toBytes(getName());
208     try {
209       // First get something into memstore. Make a Put and then pull the Cell out of it. Will
210       // manage append and sync carefully in below to manufacture hang. We keep adding same
211       // edit. WAL subsystem doesn't care.
212       Put put = new Put(bytes);
213       put.addColumn(COLUMN_FAMILY_BYTES, Bytes.toBytes("1"), bytes);
214       WALKey key = new WALKey(region.getRegionInfo().getEncodedNameAsBytes(), htd.getTableName());
215       WALEdit edit = new WALEdit();
216       List<Cell> cells = new ArrayList<Cell>();
217       for (CellScanner cs = put.cellScanner(); cs.advance();) {
218         edit.add(cs.current());
219         cells.add(cs.current());
220       }
221       // Put something in memstore and out in the WAL. Do a big number of appends so we push
222       // out other side of the ringbuffer. If small numbers, stuff doesn't make it to WAL
223       for (int i = 0; i < 1000; i++) {
224         dodgyWAL.append(htd, region.getRegionInfo(), key, edit, region.getSequenceId(), true,
225           cells);
226       }
227       // Set it so we start throwing exceptions.
228       dodgyWAL.throwException = true;
229       // This append provokes a WAL roll.
230       dodgyWAL.append(htd, region.getRegionInfo(), key, edit, region.getSequenceId(), true, cells);
231       boolean exception = false;
232       try {
233         dodgyWAL.sync();
234       } catch (Exception e) {
235         exception = true;
236       }
237       assertTrue("Did not get sync exception", exception);
238 
239       // Get a memstore flush going too so we have same hung profile as up in the issue over
240       // in HBASE-14317. Flush hangs trying to get sequenceid because the ringbuffer is held up
241       // by the zigzaglatch waiting on syncs to come home.
242       Thread t = new Thread ("flusher") {
243         public void run() {
244           try {
245             region.flush(false);
246           } catch (IOException e) {
247             // TODO Auto-generated catch block
248             e.printStackTrace();
249           }
250         };
251       };
252       t.setDaemon(true);
253       t.start();
254       // Wait till it gets into flushing. It will get stuck on getSequenceId. Then proceed.
255       while (!region.writestate.flushing) Threads.sleep(1);
256       // Now assert I got a new WAL file put in place even though loads of errors above.
257       assertTrue(originalWAL != dodgyWAL.getCurrentFileName());
258       // Can I append to it?
259       dodgyWAL.throwException = false;
260       region.put(put);
261     } finally {
262       // To stop logRoller, its server has to say it is stopped.
263       Mockito.when(server.isStopped()).thenReturn(true);
264       if (logRoller != null) logRoller.interrupt();
265       if (region != null) region.close();
266       if (dodgyWAL != null) dodgyWAL.close();
267     }
268   }
269 
270   /**
271    * @return A region on which you must call
272    *         {@link HBaseTestingUtility#closeRegionAndWAL(HRegion)} when done.
273    */
274   public HRegion initHRegion(TableName tableName, byte[] startKey, byte[] stopKey, WAL wal)
275   throws IOException {
276     return TEST_UTIL.createLocalHRegion(tableName.getName(), startKey, stopKey,
277       getName(), CONF, false, Durability.SYNC_WAL,
278       wal, COLUMN_FAMILY_BYTES);
279   }
280 }