View Javadoc

1   /**
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *     http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   */
18  package org.apache.hadoop.hbase;
19  
20  import java.io.IOException;
21  import java.util.ArrayList;
22  import java.util.Comparator;
23  import java.util.List;
24  import java.util.Set;
25  import java.util.TreeSet;
26  
27  import org.apache.hadoop.conf.Configuration;
28  import org.apache.hadoop.hbase.ClusterManager.ServiceType;
29  import org.apache.hadoop.hbase.classification.InterfaceAudience;
30  import org.apache.hadoop.hbase.client.Admin;
31  import org.apache.hadoop.hbase.client.ClusterConnection;
32  import org.apache.hadoop.hbase.client.Connection;
33  import org.apache.hadoop.hbase.client.ConnectionFactory;
34  import org.apache.hadoop.hbase.client.RegionLocator;
35  import org.apache.hadoop.hbase.protobuf.ProtobufUtil;
36  import org.apache.hadoop.hbase.protobuf.generated.AdminProtos;
37  import org.apache.hadoop.hbase.protobuf.generated.AdminProtos.ServerInfo;
38  import org.apache.hadoop.hbase.protobuf.generated.ClientProtos;
39  import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.MasterService;
40  import org.apache.hadoop.hbase.util.Bytes;
41  import org.apache.hadoop.hbase.util.Threads;
42  
43  /**
44   * Manages the interactions with an already deployed distributed cluster (as opposed to
45   * a pseudo-distributed, or mini/local cluster). This is used by integration and system tests.
46   */
47  @InterfaceAudience.Private
48  public class DistributedHBaseCluster extends HBaseCluster {
49    private Admin admin;
50    private final Connection connection;
51  
52    private ClusterManager clusterManager;
53  
54    public DistributedHBaseCluster(Configuration conf, ClusterManager clusterManager)
55        throws IOException {
56      super(conf);
57      this.clusterManager = clusterManager;
58      this.connection = ConnectionFactory.createConnection(conf);
59      this.admin = this.connection.getAdmin();
60      this.initialClusterStatus = getClusterStatus();
61    }
62  
63    public void setClusterManager(ClusterManager clusterManager) {
64      this.clusterManager = clusterManager;
65    }
66  
67    public ClusterManager getClusterManager() {
68      return clusterManager;
69    }
70  
71    /**
72     * Returns a ClusterStatus for this HBase cluster
73     * @throws IOException
74     */
75    @Override
76    public ClusterStatus getClusterStatus() throws IOException {
77      return admin.getClusterStatus();
78    }
79  
80    @Override
81    public ClusterStatus getInitialClusterStatus() throws IOException {
82      return initialClusterStatus;
83    }
84  
85    @Override
86    public void close() throws IOException {
87      if (this.admin != null) {
88        admin.close();
89      }
90      if (this.connection != null && !this.connection.isClosed()) {
91        this.connection.close();
92      }
93    }
94  
95    @Override
96    public AdminProtos.AdminService.BlockingInterface getAdminProtocol(ServerName serverName)
97    throws IOException {
98      return ((ClusterConnection)this.connection).getAdmin(serverName);
99    }
100 
101   @Override
102   public ClientProtos.ClientService.BlockingInterface getClientProtocol(ServerName serverName)
103   throws IOException {
104     return ((ClusterConnection)this.connection).getClient(serverName);
105   }
106 
107   @Override
108   public void startRegionServer(String hostname, int port) throws IOException {
109     LOG.info("Starting RS on: " + hostname);
110     clusterManager.start(ServiceType.HBASE_REGIONSERVER, hostname, port);
111   }
112 
113   @Override
114   public void killRegionServer(ServerName serverName) throws IOException {
115     LOG.info("Aborting RS: " + serverName.getServerName());
116     clusterManager.kill(ServiceType.HBASE_REGIONSERVER,
117             serverName.getHostname(),
118             serverName.getPort());
119   }
120 
121   @Override
122   public void stopRegionServer(ServerName serverName) throws IOException {
123     LOG.info("Stopping RS: " + serverName.getServerName());
124     clusterManager.stop(ServiceType.HBASE_REGIONSERVER,
125             serverName.getHostname(),
126             serverName.getPort());
127   }
128 
129   @Override
130   public void waitForRegionServerToStop(ServerName serverName, long timeout) throws IOException {
131     waitForServiceToStop(ServiceType.HBASE_REGIONSERVER, serverName, timeout);
132   }
133 
134   private void waitForServiceToStop(ServiceType service, ServerName serverName, long timeout)
135     throws IOException {
136     LOG.info("Waiting service:" + service + " to stop: " + serverName.getServerName());
137     long start = System.currentTimeMillis();
138 
139     while ((System.currentTimeMillis() - start) < timeout) {
140       if (!clusterManager.isRunning(service, serverName.getHostname(), serverName.getPort())) {
141         return;
142       }
143       Threads.sleep(1000);
144     }
145     throw new IOException("did timeout waiting for service to stop:" + serverName);
146   }
147 
148   @Override
149   public MasterService.BlockingInterface getMasterAdminService()
150   throws IOException {
151     return ((ClusterConnection)this.connection).getMaster();
152   }
153 
154   @Override
155   public void startMaster(String hostname, int port) throws IOException {
156     LOG.info("Starting Master on: " + hostname + ":" + port);
157     clusterManager.start(ServiceType.HBASE_MASTER, hostname, port);
158   }
159 
160   @Override
161   public void killMaster(ServerName serverName) throws IOException {
162     LOG.info("Aborting Master: " + serverName.getServerName());
163     clusterManager.kill(ServiceType.HBASE_MASTER, serverName.getHostname(), serverName.getPort());
164   }
165 
166   @Override
167   public void stopMaster(ServerName serverName) throws IOException {
168     LOG.info("Stopping Master: " + serverName.getServerName());
169     clusterManager.stop(ServiceType.HBASE_MASTER, serverName.getHostname(), serverName.getPort());
170   }
171 
172   @Override
173   public void waitForMasterToStop(ServerName serverName, long timeout) throws IOException {
174     waitForServiceToStop(ServiceType.HBASE_MASTER, serverName, timeout);
175   }
176 
177   @Override
178   public boolean waitForActiveAndReadyMaster(long timeout) throws IOException {
179     long start = System.currentTimeMillis();
180     while (System.currentTimeMillis() - start < timeout) {
181       try {
182         getMasterAdminService();
183         return true;
184       } catch (MasterNotRunningException m) {
185         LOG.warn("Master not started yet " + m);
186       } catch (ZooKeeperConnectionException e) {
187         LOG.warn("Failed to connect to ZK " + e);
188       }
189       Threads.sleep(1000);
190     }
191     return false;
192   }
193 
194   @Override
195   public ServerName getServerHoldingRegion(TableName tn, byte[] regionName) throws IOException {
196     HRegionLocation regionLoc = null;
197     try (RegionLocator locator = connection.getRegionLocator(tn)) {
198       regionLoc = locator.getRegionLocation(regionName);
199     }
200     if (regionLoc == null) {
201       LOG.warn("Cannot find region server holding region " + Bytes.toString(regionName) +
202         ", start key [" + Bytes.toString(HRegionInfo.getStartKey(regionName)) + "]");
203       return null;
204     }
205 
206     AdminProtos.AdminService.BlockingInterface client =
207         ((ClusterConnection)this.connection).getAdmin(regionLoc.getServerName());
208     ServerInfo info = ProtobufUtil.getServerInfo(null, client);
209     return ProtobufUtil.toServerName(info.getServerName());
210   }
211 
212   @Override
213   public void waitUntilShutDown() {
214     // Simply wait for a few seconds for now (after issuing serverManager.kill
215     throw new RuntimeException("Not implemented yet");
216   }
217 
218   @Override
219   public void shutdown() throws IOException {
220     // not sure we want this
221     throw new RuntimeException("Not implemented yet");
222   }
223 
224   @Override
225   public boolean isDistributedCluster() {
226     return true;
227   }
228 
229   @Override
230   public boolean restoreClusterStatus(ClusterStatus initial) throws IOException {
231     ClusterStatus current = getClusterStatus();
232 
233     LOG.info("Restoring cluster - started");
234 
235     // do a best effort restore
236     boolean success = true;
237     success = restoreMasters(initial, current) & success;
238     success = restoreRegionServers(initial, current) & success;
239     success = restoreAdmin() & success;
240 
241     LOG.info("Restoring cluster - done");
242     return success;
243   }
244 
245   protected boolean restoreMasters(ClusterStatus initial, ClusterStatus current) {
246     List<IOException> deferred = new ArrayList<IOException>();
247     //check whether current master has changed
248     final ServerName initMaster = initial.getMaster();
249     if (!ServerName.isSameHostnameAndPort(initMaster, current.getMaster())) {
250       LOG.info("Restoring cluster - Initial active master : "
251               + initMaster.getHostAndPort()
252               + " has changed to : "
253               + current.getMaster().getHostAndPort());
254       // If initial master is stopped, start it, before restoring the state.
255       // It will come up as a backup master, if there is already an active master.
256       try {
257         if (!clusterManager.isRunning(ServiceType.HBASE_MASTER,
258                 initMaster.getHostname(), initMaster.getPort())) {
259           LOG.info("Restoring cluster - starting initial active master at:"
260                   + initMaster.getHostAndPort());
261           startMaster(initMaster.getHostname(), initMaster.getPort());
262         }
263 
264         // master has changed, we would like to undo this.
265         // 1. Kill the current backups
266         // 2. Stop current master
267         // 3. Start backup masters
268         for (ServerName currentBackup : current.getBackupMasters()) {
269           if (!ServerName.isSameHostnameAndPort(currentBackup, initMaster)) {
270             LOG.info("Restoring cluster - stopping backup master: " + currentBackup);
271             stopMaster(currentBackup);
272           }
273         }
274         LOG.info("Restoring cluster - stopping active master: " + current.getMaster());
275         stopMaster(current.getMaster());
276         waitForActiveAndReadyMaster(); // wait so that active master takes over
277       } catch (IOException ex) {
278         // if we fail to start the initial active master, we do not want to continue stopping
279         // backup masters. Just keep what we have now
280         deferred.add(ex);
281       }
282 
283       //start backup masters
284       for (ServerName backup : initial.getBackupMasters()) {
285         try {
286           //these are not started in backup mode, but we should already have an active master
287           if (!clusterManager.isRunning(ServiceType.HBASE_MASTER,
288                   backup.getHostname(),
289                   backup.getPort())) {
290             LOG.info("Restoring cluster - starting initial backup master: "
291                     + backup.getHostAndPort());
292             startMaster(backup.getHostname(), backup.getPort());
293           }
294         } catch (IOException ex) {
295           deferred.add(ex);
296         }
297       }
298     } else {
299       //current master has not changed, match up backup masters
300       Set<ServerName> toStart = new TreeSet<ServerName>(new ServerNameIgnoreStartCodeComparator());
301       Set<ServerName> toKill = new TreeSet<ServerName>(new ServerNameIgnoreStartCodeComparator());
302       toStart.addAll(initial.getBackupMasters());
303       toKill.addAll(current.getBackupMasters());
304 
305       for (ServerName server : current.getBackupMasters()) {
306         toStart.remove(server);
307       }
308       for (ServerName server: initial.getBackupMasters()) {
309         toKill.remove(server);
310       }
311 
312       for (ServerName sn:toStart) {
313         try {
314           if(!clusterManager.isRunning(ServiceType.HBASE_MASTER, sn.getHostname(), sn.getPort())) {
315             LOG.info("Restoring cluster - starting initial backup master: " + sn.getHostAndPort());
316             startMaster(sn.getHostname(), sn.getPort());
317           }
318         } catch (IOException ex) {
319           deferred.add(ex);
320         }
321       }
322 
323       for (ServerName sn:toKill) {
324         try {
325           if(clusterManager.isRunning(ServiceType.HBASE_MASTER, sn.getHostname(), sn.getPort())) {
326             LOG.info("Restoring cluster - stopping backup master: " + sn.getHostAndPort());
327             stopMaster(sn);
328           }
329         } catch (IOException ex) {
330           deferred.add(ex);
331         }
332       }
333     }
334     if (!deferred.isEmpty()) {
335       LOG.warn("Restoring cluster - restoring region servers reported "
336               + deferred.size() + " errors:");
337       for (int i=0; i<deferred.size() && i < 3; i++) {
338         LOG.warn(deferred.get(i));
339       }
340     }
341 
342     return deferred.isEmpty();
343   }
344 
345 
346   private static class ServerNameIgnoreStartCodeComparator implements Comparator<ServerName> {
347     @Override
348     public int compare(ServerName o1, ServerName o2) {
349       int compare = o1.getHostname().compareToIgnoreCase(o2.getHostname());
350       if (compare != 0) return compare;
351       compare = o1.getPort() - o2.getPort();
352       if (compare != 0) return compare;
353       return 0;
354     }
355   }
356 
357   protected boolean restoreRegionServers(ClusterStatus initial, ClusterStatus current) {
358     Set<ServerName> toStart = new TreeSet<ServerName>(new ServerNameIgnoreStartCodeComparator());
359     Set<ServerName> toKill = new TreeSet<ServerName>(new ServerNameIgnoreStartCodeComparator());
360     toStart.addAll(initial.getServers());
361     toKill.addAll(current.getServers());
362 
363     for (ServerName server : current.getServers()) {
364       toStart.remove(server);
365     }
366     for (ServerName server: initial.getServers()) {
367       toKill.remove(server);
368     }
369 
370     List<IOException> deferred = new ArrayList<IOException>();
371 
372     for(ServerName sn:toStart) {
373       try {
374         if (!clusterManager.isRunning(ServiceType.HBASE_REGIONSERVER,
375                 sn.getHostname(),
376                 sn.getPort())) {
377           LOG.info("Restoring cluster - starting initial region server: " + sn.getHostAndPort());
378           startRegionServer(sn.getHostname(), sn.getPort());
379         }
380       } catch (IOException ex) {
381         deferred.add(ex);
382       }
383     }
384 
385     for(ServerName sn:toKill) {
386       try {
387         if (clusterManager.isRunning(ServiceType.HBASE_REGIONSERVER,
388                 sn.getHostname(),
389                 sn.getPort())) {
390           LOG.info("Restoring cluster - stopping initial region server: " + sn.getHostAndPort());
391           stopRegionServer(sn);
392         }
393       } catch (IOException ex) {
394         deferred.add(ex);
395       }
396     }
397     if (!deferred.isEmpty()) {
398       LOG.warn("Restoring cluster - restoring region servers reported "
399               + deferred.size() + " errors:");
400       for (int i=0; i<deferred.size() && i < 3; i++) {
401         LOG.warn(deferred.get(i));
402       }
403     }
404 
405     return deferred.isEmpty();
406   }
407 
408   protected boolean restoreAdmin() throws IOException {
409     // While restoring above, if the HBase Master which was initially the Active one, was down
410     // and the restore put the cluster back to Initial configuration, HAdmin instance will need
411     // to refresh its connections (otherwise it will return incorrect information) or we can
412     // point it to new instance.
413     try {
414       admin.close();
415     } catch (IOException ioe) {
416       LOG.warn("While closing the old connection", ioe);
417     }
418     this.admin = this.connection.getAdmin();
419     LOG.info("Added new HBaseAdmin");
420     return true;
421   }
422 }