1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.hadoop.hbase;
19
20 import java.io.IOException;
21 import java.util.ArrayList;
22 import java.util.Comparator;
23 import java.util.List;
24 import java.util.Set;
25 import java.util.TreeSet;
26
27 import org.apache.hadoop.conf.Configuration;
28 import org.apache.hadoop.hbase.ClusterManager.ServiceType;
29 import org.apache.hadoop.hbase.classification.InterfaceAudience;
30 import org.apache.hadoop.hbase.client.Admin;
31 import org.apache.hadoop.hbase.client.ClusterConnection;
32 import org.apache.hadoop.hbase.client.Connection;
33 import org.apache.hadoop.hbase.client.ConnectionFactory;
34 import org.apache.hadoop.hbase.client.RegionLocator;
35 import org.apache.hadoop.hbase.protobuf.ProtobufUtil;
36 import org.apache.hadoop.hbase.protobuf.generated.AdminProtos;
37 import org.apache.hadoop.hbase.protobuf.generated.AdminProtos.ServerInfo;
38 import org.apache.hadoop.hbase.protobuf.generated.ClientProtos;
39 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.MasterService;
40 import org.apache.hadoop.hbase.util.Bytes;
41 import org.apache.hadoop.hbase.util.Threads;
42
43
44
45
46
47 @InterfaceAudience.Private
48 public class DistributedHBaseCluster extends HBaseCluster {
49 private Admin admin;
50 private final Connection connection;
51
52 private ClusterManager clusterManager;
53
54 public DistributedHBaseCluster(Configuration conf, ClusterManager clusterManager)
55 throws IOException {
56 super(conf);
57 this.clusterManager = clusterManager;
58 this.connection = ConnectionFactory.createConnection(conf);
59 this.admin = this.connection.getAdmin();
60 this.initialClusterStatus = getClusterStatus();
61 }
62
63 public void setClusterManager(ClusterManager clusterManager) {
64 this.clusterManager = clusterManager;
65 }
66
67 public ClusterManager getClusterManager() {
68 return clusterManager;
69 }
70
71
72
73
74
75 @Override
76 public ClusterStatus getClusterStatus() throws IOException {
77 return admin.getClusterStatus();
78 }
79
80 @Override
81 public ClusterStatus getInitialClusterStatus() throws IOException {
82 return initialClusterStatus;
83 }
84
85 @Override
86 public void close() throws IOException {
87 if (this.admin != null) {
88 admin.close();
89 }
90 if (this.connection != null && !this.connection.isClosed()) {
91 this.connection.close();
92 }
93 }
94
95 @Override
96 public AdminProtos.AdminService.BlockingInterface getAdminProtocol(ServerName serverName)
97 throws IOException {
98 return ((ClusterConnection)this.connection).getAdmin(serverName);
99 }
100
101 @Override
102 public ClientProtos.ClientService.BlockingInterface getClientProtocol(ServerName serverName)
103 throws IOException {
104 return ((ClusterConnection)this.connection).getClient(serverName);
105 }
106
107 @Override
108 public void startRegionServer(String hostname, int port) throws IOException {
109 LOG.info("Starting RS on: " + hostname);
110 clusterManager.start(ServiceType.HBASE_REGIONSERVER, hostname, port);
111 }
112
113 @Override
114 public void killRegionServer(ServerName serverName) throws IOException {
115 LOG.info("Aborting RS: " + serverName.getServerName());
116 clusterManager.kill(ServiceType.HBASE_REGIONSERVER,
117 serverName.getHostname(),
118 serverName.getPort());
119 }
120
121 @Override
122 public void stopRegionServer(ServerName serverName) throws IOException {
123 LOG.info("Stopping RS: " + serverName.getServerName());
124 clusterManager.stop(ServiceType.HBASE_REGIONSERVER,
125 serverName.getHostname(),
126 serverName.getPort());
127 }
128
129 @Override
130 public void waitForRegionServerToStop(ServerName serverName, long timeout) throws IOException {
131 waitForServiceToStop(ServiceType.HBASE_REGIONSERVER, serverName, timeout);
132 }
133
134 private void waitForServiceToStop(ServiceType service, ServerName serverName, long timeout)
135 throws IOException {
136 LOG.info("Waiting service:" + service + " to stop: " + serverName.getServerName());
137 long start = System.currentTimeMillis();
138
139 while ((System.currentTimeMillis() - start) < timeout) {
140 if (!clusterManager.isRunning(service, serverName.getHostname(), serverName.getPort())) {
141 return;
142 }
143 Threads.sleep(1000);
144 }
145 throw new IOException("did timeout waiting for service to stop:" + serverName);
146 }
147
148 @Override
149 public MasterService.BlockingInterface getMasterAdminService()
150 throws IOException {
151 return ((ClusterConnection)this.connection).getMaster();
152 }
153
154 @Override
155 public void startMaster(String hostname, int port) throws IOException {
156 LOG.info("Starting Master on: " + hostname + ":" + port);
157 clusterManager.start(ServiceType.HBASE_MASTER, hostname, port);
158 }
159
160 @Override
161 public void killMaster(ServerName serverName) throws IOException {
162 LOG.info("Aborting Master: " + serverName.getServerName());
163 clusterManager.kill(ServiceType.HBASE_MASTER, serverName.getHostname(), serverName.getPort());
164 }
165
166 @Override
167 public void stopMaster(ServerName serverName) throws IOException {
168 LOG.info("Stopping Master: " + serverName.getServerName());
169 clusterManager.stop(ServiceType.HBASE_MASTER, serverName.getHostname(), serverName.getPort());
170 }
171
172 @Override
173 public void waitForMasterToStop(ServerName serverName, long timeout) throws IOException {
174 waitForServiceToStop(ServiceType.HBASE_MASTER, serverName, timeout);
175 }
176
177 @Override
178 public boolean waitForActiveAndReadyMaster(long timeout) throws IOException {
179 long start = System.currentTimeMillis();
180 while (System.currentTimeMillis() - start < timeout) {
181 try {
182 getMasterAdminService();
183 return true;
184 } catch (MasterNotRunningException m) {
185 LOG.warn("Master not started yet " + m);
186 } catch (ZooKeeperConnectionException e) {
187 LOG.warn("Failed to connect to ZK " + e);
188 }
189 Threads.sleep(1000);
190 }
191 return false;
192 }
193
194 @Override
195 public ServerName getServerHoldingRegion(TableName tn, byte[] regionName) throws IOException {
196 HRegionLocation regionLoc = null;
197 try (RegionLocator locator = connection.getRegionLocator(tn)) {
198 regionLoc = locator.getRegionLocation(regionName);
199 }
200 if (regionLoc == null) {
201 LOG.warn("Cannot find region server holding region " + Bytes.toString(regionName) +
202 ", start key [" + Bytes.toString(HRegionInfo.getStartKey(regionName)) + "]");
203 return null;
204 }
205
206 AdminProtos.AdminService.BlockingInterface client =
207 ((ClusterConnection)this.connection).getAdmin(regionLoc.getServerName());
208 ServerInfo info = ProtobufUtil.getServerInfo(null, client);
209 return ProtobufUtil.toServerName(info.getServerName());
210 }
211
212 @Override
213 public void waitUntilShutDown() {
214
215 throw new RuntimeException("Not implemented yet");
216 }
217
218 @Override
219 public void shutdown() throws IOException {
220
221 throw new RuntimeException("Not implemented yet");
222 }
223
224 @Override
225 public boolean isDistributedCluster() {
226 return true;
227 }
228
229 @Override
230 public boolean restoreClusterStatus(ClusterStatus initial) throws IOException {
231 ClusterStatus current = getClusterStatus();
232
233 LOG.info("Restoring cluster - started");
234
235
236 boolean success = true;
237 success = restoreMasters(initial, current) & success;
238 success = restoreRegionServers(initial, current) & success;
239 success = restoreAdmin() & success;
240
241 LOG.info("Restoring cluster - done");
242 return success;
243 }
244
245 protected boolean restoreMasters(ClusterStatus initial, ClusterStatus current) {
246 List<IOException> deferred = new ArrayList<IOException>();
247
248 final ServerName initMaster = initial.getMaster();
249 if (!ServerName.isSameHostnameAndPort(initMaster, current.getMaster())) {
250 LOG.info("Restoring cluster - Initial active master : "
251 + initMaster.getHostAndPort()
252 + " has changed to : "
253 + current.getMaster().getHostAndPort());
254
255
256 try {
257 if (!clusterManager.isRunning(ServiceType.HBASE_MASTER,
258 initMaster.getHostname(), initMaster.getPort())) {
259 LOG.info("Restoring cluster - starting initial active master at:"
260 + initMaster.getHostAndPort());
261 startMaster(initMaster.getHostname(), initMaster.getPort());
262 }
263
264
265
266
267
268 for (ServerName currentBackup : current.getBackupMasters()) {
269 if (!ServerName.isSameHostnameAndPort(currentBackup, initMaster)) {
270 LOG.info("Restoring cluster - stopping backup master: " + currentBackup);
271 stopMaster(currentBackup);
272 }
273 }
274 LOG.info("Restoring cluster - stopping active master: " + current.getMaster());
275 stopMaster(current.getMaster());
276 waitForActiveAndReadyMaster();
277 } catch (IOException ex) {
278
279
280 deferred.add(ex);
281 }
282
283
284 for (ServerName backup : initial.getBackupMasters()) {
285 try {
286
287 if (!clusterManager.isRunning(ServiceType.HBASE_MASTER,
288 backup.getHostname(),
289 backup.getPort())) {
290 LOG.info("Restoring cluster - starting initial backup master: "
291 + backup.getHostAndPort());
292 startMaster(backup.getHostname(), backup.getPort());
293 }
294 } catch (IOException ex) {
295 deferred.add(ex);
296 }
297 }
298 } else {
299
300 Set<ServerName> toStart = new TreeSet<ServerName>(new ServerNameIgnoreStartCodeComparator());
301 Set<ServerName> toKill = new TreeSet<ServerName>(new ServerNameIgnoreStartCodeComparator());
302 toStart.addAll(initial.getBackupMasters());
303 toKill.addAll(current.getBackupMasters());
304
305 for (ServerName server : current.getBackupMasters()) {
306 toStart.remove(server);
307 }
308 for (ServerName server: initial.getBackupMasters()) {
309 toKill.remove(server);
310 }
311
312 for (ServerName sn:toStart) {
313 try {
314 if(!clusterManager.isRunning(ServiceType.HBASE_MASTER, sn.getHostname(), sn.getPort())) {
315 LOG.info("Restoring cluster - starting initial backup master: " + sn.getHostAndPort());
316 startMaster(sn.getHostname(), sn.getPort());
317 }
318 } catch (IOException ex) {
319 deferred.add(ex);
320 }
321 }
322
323 for (ServerName sn:toKill) {
324 try {
325 if(clusterManager.isRunning(ServiceType.HBASE_MASTER, sn.getHostname(), sn.getPort())) {
326 LOG.info("Restoring cluster - stopping backup master: " + sn.getHostAndPort());
327 stopMaster(sn);
328 }
329 } catch (IOException ex) {
330 deferred.add(ex);
331 }
332 }
333 }
334 if (!deferred.isEmpty()) {
335 LOG.warn("Restoring cluster - restoring region servers reported "
336 + deferred.size() + " errors:");
337 for (int i=0; i<deferred.size() && i < 3; i++) {
338 LOG.warn(deferred.get(i));
339 }
340 }
341
342 return deferred.isEmpty();
343 }
344
345
346 private static class ServerNameIgnoreStartCodeComparator implements Comparator<ServerName> {
347 @Override
348 public int compare(ServerName o1, ServerName o2) {
349 int compare = o1.getHostname().compareToIgnoreCase(o2.getHostname());
350 if (compare != 0) return compare;
351 compare = o1.getPort() - o2.getPort();
352 if (compare != 0) return compare;
353 return 0;
354 }
355 }
356
357 protected boolean restoreRegionServers(ClusterStatus initial, ClusterStatus current) {
358 Set<ServerName> toStart = new TreeSet<ServerName>(new ServerNameIgnoreStartCodeComparator());
359 Set<ServerName> toKill = new TreeSet<ServerName>(new ServerNameIgnoreStartCodeComparator());
360 toStart.addAll(initial.getServers());
361 toKill.addAll(current.getServers());
362
363 for (ServerName server : current.getServers()) {
364 toStart.remove(server);
365 }
366 for (ServerName server: initial.getServers()) {
367 toKill.remove(server);
368 }
369
370 List<IOException> deferred = new ArrayList<IOException>();
371
372 for(ServerName sn:toStart) {
373 try {
374 if (!clusterManager.isRunning(ServiceType.HBASE_REGIONSERVER,
375 sn.getHostname(),
376 sn.getPort())) {
377 LOG.info("Restoring cluster - starting initial region server: " + sn.getHostAndPort());
378 startRegionServer(sn.getHostname(), sn.getPort());
379 }
380 } catch (IOException ex) {
381 deferred.add(ex);
382 }
383 }
384
385 for(ServerName sn:toKill) {
386 try {
387 if (clusterManager.isRunning(ServiceType.HBASE_REGIONSERVER,
388 sn.getHostname(),
389 sn.getPort())) {
390 LOG.info("Restoring cluster - stopping initial region server: " + sn.getHostAndPort());
391 stopRegionServer(sn);
392 }
393 } catch (IOException ex) {
394 deferred.add(ex);
395 }
396 }
397 if (!deferred.isEmpty()) {
398 LOG.warn("Restoring cluster - restoring region servers reported "
399 + deferred.size() + " errors:");
400 for (int i=0; i<deferred.size() && i < 3; i++) {
401 LOG.warn(deferred.get(i));
402 }
403 }
404
405 return deferred.isEmpty();
406 }
407
408 protected boolean restoreAdmin() throws IOException {
409
410
411
412
413 try {
414 admin.close();
415 } catch (IOException ioe) {
416 LOG.warn("While closing the old connection", ioe);
417 }
418 this.admin = this.connection.getAdmin();
419 LOG.info("Added new HBaseAdmin");
420 return true;
421 }
422 }