View Javadoc

1   /**
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *     http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   */
18  package org.apache.hadoop.hbase.master.balancer;
19  
20  import java.util.ArrayDeque;
21  import java.util.Arrays;
22  import java.util.Collection;
23  import java.util.Deque;
24  import java.util.HashMap;
25  import java.util.LinkedList;
26  import java.util.List;
27  import java.util.Map;
28  import java.util.Map.Entry;
29  import java.util.Random;
30  
31  import org.apache.commons.logging.Log;
32  import org.apache.commons.logging.LogFactory;
33  import org.apache.hadoop.hbase.classification.InterfaceAudience;
34  import org.apache.hadoop.conf.Configuration;
35  import org.apache.hadoop.hbase.ClusterStatus;
36  import org.apache.hadoop.hbase.HBaseInterfaceAudience;
37  import org.apache.hadoop.hbase.HConstants;
38  import org.apache.hadoop.hbase.HRegionInfo;
39  import org.apache.hadoop.hbase.HTableDescriptor;
40  import org.apache.hadoop.hbase.RegionLoad;
41  import org.apache.hadoop.hbase.ServerLoad;
42  import org.apache.hadoop.hbase.ServerName;
43  import org.apache.hadoop.hbase.TableName;
44  import org.apache.hadoop.hbase.master.MasterServices;
45  import org.apache.hadoop.hbase.master.RegionPlan;
46  import org.apache.hadoop.hbase.master.balancer.BaseLoadBalancer.Cluster.Action;
47  import org.apache.hadoop.hbase.master.balancer.BaseLoadBalancer.Cluster.Action.Type;
48  import org.apache.hadoop.hbase.master.balancer.BaseLoadBalancer.Cluster.AssignRegionAction;
49  import org.apache.hadoop.hbase.master.balancer.BaseLoadBalancer.Cluster.MoveRegionAction;
50  import org.apache.hadoop.hbase.master.balancer.BaseLoadBalancer.Cluster.SwapRegionsAction;
51  import org.apache.hadoop.hbase.util.Bytes;
52  import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
53  
54  /**
55   * <p>This is a best effort load balancer. Given a Cost function F(C) => x It will
56   * randomly try and mutate the cluster to Cprime. If F(Cprime) < F(C) then the
57   * new cluster state becomes the plan. It includes costs functions to compute the cost of:</p>
58   * <ul>
59   * <li>Region Load</li>
60   * <li>Table Load</li>
61   * <li>Data Locality</li>
62   * <li>Memstore Sizes</li>
63   * <li>Storefile Sizes</li>
64   * </ul>
65   *
66   *
67   * <p>Every cost function returns a number between 0 and 1 inclusive; where 0 is the lowest cost
68   * best solution, and 1 is the highest possible cost and the worst solution.  The computed costs are
69   * scaled by their respective multipliers:</p>
70   *
71   * <ul>
72   *   <li>hbase.master.balancer.stochastic.regionLoadCost</li>
73   *   <li>hbase.master.balancer.stochastic.moveCost</li>
74   *   <li>hbase.master.balancer.stochastic.tableLoadCost</li>
75   *   <li>hbase.master.balancer.stochastic.localityCost</li>
76   *   <li>hbase.master.balancer.stochastic.memstoreSizeCost</li>
77   *   <li>hbase.master.balancer.stochastic.storefileSizeCost</li>
78   * </ul>
79   *
80   * <p>In addition to the above configurations, the balancer can be tuned by the following
81   * configuration values:</p>
82   * <ul>
83   *   <li>hbase.master.balancer.stochastic.maxMoveRegions which
84   *   controls what the max number of regions that can be moved in a single invocation of this
85   *   balancer.</li>
86   *   <li>hbase.master.balancer.stochastic.stepsPerRegion is the coefficient by which the number of
87   *   regions is multiplied to try and get the number of times the balancer will
88   *   mutate all servers.</li>
89   *   <li>hbase.master.balancer.stochastic.maxSteps which controls the maximum number of times that
90   *   the balancer will try and mutate all the servers. The balancer will use the minimum of this
91   *   value and the above computation.</li>
92   * </ul>
93   *
94   * <p>This balancer is best used with hbase.master.loadbalance.bytable set to false
95   * so that the balancer gets the full picture of all loads on the cluster.</p>
96   */
97  @InterfaceAudience.LimitedPrivate(HBaseInterfaceAudience.CONFIG)
98  public class StochasticLoadBalancer extends BaseLoadBalancer {
99  
100   protected static final String STEPS_PER_REGION_KEY =
101       "hbase.master.balancer.stochastic.stepsPerRegion";
102   protected static final String MAX_STEPS_KEY =
103       "hbase.master.balancer.stochastic.maxSteps";
104   protected static final String MAX_RUNNING_TIME_KEY =
105       "hbase.master.balancer.stochastic.maxRunningTime";
106   protected static final String KEEP_REGION_LOADS =
107       "hbase.master.balancer.stochastic.numRegionLoadsToRemember";
108   private static final String TABLE_FUNCTION_SEP = "_";
109 
110   private static final Random RANDOM = new Random(System.currentTimeMillis());
111   private static final Log LOG = LogFactory.getLog(StochasticLoadBalancer.class);
112 
113   Map<String, Deque<RegionLoad>> loads = new HashMap<String, Deque<RegionLoad>>();
114 
115   // values are defaults
116   private int maxSteps = 1000000;
117   private int stepsPerRegion = 800;
118   private long maxRunningTime = 30 * 1000 * 1; // 30 seconds.
119   private int numRegionLoadsToRemember = 15;
120 
121   private CandidateGenerator[] candidateGenerators;
122   private CostFromRegionLoadFunction[] regionLoadFunctions;
123   private CostFunction[] costFunctions;
124 
125   // to save and report costs to JMX
126   private Double curOverallCost = 0d;
127   private Double[] tempFunctionCosts;
128   private Double[] curFunctionCosts;
129 
130   // Keep locality based picker and cost function to alert them
131   // when new services are offered
132   private LocalityBasedCandidateGenerator localityCandidateGenerator;
133   private LocalityCostFunction localityCost;
134   private RegionReplicaHostCostFunction regionReplicaHostCostFunction;
135   private RegionReplicaRackCostFunction regionReplicaRackCostFunction;
136   private boolean isByTable = false;
137   private TableName tableName = null;
138 
139   /**
140    * The constructor that pass a MetricsStochasticBalancer to BaseLoadBalancer to replace its
141    * default MetricsBalancer
142    */
143   public StochasticLoadBalancer() {
144     super(new MetricsStochasticBalancer());
145   }
146 
147   @Override
148   public void onConfigurationChange(Configuration conf) {
149     setConf(conf);
150   }
151 
152   @Override
153   public synchronized void setConf(Configuration conf) {
154     super.setConf(conf);
155     LOG.info("loading config");
156 
157     maxSteps = conf.getInt(MAX_STEPS_KEY, maxSteps);
158 
159     stepsPerRegion = conf.getInt(STEPS_PER_REGION_KEY, stepsPerRegion);
160     maxRunningTime = conf.getLong(MAX_RUNNING_TIME_KEY, maxRunningTime);
161 
162     numRegionLoadsToRemember = conf.getInt(KEEP_REGION_LOADS, numRegionLoadsToRemember);
163     isByTable = conf.getBoolean(HConstants.HBASE_MASTER_LOADBALANCE_BYTABLE, isByTable);
164 
165     if (localityCandidateGenerator == null) {
166       localityCandidateGenerator = new LocalityBasedCandidateGenerator(services);
167     }
168     localityCost = new LocalityCostFunction(conf, services);
169 
170     if (candidateGenerators == null) {
171       candidateGenerators = new CandidateGenerator[] {
172           new RandomCandidateGenerator(),
173           new LoadCandidateGenerator(),
174           localityCandidateGenerator,
175           new RegionReplicaRackCandidateGenerator(),
176       };
177     }
178 
179     regionLoadFunctions = new CostFromRegionLoadFunction[] {
180       new ReadRequestCostFunction(conf),
181       new WriteRequestCostFunction(conf),
182       new MemstoreSizeCostFunction(conf),
183       new StoreFileCostFunction(conf)
184     };
185 
186     regionReplicaHostCostFunction = new RegionReplicaHostCostFunction(conf);
187     regionReplicaRackCostFunction = new RegionReplicaRackCostFunction(conf);
188 
189     costFunctions = new CostFunction[]{
190       new RegionCountSkewCostFunction(conf),
191       new PrimaryRegionCountSkewCostFunction(conf),
192       new MoveCostFunction(conf),
193       localityCost,
194       new TableSkewCostFunction(conf),
195       regionReplicaHostCostFunction,
196       regionReplicaRackCostFunction,
197       regionLoadFunctions[0],
198       regionLoadFunctions[1],
199       regionLoadFunctions[2],
200       regionLoadFunctions[3],
201     };
202 
203     curFunctionCosts= new Double[costFunctions.length];
204     tempFunctionCosts= new Double[costFunctions.length];
205 
206   }
207 
208   @Override
209   protected void setSlop(Configuration conf) {
210     this.slop = conf.getFloat("hbase.regions.slop", 0.001F);
211   }
212 
213   @Override
214   public synchronized void setClusterStatus(ClusterStatus st) {
215     super.setClusterStatus(st);
216     updateRegionLoad();
217     for(CostFromRegionLoadFunction cost : regionLoadFunctions) {
218       cost.setClusterStatus(st);
219     }
220 
221     // update metrics size
222     try {
223       // by-table or ensemble mode
224       int tablesCount = isByTable ? services.getTableDescriptors().getAll().size() : 1;
225       int functionsCount = getCostFunctionNames().length;
226 
227       updateMetricsSize(tablesCount * (functionsCount + 1)); // +1 for overall
228     } catch (Exception e) {
229       LOG.error("failed to get the size of all tables, exception = " + e.getMessage());
230     }
231   }
232 
233   /**
234    * Update the number of metrics that are reported to JMX
235    */
236   public void updateMetricsSize(int size) {
237     if (metricsBalancer instanceof MetricsStochasticBalancer) {
238         ((MetricsStochasticBalancer) metricsBalancer).updateMetricsSize(size);
239     }
240   }
241 
242   @Override
243   public synchronized void setMasterServices(MasterServices masterServices) {
244     super.setMasterServices(masterServices);
245     this.localityCost.setServices(masterServices);
246     this.localityCandidateGenerator.setServices(masterServices);
247 
248   }
249 
250   @Override
251   protected synchronized boolean areSomeRegionReplicasColocated(Cluster c) {
252     regionReplicaHostCostFunction.init(c);
253     if (regionReplicaHostCostFunction.cost() > 0) return true;
254     regionReplicaRackCostFunction.init(c);
255     if (regionReplicaRackCostFunction.cost() > 0) return true;
256     return false;
257   }
258 
259   @Override
260   public synchronized List<RegionPlan> balanceCluster(TableName tableName, Map<ServerName,
261     List<HRegionInfo>> clusterState) {
262     this.tableName = tableName;
263     return balanceCluster(clusterState);
264   }
265 
266   /**
267    * Given the cluster state this will try and approach an optimal balance. This
268    * should always approach the optimal state given enough steps.
269    */
270   @Override
271   public synchronized List<RegionPlan> balanceCluster(Map<ServerName,
272     List<HRegionInfo>> clusterState) {
273     List<RegionPlan> plans = balanceMasterRegions(clusterState);
274     if (plans != null || clusterState == null || clusterState.size() <= 1) {
275       return plans;
276     }
277 
278     if (masterServerName != null && clusterState.containsKey(masterServerName)) {
279       if (clusterState.size() <= 2) {
280         return null;
281       }
282       clusterState = new HashMap<ServerName, List<HRegionInfo>>(clusterState);
283       clusterState.remove(masterServerName);
284     }
285 
286     // On clusters with lots of HFileLinks or lots of reference files,
287     // instantiating the storefile infos can be quite expensive.
288     // Allow turning this feature off if the locality cost is not going to
289     // be used in any computations.
290     RegionLocationFinder finder = null;
291     if (this.localityCost != null && this.localityCost.getMultiplier() > 0) {
292       finder = this.regionFinder;
293     }
294 
295     //The clusterState that is given to this method contains the state
296     //of all the regions in the table(s) (that's true today)
297     // Keep track of servers to iterate through them.
298     Cluster cluster = new Cluster(clusterState, loads, finder, rackManager);
299 
300     if (!needsBalance(cluster)) {
301       return null;
302     }
303 
304     long startTime = EnvironmentEdgeManager.currentTime();
305 
306     initCosts(cluster);
307 
308     double currentCost = computeCost(cluster, Double.MAX_VALUE);
309     curOverallCost = currentCost;
310     for (int i = 0; i < this.curFunctionCosts.length; i++) {
311       curFunctionCosts[i] = tempFunctionCosts[i];
312     }
313 
314     double initCost = currentCost;
315     double newCost = currentCost;
316 
317     long computedMaxSteps = Math.min(this.maxSteps,
318         ((long)cluster.numRegions * (long)this.stepsPerRegion * (long)cluster.numServers));
319     // Perform a stochastic walk to see if we can get a good fit.
320     long step;
321 
322     for (step = 0; step < computedMaxSteps; step++) {
323       int generatorIdx = RANDOM.nextInt(candidateGenerators.length);
324       CandidateGenerator p = candidateGenerators[generatorIdx];
325       Cluster.Action action = p.generate(cluster);
326 
327       if (action.type == Type.NULL) {
328         continue;
329       }
330 
331       cluster.doAction(action);
332       updateCostsWithAction(cluster, action);
333 
334       newCost = computeCost(cluster, currentCost);
335 
336       // Should this be kept?
337       if (newCost < currentCost) {
338         currentCost = newCost;
339 
340         // save for JMX
341         curOverallCost = currentCost;
342         for (int i = 0; i < this.curFunctionCosts.length; i++) {
343           curFunctionCosts[i] = tempFunctionCosts[i];
344         }
345       } else {
346         // Put things back the way they were before.
347         // TODO: undo by remembering old values
348         Action undoAction = action.undoAction();
349         cluster.doAction(undoAction);
350         updateCostsWithAction(cluster, undoAction);
351       }
352 
353       if (EnvironmentEdgeManager.currentTime() - startTime >
354           maxRunningTime) {
355         break;
356       }
357     }
358 
359     long endTime = EnvironmentEdgeManager.currentTime();
360 
361     metricsBalancer.balanceCluster(endTime - startTime);
362 
363     // update costs metrics
364     updateStochasticCosts(tableName, curOverallCost, curFunctionCosts);
365     if (initCost > currentCost) {
366       plans = createRegionPlans(cluster);
367       if (LOG.isDebugEnabled()) {
368         LOG.debug("Finished computing new load balance plan.  Computation took "
369             + (endTime - startTime) + "ms to try " + step
370             + " different iterations.  Found a solution that moves "
371             + plans.size() + " regions; Going from a computed cost of "
372             + initCost + " to a new cost of " + currentCost);
373       }
374 
375       return plans;
376     }
377     if (LOG.isDebugEnabled()) {
378       LOG.debug("Could not find a better load balance plan.  Tried "
379           + step + " different configurations in " + (endTime - startTime)
380           + "ms, and did not find anything with a computed cost less than " + initCost);
381     }
382     return null;
383   }
384 
385   /**
386    * update costs to JMX
387    */
388   private void updateStochasticCosts(TableName tableName, Double overall, Double[] subCosts) {
389     if (tableName == null) return;
390 
391     // check if the metricsBalancer is MetricsStochasticBalancer before casting
392     if (metricsBalancer instanceof MetricsStochasticBalancer) {
393       MetricsStochasticBalancer balancer = (MetricsStochasticBalancer) metricsBalancer;
394       // overall cost
395       balancer.updateStochasticCost(tableName.getNameAsString(),
396         "Overall", "Overall cost", overall);
397 
398       // each cost function
399       for (int i = 0; i < costFunctions.length; i++) {
400         CostFunction costFunction = costFunctions[i];
401         String costFunctionName = costFunction.getClass().getSimpleName();
402         Double costPercent = (overall == 0) ? 0 : (subCosts[i] / overall);
403         // TODO: cost function may need a specific description
404         balancer.updateStochasticCost(tableName.getNameAsString(), costFunctionName,
405           "The percent of " + costFunctionName, costPercent);
406       }
407     }
408   }
409 
410 
411   /**
412    * Create all of the RegionPlan's needed to move from the initial cluster state to the desired
413    * state.
414    *
415    * @param cluster The state of the cluster
416    * @return List of RegionPlan's that represent the moves needed to get to desired final state.
417    */
418   private List<RegionPlan> createRegionPlans(Cluster cluster) {
419     List<RegionPlan> plans = new LinkedList<RegionPlan>();
420     for (int regionIndex = 0;
421          regionIndex < cluster.regionIndexToServerIndex.length; regionIndex++) {
422       int initialServerIndex = cluster.initialRegionIndexToServerIndex[regionIndex];
423       int newServerIndex = cluster.regionIndexToServerIndex[regionIndex];
424 
425       if (initialServerIndex != newServerIndex) {
426         HRegionInfo region = cluster.regions[regionIndex];
427         ServerName initialServer = cluster.servers[initialServerIndex];
428         ServerName newServer = cluster.servers[newServerIndex];
429 
430         if (LOG.isTraceEnabled()) {
431           LOG.trace("Moving Region " + region.getEncodedName() + " from server "
432               + initialServer.getHostname() + " to " + newServer.getHostname());
433         }
434         RegionPlan rp = new RegionPlan(region, initialServer, newServer);
435         plans.add(rp);
436       }
437     }
438     return plans;
439   }
440 
441   /**
442    * Store the current region loads.
443    */
444   private synchronized void updateRegionLoad() {
445     // We create a new hashmap so that regions that are no longer there are removed.
446     // However we temporarily need the old loads so we can use them to keep the rolling average.
447     Map<String, Deque<RegionLoad>> oldLoads = loads;
448     loads = new HashMap<String, Deque<RegionLoad>>();
449 
450     for (ServerName sn : clusterStatus.getServers()) {
451       ServerLoad sl = clusterStatus.getLoad(sn);
452       if (sl == null) {
453         continue;
454       }
455       for (Entry<byte[], RegionLoad> entry : sl.getRegionsLoad().entrySet()) {
456         Deque<RegionLoad> rLoads = oldLoads.get(Bytes.toString(entry.getKey()));
457         if (rLoads == null) {
458           // There was nothing there
459           rLoads = new ArrayDeque<RegionLoad>();
460         } else if (rLoads.size() >= numRegionLoadsToRemember) {
461           rLoads.remove();
462         }
463         rLoads.add(entry.getValue());
464         loads.put(Bytes.toString(entry.getKey()), rLoads);
465 
466       }
467     }
468 
469     for(CostFromRegionLoadFunction cost : regionLoadFunctions) {
470       cost.setLoads(loads);
471     }
472   }
473 
474   protected void initCosts(Cluster cluster) {
475     for (CostFunction c:costFunctions) {
476       c.init(cluster);
477     }
478   }
479 
480   protected void updateCostsWithAction(Cluster cluster, Action action) {
481     for (CostFunction c : costFunctions) {
482       c.postAction(action);
483     }
484   }
485 
486   /**
487    * Get the names of the cost functions
488    */
489   public String[] getCostFunctionNames() {
490     if (costFunctions == null) return null;
491     String[] ret = new String[costFunctions.length];
492     for (int i = 0; i < costFunctions.length; i++) {
493       CostFunction c = costFunctions[i];
494       ret[i] = c.getClass().getSimpleName();
495     }
496 
497     return ret;
498   }
499 
500   /**
501    * This is the main cost function.  It will compute a cost associated with a proposed cluster
502    * state.  All different costs will be combined with their multipliers to produce a double cost.
503    *
504    * @param cluster The state of the cluster
505    * @param previousCost the previous cost. This is used as an early out.
506    * @return a double of a cost associated with the proposed cluster state.  This cost is an
507    *         aggregate of all individual cost functions.
508    */
509   protected double computeCost(Cluster cluster, double previousCost) {
510     double total = 0;
511 
512     for (int i = 0; i < costFunctions.length; i++) {
513       CostFunction c = costFunctions[i];
514       this.tempFunctionCosts[i] = 0.0;
515 
516       if (c.getMultiplier() <= 0) {
517         continue;
518       }
519 
520       Float multiplier = c.getMultiplier();
521       Double cost = c.cost();
522 
523       this.tempFunctionCosts[i] = multiplier*cost;
524       total += this.tempFunctionCosts[i];
525 
526       if (total > previousCost) {
527         break;
528       }
529     }
530 
531     return total;
532   }
533 
534   /** Generates a candidate action to be applied to the cluster for cost function search */
535   abstract static class CandidateGenerator {
536     abstract Cluster.Action generate(Cluster cluster);
537 
538     /**
539      * From a list of regions pick a random one. Null can be returned which
540      * {@link StochasticLoadBalancer#balanceCluster(Map)} recognize as signal to try a region move
541      * rather than swap.
542      *
543      * @param cluster        The state of the cluster
544      * @param server         index of the server
545      * @param chanceOfNoSwap Chance that this will decide to try a move rather
546      *                       than a swap.
547      * @return a random {@link HRegionInfo} or null if an asymmetrical move is
548      *         suggested.
549      */
550     protected int pickRandomRegion(Cluster cluster, int server, double chanceOfNoSwap) {
551       // Check to see if this is just a move.
552       if (cluster.regionsPerServer[server].length == 0 || RANDOM.nextFloat() < chanceOfNoSwap) {
553         // signal a move only.
554         return -1;
555       }
556       int rand = RANDOM.nextInt(cluster.regionsPerServer[server].length);
557       return cluster.regionsPerServer[server][rand];
558 
559     }
560     protected int pickRandomServer(Cluster cluster) {
561       if (cluster.numServers < 1) {
562         return -1;
563       }
564 
565       return RANDOM.nextInt(cluster.numServers);
566     }
567 
568     protected int pickRandomRack(Cluster cluster) {
569       if (cluster.numRacks < 1) {
570         return -1;
571       }
572 
573       return RANDOM.nextInt(cluster.numRacks);
574     }
575 
576     protected int pickOtherRandomServer(Cluster cluster, int serverIndex) {
577       if (cluster.numServers < 2) {
578         return -1;
579       }
580       while (true) {
581         int otherServerIndex = pickRandomServer(cluster);
582         if (otherServerIndex != serverIndex) {
583           return otherServerIndex;
584         }
585       }
586     }
587 
588     protected int pickOtherRandomRack(Cluster cluster, int rackIndex) {
589       if (cluster.numRacks < 2) {
590         return -1;
591       }
592       while (true) {
593         int otherRackIndex = pickRandomRack(cluster);
594         if (otherRackIndex != rackIndex) {
595           return otherRackIndex;
596         }
597       }
598     }
599 
600     protected Cluster.Action pickRandomRegions(Cluster cluster,
601                                                        int thisServer,
602                                                        int otherServer) {
603       if (thisServer < 0 || otherServer < 0) {
604         return Cluster.NullAction;
605       }
606 
607       // Decide who is most likely to need another region
608       int thisRegionCount = cluster.getNumRegions(thisServer);
609       int otherRegionCount = cluster.getNumRegions(otherServer);
610 
611       // Assign the chance based upon the above
612       double thisChance = (thisRegionCount > otherRegionCount) ? 0 : 0.5;
613       double otherChance = (thisRegionCount <= otherRegionCount) ? 0 : 0.5;
614 
615       int thisRegion = pickRandomRegion(cluster, thisServer, thisChance);
616       int otherRegion = pickRandomRegion(cluster, otherServer, otherChance);
617 
618       return getAction(thisServer, thisRegion, otherServer, otherRegion);
619     }
620 
621     protected Cluster.Action getAction(int fromServer, int fromRegion,
622         int toServer, int toRegion) {
623       if (fromServer < 0 || toServer < 0) {
624         return Cluster.NullAction;
625       }
626       if (fromRegion > 0 && toRegion > 0) {
627         return new Cluster.SwapRegionsAction(fromServer, fromRegion,
628           toServer, toRegion);
629       } else if (fromRegion > 0) {
630         return new Cluster.MoveRegionAction(fromRegion, fromServer, toServer);
631       } else if (toRegion > 0) {
632         return new Cluster.MoveRegionAction(toRegion, toServer, fromServer);
633       } else {
634         return Cluster.NullAction;
635       }
636     }
637   }
638 
639   static class RandomCandidateGenerator extends CandidateGenerator {
640 
641     @Override
642     Cluster.Action generate(Cluster cluster) {
643 
644       int thisServer = pickRandomServer(cluster);
645 
646       // Pick the other server
647       int otherServer = pickOtherRandomServer(cluster, thisServer);
648 
649       return pickRandomRegions(cluster, thisServer, otherServer);
650     }
651   }
652 
653   static class LoadCandidateGenerator extends CandidateGenerator {
654 
655     @Override
656     Cluster.Action generate(Cluster cluster) {
657       cluster.sortServersByRegionCount();
658       int thisServer = pickMostLoadedServer(cluster, -1);
659       int otherServer = pickLeastLoadedServer(cluster, thisServer);
660 
661       return pickRandomRegions(cluster, thisServer, otherServer);
662     }
663 
664     private int pickLeastLoadedServer(final Cluster cluster, int thisServer) {
665       Integer[] servers = cluster.serverIndicesSortedByRegionCount;
666 
667       int index = 0;
668       while (servers[index] == null || servers[index] == thisServer) {
669         index++;
670         if (index == servers.length) {
671           return -1;
672         }
673       }
674       return servers[index];
675     }
676 
677     private int pickMostLoadedServer(final Cluster cluster, int thisServer) {
678       Integer[] servers = cluster.serverIndicesSortedByRegionCount;
679 
680       int index = servers.length - 1;
681       while (servers[index] == null || servers[index] == thisServer) {
682         index--;
683         if (index < 0) {
684           return -1;
685         }
686       }
687       return servers[index];
688     }
689   }
690 
691   static class LocalityBasedCandidateGenerator extends CandidateGenerator {
692 
693     private MasterServices masterServices;
694 
695     LocalityBasedCandidateGenerator(MasterServices masterServices) {
696       this.masterServices = masterServices;
697     }
698 
699     @Override
700     Cluster.Action generate(Cluster cluster) {
701       if (this.masterServices == null) {
702         return Cluster.NullAction;
703       }
704       // Pick a random region server
705       int thisServer = pickRandomServer(cluster);
706 
707       // Pick a random region on this server
708       int thisRegion = pickRandomRegion(cluster, thisServer, 0.0f);
709 
710       if (thisRegion == -1) {
711         return Cluster.NullAction;
712       }
713 
714       // Pick the server with the highest locality
715       int otherServer = pickHighestLocalityServer(cluster, thisServer, thisRegion);
716 
717       if (otherServer == -1) {
718         return Cluster.NullAction;
719       }
720 
721       // pick an region on the other server to potentially swap
722       int otherRegion = this.pickRandomRegion(cluster, otherServer, 0.5f);
723 
724       return getAction(thisServer, thisRegion, otherServer, otherRegion);
725     }
726 
727     private int pickHighestLocalityServer(Cluster cluster, int thisServer, int thisRegion) {
728       int[] regionLocations = cluster.regionLocations[thisRegion];
729 
730       if (regionLocations == null || regionLocations.length <= 1) {
731         return pickOtherRandomServer(cluster, thisServer);
732       }
733 
734       for (int loc : regionLocations) {
735         if (loc >= 0 && loc != thisServer) { // find the first suitable server
736           return loc;
737         }
738       }
739 
740       // no location found
741       return pickOtherRandomServer(cluster, thisServer);
742     }
743 
744     void setServices(MasterServices services) {
745       this.masterServices = services;
746     }
747   }
748 
749   /**
750    * Generates candidates which moves the replicas out of the region server for
751    * co-hosted region replicas
752    */
753   static class RegionReplicaCandidateGenerator extends CandidateGenerator {
754 
755     RandomCandidateGenerator randomGenerator = new RandomCandidateGenerator();
756 
757     /**
758      * Randomly select one regionIndex out of all region replicas co-hosted in the same group
759      * (a group is a server, host or rack)
760      * @param primariesOfRegionsPerGroup either Cluster.primariesOfRegionsPerServer,
761      * primariesOfRegionsPerHost or primariesOfRegionsPerRack
762      * @param regionsPerGroup either Cluster.regionsPerServer, regionsPerHost or regionsPerRack
763      * @param regionIndexToPrimaryIndex Cluster.regionsIndexToPrimaryIndex
764      * @return a regionIndex for the selected primary or -1 if there is no co-locating
765      */
766     int selectCoHostedRegionPerGroup(int[] primariesOfRegionsPerGroup, int[] regionsPerGroup
767         , int[] regionIndexToPrimaryIndex) {
768       int currentPrimary = -1;
769       int currentPrimaryIndex = -1;
770       int selectedPrimaryIndex = -1;
771       double currentLargestRandom = -1;
772       // primariesOfRegionsPerGroup is a sorted array. Since it contains the primary region
773       // ids for the regions hosted in server, a consecutive repetition means that replicas
774       // are co-hosted
775       for (int j = 0; j <= primariesOfRegionsPerGroup.length; j++) {
776         int primary = j < primariesOfRegionsPerGroup.length
777             ? primariesOfRegionsPerGroup[j] : -1;
778         if (primary != currentPrimary) { // check for whether we see a new primary
779           int numReplicas = j - currentPrimaryIndex;
780           if (numReplicas > 1) { // means consecutive primaries, indicating co-location
781             // decide to select this primary region id or not
782             double currentRandom = RANDOM.nextDouble();
783             // we don't know how many region replicas are co-hosted, we will randomly select one
784             // using reservoir sampling (http://gregable.com/2007/10/reservoir-sampling.html)
785             if (currentRandom > currentLargestRandom) {
786               selectedPrimaryIndex = currentPrimary;
787               currentLargestRandom = currentRandom;
788             }
789           }
790           currentPrimary = primary;
791           currentPrimaryIndex = j;
792         }
793       }
794 
795       // we have found the primary id for the region to move. Now find the actual regionIndex
796       // with the given primary, prefer to move the secondary region.
797       for (int j = 0; j < regionsPerGroup.length; j++) {
798         int regionIndex = regionsPerGroup[j];
799         if (selectedPrimaryIndex == regionIndexToPrimaryIndex[regionIndex]) {
800           // always move the secondary, not the primary
801           if (selectedPrimaryIndex != regionIndex) {
802             return regionIndex;
803           }
804         }
805       }
806       return -1;
807     }
808 
809     @Override
810     Cluster.Action generate(Cluster cluster) {
811       int serverIndex = pickRandomServer(cluster);
812       if (cluster.numServers <= 1 || serverIndex == -1) {
813         return Cluster.NullAction;
814       }
815 
816       int regionIndex = selectCoHostedRegionPerGroup(
817         cluster.primariesOfRegionsPerServer[serverIndex],
818         cluster.regionsPerServer[serverIndex],
819         cluster.regionIndexToPrimaryIndex);
820 
821       // if there are no pairs of region replicas co-hosted, default to random generator
822       if (regionIndex == -1) {
823         // default to randompicker
824         return randomGenerator.generate(cluster);
825       }
826 
827       int toServerIndex = pickOtherRandomServer(cluster, serverIndex);
828       int toRegionIndex = pickRandomRegion(cluster, toServerIndex, 0.9f);
829       return getAction(serverIndex, regionIndex, toServerIndex, toRegionIndex);
830     }
831   }
832 
833   /**
834    * Generates candidates which moves the replicas out of the rack for
835    * co-hosted region replicas in the same rack
836    */
837   static class RegionReplicaRackCandidateGenerator extends RegionReplicaCandidateGenerator {
838     @Override
839     Cluster.Action generate(Cluster cluster) {
840       int rackIndex = pickRandomRack(cluster);
841       if (cluster.numRacks <= 1 || rackIndex == -1) {
842         return super.generate(cluster);
843       }
844 
845       int regionIndex = selectCoHostedRegionPerGroup(
846         cluster.primariesOfRegionsPerRack[rackIndex],
847         cluster.regionsPerRack[rackIndex],
848         cluster.regionIndexToPrimaryIndex);
849 
850       // if there are no pairs of region replicas co-hosted, default to random generator
851       if (regionIndex == -1) {
852         // default to randompicker
853         return randomGenerator.generate(cluster);
854       }
855 
856       int serverIndex = cluster.regionIndexToServerIndex[regionIndex];
857       int toRackIndex = pickOtherRandomRack(cluster, rackIndex);
858 
859       int rand = RANDOM.nextInt(cluster.serversPerRack[toRackIndex].length);
860       int toServerIndex = cluster.serversPerRack[toRackIndex][rand];
861       int toRegionIndex = pickRandomRegion(cluster, toServerIndex, 0.9f);
862       return getAction(serverIndex, regionIndex, toServerIndex, toRegionIndex);
863     }
864   }
865 
866   /**
867    * Base class of StochasticLoadBalancer's Cost Functions.
868    */
869   abstract static class CostFunction {
870 
871     private float multiplier = 0;
872 
873     protected Cluster cluster;
874 
875     CostFunction(Configuration c) {
876 
877     }
878 
879     float getMultiplier() {
880       return multiplier;
881     }
882 
883     void setMultiplier(float m) {
884       this.multiplier = m;
885     }
886 
887     /** Called once per LB invocation to give the cost function
888      * to initialize it's state, and perform any costly calculation.
889      */
890     void init(Cluster cluster) {
891       this.cluster = cluster;
892     }
893 
894     /** Called once per cluster Action to give the cost function
895      * an opportunity to update it's state. postAction() is always
896      * called at least once before cost() is called with the cluster
897      * that this action is performed on. */
898     void postAction(Action action) {
899       switch (action.type) {
900       case NULL: break;
901       case ASSIGN_REGION:
902         AssignRegionAction ar = (AssignRegionAction) action;
903         regionMoved(ar.region, -1, ar.server);
904         break;
905       case MOVE_REGION:
906         MoveRegionAction mra = (MoveRegionAction) action;
907         regionMoved(mra.region, mra.fromServer, mra.toServer);
908         break;
909       case SWAP_REGIONS:
910         SwapRegionsAction a = (SwapRegionsAction) action;
911         regionMoved(a.fromRegion, a.fromServer, a.toServer);
912         regionMoved(a.toRegion, a.toServer, a.fromServer);
913         break;
914       default:
915         throw new RuntimeException("Uknown action:" + action.type);
916       }
917     }
918 
919     protected void regionMoved(int region, int oldServer, int newServer) {
920     }
921 
922     abstract double cost();
923 
924     /**
925      * Function to compute a scaled cost using {@link DescriptiveStatistics}. It
926      * assumes that this is a zero sum set of costs.  It assumes that the worst case
927      * possible is all of the elements in one region server and the rest having 0.
928      *
929      * @param stats the costs
930      * @return a scaled set of costs.
931      */
932     protected double costFromArray(double[] stats) {
933       double totalCost = 0;
934       double total = getSum(stats);
935 
936       double count = stats.length;
937       double mean = total/count;
938 
939       // Compute max as if all region servers had 0 and one had the sum of all costs.  This must be
940       // a zero sum cost for this to make sense.
941       double max = ((count - 1) * mean) + (total - mean);
942 
943       // It's possible that there aren't enough regions to go around
944       double min;
945       if (count > total) {
946         min = ((count - total) * mean) + ((1 - mean) * total);
947       } else {
948         // Some will have 1 more than everything else.
949         int numHigh = (int) (total - (Math.floor(mean) * count));
950         int numLow = (int) (count - numHigh);
951 
952         min = (numHigh * (Math.ceil(mean) - mean)) + (numLow * (mean - Math.floor(mean)));
953 
954       }
955       min = Math.max(0, min);
956       for (int i=0; i<stats.length; i++) {
957         double n = stats[i];
958         double diff = Math.abs(mean - n);
959         totalCost += diff;
960       }
961 
962       double scaled =  scale(min, max, totalCost);
963       return scaled;
964     }
965 
966     private double getSum(double[] stats) {
967       double total = 0;
968       for(double s:stats) {
969         total += s;
970       }
971       return total;
972     }
973 
974     /**
975      * Scale the value between 0 and 1.
976      *
977      * @param min   Min value
978      * @param max   The Max value
979      * @param value The value to be scaled.
980      * @return The scaled value.
981      */
982     protected double scale(double min, double max, double value) {
983       if (max <= min || value <= min) {
984         return 0;
985       }
986       if ((max - min) == 0) return 0;
987 
988       return Math.max(0d, Math.min(1d, (value - min) / (max - min)));
989     }
990   }
991 
992   /**
993    * Given the starting state of the regions and a potential ending state
994    * compute cost based upon the number of regions that have moved.
995    */
996   static class MoveCostFunction extends CostFunction {
997     private static final String MOVE_COST_KEY = "hbase.master.balancer.stochastic.moveCost";
998     private static final String MAX_MOVES_PERCENT_KEY =
999         "hbase.master.balancer.stochastic.maxMovePercent";
1000     private static final float DEFAULT_MOVE_COST = 100;
1001     private static final int DEFAULT_MAX_MOVES = 600;
1002     private static final float DEFAULT_MAX_MOVE_PERCENT = 0.25f;
1003 
1004     private final float maxMovesPercent;
1005 
1006     MoveCostFunction(Configuration conf) {
1007       super(conf);
1008 
1009       // Move cost multiplier should be the same cost or higher than the rest of the costs to ensure
1010       // that large benefits are need to overcome the cost of a move.
1011       this.setMultiplier(conf.getFloat(MOVE_COST_KEY, DEFAULT_MOVE_COST));
1012       // What percent of the number of regions a single run of the balancer can move.
1013       maxMovesPercent = conf.getFloat(MAX_MOVES_PERCENT_KEY, DEFAULT_MAX_MOVE_PERCENT);
1014     }
1015 
1016     @Override
1017     double cost() {
1018       // Try and size the max number of Moves, but always be prepared to move some.
1019       int maxMoves = Math.max((int) (cluster.numRegions * maxMovesPercent),
1020           DEFAULT_MAX_MOVES);
1021 
1022       double moveCost = cluster.numMovedRegions;
1023 
1024       // Don't let this single balance move more than the max moves.
1025       // This allows better scaling to accurately represent the actual cost of a move.
1026       if (moveCost > maxMoves) {
1027         return 1000000;   // return a number much greater than any of the other cost
1028       }
1029 
1030       return scale(0, cluster.numRegions, moveCost);
1031     }
1032   }
1033 
1034   /**
1035    * Compute the cost of a potential cluster state from skew in number of
1036    * regions on a cluster.
1037    */
1038   static class RegionCountSkewCostFunction extends CostFunction {
1039     private static final String REGION_COUNT_SKEW_COST_KEY =
1040         "hbase.master.balancer.stochastic.regionCountCost";
1041     private static final float DEFAULT_REGION_COUNT_SKEW_COST = 500;
1042 
1043     private double[] stats = null;
1044 
1045     RegionCountSkewCostFunction(Configuration conf) {
1046       super(conf);
1047       // Load multiplier should be the greatest as it is the most general way to balance data.
1048       this.setMultiplier(conf.getFloat(REGION_COUNT_SKEW_COST_KEY, DEFAULT_REGION_COUNT_SKEW_COST));
1049     }
1050 
1051     @Override
1052     double cost() {
1053       if (stats == null || stats.length != cluster.numServers) {
1054         stats = new double[cluster.numServers];
1055       }
1056 
1057       for (int i =0; i < cluster.numServers; i++) {
1058         stats[i] = cluster.regionsPerServer[i].length;
1059       }
1060 
1061       return costFromArray(stats);
1062     }
1063   }
1064 
1065   /**
1066    * Compute the cost of a potential cluster state from skew in number of
1067    * primary regions on a cluster.
1068    */
1069   static class PrimaryRegionCountSkewCostFunction extends CostFunction {
1070     private static final String PRIMARY_REGION_COUNT_SKEW_COST_KEY =
1071         "hbase.master.balancer.stochastic.primaryRegionCountCost";
1072     private static final float DEFAULT_PRIMARY_REGION_COUNT_SKEW_COST = 500;
1073 
1074     private double[] stats = null;
1075 
1076     PrimaryRegionCountSkewCostFunction(Configuration conf) {
1077       super(conf);
1078       // Load multiplier should be the greatest as primary regions serve majority of reads/writes.
1079       this.setMultiplier(conf.getFloat(PRIMARY_REGION_COUNT_SKEW_COST_KEY,
1080         DEFAULT_PRIMARY_REGION_COUNT_SKEW_COST));
1081     }
1082 
1083     @Override
1084     double cost() {
1085       if (!cluster.hasRegionReplicas) {
1086         return 0;
1087       }
1088       if (stats == null || stats.length != cluster.numServers) {
1089         stats = new double[cluster.numServers];
1090       }
1091 
1092       for (int i =0; i < cluster.numServers; i++) {
1093         stats[i] = 0;
1094         for (int regionIdx : cluster.regionsPerServer[i]) {
1095           if (regionIdx == cluster.regionIndexToPrimaryIndex[regionIdx]) {
1096             stats[i] ++;
1097           }
1098         }
1099       }
1100 
1101       return costFromArray(stats);
1102     }
1103   }
1104 
1105   /**
1106    * Compute the cost of a potential cluster configuration based upon how evenly
1107    * distributed tables are.
1108    */
1109   static class TableSkewCostFunction extends CostFunction {
1110 
1111     private static final String TABLE_SKEW_COST_KEY =
1112         "hbase.master.balancer.stochastic.tableSkewCost";
1113     private static final float DEFAULT_TABLE_SKEW_COST = 35;
1114 
1115     TableSkewCostFunction(Configuration conf) {
1116       super(conf);
1117       this.setMultiplier(conf.getFloat(TABLE_SKEW_COST_KEY, DEFAULT_TABLE_SKEW_COST));
1118     }
1119 
1120     @Override
1121     double cost() {
1122       double max = cluster.numRegions;
1123       double min = ((double) cluster.numRegions) / cluster.numServers;
1124       double value = 0;
1125 
1126       for (int i = 0; i < cluster.numMaxRegionsPerTable.length; i++) {
1127         value += cluster.numMaxRegionsPerTable[i];
1128       }
1129 
1130       return scale(min, max, value);
1131     }
1132   }
1133 
1134   /**
1135    * Compute a cost of a potential cluster configuration based upon where
1136    * {@link org.apache.hadoop.hbase.regionserver.StoreFile}s are located.
1137    */
1138   static class LocalityCostFunction extends CostFunction {
1139 
1140     private static final String LOCALITY_COST_KEY = "hbase.master.balancer.stochastic.localityCost";
1141     private static final float DEFAULT_LOCALITY_COST = 25;
1142 
1143     private MasterServices services;
1144 
1145     LocalityCostFunction(Configuration conf, MasterServices srv) {
1146       super(conf);
1147       this.setMultiplier(conf.getFloat(LOCALITY_COST_KEY, DEFAULT_LOCALITY_COST));
1148       this.services = srv;
1149     }
1150 
1151     void setServices(MasterServices srvc) {
1152       this.services = srvc;
1153     }
1154 
1155     @Override
1156     double cost() {
1157       double max = 0;
1158       double cost = 0;
1159 
1160       // If there's no master so there's no way anything else works.
1161       if (this.services == null) {
1162         return cost;
1163       }
1164 
1165       for (int i = 0; i < cluster.regionLocations.length; i++) {
1166         max += 1;
1167         int serverIndex = cluster.regionIndexToServerIndex[i];
1168         int[] regionLocations = cluster.regionLocations[i];
1169 
1170         // If we can't find where the data is getTopBlock returns null.
1171         // so count that as being the best possible.
1172         if (regionLocations == null) {
1173           continue;
1174         }
1175 
1176         int index = -1;
1177         for (int j = 0; j < regionLocations.length; j++) {
1178           if (regionLocations[j] >= 0 && regionLocations[j] == serverIndex) {
1179             index = j;
1180             break;
1181           }
1182         }
1183 
1184         if (index < 0) {
1185           if (regionLocations.length > 0) {
1186             cost += 1;
1187           }
1188         } else {
1189           cost += (double) index / (double) regionLocations.length;
1190         }
1191       }
1192       return scale(0, max, cost);
1193     }
1194   }
1195 
1196   /**
1197    * Base class the allows writing costs functions from rolling average of some
1198    * number from RegionLoad.
1199    */
1200   abstract static class CostFromRegionLoadFunction extends CostFunction {
1201 
1202     private ClusterStatus clusterStatus = null;
1203     private Map<String, Deque<RegionLoad>> loads = null;
1204     private double[] stats = null;
1205     CostFromRegionLoadFunction(Configuration conf) {
1206       super(conf);
1207     }
1208 
1209     void setClusterStatus(ClusterStatus status) {
1210       this.clusterStatus = status;
1211     }
1212 
1213     void setLoads(Map<String, Deque<RegionLoad>> l) {
1214       this.loads = l;
1215     }
1216 
1217     @Override
1218     double cost() {
1219       if (clusterStatus == null || loads == null) {
1220         return 0;
1221       }
1222 
1223       if (stats == null || stats.length != cluster.numServers) {
1224         stats = new double[cluster.numServers];
1225       }
1226 
1227       for (int i =0; i < stats.length; i++) {
1228         //Cost this server has from RegionLoad
1229         long cost = 0;
1230 
1231         // for every region on this server get the rl
1232         for(int regionIndex:cluster.regionsPerServer[i]) {
1233           Collection<RegionLoad> regionLoadList =  cluster.regionLoads[regionIndex];
1234 
1235           // Now if we found a region load get the type of cost that was requested.
1236           if (regionLoadList != null) {
1237             cost += getRegionLoadCost(regionLoadList);
1238           }
1239         }
1240 
1241         // Add the total cost to the stats.
1242         stats[i] = cost;
1243       }
1244 
1245       // Now return the scaled cost from data held in the stats object.
1246       return costFromArray(stats);
1247     }
1248 
1249     protected double getRegionLoadCost(Collection<RegionLoad> regionLoadList) {
1250       double cost = 0;
1251 
1252       for (RegionLoad rl : regionLoadList) {
1253         double toAdd = getCostFromRl(rl);
1254 
1255         if (cost == 0) {
1256           cost = toAdd;
1257         } else {
1258           cost = (.5 * cost) + (.5 * toAdd);
1259         }
1260       }
1261 
1262       return cost;
1263     }
1264 
1265     protected abstract double getCostFromRl(RegionLoad rl);
1266   }
1267 
1268   /**
1269    * Compute the cost of total number of read requests  The more unbalanced the higher the
1270    * computed cost will be.  This uses a rolling average of regionload.
1271    */
1272 
1273   static class ReadRequestCostFunction extends CostFromRegionLoadFunction {
1274 
1275     private static final String READ_REQUEST_COST_KEY =
1276         "hbase.master.balancer.stochastic.readRequestCost";
1277     private static final float DEFAULT_READ_REQUEST_COST = 5;
1278 
1279     ReadRequestCostFunction(Configuration conf) {
1280       super(conf);
1281       this.setMultiplier(conf.getFloat(READ_REQUEST_COST_KEY, DEFAULT_READ_REQUEST_COST));
1282     }
1283 
1284 
1285     @Override
1286     protected double getCostFromRl(RegionLoad rl) {
1287       return rl.getReadRequestsCount();
1288     }
1289   }
1290 
1291   /**
1292    * Compute the cost of total number of write requests.  The more unbalanced the higher the
1293    * computed cost will be.  This uses a rolling average of regionload.
1294    */
1295   static class WriteRequestCostFunction extends CostFromRegionLoadFunction {
1296 
1297     private static final String WRITE_REQUEST_COST_KEY =
1298         "hbase.master.balancer.stochastic.writeRequestCost";
1299     private static final float DEFAULT_WRITE_REQUEST_COST = 5;
1300 
1301     WriteRequestCostFunction(Configuration conf) {
1302       super(conf);
1303       this.setMultiplier(conf.getFloat(WRITE_REQUEST_COST_KEY, DEFAULT_WRITE_REQUEST_COST));
1304     }
1305 
1306     @Override
1307     protected double getCostFromRl(RegionLoad rl) {
1308       return rl.getWriteRequestsCount();
1309     }
1310   }
1311 
1312   /**
1313    * A cost function for region replicas. We give a very high cost to hosting
1314    * replicas of the same region in the same host. We do not prevent the case
1315    * though, since if numReplicas > numRegionServers, we still want to keep the
1316    * replica open.
1317    */
1318   static class RegionReplicaHostCostFunction extends CostFunction {
1319     private static final String REGION_REPLICA_HOST_COST_KEY =
1320         "hbase.master.balancer.stochastic.regionReplicaHostCostKey";
1321     private static final float DEFAULT_REGION_REPLICA_HOST_COST_KEY = 100000;
1322 
1323     long maxCost = 0;
1324     long[] costsPerGroup; // group is either server, host or rack
1325     int[][] primariesOfRegionsPerGroup;
1326 
1327     public RegionReplicaHostCostFunction(Configuration conf) {
1328       super(conf);
1329       this.setMultiplier(conf.getFloat(REGION_REPLICA_HOST_COST_KEY,
1330         DEFAULT_REGION_REPLICA_HOST_COST_KEY));
1331     }
1332 
1333     @Override
1334     void init(Cluster cluster) {
1335       super.init(cluster);
1336       // max cost is the case where every region replica is hosted together regardless of host
1337       maxCost = cluster.numHosts > 1 ? getMaxCost(cluster) : 0;
1338       costsPerGroup = new long[cluster.numHosts];
1339       primariesOfRegionsPerGroup = cluster.multiServersPerHost // either server based or host based
1340           ? cluster.primariesOfRegionsPerHost
1341           : cluster.primariesOfRegionsPerServer;
1342       for (int i = 0 ; i < primariesOfRegionsPerGroup.length; i++) {
1343         costsPerGroup[i] = costPerGroup(primariesOfRegionsPerGroup[i]);
1344       }
1345     }
1346 
1347     long getMaxCost(Cluster cluster) {
1348       if (!cluster.hasRegionReplicas) {
1349         return 0; // short circuit
1350       }
1351       // max cost is the case where every region replica is hosted together regardless of host
1352       int[] primariesOfRegions = new int[cluster.numRegions];
1353       System.arraycopy(cluster.regionIndexToPrimaryIndex, 0, primariesOfRegions, 0,
1354           cluster.regions.length);
1355 
1356       Arrays.sort(primariesOfRegions);
1357 
1358       // compute numReplicas from the sorted array
1359       return costPerGroup(primariesOfRegions);
1360     }
1361 
1362     @Override
1363     double cost() {
1364       if (maxCost <= 0) {
1365         return 0;
1366       }
1367 
1368       long totalCost = 0;
1369       for (int i = 0 ; i < costsPerGroup.length; i++) {
1370         totalCost += costsPerGroup[i];
1371       }
1372       return scale(0, maxCost, totalCost);
1373     }
1374 
1375     /**
1376      * For each primary region, it computes the total number of replicas in the array (numReplicas)
1377      * and returns a sum of numReplicas-1 squared. For example, if the server hosts
1378      * regions a, b, c, d, e, f where a and b are same replicas, and c,d,e are same replicas, it
1379      * returns (2-1) * (2-1) + (3-1) * (3-1) + (1-1) * (1-1).
1380      * @param primariesOfRegions a sorted array of primary regions ids for the regions hosted
1381      * @return a sum of numReplicas-1 squared for each primary region in the group.
1382      */
1383     protected long costPerGroup(int[] primariesOfRegions) {
1384       long cost = 0;
1385       int currentPrimary = -1;
1386       int currentPrimaryIndex = -1;
1387       // primariesOfRegions is a sorted array of primary ids of regions. Replicas of regions
1388       // sharing the same primary will have consecutive numbers in the array.
1389       for (int j = 0 ; j <= primariesOfRegions.length; j++) {
1390         int primary = j < primariesOfRegions.length ? primariesOfRegions[j] : -1;
1391         if (primary != currentPrimary) { // we see a new primary
1392           int numReplicas = j - currentPrimaryIndex;
1393           // square the cost
1394           if (numReplicas > 1) { // means consecutive primaries, indicating co-location
1395             cost += (numReplicas - 1) * (numReplicas - 1);
1396           }
1397           currentPrimary = primary;
1398           currentPrimaryIndex = j;
1399         }
1400       }
1401 
1402       return cost;
1403     }
1404 
1405     @Override
1406     protected void regionMoved(int region, int oldServer, int newServer) {
1407       if (maxCost <= 0) {
1408         return; // no need to compute
1409       }
1410       if (cluster.multiServersPerHost) {
1411         int oldHost = cluster.serverIndexToHostIndex[oldServer];
1412         int newHost = cluster.serverIndexToHostIndex[newServer];
1413         if (newHost != oldHost) {
1414           costsPerGroup[oldHost] = costPerGroup(cluster.primariesOfRegionsPerHost[oldHost]);
1415           costsPerGroup[newHost] = costPerGroup(cluster.primariesOfRegionsPerHost[newHost]);
1416         }
1417       } else {
1418         costsPerGroup[oldServer] = costPerGroup(cluster.primariesOfRegionsPerServer[oldServer]);
1419         costsPerGroup[newServer] = costPerGroup(cluster.primariesOfRegionsPerServer[newServer]);
1420       }
1421     }
1422   }
1423 
1424   /**
1425    * A cost function for region replicas for the rack distribution. We give a relatively high
1426    * cost to hosting replicas of the same region in the same rack. We do not prevent the case
1427    * though.
1428    */
1429   static class RegionReplicaRackCostFunction extends RegionReplicaHostCostFunction {
1430     private static final String REGION_REPLICA_RACK_COST_KEY =
1431         "hbase.master.balancer.stochastic.regionReplicaRackCostKey";
1432     private static final float DEFAULT_REGION_REPLICA_RACK_COST_KEY = 10000;
1433 
1434     public RegionReplicaRackCostFunction(Configuration conf) {
1435       super(conf);
1436       this.setMultiplier(conf.getFloat(REGION_REPLICA_RACK_COST_KEY,
1437         DEFAULT_REGION_REPLICA_RACK_COST_KEY));
1438     }
1439 
1440     @Override
1441     void init(Cluster cluster) {
1442       this.cluster = cluster;
1443       if (cluster.numRacks <= 1) {
1444         maxCost = 0;
1445         return; // disabled for 1 rack
1446       }
1447       // max cost is the case where every region replica is hosted together regardless of rack
1448       maxCost = getMaxCost(cluster);
1449       costsPerGroup = new long[cluster.numRacks];
1450       for (int i = 0 ; i < cluster.primariesOfRegionsPerRack.length; i++) {
1451         costsPerGroup[i] = costPerGroup(cluster.primariesOfRegionsPerRack[i]);
1452       }
1453     }
1454 
1455     @Override
1456     protected void regionMoved(int region, int oldServer, int newServer) {
1457       if (maxCost <= 0) {
1458         return; // no need to compute
1459       }
1460       int oldRack = cluster.serverIndexToRackIndex[oldServer];
1461       int newRack = cluster.serverIndexToRackIndex[newServer];
1462       if (newRack != oldRack) {
1463         costsPerGroup[oldRack] = costPerGroup(cluster.primariesOfRegionsPerRack[oldRack]);
1464         costsPerGroup[newRack] = costPerGroup(cluster.primariesOfRegionsPerRack[newRack]);
1465       }
1466     }
1467   }
1468 
1469   /**
1470    * Compute the cost of total memstore size.  The more unbalanced the higher the
1471    * computed cost will be.  This uses a rolling average of regionload.
1472    */
1473   static class MemstoreSizeCostFunction extends CostFromRegionLoadFunction {
1474 
1475     private static final String MEMSTORE_SIZE_COST_KEY =
1476         "hbase.master.balancer.stochastic.memstoreSizeCost";
1477     private static final float DEFAULT_MEMSTORE_SIZE_COST = 5;
1478 
1479     MemstoreSizeCostFunction(Configuration conf) {
1480       super(conf);
1481       this.setMultiplier(conf.getFloat(MEMSTORE_SIZE_COST_KEY, DEFAULT_MEMSTORE_SIZE_COST));
1482     }
1483 
1484     @Override
1485     protected double getCostFromRl(RegionLoad rl) {
1486       return rl.getMemStoreSizeMB();
1487     }
1488   }
1489   /**
1490    * Compute the cost of total open storefiles size.  The more unbalanced the higher the
1491    * computed cost will be.  This uses a rolling average of regionload.
1492    */
1493   static class StoreFileCostFunction extends CostFromRegionLoadFunction {
1494 
1495     private static final String STOREFILE_SIZE_COST_KEY =
1496         "hbase.master.balancer.stochastic.storefileSizeCost";
1497     private static final float DEFAULT_STOREFILE_SIZE_COST = 5;
1498 
1499     StoreFileCostFunction(Configuration conf) {
1500       super(conf);
1501       this.setMultiplier(conf.getFloat(STOREFILE_SIZE_COST_KEY, DEFAULT_STOREFILE_SIZE_COST));
1502     }
1503 
1504     @Override
1505     protected double getCostFromRl(RegionLoad rl) {
1506       return rl.getStorefileSizeMB();
1507     }
1508   }
1509 
1510   /**
1511    * A helper function to compose the attribute name from tablename and costfunction name
1512    */
1513   public static String composeAttributeName(String tableName, String costFunctionName) {
1514     return tableName + TABLE_FUNCTION_SEP + costFunctionName;
1515   }
1516 }