001 /** 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018 package org.apache.hadoop.mapred; 019 020 import java.io.FileNotFoundException; 021 import java.io.IOException; 022 import java.net.InetSocketAddress; 023 import java.net.URL; 024 import java.security.PrivilegedExceptionAction; 025 import java.util.ArrayList; 026 import java.util.Collection; 027 import java.util.List; 028 029 import org.apache.hadoop.classification.InterfaceAudience; 030 import org.apache.hadoop.classification.InterfaceStability; 031 import org.apache.hadoop.conf.Configuration; 032 import org.apache.hadoop.fs.FileStatus; 033 import org.apache.hadoop.fs.FileSystem; 034 import org.apache.hadoop.fs.Path; 035 import org.apache.hadoop.io.Text; 036 import org.apache.hadoop.mapred.ClusterStatus.BlackListInfo; 037 import org.apache.hadoop.mapreduce.Cluster; 038 import org.apache.hadoop.mapreduce.ClusterMetrics; 039 import org.apache.hadoop.mapreduce.Job; 040 import org.apache.hadoop.mapreduce.QueueInfo; 041 import org.apache.hadoop.mapreduce.TaskTrackerInfo; 042 import org.apache.hadoop.mapreduce.TaskType; 043 import org.apache.hadoop.mapreduce.filecache.DistributedCache; 044 import org.apache.hadoop.mapreduce.security.token.delegation.DelegationTokenIdentifier; 045 import org.apache.hadoop.mapreduce.tools.CLI; 046 import org.apache.hadoop.mapreduce.util.ConfigUtil; 047 import org.apache.hadoop.security.UserGroupInformation; 048 import org.apache.hadoop.security.token.SecretManager.InvalidToken; 049 import org.apache.hadoop.security.token.Token; 050 import org.apache.hadoop.security.token.TokenRenewer; 051 import org.apache.hadoop.util.Tool; 052 import org.apache.hadoop.util.ToolRunner; 053 054 /** 055 * <code>JobClient</code> is the primary interface for the user-job to interact 056 * with the cluster. 057 * 058 * <code>JobClient</code> provides facilities to submit jobs, track their 059 * progress, access component-tasks' reports/logs, get the Map-Reduce cluster 060 * status information etc. 061 * 062 * <p>The job submission process involves: 063 * <ol> 064 * <li> 065 * Checking the input and output specifications of the job. 066 * </li> 067 * <li> 068 * Computing the {@link InputSplit}s for the job. 069 * </li> 070 * <li> 071 * Setup the requisite accounting information for the {@link DistributedCache} 072 * of the job, if necessary. 073 * </li> 074 * <li> 075 * Copying the job's jar and configuration to the map-reduce system directory 076 * on the distributed file-system. 077 * </li> 078 * <li> 079 * Submitting the job to the cluster and optionally monitoring 080 * it's status. 081 * </li> 082 * </ol></p> 083 * 084 * Normally the user creates the application, describes various facets of the 085 * job via {@link JobConf} and then uses the <code>JobClient</code> to submit 086 * the job and monitor its progress. 087 * 088 * <p>Here is an example on how to use <code>JobClient</code>:</p> 089 * <p><blockquote><pre> 090 * // Create a new JobConf 091 * JobConf job = new JobConf(new Configuration(), MyJob.class); 092 * 093 * // Specify various job-specific parameters 094 * job.setJobName("myjob"); 095 * 096 * job.setInputPath(new Path("in")); 097 * job.setOutputPath(new Path("out")); 098 * 099 * job.setMapperClass(MyJob.MyMapper.class); 100 * job.setReducerClass(MyJob.MyReducer.class); 101 * 102 * // Submit the job, then poll for progress until the job is complete 103 * JobClient.runJob(job); 104 * </pre></blockquote></p> 105 * 106 * <h4 id="JobControl">Job Control</h4> 107 * 108 * <p>At times clients would chain map-reduce jobs to accomplish complex tasks 109 * which cannot be done via a single map-reduce job. This is fairly easy since 110 * the output of the job, typically, goes to distributed file-system and that 111 * can be used as the input for the next job.</p> 112 * 113 * <p>However, this also means that the onus on ensuring jobs are complete 114 * (success/failure) lies squarely on the clients. In such situations the 115 * various job-control options are: 116 * <ol> 117 * <li> 118 * {@link #runJob(JobConf)} : submits the job and returns only after 119 * the job has completed. 120 * </li> 121 * <li> 122 * {@link #submitJob(JobConf)} : only submits the job, then poll the 123 * returned handle to the {@link RunningJob} to query status and make 124 * scheduling decisions. 125 * </li> 126 * <li> 127 * {@link JobConf#setJobEndNotificationURI(String)} : setup a notification 128 * on job-completion, thus avoiding polling. 129 * </li> 130 * </ol></p> 131 * 132 * @see JobConf 133 * @see ClusterStatus 134 * @see Tool 135 * @see DistributedCache 136 */ 137 @InterfaceAudience.Public 138 @InterfaceStability.Stable 139 public class JobClient extends CLI { 140 141 @InterfaceAudience.Private 142 public static final String MAPREDUCE_CLIENT_RETRY_POLICY_ENABLED_KEY = 143 "mapreduce.jobclient.retry.policy.enabled"; 144 @InterfaceAudience.Private 145 public static final boolean MAPREDUCE_CLIENT_RETRY_POLICY_ENABLED_DEFAULT = 146 false; 147 @InterfaceAudience.Private 148 public static final String MAPREDUCE_CLIENT_RETRY_POLICY_SPEC_KEY = 149 "mapreduce.jobclient.retry.policy.spec"; 150 @InterfaceAudience.Private 151 public static final String MAPREDUCE_CLIENT_RETRY_POLICY_SPEC_DEFAULT = 152 "10000,6,60000,10"; // t1,n1,t2,n2,... 153 154 public static enum TaskStatusFilter { NONE, KILLED, FAILED, SUCCEEDED, ALL } 155 private TaskStatusFilter taskOutputFilter = TaskStatusFilter.FAILED; 156 /* notes that get delegation token was called. Again this is hack for oozie 157 * to make sure we add history server delegation tokens to the credentials 158 * for the job. Since the api only allows one delegation token to be returned, 159 * we have to add this hack. 160 */ 161 private boolean getDelegationTokenCalled = false; 162 /* notes the renewer that will renew the delegation token */ 163 private String dtRenewer = null; 164 /* do we need a HS delegation token for this client */ 165 static final String HS_DELEGATION_TOKEN_REQUIRED 166 = "mapreduce.history.server.delegationtoken.required"; 167 static final String HS_DELEGATION_TOKEN_RENEWER 168 = "mapreduce.history.server.delegationtoken.renewer"; 169 170 static{ 171 ConfigUtil.loadResources(); 172 } 173 174 /** 175 * A NetworkedJob is an implementation of RunningJob. It holds 176 * a JobProfile object to provide some info, and interacts with the 177 * remote service to provide certain functionality. 178 */ 179 static class NetworkedJob implements RunningJob { 180 Job job; 181 /** 182 * We store a JobProfile and a timestamp for when we last 183 * acquired the job profile. If the job is null, then we cannot 184 * perform any of the tasks. The job might be null if the cluster 185 * has completely forgotten about the job. (eg, 24 hours after the 186 * job completes.) 187 */ 188 public NetworkedJob(JobStatus status, Cluster cluster) throws IOException { 189 job = Job.getInstance(cluster, status, new JobConf(status.getJobFile())); 190 } 191 192 public NetworkedJob(Job job) throws IOException { 193 this.job = job; 194 } 195 196 public Configuration getConfiguration() { 197 return job.getConfiguration(); 198 } 199 200 /** 201 * An identifier for the job 202 */ 203 public JobID getID() { 204 return JobID.downgrade(job.getJobID()); 205 } 206 207 /** @deprecated This method is deprecated and will be removed. Applications should 208 * rather use {@link #getID()}.*/ 209 @Deprecated 210 public String getJobID() { 211 return getID().toString(); 212 } 213 214 /** 215 * The user-specified job name 216 */ 217 public String getJobName() { 218 return job.getJobName(); 219 } 220 221 /** 222 * The name of the job file 223 */ 224 public String getJobFile() { 225 return job.getJobFile(); 226 } 227 228 /** 229 * A URL where the job's status can be seen 230 */ 231 public String getTrackingURL() { 232 return job.getTrackingURL(); 233 } 234 235 /** 236 * A float between 0.0 and 1.0, indicating the % of map work 237 * completed. 238 */ 239 public float mapProgress() throws IOException { 240 return job.mapProgress(); 241 } 242 243 /** 244 * A float between 0.0 and 1.0, indicating the % of reduce work 245 * completed. 246 */ 247 public float reduceProgress() throws IOException { 248 return job.reduceProgress(); 249 } 250 251 /** 252 * A float between 0.0 and 1.0, indicating the % of cleanup work 253 * completed. 254 */ 255 public float cleanupProgress() throws IOException { 256 try { 257 return job.cleanupProgress(); 258 } catch (InterruptedException ie) { 259 throw new IOException(ie); 260 } 261 } 262 263 /** 264 * A float between 0.0 and 1.0, indicating the % of setup work 265 * completed. 266 */ 267 public float setupProgress() throws IOException { 268 return job.setupProgress(); 269 } 270 271 /** 272 * Returns immediately whether the whole job is done yet or not. 273 */ 274 public synchronized boolean isComplete() throws IOException { 275 return job.isComplete(); 276 } 277 278 /** 279 * True iff job completed successfully. 280 */ 281 public synchronized boolean isSuccessful() throws IOException { 282 return job.isSuccessful(); 283 } 284 285 /** 286 * Blocks until the job is finished 287 */ 288 public void waitForCompletion() throws IOException { 289 try { 290 job.waitForCompletion(false); 291 } catch (InterruptedException ie) { 292 throw new IOException(ie); 293 } catch (ClassNotFoundException ce) { 294 throw new IOException(ce); 295 } 296 } 297 298 /** 299 * Tells the service to get the state of the current job. 300 */ 301 public synchronized int getJobState() throws IOException { 302 try { 303 return job.getJobState().getValue(); 304 } catch (InterruptedException ie) { 305 throw new IOException(ie); 306 } 307 } 308 309 /** 310 * Tells the service to terminate the current job. 311 */ 312 public synchronized void killJob() throws IOException { 313 job.killJob(); 314 } 315 316 317 /** Set the priority of the job. 318 * @param priority new priority of the job. 319 */ 320 public synchronized void setJobPriority(String priority) 321 throws IOException { 322 try { 323 job.setPriority( 324 org.apache.hadoop.mapreduce.JobPriority.valueOf(priority)); 325 } catch (InterruptedException ie) { 326 throw new IOException(ie); 327 } 328 } 329 330 /** 331 * Kill indicated task attempt. 332 * @param taskId the id of the task to kill. 333 * @param shouldFail if true the task is failed and added to failed tasks list, otherwise 334 * it is just killed, w/o affecting job failure status. 335 */ 336 public synchronized void killTask(TaskAttemptID taskId, 337 boolean shouldFail) throws IOException { 338 if (shouldFail) { 339 job.failTask(taskId); 340 } else { 341 job.killTask(taskId); 342 } 343 } 344 345 /** @deprecated Applications should rather use {@link #killTask(TaskAttemptID, boolean)}*/ 346 @Deprecated 347 public synchronized void killTask(String taskId, boolean shouldFail) throws IOException { 348 killTask(TaskAttemptID.forName(taskId), shouldFail); 349 } 350 351 /** 352 * Fetch task completion events from cluster for this job. 353 */ 354 public synchronized TaskCompletionEvent[] getTaskCompletionEvents( 355 int startFrom) throws IOException { 356 try { 357 org.apache.hadoop.mapreduce.TaskCompletionEvent[] acls = 358 job.getTaskCompletionEvents(startFrom, 10); 359 TaskCompletionEvent[] ret = new TaskCompletionEvent[acls.length]; 360 for (int i = 0 ; i < acls.length; i++ ) { 361 ret[i] = TaskCompletionEvent.downgrade(acls[i]); 362 } 363 return ret; 364 } catch (InterruptedException ie) { 365 throw new IOException(ie); 366 } 367 } 368 369 /** 370 * Dump stats to screen 371 */ 372 @Override 373 public String toString() { 374 return job.toString(); 375 } 376 377 /** 378 * Returns the counters for this job 379 */ 380 public Counters getCounters() throws IOException { 381 Counters result = null; 382 org.apache.hadoop.mapreduce.Counters temp = job.getCounters(); 383 if(temp != null) { 384 result = Counters.downgrade(temp); 385 } 386 return result; 387 } 388 389 @Override 390 public String[] getTaskDiagnostics(TaskAttemptID id) throws IOException { 391 try { 392 return job.getTaskDiagnostics(id); 393 } catch (InterruptedException ie) { 394 throw new IOException(ie); 395 } 396 } 397 398 public String getHistoryUrl() throws IOException { 399 try { 400 return job.getHistoryUrl(); 401 } catch (InterruptedException ie) { 402 throw new IOException(ie); 403 } 404 } 405 406 public boolean isRetired() throws IOException { 407 try { 408 return job.isRetired(); 409 } catch (InterruptedException ie) { 410 throw new IOException(ie); 411 } 412 } 413 414 boolean monitorAndPrintJob() throws IOException, InterruptedException { 415 return job.monitorAndPrintJob(); 416 } 417 418 @Override 419 public String getFailureInfo() throws IOException { 420 try { 421 return job.getStatus().getFailureInfo(); 422 } catch (InterruptedException ie) { 423 throw new IOException(ie); 424 } 425 } 426 427 @Override 428 public JobStatus getJobStatus() throws IOException { 429 try { 430 return JobStatus.downgrade(job.getStatus()); 431 } catch (InterruptedException ie) { 432 throw new IOException(ie); 433 } 434 } 435 } 436 437 /** 438 * Ugi of the client. We store this ugi when the client is created and 439 * then make sure that the same ugi is used to run the various protocols. 440 */ 441 UserGroupInformation clientUgi; 442 443 /** 444 * Create a job client. 445 */ 446 public JobClient() { 447 } 448 449 /** 450 * Build a job client with the given {@link JobConf}, and connect to the 451 * default cluster 452 * 453 * @param conf the job configuration. 454 * @throws IOException 455 */ 456 public JobClient(JobConf conf) throws IOException { 457 init(conf); 458 } 459 460 /** 461 * Build a job client with the given {@link Configuration}, 462 * and connect to the default cluster 463 * 464 * @param conf the configuration. 465 * @throws IOException 466 */ 467 public JobClient(Configuration conf) throws IOException { 468 init(new JobConf(conf)); 469 } 470 471 /** 472 * Connect to the default cluster 473 * @param conf the job configuration. 474 * @throws IOException 475 */ 476 public void init(JobConf conf) throws IOException { 477 setConf(conf); 478 cluster = new Cluster(conf); 479 clientUgi = UserGroupInformation.getCurrentUser(); 480 } 481 482 @InterfaceAudience.Private 483 public static class Renewer extends TokenRenewer { 484 485 @Override 486 public boolean handleKind(Text kind) { 487 return DelegationTokenIdentifier.MAPREDUCE_DELEGATION_KIND.equals(kind); 488 } 489 490 @SuppressWarnings("unchecked") 491 @Override 492 public long renew(Token<?> token, Configuration conf 493 ) throws IOException, InterruptedException { 494 return new Cluster(conf). 495 renewDelegationToken((Token<DelegationTokenIdentifier>) token); 496 } 497 498 @SuppressWarnings("unchecked") 499 @Override 500 public void cancel(Token<?> token, Configuration conf 501 ) throws IOException, InterruptedException { 502 new Cluster(conf). 503 cancelDelegationToken((Token<DelegationTokenIdentifier>) token); 504 } 505 506 @Override 507 public boolean isManaged(Token<?> token) throws IOException { 508 return true; 509 } 510 } 511 512 /** 513 * Build a job client, connect to the indicated job tracker. 514 * 515 * @param jobTrackAddr the job tracker to connect to. 516 * @param conf configuration. 517 */ 518 public JobClient(InetSocketAddress jobTrackAddr, 519 Configuration conf) throws IOException { 520 cluster = new Cluster(jobTrackAddr, conf); 521 clientUgi = UserGroupInformation.getCurrentUser(); 522 } 523 524 /** 525 * Close the <code>JobClient</code>. 526 */ 527 public synchronized void close() throws IOException { 528 cluster.close(); 529 } 530 531 /** 532 * Get a filesystem handle. We need this to prepare jobs 533 * for submission to the MapReduce system. 534 * 535 * @return the filesystem handle. 536 */ 537 public synchronized FileSystem getFs() throws IOException { 538 try { 539 return cluster.getFileSystem(); 540 } catch (InterruptedException ie) { 541 throw new IOException(ie); 542 } 543 } 544 545 /** 546 * Get a handle to the Cluster 547 */ 548 public Cluster getClusterHandle() { 549 return cluster; 550 } 551 552 /** 553 * Submit a job to the MR system. 554 * 555 * This returns a handle to the {@link RunningJob} which can be used to track 556 * the running-job. 557 * 558 * @param jobFile the job configuration. 559 * @return a handle to the {@link RunningJob} which can be used to track the 560 * running-job. 561 * @throws FileNotFoundException 562 * @throws InvalidJobConfException 563 * @throws IOException 564 */ 565 public RunningJob submitJob(String jobFile) throws FileNotFoundException, 566 InvalidJobConfException, 567 IOException { 568 // Load in the submitted job details 569 JobConf job = new JobConf(jobFile); 570 return submitJob(job); 571 } 572 573 /** 574 * Submit a job to the MR system. 575 * This returns a handle to the {@link RunningJob} which can be used to track 576 * the running-job. 577 * 578 * @param conf the job configuration. 579 * @return a handle to the {@link RunningJob} which can be used to track the 580 * running-job. 581 * @throws FileNotFoundException 582 * @throws IOException 583 */ 584 public RunningJob submitJob(final JobConf conf) throws FileNotFoundException, 585 IOException { 586 return submitJobInternal(conf); 587 } 588 589 @InterfaceAudience.Private 590 public RunningJob submitJobInternal(final JobConf conf) 591 throws FileNotFoundException, IOException { 592 try { 593 conf.setBooleanIfUnset("mapred.mapper.new-api", false); 594 conf.setBooleanIfUnset("mapred.reducer.new-api", false); 595 if (getDelegationTokenCalled) { 596 conf.setBoolean(HS_DELEGATION_TOKEN_REQUIRED, getDelegationTokenCalled); 597 getDelegationTokenCalled = false; 598 conf.set(HS_DELEGATION_TOKEN_RENEWER, dtRenewer); 599 dtRenewer = null; 600 } 601 Job job = clientUgi.doAs(new PrivilegedExceptionAction<Job> () { 602 @Override 603 public Job run() throws IOException, ClassNotFoundException, 604 InterruptedException { 605 Job job = Job.getInstance(conf); 606 job.submit(); 607 return job; 608 } 609 }); 610 // update our Cluster instance with the one created by Job for submission 611 // (we can't pass our Cluster instance to Job, since Job wraps the config 612 // instance, and the two configs would then diverge) 613 cluster = job.getCluster(); 614 return new NetworkedJob(job); 615 } catch (InterruptedException ie) { 616 throw new IOException("interrupted", ie); 617 } 618 } 619 620 private Job getJobUsingCluster(final JobID jobid) throws IOException, 621 InterruptedException { 622 return clientUgi.doAs(new PrivilegedExceptionAction<Job>() { 623 public Job run() throws IOException, InterruptedException { 624 return cluster.getJob(jobid); 625 } 626 }); 627 } 628 /** 629 * Get an {@link RunningJob} object to track an ongoing job. Returns 630 * null if the id does not correspond to any known job. 631 * 632 * @param jobid the jobid of the job. 633 * @return the {@link RunningJob} handle to track the job, null if the 634 * <code>jobid</code> doesn't correspond to any known job. 635 * @throws IOException 636 */ 637 public RunningJob getJob(final JobID jobid) throws IOException { 638 try { 639 640 Job job = getJobUsingCluster(jobid); 641 if (job != null) { 642 JobStatus status = JobStatus.downgrade(job.getStatus()); 643 if (status != null) { 644 return new NetworkedJob(status, cluster); 645 } 646 } 647 } catch (InterruptedException ie) { 648 throw new IOException(ie); 649 } 650 return null; 651 } 652 653 /**@deprecated Applications should rather use {@link #getJob(JobID)}. 654 */ 655 @Deprecated 656 public RunningJob getJob(String jobid) throws IOException { 657 return getJob(JobID.forName(jobid)); 658 } 659 660 private static final TaskReport[] EMPTY_TASK_REPORTS = new TaskReport[0]; 661 662 /** 663 * Get the information of the current state of the map tasks of a job. 664 * 665 * @param jobId the job to query. 666 * @return the list of all of the map tips. 667 * @throws IOException 668 */ 669 public TaskReport[] getMapTaskReports(JobID jobId) throws IOException { 670 return getTaskReports(jobId, TaskType.MAP); 671 } 672 673 private TaskReport[] getTaskReports(final JobID jobId, TaskType type) throws 674 IOException { 675 try { 676 Job j = getJobUsingCluster(jobId); 677 if(j == null) { 678 return EMPTY_TASK_REPORTS; 679 } 680 return TaskReport.downgradeArray(j.getTaskReports(type)); 681 } catch (InterruptedException ie) { 682 throw new IOException(ie); 683 } 684 } 685 686 /**@deprecated Applications should rather use {@link #getMapTaskReports(JobID)}*/ 687 @Deprecated 688 public TaskReport[] getMapTaskReports(String jobId) throws IOException { 689 return getMapTaskReports(JobID.forName(jobId)); 690 } 691 692 /** 693 * Get the information of the current state of the reduce tasks of a job. 694 * 695 * @param jobId the job to query. 696 * @return the list of all of the reduce tips. 697 * @throws IOException 698 */ 699 public TaskReport[] getReduceTaskReports(JobID jobId) throws IOException { 700 return getTaskReports(jobId, TaskType.REDUCE); 701 } 702 703 /** 704 * Get the information of the current state of the cleanup tasks of a job. 705 * 706 * @param jobId the job to query. 707 * @return the list of all of the cleanup tips. 708 * @throws IOException 709 */ 710 public TaskReport[] getCleanupTaskReports(JobID jobId) throws IOException { 711 return getTaskReports(jobId, TaskType.JOB_CLEANUP); 712 } 713 714 /** 715 * Get the information of the current state of the setup tasks of a job. 716 * 717 * @param jobId the job to query. 718 * @return the list of all of the setup tips. 719 * @throws IOException 720 */ 721 public TaskReport[] getSetupTaskReports(JobID jobId) throws IOException { 722 return getTaskReports(jobId, TaskType.JOB_SETUP); 723 } 724 725 726 /**@deprecated Applications should rather use {@link #getReduceTaskReports(JobID)}*/ 727 @Deprecated 728 public TaskReport[] getReduceTaskReports(String jobId) throws IOException { 729 return getReduceTaskReports(JobID.forName(jobId)); 730 } 731 732 /** 733 * Display the information about a job's tasks, of a particular type and 734 * in a particular state 735 * 736 * @param jobId the ID of the job 737 * @param type the type of the task (map/reduce/setup/cleanup) 738 * @param state the state of the task 739 * (pending/running/completed/failed/killed) 740 */ 741 public void displayTasks(final JobID jobId, String type, String state) 742 throws IOException { 743 try { 744 Job job = getJobUsingCluster(jobId); 745 super.displayTasks(job, type, state); 746 } catch (InterruptedException ie) { 747 throw new IOException(ie); 748 } 749 } 750 751 /** 752 * Get status information about the Map-Reduce cluster. 753 * 754 * @return the status information about the Map-Reduce cluster as an object 755 * of {@link ClusterStatus}. 756 * @throws IOException 757 */ 758 public ClusterStatus getClusterStatus() throws IOException { 759 try { 760 return clientUgi.doAs(new PrivilegedExceptionAction<ClusterStatus>() { 761 public ClusterStatus run() throws IOException, InterruptedException { 762 ClusterMetrics metrics = cluster.getClusterStatus(); 763 return new ClusterStatus(metrics.getTaskTrackerCount(), 764 metrics.getBlackListedTaskTrackerCount(), cluster.getTaskTrackerExpiryInterval(), 765 metrics.getOccupiedMapSlots(), 766 metrics.getOccupiedReduceSlots(), metrics.getMapSlotCapacity(), 767 metrics.getReduceSlotCapacity(), 768 cluster.getJobTrackerStatus(), 769 metrics.getDecommissionedTaskTrackerCount()); 770 } 771 }); 772 } 773 catch (InterruptedException ie) { 774 throw new IOException(ie); 775 } 776 } 777 778 private Collection<String> arrayToStringList(TaskTrackerInfo[] objs) { 779 Collection<String> list = new ArrayList<String>(); 780 for (TaskTrackerInfo info: objs) { 781 list.add(info.getTaskTrackerName()); 782 } 783 return list; 784 } 785 786 private Collection<BlackListInfo> arrayToBlackListInfo(TaskTrackerInfo[] objs) { 787 Collection<BlackListInfo> list = new ArrayList<BlackListInfo>(); 788 for (TaskTrackerInfo info: objs) { 789 BlackListInfo binfo = new BlackListInfo(); 790 binfo.setTrackerName(info.getTaskTrackerName()); 791 binfo.setReasonForBlackListing(info.getReasonForBlacklist()); 792 binfo.setBlackListReport(info.getBlacklistReport()); 793 list.add(binfo); 794 } 795 return list; 796 } 797 798 /** 799 * Get status information about the Map-Reduce cluster. 800 * 801 * @param detailed if true then get a detailed status including the 802 * tracker names 803 * @return the status information about the Map-Reduce cluster as an object 804 * of {@link ClusterStatus}. 805 * @throws IOException 806 */ 807 public ClusterStatus getClusterStatus(boolean detailed) throws IOException { 808 try { 809 return clientUgi.doAs(new PrivilegedExceptionAction<ClusterStatus>() { 810 public ClusterStatus run() throws IOException, InterruptedException { 811 ClusterMetrics metrics = cluster.getClusterStatus(); 812 return new ClusterStatus(arrayToStringList(cluster.getActiveTaskTrackers()), 813 arrayToBlackListInfo(cluster.getBlackListedTaskTrackers()), 814 cluster.getTaskTrackerExpiryInterval(), metrics.getOccupiedMapSlots(), 815 metrics.getOccupiedReduceSlots(), metrics.getMapSlotCapacity(), 816 metrics.getReduceSlotCapacity(), 817 cluster.getJobTrackerStatus()); 818 } 819 }); 820 } catch (InterruptedException ie) { 821 throw new IOException(ie); 822 } 823 } 824 825 826 /** 827 * Get the jobs that are not completed and not failed. 828 * 829 * @return array of {@link JobStatus} for the running/to-be-run jobs. 830 * @throws IOException 831 */ 832 public JobStatus[] jobsToComplete() throws IOException { 833 List<JobStatus> stats = new ArrayList<JobStatus>(); 834 for (JobStatus stat : getAllJobs()) { 835 if (!stat.isJobComplete()) { 836 stats.add(stat); 837 } 838 } 839 return stats.toArray(new JobStatus[0]); 840 } 841 842 /** 843 * Get the jobs that are submitted. 844 * 845 * @return array of {@link JobStatus} for the submitted jobs. 846 * @throws IOException 847 */ 848 public JobStatus[] getAllJobs() throws IOException { 849 try { 850 org.apache.hadoop.mapreduce.JobStatus[] jobs = 851 clientUgi.doAs(new PrivilegedExceptionAction< 852 org.apache.hadoop.mapreduce.JobStatus[]> () { 853 public org.apache.hadoop.mapreduce.JobStatus[] run() 854 throws IOException, InterruptedException { 855 return cluster.getAllJobStatuses(); 856 } 857 }); 858 JobStatus[] stats = new JobStatus[jobs.length]; 859 for (int i = 0; i < jobs.length; i++) { 860 stats[i] = JobStatus.downgrade(jobs[i]); 861 } 862 return stats; 863 } catch (InterruptedException ie) { 864 throw new IOException(ie); 865 } 866 } 867 868 /** 869 * Utility that submits a job, then polls for progress until the job is 870 * complete. 871 * 872 * @param job the job configuration. 873 * @throws IOException if the job fails 874 */ 875 public static RunningJob runJob(JobConf job) throws IOException { 876 JobClient jc = new JobClient(job); 877 RunningJob rj = jc.submitJob(job); 878 try { 879 if (!jc.monitorAndPrintJob(job, rj)) { 880 throw new IOException("Job failed!"); 881 } 882 } catch (InterruptedException ie) { 883 Thread.currentThread().interrupt(); 884 } 885 return rj; 886 } 887 888 /** 889 * Monitor a job and print status in real-time as progress is made and tasks 890 * fail. 891 * @param conf the job's configuration 892 * @param job the job to track 893 * @return true if the job succeeded 894 * @throws IOException if communication to the JobTracker fails 895 */ 896 public boolean monitorAndPrintJob(JobConf conf, 897 RunningJob job 898 ) throws IOException, InterruptedException { 899 return ((NetworkedJob)job).monitorAndPrintJob(); 900 } 901 902 static String getTaskLogURL(TaskAttemptID taskId, String baseUrl) { 903 return (baseUrl + "/tasklog?plaintext=true&attemptid=" + taskId); 904 } 905 906 static Configuration getConfiguration(String jobTrackerSpec) 907 { 908 Configuration conf = new Configuration(); 909 if (jobTrackerSpec != null) { 910 if (jobTrackerSpec.indexOf(":") >= 0) { 911 conf.set("mapred.job.tracker", jobTrackerSpec); 912 } else { 913 String classpathFile = "hadoop-" + jobTrackerSpec + ".xml"; 914 URL validate = conf.getResource(classpathFile); 915 if (validate == null) { 916 throw new RuntimeException(classpathFile + " not found on CLASSPATH"); 917 } 918 conf.addResource(classpathFile); 919 } 920 } 921 return conf; 922 } 923 924 /** 925 * Sets the output filter for tasks. only those tasks are printed whose 926 * output matches the filter. 927 * @param newValue task filter. 928 */ 929 @Deprecated 930 public void setTaskOutputFilter(TaskStatusFilter newValue){ 931 this.taskOutputFilter = newValue; 932 } 933 934 /** 935 * Get the task output filter out of the JobConf. 936 * 937 * @param job the JobConf to examine. 938 * @return the filter level. 939 */ 940 public static TaskStatusFilter getTaskOutputFilter(JobConf job) { 941 return TaskStatusFilter.valueOf(job.get("jobclient.output.filter", 942 "FAILED")); 943 } 944 945 /** 946 * Modify the JobConf to set the task output filter. 947 * 948 * @param job the JobConf to modify. 949 * @param newValue the value to set. 950 */ 951 public static void setTaskOutputFilter(JobConf job, 952 TaskStatusFilter newValue) { 953 job.set("jobclient.output.filter", newValue.toString()); 954 } 955 956 /** 957 * Returns task output filter. 958 * @return task filter. 959 */ 960 @Deprecated 961 public TaskStatusFilter getTaskOutputFilter(){ 962 return this.taskOutputFilter; 963 } 964 965 protected long getCounter(org.apache.hadoop.mapreduce.Counters cntrs, 966 String counterGroupName, String counterName) throws IOException { 967 Counters counters = Counters.downgrade(cntrs); 968 return counters.findCounter(counterGroupName, counterName).getValue(); 969 } 970 971 /** 972 * Get status information about the max available Maps in the cluster. 973 * 974 * @return the max available Maps in the cluster 975 * @throws IOException 976 */ 977 public int getDefaultMaps() throws IOException { 978 try { 979 return clientUgi.doAs(new PrivilegedExceptionAction<Integer>() { 980 @Override 981 public Integer run() throws IOException, InterruptedException { 982 return cluster.getClusterStatus().getMapSlotCapacity(); 983 } 984 }); 985 } catch (InterruptedException ie) { 986 throw new IOException(ie); 987 } 988 } 989 990 /** 991 * Get status information about the max available Reduces in the cluster. 992 * 993 * @return the max available Reduces in the cluster 994 * @throws IOException 995 */ 996 public int getDefaultReduces() throws IOException { 997 try { 998 return clientUgi.doAs(new PrivilegedExceptionAction<Integer>() { 999 @Override 1000 public Integer run() throws IOException, InterruptedException { 1001 return cluster.getClusterStatus().getReduceSlotCapacity(); 1002 } 1003 }); 1004 } catch (InterruptedException ie) { 1005 throw new IOException(ie); 1006 } 1007 } 1008 1009 /** 1010 * Grab the jobtracker system directory path where job-specific files are to be placed. 1011 * 1012 * @return the system directory where job-specific files are to be placed. 1013 */ 1014 public Path getSystemDir() { 1015 try { 1016 return clientUgi.doAs(new PrivilegedExceptionAction<Path>() { 1017 @Override 1018 public Path run() throws IOException, InterruptedException { 1019 return cluster.getSystemDir(); 1020 } 1021 }); 1022 } catch (IOException ioe) { 1023 return null; 1024 } catch (InterruptedException ie) { 1025 return null; 1026 } 1027 } 1028 1029 /** 1030 * Checks if the job directory is clean and has all the required components 1031 * for (re) starting the job 1032 */ 1033 public static boolean isJobDirValid(Path jobDirPath, FileSystem fs) 1034 throws IOException { 1035 FileStatus[] contents = fs.listStatus(jobDirPath); 1036 int matchCount = 0; 1037 if (contents != null && contents.length >= 2) { 1038 for (FileStatus status : contents) { 1039 if ("job.xml".equals(status.getPath().getName())) { 1040 ++matchCount; 1041 } 1042 if ("job.split".equals(status.getPath().getName())) { 1043 ++matchCount; 1044 } 1045 } 1046 if (matchCount == 2) { 1047 return true; 1048 } 1049 } 1050 return false; 1051 } 1052 1053 /** 1054 * Fetch the staging area directory for the application 1055 * 1056 * @return path to staging area directory 1057 * @throws IOException 1058 */ 1059 public Path getStagingAreaDir() throws IOException { 1060 try { 1061 return clientUgi.doAs(new PrivilegedExceptionAction<Path>() { 1062 @Override 1063 public Path run() throws IOException, InterruptedException { 1064 return cluster.getStagingAreaDir(); 1065 } 1066 }); 1067 } catch (InterruptedException ie) { 1068 // throw RuntimeException instead for compatibility reasons 1069 throw new RuntimeException(ie); 1070 } 1071 } 1072 1073 private JobQueueInfo getJobQueueInfo(QueueInfo queue) { 1074 JobQueueInfo ret = new JobQueueInfo(queue); 1075 // make sure to convert any children 1076 if (queue.getQueueChildren().size() > 0) { 1077 List<JobQueueInfo> childQueues = new ArrayList<JobQueueInfo>(queue 1078 .getQueueChildren().size()); 1079 for (QueueInfo child : queue.getQueueChildren()) { 1080 childQueues.add(getJobQueueInfo(child)); 1081 } 1082 ret.setChildren(childQueues); 1083 } 1084 return ret; 1085 } 1086 1087 private JobQueueInfo[] getJobQueueInfoArray(QueueInfo[] queues) 1088 throws IOException { 1089 JobQueueInfo[] ret = new JobQueueInfo[queues.length]; 1090 for (int i = 0; i < queues.length; i++) { 1091 ret[i] = getJobQueueInfo(queues[i]); 1092 } 1093 return ret; 1094 } 1095 1096 /** 1097 * Returns an array of queue information objects about root level queues 1098 * configured 1099 * 1100 * @return the array of root level JobQueueInfo objects 1101 * @throws IOException 1102 */ 1103 public JobQueueInfo[] getRootQueues() throws IOException { 1104 try { 1105 return clientUgi.doAs(new PrivilegedExceptionAction<JobQueueInfo[]>() { 1106 public JobQueueInfo[] run() throws IOException, InterruptedException { 1107 return getJobQueueInfoArray(cluster.getRootQueues()); 1108 } 1109 }); 1110 } catch (InterruptedException ie) { 1111 throw new IOException(ie); 1112 } 1113 } 1114 1115 /** 1116 * Returns an array of queue information objects about immediate children 1117 * of queue queueName. 1118 * 1119 * @param queueName 1120 * @return the array of immediate children JobQueueInfo objects 1121 * @throws IOException 1122 */ 1123 public JobQueueInfo[] getChildQueues(final String queueName) throws IOException { 1124 try { 1125 return clientUgi.doAs(new PrivilegedExceptionAction<JobQueueInfo[]>() { 1126 public JobQueueInfo[] run() throws IOException, InterruptedException { 1127 return getJobQueueInfoArray(cluster.getChildQueues(queueName)); 1128 } 1129 }); 1130 } catch (InterruptedException ie) { 1131 throw new IOException(ie); 1132 } 1133 } 1134 1135 /** 1136 * Return an array of queue information objects about all the Job Queues 1137 * configured. 1138 * 1139 * @return Array of JobQueueInfo objects 1140 * @throws IOException 1141 */ 1142 public JobQueueInfo[] getQueues() throws IOException { 1143 try { 1144 return clientUgi.doAs(new PrivilegedExceptionAction<JobQueueInfo[]>() { 1145 public JobQueueInfo[] run() throws IOException, InterruptedException { 1146 return getJobQueueInfoArray(cluster.getQueues()); 1147 } 1148 }); 1149 } catch (InterruptedException ie) { 1150 throw new IOException(ie); 1151 } 1152 } 1153 1154 /** 1155 * Gets all the jobs which were added to particular Job Queue 1156 * 1157 * @param queueName name of the Job Queue 1158 * @return Array of jobs present in the job queue 1159 * @throws IOException 1160 */ 1161 1162 public JobStatus[] getJobsFromQueue(final String queueName) throws IOException { 1163 try { 1164 QueueInfo queue = clientUgi.doAs(new PrivilegedExceptionAction<QueueInfo>() { 1165 @Override 1166 public QueueInfo run() throws IOException, InterruptedException { 1167 return cluster.getQueue(queueName); 1168 } 1169 }); 1170 if (queue == null) { 1171 return null; 1172 } 1173 org.apache.hadoop.mapreduce.JobStatus[] stats = 1174 queue.getJobStatuses(); 1175 JobStatus[] ret = new JobStatus[stats.length]; 1176 for (int i = 0 ; i < stats.length; i++ ) { 1177 ret[i] = JobStatus.downgrade(stats[i]); 1178 } 1179 return ret; 1180 } catch (InterruptedException ie) { 1181 throw new IOException(ie); 1182 } 1183 } 1184 1185 /** 1186 * Gets the queue information associated to a particular Job Queue 1187 * 1188 * @param queueName name of the job queue. 1189 * @return Queue information associated to particular queue. 1190 * @throws IOException 1191 */ 1192 public JobQueueInfo getQueueInfo(final String queueName) throws IOException { 1193 try { 1194 QueueInfo queueInfo = clientUgi.doAs(new 1195 PrivilegedExceptionAction<QueueInfo>() { 1196 public QueueInfo run() throws IOException, InterruptedException { 1197 return cluster.getQueue(queueName); 1198 } 1199 }); 1200 if (queueInfo != null) { 1201 return new JobQueueInfo(queueInfo); 1202 } 1203 return null; 1204 } catch (InterruptedException ie) { 1205 throw new IOException(ie); 1206 } 1207 } 1208 1209 /** 1210 * Gets the Queue ACLs for current user 1211 * @return array of QueueAclsInfo object for current user. 1212 * @throws IOException 1213 */ 1214 public QueueAclsInfo[] getQueueAclsForCurrentUser() throws IOException { 1215 try { 1216 org.apache.hadoop.mapreduce.QueueAclsInfo[] acls = 1217 clientUgi.doAs(new 1218 PrivilegedExceptionAction 1219 <org.apache.hadoop.mapreduce.QueueAclsInfo[]>() { 1220 public org.apache.hadoop.mapreduce.QueueAclsInfo[] run() 1221 throws IOException, InterruptedException { 1222 return cluster.getQueueAclsForCurrentUser(); 1223 } 1224 }); 1225 QueueAclsInfo[] ret = new QueueAclsInfo[acls.length]; 1226 for (int i = 0 ; i < acls.length; i++ ) { 1227 ret[i] = QueueAclsInfo.downgrade(acls[i]); 1228 } 1229 return ret; 1230 } catch (InterruptedException ie) { 1231 throw new IOException(ie); 1232 } 1233 } 1234 1235 /** 1236 * Get a delegation token for the user from the JobTracker. 1237 * @param renewer the user who can renew the token 1238 * @return the new token 1239 * @throws IOException 1240 */ 1241 public Token<DelegationTokenIdentifier> 1242 getDelegationToken(final Text renewer) throws IOException, InterruptedException { 1243 getDelegationTokenCalled = true; 1244 dtRenewer = renewer.toString(); 1245 return clientUgi.doAs(new 1246 PrivilegedExceptionAction<Token<DelegationTokenIdentifier>>() { 1247 public Token<DelegationTokenIdentifier> run() throws IOException, 1248 InterruptedException { 1249 return cluster.getDelegationToken(renewer); 1250 } 1251 }); 1252 } 1253 1254 /** 1255 * Renew a delegation token 1256 * @param token the token to renew 1257 * @return true if the renewal went well 1258 * @throws InvalidToken 1259 * @throws IOException 1260 * @deprecated Use {@link Token#renew} instead 1261 */ 1262 public long renewDelegationToken(Token<DelegationTokenIdentifier> token 1263 ) throws InvalidToken, IOException, 1264 InterruptedException { 1265 return token.renew(getConf()); 1266 } 1267 1268 /** 1269 * Cancel a delegation token from the JobTracker 1270 * @param token the token to cancel 1271 * @throws IOException 1272 * @deprecated Use {@link Token#cancel} instead 1273 */ 1274 public void cancelDelegationToken(Token<DelegationTokenIdentifier> token 1275 ) throws InvalidToken, IOException, 1276 InterruptedException { 1277 token.cancel(getConf()); 1278 } 1279 1280 /** 1281 */ 1282 public static void main(String argv[]) throws Exception { 1283 int res = ToolRunner.run(new JobClient(), argv); 1284 System.exit(res); 1285 } 1286 } 1287