001 /** 002 * Copyright (c) 2010 Yahoo! Inc. All rights reserved. 003 * Licensed under the Apache License, Version 2.0 (the "License"); 004 * you may not use this file except in compliance with the License. 005 * You may obtain a copy of the License at 006 * 007 * http://www.apache.org/licenses/LICENSE-2.0 008 * 009 * Unless required by applicable law or agreed to in writing, software 010 * distributed under the License is distributed on an "AS IS" BASIS, 011 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 012 * See the License for the specific language governing permissions and 013 * limitations under the License. See accompanying LICENSE file. 014 */ 015 package org.apache.oozie.command.coord; 016 017 import java.io.IOException; 018 import java.io.StringReader; 019 import java.util.ArrayList; 020 import java.util.Date; 021 import java.util.HashSet; 022 import java.util.List; 023 import java.util.Set; 024 025 import org.apache.hadoop.conf.Configuration; 026 import org.apache.hadoop.fs.Path; 027 import org.apache.oozie.CoordinatorActionBean; 028 import org.apache.oozie.CoordinatorActionInfo; 029 import org.apache.oozie.CoordinatorJobBean; 030 import org.apache.oozie.ErrorCode; 031 import org.apache.oozie.XException; 032 import org.apache.oozie.action.ActionExecutorException; 033 import org.apache.oozie.action.hadoop.FsActionExecutor; 034 import org.apache.oozie.client.CoordinatorAction; 035 import org.apache.oozie.client.CoordinatorJob; 036 import org.apache.oozie.client.Job; 037 import org.apache.oozie.client.SLAEvent.SlaAppType; 038 import org.apache.oozie.client.rest.RestConstants; 039 import org.apache.oozie.command.CommandException; 040 import org.apache.oozie.command.PreconditionException; 041 import org.apache.oozie.command.RerunTransitionXCommand; 042 import org.apache.oozie.command.bundle.BundleStatusUpdateXCommand; 043 import org.apache.oozie.coord.CoordELFunctions; 044 import org.apache.oozie.executor.jpa.CoordActionGetJPAExecutor; 045 import org.apache.oozie.executor.jpa.CoordJobGetActionForNominalTimeJPAExecutor; 046 import org.apache.oozie.executor.jpa.CoordJobGetActionsForDatesJPAExecutor; 047 import org.apache.oozie.executor.jpa.CoordJobGetJPAExecutor; 048 import org.apache.oozie.executor.jpa.CoordJobUpdateJPAExecutor; 049 import org.apache.oozie.executor.jpa.JPAExecutorException; 050 import org.apache.oozie.service.JPAService; 051 import org.apache.oozie.service.Services; 052 import org.apache.oozie.util.DateUtils; 053 import org.apache.oozie.util.InstrumentUtils; 054 import org.apache.oozie.util.LogUtils; 055 import org.apache.oozie.util.ParamChecker; 056 import org.apache.oozie.util.StatusUtils; 057 import org.apache.oozie.util.XConfiguration; 058 import org.apache.oozie.util.XLog; 059 import org.apache.oozie.util.XmlUtils; 060 import org.apache.oozie.util.db.SLADbOperations; 061 import org.jdom.Element; 062 import org.jdom.JDOMException; 063 064 /** 065 * Rerun coordinator actions by a list of dates or ids. User can specify if refresh or noCleanup. 066 * <p/> 067 * The "rerunType" can be set as {@link RestConstants.JOB_COORD_RERUN_DATE} or 068 * {@link RestConstants.JOB_COORD_RERUN_ACTION}. 069 * <p/> 070 * The "refresh" is used to indicate if user wants to refresh an action's input and output events. 071 * <p/> 072 * The "noCleanup" is used to indicate if user wants to cleanup output events for given rerun actions 073 */ 074 public class CoordRerunXCommand extends RerunTransitionXCommand<CoordinatorActionInfo> { 075 076 private String rerunType; 077 private String scope; 078 private boolean refresh; 079 private boolean noCleanup; 080 private CoordinatorJobBean coordJob = null; 081 private JPAService jpaService = null; 082 protected boolean prevPending; 083 084 /** 085 * The constructor for class {@link CoordRerunXCommand} 086 * 087 * @param jobId the job id 088 * @param rerunType rerun type {@link RestConstants.JOB_COORD_RERUN_DATE} or {@link RestConstants.JOB_COORD_RERUN_ACTION} 089 * @param scope the rerun scope for given rerunType separated by "," 090 * @param refresh true if user wants to refresh input/output dataset urls 091 * @param noCleanup false if user wants to cleanup output events for given rerun actions 092 */ 093 public CoordRerunXCommand(String jobId, String rerunType, String scope, boolean refresh, boolean noCleanup) { 094 super("coord_rerun", "coord_rerun", 1); 095 this.jobId = ParamChecker.notEmpty(jobId, "jobId"); 096 this.rerunType = ParamChecker.notEmpty(rerunType, "rerunType"); 097 this.scope = ParamChecker.notEmpty(scope, "scope"); 098 this.refresh = refresh; 099 this.noCleanup = noCleanup; 100 } 101 102 /** 103 * Get the list of actions for given id ranges 104 * 105 * @param jobId coordinator job id 106 * @param scope the id range to rerun separated by "," 107 * @return the list of all actions to rerun 108 * @throws CommandException thrown if failed to get coordinator actions by given id range 109 */ 110 private List<CoordinatorActionBean> getCoordActionsFromIds(String jobId, String scope) throws CommandException { 111 ParamChecker.notEmpty(jobId, "jobId"); 112 ParamChecker.notEmpty(scope, "scope"); 113 114 Set<String> actions = new HashSet<String>(); 115 String[] list = scope.split(","); 116 for (String s : list) { 117 s = s.trim(); 118 if (s.contains("-")) { 119 String[] range = s.split("-"); 120 if (range.length != 2) { 121 throw new CommandException(ErrorCode.E0302, "format is wrong for action's range '" + s + "'"); 122 } 123 int start; 124 int end; 125 try { 126 start = Integer.parseInt(range[0].trim()); 127 end = Integer.parseInt(range[1].trim()); 128 if (start > end) { 129 throw new CommandException(ErrorCode.E0302, "format is wrong for action's range '" + s + "'"); 130 } 131 } 132 catch (NumberFormatException ne) { 133 throw new CommandException(ErrorCode.E0302, ne); 134 } 135 for (int i = start; i <= end; i++) { 136 actions.add(jobId + "@" + i); 137 } 138 } 139 else { 140 try { 141 Integer.parseInt(s); 142 } 143 catch (NumberFormatException ne) { 144 throw new CommandException(ErrorCode.E0302, "format is wrong for action id'" + s 145 + "'. Integer only."); 146 } 147 actions.add(jobId + "@" + s); 148 } 149 } 150 151 List<CoordinatorActionBean> coordActions = new ArrayList<CoordinatorActionBean>(); 152 for (String id : actions) { 153 CoordinatorActionBean coordAction; 154 try { 155 coordAction = jpaService.execute(new CoordActionGetJPAExecutor(id)); 156 } 157 catch (JPAExecutorException je) { 158 throw new CommandException(je); 159 } 160 coordActions.add(coordAction); 161 LOG.debug("Rerun coordinator for actionId='" + id + "'"); 162 } 163 return coordActions; 164 } 165 166 /** 167 * Get the list of actions for given date ranges 168 * 169 * @param jobId coordinator job id 170 * @param scope the date range to rerun separated by "," 171 * @return the list of dates to rerun 172 * @throws CommandException thrown if failed to get coordinator actions by given date range 173 */ 174 private List<CoordinatorActionBean> getCoordActionsFromDates(String jobId, String scope) throws CommandException { 175 ParamChecker.notEmpty(jobId, "jobId"); 176 ParamChecker.notEmpty(scope, "scope"); 177 178 Set<CoordinatorActionBean> actionSet = new HashSet<CoordinatorActionBean>(); 179 String[] list = scope.split(","); 180 for (String s : list) { 181 s = s.trim(); 182 if (s.contains("::")) { 183 String[] dateRange = s.split("::"); 184 if (dateRange.length != 2) { 185 throw new CommandException(ErrorCode.E0302, "format is wrong for date's range '" + s + "'"); 186 } 187 Date start; 188 Date end; 189 try { 190 start = DateUtils.parseDateUTC(dateRange[0].trim()); 191 end = DateUtils.parseDateUTC(dateRange[1].trim()); 192 if (start.after(end)) { 193 throw new CommandException(ErrorCode.E0302, "start date is older than end date: '" + s + "'"); 194 } 195 } 196 catch (Exception e) { 197 throw new CommandException(ErrorCode.E0302, e); 198 } 199 200 List<CoordinatorActionBean> listOfActions = getActionIdsFromDateRange(jobId, start, end); 201 actionSet.addAll(listOfActions); 202 } 203 else { 204 try { 205 Date date = DateUtils.parseDateUTC(s.trim()); 206 CoordinatorActionBean coordAction = jpaService 207 .execute(new CoordJobGetActionForNominalTimeJPAExecutor(jobId, date)); 208 actionSet.add(coordAction); 209 } 210 catch (JPAExecutorException e) { 211 throw new CommandException(e); 212 } 213 catch (Exception e) { 214 throw new CommandException(ErrorCode.E0302, e); 215 } 216 } 217 } 218 219 List<CoordinatorActionBean> coordActions = new ArrayList<CoordinatorActionBean>(); 220 for (CoordinatorActionBean coordAction : actionSet) { 221 coordActions.add(coordAction); 222 LOG.debug("Rerun coordinator for actionId='" + coordAction.getId() + "'"); 223 } 224 return coordActions; 225 } 226 227 /** 228 * Get coordinator action ids between given start and end time 229 * 230 * @param jobId coordinator job id 231 * @param start start time 232 * @param end end time 233 * @return a list of coordinator actions belong to the range of start and end time 234 * @throws CommandException thrown if failed to get coordinator actions 235 */ 236 private List<CoordinatorActionBean> getActionIdsFromDateRange(String jobId, Date start, Date end) 237 throws CommandException { 238 List<CoordinatorActionBean> list; 239 try { 240 list = jpaService.execute(new CoordJobGetActionsForDatesJPAExecutor(jobId, start, end)); 241 } 242 catch (JPAExecutorException je) { 243 throw new CommandException(je); 244 } 245 return list; 246 } 247 248 /** 249 * Check if all given actions are eligible to rerun. 250 * 251 * @param actions list of CoordinatorActionBean 252 * @return true if all actions are eligible to rerun 253 */ 254 private boolean checkAllActionsRunnable(List<CoordinatorActionBean> coordActions) { 255 boolean ret = false; 256 for (CoordinatorActionBean coordAction : coordActions) { 257 ret = true; 258 if (!coordAction.isTerminalStatus()) { 259 ret = false; 260 break; 261 } 262 } 263 return ret; 264 } 265 266 /** 267 * Cleanup output-events directories 268 * 269 * @param eAction coordinator action xml 270 * @param user user name 271 * @param group group name 272 */ 273 @SuppressWarnings("unchecked") 274 private void cleanupOutputEvents(Element eAction, String user, String group) { 275 Element outputList = eAction.getChild("output-events", eAction.getNamespace()); 276 if (outputList != null) { 277 for (Element data : (List<Element>) outputList.getChildren("data-out", eAction.getNamespace())) { 278 if (data.getChild("uris", data.getNamespace()) != null) { 279 String uris = data.getChild("uris", data.getNamespace()).getTextTrim(); 280 if (uris != null) { 281 String[] uriArr = uris.split(CoordELFunctions.INSTANCE_SEPARATOR); 282 FsActionExecutor fsAe = new FsActionExecutor(); 283 for (String uri : uriArr) { 284 Path path = new Path(uri); 285 try { 286 fsAe.delete(user, group, path); 287 LOG.debug("Cleanup the output dir " + path); 288 } 289 catch (ActionExecutorException ae) { 290 LOG.warn("Failed to cleanup the output dir " + uri, ae); 291 } 292 } 293 } 294 295 } 296 } 297 } 298 else { 299 LOG.info("No output-events defined in coordinator xml. Therefore nothing to cleanup"); 300 } 301 } 302 303 /** 304 * Refresh an action's input and ouput events. 305 * 306 * @param coordJob coordinator job bean 307 * @param coordAction coordinator action bean 308 * @throws Exception thrown if failed to materialize coordinator action 309 */ 310 private void refreshAction(CoordinatorJobBean coordJob, CoordinatorActionBean coordAction) throws Exception { 311 Configuration jobConf = null; 312 try { 313 jobConf = new XConfiguration(new StringReader(coordJob.getConf())); 314 } 315 catch (IOException ioe) { 316 LOG.warn("Configuration parse error. read from DB :" + coordJob.getConf(), ioe); 317 throw new CommandException(ErrorCode.E1005, ioe); 318 } 319 String jobXml = coordJob.getJobXml(); 320 Element eJob = XmlUtils.parseXml(jobXml); 321 Date actualTime = new Date(); 322 String actionXml = CoordCommandUtils.materializeOneInstance(jobId, dryrun, (Element) eJob.clone(), coordAction 323 .getNominalTime(), actualTime, coordAction.getActionNumber(), jobConf, coordAction); 324 LOG.debug("Refresh Action actionId=" + coordAction.getId() + ", actionXml=" 325 + XmlUtils.prettyPrint(actionXml).toString()); 326 coordAction.setActionXml(actionXml); 327 } 328 329 /** 330 * Update an action into database table 331 * 332 * @param coordJob coordinator job bean 333 * @param coordAction coordinator action bean 334 * @param actionXml coordinator action xml 335 * @throws Exception thrown failed to update coordinator action bean or unable to write sla registration event 336 */ 337 private void updateAction(CoordinatorJobBean coordJob, CoordinatorActionBean coordAction, String actionXml) 338 throws Exception { 339 LOG.debug("updateAction for actionId=" + coordAction.getId()); 340 if (coordAction.getStatus() == CoordinatorAction.Status.TIMEDOUT) { 341 LOG.debug("Updating created time for TIMEDOUT action id =" + coordAction.getId()); 342 coordAction.setCreatedTime(new Date()); 343 } 344 coordAction.setStatus(CoordinatorAction.Status.WAITING); 345 coordAction.setExternalId(""); 346 coordAction.setExternalStatus(""); 347 coordAction.setRerunTime(new Date()); 348 coordAction.setLastModifiedTime(new Date()); 349 jpaService.execute(new org.apache.oozie.executor.jpa.CoordActionUpdateJPAExecutor(coordAction)); 350 writeActionRegistration(coordAction.getActionXml(), coordAction, coordJob.getUser(), coordJob.getGroup()); 351 } 352 353 /** 354 * Create SLA RegistrationEvent 355 * 356 * @param actionXml action xml 357 * @param actionBean coordinator action bean 358 * @param user user name 359 * @param group group name 360 * @throws Exception thrown if unable to write sla registration event 361 */ 362 private void writeActionRegistration(String actionXml, CoordinatorActionBean actionBean, String user, String group) 363 throws Exception { 364 Element eAction = XmlUtils.parseXml(actionXml); 365 Element eSla = eAction.getChild("action", eAction.getNamespace()).getChild("info", eAction.getNamespace("sla")); 366 SLADbOperations.writeSlaRegistrationEvent(eSla, actionBean.getId(), SlaAppType.COORDINATOR_ACTION, user, group, 367 LOG); 368 } 369 370 /* (non-Javadoc) 371 * @see org.apache.oozie.command.XCommand#getEntityKey() 372 */ 373 @Override 374 protected String getEntityKey() { 375 return jobId; 376 } 377 378 /* (non-Javadoc) 379 * @see org.apache.oozie.command.XCommand#isLockRequired() 380 */ 381 @Override 382 protected boolean isLockRequired() { 383 return true; 384 } 385 386 /* (non-Javadoc) 387 * @see org.apache.oozie.command.XCommand#loadState() 388 */ 389 @Override 390 protected void loadState() throws CommandException { 391 jpaService = Services.get().get(JPAService.class); 392 if (jpaService == null) { 393 throw new CommandException(ErrorCode.E0610); 394 } 395 try { 396 coordJob = jpaService.execute(new CoordJobGetJPAExecutor(jobId)); 397 prevPending = coordJob.isPending(); 398 } 399 catch (JPAExecutorException je) { 400 throw new CommandException(je); 401 } 402 LogUtils.setLogInfo(coordJob, logInfo); 403 } 404 405 /* (non-Javadoc) 406 * @see org.apache.oozie.command.XCommand#verifyPrecondition() 407 */ 408 @Override 409 protected void verifyPrecondition() throws CommandException, PreconditionException { 410 if (coordJob.getStatus() == CoordinatorJob.Status.KILLED 411 || coordJob.getStatus() == CoordinatorJob.Status.FAILED) { 412 LOG.info("CoordRerunXCommand is not able to run, job status=" + coordJob.getStatus() + ", jobid=" + jobId); 413 throw new CommandException(ErrorCode.E1018, 414 "coordinator job is killed or failed so all actions are not eligible to rerun!"); 415 } 416 417 // no actioins have been created for PREP job 418 if (coordJob.getStatus() == CoordinatorJob.Status.PREP) { 419 LOG.info("CoordRerunXCommand is not able to run, job status=" + coordJob.getStatus() + ", jobid=" + jobId); 420 throw new CommandException(ErrorCode.E1018, 421 "coordinator job is PREP so no actions are materialized to rerun!"); 422 } 423 } 424 425 @Override 426 protected void eagerVerifyPrecondition() throws CommandException, PreconditionException { 427 verifyPrecondition(); 428 } 429 430 @Override 431 public void rerunChildren() throws CommandException { 432 boolean isError = false; 433 try { 434 CoordinatorActionInfo coordInfo = null; 435 InstrumentUtils.incrJobCounter(getName(), 1, getInstrumentation()); 436 List<CoordinatorActionBean> coordActions; 437 if (rerunType.equals(RestConstants.JOB_COORD_RERUN_DATE)) { 438 coordActions = getCoordActionsFromDates(jobId, scope); 439 } 440 else if (rerunType.equals(RestConstants.JOB_COORD_RERUN_ACTION)) { 441 coordActions = getCoordActionsFromIds(jobId, scope); 442 } 443 else { 444 isError = true; 445 throw new CommandException(ErrorCode.E1018, "date or action expected."); 446 } 447 if (checkAllActionsRunnable(coordActions)) { 448 for (CoordinatorActionBean coordAction : coordActions) { 449 String actionXml = coordAction.getActionXml(); 450 if (!noCleanup) { 451 Element eAction = XmlUtils.parseXml(actionXml); 452 cleanupOutputEvents(eAction, coordJob.getUser(), coordJob.getGroup()); 453 } 454 if (refresh) { 455 refreshAction(coordJob, coordAction); 456 } 457 updateAction(coordJob, coordAction, actionXml); 458 459 queue(new CoordActionNotificationXCommand(coordAction), 100); 460 queue(new CoordActionInputCheckXCommand(coordAction.getId()), 100); 461 } 462 } 463 else { 464 isError = true; 465 throw new CommandException(ErrorCode.E1018, "part or all actions are not eligible to rerun!"); 466 } 467 coordInfo = new CoordinatorActionInfo(coordActions); 468 469 ret = coordInfo; 470 } 471 catch (XException xex) { 472 isError = true; 473 throw new CommandException(xex); 474 } 475 catch (JDOMException jex) { 476 isError = true; 477 throw new CommandException(ErrorCode.E0700, jex); 478 } 479 catch (Exception ex) { 480 isError = true; 481 throw new CommandException(ErrorCode.E1018, ex); 482 } 483 finally{ 484 if(isError){ 485 transitToPrevious(); 486 } 487 } 488 } 489 490 /* 491 * (non-Javadoc) 492 * @see org.apache.oozie.command.TransitionXCommand#getJob() 493 */ 494 @Override 495 public Job getJob() { 496 return coordJob; 497 } 498 499 @Override 500 public void notifyParent() throws CommandException { 501 //update bundle action 502 if (getPrevStatus() != null && coordJob.getBundleId() != null) { 503 BundleStatusUpdateXCommand bundleStatusUpdate = new BundleStatusUpdateXCommand(coordJob, getPrevStatus()); 504 bundleStatusUpdate.call(); 505 } 506 } 507 508 @Override 509 public void updateJob() throws CommandException { 510 try { 511 // rerun a paused coordinator job will keep job status at paused and pending at previous pending 512 if (getPrevStatus()!= null && getPrevStatus().equals(Job.Status.PAUSED)) { 513 coordJob.setStatus(Job.Status.PAUSED); 514 if (prevPending) { 515 coordJob.setPending(); 516 } else { 517 coordJob.resetPending(); 518 } 519 } 520 521 jpaService.execute(new CoordJobUpdateJPAExecutor(coordJob)); 522 } 523 catch (JPAExecutorException je) { 524 throw new CommandException(je); 525 } 526 } 527 528 /* (non-Javadoc) 529 * @see org.apache.oozie.command.RerunTransitionXCommand#getLog() 530 */ 531 @Override 532 public XLog getLog() { 533 return LOG; 534 } 535 536 @Override 537 public final void transitToNext() { 538 prevStatus = coordJob.getStatus(); 539 coordJob.setStatus(Job.Status.RUNNING); 540 // used for backward support of coordinator 0.1 schema 541 coordJob.setStatus(StatusUtils.getStatusForCoordRerun(coordJob, prevStatus)); 542 coordJob.setPending(); 543 } 544 545 private final void transitToPrevious() throws CommandException { 546 coordJob.setStatus(getPrevStatus()); 547 if (!prevPending) { 548 coordJob.resetPending(); 549 } 550 else { 551 coordJob.setPending(); 552 } 553 } 554 }