001 /** 002 * Copyright (c) 2010 Yahoo! Inc. All rights reserved. 003 * Licensed under the Apache License, Version 2.0 (the "License"); 004 * you may not use this file except in compliance with the License. 005 * You may obtain a copy of the License at 006 * 007 * http://www.apache.org/licenses/LICENSE-2.0 008 * 009 * Unless required by applicable law or agreed to in writing, software 010 * distributed under the License is distributed on an "AS IS" BASIS, 011 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 012 * See the License for the specific language governing permissions and 013 * limitations under the License. See accompanying LICENSE file. 014 */ 015 package org.apache.oozie.service; 016 017 import java.io.IOException; 018 import java.io.StringReader; 019 import java.util.ArrayList; 020 import java.util.Date; 021 import java.util.List; 022 023 import org.apache.hadoop.conf.Configuration; 024 import org.apache.oozie.BundleActionBean; 025 import org.apache.oozie.BundleJobBean; 026 import org.apache.oozie.CoordinatorActionBean; 027 import org.apache.oozie.CoordinatorJobBean; 028 import org.apache.oozie.ErrorCode; 029 import org.apache.oozie.WorkflowActionBean; 030 import org.apache.oozie.client.Job; 031 import org.apache.oozie.client.OozieClient; 032 import org.apache.oozie.command.CommandException; 033 import org.apache.oozie.command.coord.CoordActionInputCheckCommand; 034 import org.apache.oozie.command.coord.CoordActionInputCheckXCommand; 035 import org.apache.oozie.command.coord.CoordActionReadyCommand; 036 import org.apache.oozie.command.coord.CoordActionReadyXCommand; 037 import org.apache.oozie.command.coord.CoordActionStartCommand; 038 import org.apache.oozie.command.coord.CoordActionStartXCommand; 039 import org.apache.oozie.command.coord.CoordKillXCommand; 040 import org.apache.oozie.command.coord.CoordResumeXCommand; 041 import org.apache.oozie.command.coord.CoordSubmitXCommand; 042 import org.apache.oozie.command.coord.CoordSuspendXCommand; 043 import org.apache.oozie.command.wf.ActionEndCommand; 044 import org.apache.oozie.command.wf.ActionEndXCommand; 045 import org.apache.oozie.command.wf.ActionStartCommand; 046 import org.apache.oozie.command.wf.ActionStartXCommand; 047 import org.apache.oozie.command.wf.KillXCommand; 048 import org.apache.oozie.command.wf.ResumeXCommand; 049 import org.apache.oozie.command.wf.SignalCommand; 050 import org.apache.oozie.command.wf.SignalXCommand; 051 import org.apache.oozie.command.wf.SuspendXCommand; 052 import org.apache.oozie.executor.jpa.BundleActionsGetWaitingOlderJPAExecutor; 053 import org.apache.oozie.executor.jpa.BundleJobGetJPAExecutor; 054 import org.apache.oozie.executor.jpa.CoordActionsGetForRecoveryJPAExecutor; 055 import org.apache.oozie.executor.jpa.CoordActionsGetReadyGroupbyJobIDJPAExecutor; 056 import org.apache.oozie.executor.jpa.CoordJobGetJPAExecutor; 057 import org.apache.oozie.executor.jpa.JPAExecutorException; 058 import org.apache.oozie.executor.jpa.WorkflowActionsGetPendingJPAExecutor; 059 import org.apache.oozie.util.JobUtils; 060 import org.apache.oozie.util.XCallable; 061 import org.apache.oozie.util.XConfiguration; 062 import org.apache.oozie.util.XLog; 063 import org.apache.oozie.util.XmlUtils; 064 import org.jdom.Attribute; 065 import org.jdom.Element; 066 import org.jdom.JDOMException; 067 068 /** 069 * The Recovery Service checks for pending actions and premater coordinator jobs older than a configured age and then 070 * queues them for execution. 071 */ 072 public class RecoveryService implements Service { 073 074 public static final String CONF_PREFIX = Service.CONF_PREFIX + "RecoveryService."; 075 public static final String CONF_PREFIX_WF_ACTIONS = Service.CONF_PREFIX + "wf.actions."; 076 public static final String CONF_PREFIX_COORD = Service.CONF_PREFIX + "coord."; 077 public static final String CONF_PREFIX_BUNDLE = Service.CONF_PREFIX + "bundle."; 078 /** 079 * Time interval, in seconds, at which the recovery service will be scheduled to run. 080 */ 081 public static final String CONF_SERVICE_INTERVAL = CONF_PREFIX + "interval"; 082 /** 083 * The number of callables to be queued in a batch. 084 */ 085 public static final String CONF_CALLABLE_BATCH_SIZE = CONF_PREFIX + "callable.batch.size"; 086 /** 087 * Age of actions to queue, in seconds. 088 */ 089 public static final String CONF_WF_ACTIONS_OLDER_THAN = CONF_PREFIX_WF_ACTIONS + "older.than"; 090 /** 091 * Age of coordinator jobs to recover, in seconds. 092 */ 093 public static final String CONF_COORD_OLDER_THAN = CONF_PREFIX_COORD + "older.than"; 094 095 /** 096 * Age of Bundle jobs to recover, in seconds. 097 */ 098 public static final String CONF_BUNDLE_OLDER_THAN = CONF_PREFIX_BUNDLE + "older.than"; 099 100 private static final String INSTRUMENTATION_GROUP = "recovery"; 101 private static final String INSTR_RECOVERED_ACTIONS_COUNTER = "actions"; 102 private static final String INSTR_RECOVERED_COORD_ACTIONS_COUNTER = "coord_actions"; 103 private static final String INSTR_RECOVERED_BUNDLE_ACTIONS_COUNTER = "bundle_actions"; 104 105 private static boolean useXCommand = true; 106 107 108 /** 109 * RecoveryRunnable is the Runnable which is scheduled to run with the configured interval, and takes care of the 110 * queuing of commands. 111 */ 112 static class RecoveryRunnable implements Runnable { 113 private final long olderThan; 114 private final long coordOlderThan; 115 private final long bundleOlderThan; 116 private long delay = 0; 117 private List<XCallable<?>> callables; 118 private List<XCallable<?>> delayedCallables; 119 private StringBuilder msg = null; 120 private JPAService jpaService = null; 121 122 public RecoveryRunnable(long olderThan, long coordOlderThan,long bundleOlderThan) { 123 this.olderThan = olderThan; 124 this.coordOlderThan = coordOlderThan; 125 this.bundleOlderThan = bundleOlderThan; 126 } 127 128 public void run() { 129 XLog.Info.get().clear(); 130 XLog log = XLog.getLog(getClass()); 131 msg = new StringBuilder(); 132 jpaService = Services.get().get(JPAService.class); 133 runWFRecovery(); 134 runCoordActionRecovery(); 135 runCoordActionRecoveryForReady(); 136 runBundleRecovery(); 137 log.debug("QUEUING [{0}] for potential recovery", msg.toString()); 138 boolean ret = false; 139 if (null != callables) { 140 ret = Services.get().get(CallableQueueService.class).queueSerial(callables); 141 if (ret == false) { 142 log.warn("Unable to queue the callables commands for RecoveryService. " 143 + "Most possibly command queue is full. Queue size is :" 144 + Services.get().get(CallableQueueService.class).queueSize()); 145 } 146 callables = null; 147 } 148 if (null != delayedCallables) { 149 ret = Services.get().get(CallableQueueService.class).queueSerial(delayedCallables, this.delay); 150 if (ret == false) { 151 log.warn("Unable to queue the delayedCallables commands for RecoveryService. " 152 + "Most possibly Callable queue is full. Queue size is :" 153 + Services.get().get(CallableQueueService.class).queueSize()); 154 } 155 delayedCallables = null; 156 this.delay = 0; 157 } 158 } 159 160 private void runBundleRecovery(){ 161 XLog.Info.get().clear(); 162 XLog log = XLog.getLog(getClass()); 163 164 try { 165 List<BundleActionBean> bactions = jpaService.execute(new BundleActionsGetWaitingOlderJPAExecutor(bundleOlderThan)); 166 msg.append(", BUNDLE_ACTIONS : " + bactions.size()); 167 for (BundleActionBean baction : bactions) { 168 Services.get().get(InstrumentationService.class).get().incr(INSTRUMENTATION_GROUP, 169 INSTR_RECOVERED_BUNDLE_ACTIONS_COUNTER, 1); 170 if(baction.getStatus() == Job.Status.PREP){ 171 BundleJobBean bundleJob = null; 172 try { 173 if (jpaService != null) { 174 bundleJob = jpaService.execute(new BundleJobGetJPAExecutor(baction.getBundleId())); 175 } 176 if(bundleJob != null){ 177 Element bAppXml = XmlUtils.parseXml(bundleJob.getJobXml()); 178 List<Element> coordElems = bAppXml.getChildren("coordinator", bAppXml.getNamespace()); 179 for (Element coordElem : coordElems) { 180 Attribute name = coordElem.getAttribute("name"); 181 if (name.getValue().equals(baction.getCoordName())) { 182 Configuration coordConf = mergeConfig(coordElem,bundleJob); 183 coordConf.set(OozieClient.BUNDLE_ID, baction.getBundleId()); 184 queueCallable(new CoordSubmitXCommand(coordConf, bundleJob.getAuthToken(), bundleJob.getId(), name.getValue())); 185 } 186 } 187 } 188 } 189 catch (JDOMException jex) { 190 throw new CommandException(ErrorCode.E1301, jex); 191 } 192 catch (JPAExecutorException je) { 193 throw new CommandException(je); 194 } 195 } 196 else if(baction.getStatus() == Job.Status.KILLED){ 197 queueCallable(new CoordKillXCommand(baction.getCoordId())); 198 } 199 else if(baction.getStatus() == Job.Status.SUSPENDED){ 200 queueCallable(new CoordSuspendXCommand(baction.getCoordId())); 201 } 202 else if(baction.getStatus() == Job.Status.RUNNING){ 203 queueCallable(new CoordResumeXCommand(baction.getCoordId())); 204 } 205 } 206 } 207 catch (Exception ex) { 208 log.error("Exception, {0}", ex.getMessage(), ex); 209 } 210 } 211 212 /** 213 * Recover coordinator actions that are staying in WAITING or SUBMITTED too long 214 */ 215 private void runCoordActionRecovery() { 216 XLog.Info.get().clear(); 217 XLog log = XLog.getLog(getClass()); 218 219 try { 220 List<CoordinatorActionBean> cactions = jpaService.execute(new CoordActionsGetForRecoveryJPAExecutor(coordOlderThan)); 221 msg.append(", COORD_ACTIONS : " + cactions.size()); 222 for (CoordinatorActionBean caction : cactions) { 223 Services.get().get(InstrumentationService.class).get().incr(INSTRUMENTATION_GROUP, 224 INSTR_RECOVERED_COORD_ACTIONS_COUNTER, 1); 225 if (caction.getStatus() == CoordinatorActionBean.Status.WAITING) { 226 if (useXCommand) { 227 queueCallable(new CoordActionInputCheckXCommand(caction.getId())); 228 } else { 229 queueCallable(new CoordActionInputCheckCommand(caction.getId())); 230 } 231 232 log.info("Recover a WAITTING coord action and resubmit CoordActionInputCheckXCommand :" + caction.getId()); 233 } 234 else if (caction.getStatus() == CoordinatorActionBean.Status.SUBMITTED) { 235 CoordinatorJobBean coordJob = jpaService.execute(new CoordJobGetJPAExecutor(caction.getJobId())); 236 237 if (useXCommand) { 238 queueCallable(new CoordActionStartXCommand(caction.getId(), coordJob.getUser(), coordJob 239 .getAuthToken())); 240 } else { 241 queueCallable(new CoordActionStartCommand(caction.getId(), coordJob.getUser(), coordJob 242 .getAuthToken())); 243 } 244 245 log.info("Recover a SUBMITTED coord action and resubmit CoordActionStartCommand :" + caction.getId()); 246 } 247 else if (caction.getStatus() == CoordinatorActionBean.Status.SUSPENDED) { 248 if (caction.getExternalId() != null) { 249 queueCallable(new SuspendXCommand(caction.getExternalId())); 250 log.debug("Recover a SUSPENDED coord action and resubmit SuspendXCommand :" + caction.getId()); 251 } 252 } 253 else if (caction.getStatus() == CoordinatorActionBean.Status.KILLED) { 254 if (caction.getExternalId() != null) { 255 queueCallable(new KillXCommand(caction.getExternalId())); 256 log.debug("Recover a KILLED coord action and resubmit KillXCommand :" + caction.getId()); 257 } 258 } 259 else if (caction.getStatus() == CoordinatorActionBean.Status.RUNNING) { 260 if (caction.getExternalId() != null) { 261 queueCallable(new ResumeXCommand(caction.getExternalId())); 262 log.debug("Recover a RUNNING coord action and resubmit ResumeXCommand :" + caction.getId()); 263 } 264 } 265 } 266 } 267 catch (Exception ex) { 268 log.error("Exception, {0}", ex.getMessage(), ex); 269 } 270 } 271 272 /** 273 * Recover coordinator actions that are staying in READY too long 274 */ 275 private void runCoordActionRecoveryForReady() { 276 XLog.Info.get().clear(); 277 XLog log = XLog.getLog(getClass()); 278 279 try { 280 List<String> jobids = jpaService.execute(new CoordActionsGetReadyGroupbyJobIDJPAExecutor(coordOlderThan)); 281 msg.append(", COORD_READY_JOBS : " + jobids.size()); 282 for (String jobid : jobids) { 283 if (useXCommand) { 284 queueCallable(new CoordActionReadyXCommand(jobid)); 285 } else { 286 queueCallable(new CoordActionReadyCommand(jobid)); 287 } 288 289 log.info("Recover READY coord actions for jobid :" + jobid); 290 } 291 } 292 catch (Exception ex) { 293 log.error("Exception, {0}", ex.getMessage(), ex); 294 } 295 } 296 297 /** 298 * Recover wf actions 299 */ 300 private void runWFRecovery() { 301 XLog.Info.get().clear(); 302 XLog log = XLog.getLog(getClass()); 303 // queue command for action recovery 304 try { 305 List<WorkflowActionBean> actions = null; 306 try { 307 actions = jpaService.execute(new WorkflowActionsGetPendingJPAExecutor(olderThan)); 308 } 309 catch (JPAExecutorException ex) { 310 log.warn("Exception while reading pending actions from storage", ex); 311 } 312 //log.debug("QUEUING[{0}] pending wf actions for potential recovery", actions.size()); 313 msg.append(" WF_ACTIONS " + actions.size()); 314 315 for (WorkflowActionBean action : actions) { 316 Services.get().get(InstrumentationService.class).get().incr(INSTRUMENTATION_GROUP, 317 INSTR_RECOVERED_ACTIONS_COUNTER, 1); 318 if (action.getStatus() == WorkflowActionBean.Status.PREP 319 || action.getStatus() == WorkflowActionBean.Status.START_MANUAL) { 320 321 if (useXCommand) { 322 queueCallable(new ActionStartXCommand(action.getId(), action.getType())); 323 } else { 324 queueCallable(new ActionStartCommand(action.getId(), action.getType())); 325 } 326 327 } 328 else if (action.getStatus() == WorkflowActionBean.Status.START_RETRY) { 329 Date nextRunTime = action.getPendingAge(); 330 if (useXCommand) { 331 queueCallable(new ActionStartXCommand(action.getId(), action.getType()), nextRunTime.getTime() 332 - System.currentTimeMillis()); 333 } else { 334 queueCallable(new ActionStartCommand(action.getId(), action.getType()), nextRunTime.getTime() 335 - System.currentTimeMillis()); 336 } 337 338 } 339 else if (action.getStatus() == WorkflowActionBean.Status.DONE 340 || action.getStatus() == WorkflowActionBean.Status.END_MANUAL) { 341 if (useXCommand) { 342 queueCallable(new ActionEndXCommand(action.getId(), action.getType())); 343 } else { 344 queueCallable(new ActionEndCommand(action.getId(), action.getType())); 345 } 346 347 } 348 else if (action.getStatus() == WorkflowActionBean.Status.END_RETRY) { 349 Date nextRunTime = action.getPendingAge(); 350 if (useXCommand) { 351 queueCallable(new ActionEndXCommand(action.getId(), action.getType()), nextRunTime.getTime() 352 - System.currentTimeMillis()); 353 } else { 354 queueCallable(new ActionEndCommand(action.getId(), action.getType()), nextRunTime.getTime() 355 - System.currentTimeMillis()); 356 } 357 358 } 359 else if (action.getStatus() == WorkflowActionBean.Status.OK 360 || action.getStatus() == WorkflowActionBean.Status.ERROR) { 361 if (useXCommand) { 362 queueCallable(new SignalXCommand(action.getJobId(), action.getId())); 363 } else { 364 queueCallable(new SignalCommand(action.getJobId(), action.getId())); 365 } 366 367 } 368 } 369 } 370 catch (Exception ex) { 371 log.error("Exception, {0}", ex.getMessage(), ex); 372 } 373 } 374 375 /** 376 * Adds callables to a list. If the number of callables in the list reaches {@link 377 * RecoveryService#CONF_CALLABLE_BATCH_SIZE}, the entire batch is queued and the callables list is reset. 378 * 379 * @param callable the callable to queue. 380 */ 381 private void queueCallable(XCallable<?> callable) { 382 if (callables == null) { 383 callables = new ArrayList<XCallable<?>>(); 384 } 385 callables.add(callable); 386 if (callables.size() == Services.get().getConf().getInt(CONF_CALLABLE_BATCH_SIZE, 10)) { 387 boolean ret = Services.get().get(CallableQueueService.class).queueSerial(callables); 388 if (ret == false) { 389 XLog.getLog(getClass()).warn( 390 "Unable to queue the callables commands for RecoveryService. " 391 + "Most possibly command queue is full. Queue size is :" 392 + Services.get().get(CallableQueueService.class).queueSize()); 393 } 394 callables = new ArrayList<XCallable<?>>(); 395 } 396 } 397 398 /** 399 * Adds callables to a list. If the number of callables in the list reaches {@link 400 * RecoveryService#CONF_CALLABLE_BATCH_SIZE}, the entire batch is queued with the delay set to the maximum delay 401 * of the callables in the list. The callables list and the delay is reset. 402 * 403 * @param callable the callable to queue. 404 * @param delay the delay for the callable. 405 */ 406 private void queueCallable(XCallable<?> callable, long delay) { 407 if (delayedCallables == null) { 408 delayedCallables = new ArrayList<XCallable<?>>(); 409 } 410 this.delay = Math.max(this.delay, delay); 411 delayedCallables.add(callable); 412 if (delayedCallables.size() == Services.get().getConf().getInt(CONF_CALLABLE_BATCH_SIZE, 10)) { 413 boolean ret = Services.get().get(CallableQueueService.class).queueSerial(delayedCallables, this.delay); 414 if (ret == false) { 415 XLog.getLog(getClass()).warn("Unable to queue the delayedCallables commands for RecoveryService. " 416 + "Most possibly Callable queue is full. Queue size is :" 417 + Services.get().get(CallableQueueService.class).queueSize()); 418 } 419 delayedCallables = new ArrayList<XCallable<?>>(); 420 this.delay = 0; 421 } 422 } 423 } 424 425 /** 426 * Initializes the RecoveryService. 427 * 428 * @param services services instance. 429 */ 430 @Override 431 public void init(Services services) { 432 Configuration conf = services.getConf(); 433 Runnable recoveryRunnable = new RecoveryRunnable(conf.getInt(CONF_WF_ACTIONS_OLDER_THAN, 120), conf.getInt( 434 CONF_COORD_OLDER_THAN, 600),conf.getInt(CONF_BUNDLE_OLDER_THAN, 600)); 435 services.get(SchedulerService.class).schedule(recoveryRunnable, 10, conf.getInt(CONF_SERVICE_INTERVAL, 600), 436 SchedulerService.Unit.SEC); 437 438 if (Services.get().getConf().getBoolean(USE_XCOMMAND, true) == false) { 439 useXCommand = false; 440 } 441 } 442 443 /** 444 * Destroy the Recovery Service. 445 */ 446 @Override 447 public void destroy() { 448 } 449 450 /** 451 * Return the public interface for the Recovery Service. 452 * 453 * @return {@link RecoveryService}. 454 */ 455 @Override 456 public Class<? extends Service> getInterface() { 457 return RecoveryService.class; 458 } 459 460 /** 461 * Merge Bundle job config and the configuration from the coord job to pass 462 * to Coord Engine 463 * 464 * @param coordElem the coordinator configuration 465 * @return Configuration merged configuration 466 * @throws CommandException thrown if failed to merge configuration 467 */ 468 private static Configuration mergeConfig(Element coordElem,BundleJobBean bundleJob) throws CommandException { 469 XLog.Info.get().clear(); 470 XLog log = XLog.getLog("RecoveryService"); 471 472 String jobConf = bundleJob.getConf(); 473 // Step 1: runConf = jobConf 474 Configuration runConf = null; 475 try { 476 runConf = new XConfiguration(new StringReader(jobConf)); 477 } 478 catch (IOException e1) { 479 log.warn("Configuration parse error in:" + jobConf); 480 throw new CommandException(ErrorCode.E1306, e1.getMessage(), e1); 481 } 482 // Step 2: Merge local properties into runConf 483 // extract 'property' tags under 'configuration' block in the coordElem 484 // convert Element to XConfiguration 485 Element localConfigElement = coordElem.getChild("configuration", coordElem.getNamespace()); 486 487 if (localConfigElement != null) { 488 String strConfig = XmlUtils.prettyPrint(localConfigElement).toString(); 489 Configuration localConf; 490 try { 491 localConf = new XConfiguration(new StringReader(strConfig)); 492 } 493 catch (IOException e1) { 494 log.warn("Configuration parse error in:" + strConfig); 495 throw new CommandException(ErrorCode.E1307, e1.getMessage(), e1); 496 } 497 498 // copy configuration properties in the coordElem to the runConf 499 XConfiguration.copy(localConf, runConf); 500 } 501 502 // Step 3: Extract value of 'app-path' in coordElem, save it as a 503 // new property called 'oozie.coord.application.path', and normalize. 504 String appPath = coordElem.getChild("app-path", coordElem.getNamespace()).getValue(); 505 runConf.set(OozieClient.COORDINATOR_APP_PATH, appPath); 506 // Normalize coordinator appPath here; 507 try { 508 JobUtils.normalizeAppPath(runConf.get(OozieClient.USER_NAME), runConf.get(OozieClient.GROUP_NAME), runConf); 509 } 510 catch (IOException e) { 511 throw new CommandException(ErrorCode.E1001, runConf.get(OozieClient.COORDINATOR_APP_PATH)); 512 } 513 return runConf; 514 } 515 }