001 /** 002 * Copyright (c) 2010 Yahoo! Inc. All rights reserved. 003 * Licensed under the Apache License, Version 2.0 (the "License"); 004 * you may not use this file except in compliance with the License. 005 * You may obtain a copy of the License at 006 * 007 * http://www.apache.org/licenses/LICENSE-2.0 008 * 009 * Unless required by applicable law or agreed to in writing, software 010 * distributed under the License is distributed on an "AS IS" BASIS, 011 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 012 * See the License for the specific language governing permissions and 013 * limitations under the License. See accompanying LICENSE file. 014 */ 015 package org.apache.oozie.command.wf; 016 017 import java.io.IOException; 018 import java.io.StringReader; 019 import java.net.URI; 020 import java.net.URISyntaxException; 021 import java.util.Date; 022 import java.util.Properties; 023 024 import org.apache.hadoop.conf.Configuration; 025 import org.apache.hadoop.fs.FileSystem; 026 import org.apache.hadoop.fs.Path; 027 import org.apache.oozie.DagELFunctions; 028 import org.apache.oozie.ErrorCode; 029 import org.apache.oozie.WorkflowActionBean; 030 import org.apache.oozie.WorkflowJobBean; 031 import org.apache.oozie.action.ActionExecutor; 032 import org.apache.oozie.client.WorkflowAction; 033 import org.apache.oozie.client.WorkflowJob; 034 import org.apache.oozie.command.CommandException; 035 import org.apache.oozie.command.coord.CoordActionUpdateXCommand; 036 import org.apache.oozie.service.CallbackService; 037 import org.apache.oozie.service.ELService; 038 import org.apache.oozie.service.HadoopAccessorException; 039 import org.apache.oozie.service.HadoopAccessorService; 040 import org.apache.oozie.service.JPAService; 041 import org.apache.oozie.service.Services; 042 import org.apache.oozie.util.ELEvaluator; 043 import org.apache.oozie.util.InstrumentUtils; 044 import org.apache.oozie.util.Instrumentation; 045 import org.apache.oozie.util.XConfiguration; 046 import org.apache.oozie.workflow.WorkflowException; 047 import org.apache.oozie.workflow.WorkflowInstance; 048 import org.apache.oozie.workflow.lite.LiteWorkflowInstance; 049 050 /** 051 * Base class for Action execution commands. Provides common functionality to handle different types of errors while 052 * attempting to start or end an action. 053 */ 054 public abstract class ActionXCommand<T> extends WorkflowXCommand<Void> { 055 private static final String INSTRUMENTATION_GROUP = "action.executors"; 056 057 protected static final String INSTR_FAILED_JOBS_COUNTER = "failed"; 058 059 protected static final String RECOVERY_ID_SEPARATOR = "@"; 060 061 public ActionXCommand(String name, String type, int priority) { 062 super(name, type, priority); 063 } 064 065 /** 066 * Takes care of Transient failures. Sets the action status to retry and increments the retry count if not enough 067 * attempts have been made. Otherwise returns false. 068 * 069 * @param context the execution context. 070 * @param executor the executor instance being used. 071 * @param status the status to be set for the action. 072 * @return true if the action is scheduled for another retry. false if the number of retries has exceeded the 073 * maximum number of configured retries. 074 * @throws CommandException thrown if unable to handle transient 075 */ 076 protected boolean handleTransient(ActionExecutor.Context context, ActionExecutor executor, 077 WorkflowAction.Status status) throws CommandException { 078 LOG.debug("Attempting to retry"); 079 ActionExecutorContext aContext = (ActionExecutorContext) context; 080 WorkflowActionBean action = (WorkflowActionBean) aContext.getAction(); 081 incrActionErrorCounter(action.getType(), "transient", 1); 082 083 int actionRetryCount = action.getRetries(); 084 if (actionRetryCount >= executor.getMaxRetries()) { 085 LOG.warn("Exceeded max retry count [{0}]. Suspending Job", executor.getMaxRetries()); 086 return false; 087 } 088 else { 089 action.setStatus(status); 090 action.setPending(); 091 action.incRetries(); 092 long retryDelayMillis = executor.getRetryInterval() * 1000; 093 action.setPendingAge(new Date(System.currentTimeMillis() + retryDelayMillis)); 094 LOG.info("Next Retry, Attempt Number [{0}] in [{1}] milliseconds", actionRetryCount + 1, retryDelayMillis); 095 this.resetUsed(); 096 queue(this, retryDelayMillis); 097 return true; 098 } 099 } 100 101 /** 102 * Takes care of non transient failures. The job is suspended, and the state of the action is changed to *MANUAL and 103 * set pending flag of action to false 104 * 105 * @param context the execution context. 106 * @param executor the executor instance being used. 107 * @param status the status to be set for the action. 108 * @throws CommandException thrown if unable to suspend job 109 */ 110 protected void handleNonTransient(ActionExecutor.Context context, ActionExecutor executor, 111 WorkflowAction.Status status) throws CommandException { 112 ActionExecutorContext aContext = (ActionExecutorContext) context; 113 WorkflowActionBean action = (WorkflowActionBean) aContext.getAction(); 114 incrActionErrorCounter(action.getType(), "nontransient", 1); 115 WorkflowJobBean workflow = (WorkflowJobBean) context.getWorkflow(); 116 String id = workflow.getId(); 117 action.setStatus(status); 118 action.resetPendingOnly(); 119 LOG.warn("Suspending Workflow Job id=" + id); 120 try { 121 SuspendXCommand.suspendJob(Services.get().get(JPAService.class), workflow, id, action.getId()); 122 } 123 catch (Exception e) { 124 throw new CommandException(ErrorCode.E0727, e.getMessage()); 125 } 126 finally { 127 // update coordinator action 128 new CoordActionUpdateXCommand(workflow, 3).call(); 129 } 130 } 131 132 /** 133 * Takes care of errors. </p> For errors while attempting to start the action, the job state is updated and an 134 * {@link ActionEndCommand} is queued. </p> For errors while attempting to end the action, the job state is updated. 135 * </p> 136 * 137 * @param context the execution context. 138 * @param executor the executor instance being used. 139 * @param message 140 * @param isStart whether the error was generated while starting or ending an action. 141 * @param status the status to be set for the action. 142 * @throws CommandException thrown if unable to handle action error 143 */ 144 protected void handleError(ActionExecutor.Context context, ActionExecutor executor, String message, 145 boolean isStart, WorkflowAction.Status status) throws CommandException { 146 LOG.warn("Setting Action Status to [{0}]", status); 147 ActionExecutorContext aContext = (ActionExecutorContext) context; 148 WorkflowActionBean action = (WorkflowActionBean) aContext.getAction(); 149 incrActionErrorCounter(action.getType(), "error", 1); 150 action.setPending(); 151 if (isStart) { 152 action.setExecutionData(message, null); 153 queue(new ActionEndXCommand(action.getId(), action.getType())); 154 } 155 else { 156 action.setEndData(status, WorkflowAction.Status.ERROR.toString()); 157 } 158 } 159 160 /** 161 * Fail the job due to failed action 162 * 163 * @param context the execution context. 164 * @throws CommandException thrown if unable to fail job 165 */ 166 public void failJob(ActionExecutor.Context context) throws CommandException { 167 ActionExecutorContext aContext = (ActionExecutorContext) context; 168 WorkflowActionBean action = (WorkflowActionBean) aContext.getAction(); 169 incrActionErrorCounter(action.getType(), "failed", 1); 170 WorkflowJobBean workflow = (WorkflowJobBean) context.getWorkflow(); 171 LOG.warn("Failing Job due to failed action [{0}]", action.getName()); 172 try { 173 workflow.getWorkflowInstance().fail(action.getName()); 174 WorkflowInstance wfInstance = workflow.getWorkflowInstance(); 175 ((LiteWorkflowInstance) wfInstance).setStatus(WorkflowInstance.Status.FAILED); 176 workflow.setWorkflowInstance(wfInstance); 177 workflow.setStatus(WorkflowJob.Status.FAILED); 178 action.setStatus(WorkflowAction.Status.FAILED); 179 action.resetPending(); 180 queue(new NotificationXCommand(workflow, action)); 181 queue(new KillXCommand(workflow.getId())); 182 InstrumentUtils.incrJobCounter(INSTR_FAILED_JOBS_COUNTER, 1, getInstrumentation()); 183 } 184 catch (WorkflowException ex) { 185 throw new CommandException(ex); 186 } 187 } 188 189 private void incrActionErrorCounter(String type, String error, int count) { 190 getInstrumentation().incr(INSTRUMENTATION_GROUP, type + "#ex." + error, count); 191 } 192 193 protected void incrActionCounter(String type, int count) { 194 getInstrumentation().incr(INSTRUMENTATION_GROUP, type + "#" + getName(), count); 195 } 196 197 protected void addActionCron(String type, Instrumentation.Cron cron) { 198 getInstrumentation().addCron(INSTRUMENTATION_GROUP, type + "#" + getName(), cron); 199 } 200 201 /** 202 * Workflow action executor context 203 * 204 */ 205 public static class ActionExecutorContext implements ActionExecutor.Context { 206 private final WorkflowJobBean workflow; 207 private Configuration protoConf; 208 private final WorkflowActionBean action; 209 private final boolean isRetry; 210 private boolean started; 211 private boolean ended; 212 private boolean executed; 213 214 public ActionExecutorContext(WorkflowJobBean workflow, WorkflowActionBean action, boolean isRetry) { 215 this.workflow = workflow; 216 this.action = action; 217 this.isRetry = isRetry; 218 try { 219 protoConf = new XConfiguration(new StringReader(workflow.getProtoActionConf())); 220 } 221 catch (IOException ex) { 222 throw new RuntimeException("It should not happen", ex); 223 } 224 } 225 226 public String getCallbackUrl(String externalStatusVar) { 227 return Services.get().get(CallbackService.class).createCallBackUrl(action.getId(), externalStatusVar); 228 } 229 230 public Configuration getProtoActionConf() { 231 return protoConf; 232 } 233 234 public WorkflowJob getWorkflow() { 235 return workflow; 236 } 237 238 public WorkflowAction getAction() { 239 return action; 240 } 241 242 public ELEvaluator getELEvaluator() { 243 ELEvaluator evaluator = Services.get().get(ELService.class).createEvaluator("workflow"); 244 DagELFunctions.configureEvaluator(evaluator, workflow, action); 245 return evaluator; 246 } 247 248 public void setVar(String name, String value) { 249 name = action.getName() + WorkflowInstance.NODE_VAR_SEPARATOR + name; 250 WorkflowInstance wfInstance = workflow.getWorkflowInstance(); 251 wfInstance.setVar(name, value); 252 workflow.setWorkflowInstance(wfInstance); 253 } 254 255 public String getVar(String name) { 256 name = action.getName() + WorkflowInstance.NODE_VAR_SEPARATOR + name; 257 return workflow.getWorkflowInstance().getVar(name); 258 } 259 260 public void setStartData(String externalId, String trackerUri, String consoleUrl) { 261 action.setStartData(externalId, trackerUri, consoleUrl); 262 started = true; 263 } 264 265 public void setExecutionData(String externalStatus, Properties actionData) { 266 action.setExecutionData(externalStatus, actionData); 267 executed = true; 268 } 269 270 public void setEndData(WorkflowAction.Status status, String signalValue) { 271 action.setEndData(status, signalValue); 272 ended = true; 273 } 274 275 public boolean isRetry() { 276 return isRetry; 277 } 278 279 /** 280 * Returns whether setStartData has been called or not. 281 * 282 * @return true if start completion info has been set. 283 */ 284 public boolean isStarted() { 285 return started; 286 } 287 288 /** 289 * Returns whether setExecutionData has been called or not. 290 * 291 * @return true if execution completion info has been set, otherwise false. 292 */ 293 public boolean isExecuted() { 294 return executed; 295 } 296 297 /** 298 * Returns whether setEndData has been called or not. 299 * 300 * @return true if end completion info has been set. 301 */ 302 public boolean isEnded() { 303 return ended; 304 } 305 306 public void setExternalStatus(String externalStatus) { 307 action.setExternalStatus(externalStatus); 308 } 309 310 @Override 311 public String getRecoveryId() { 312 return action.getId() + RECOVERY_ID_SEPARATOR + workflow.getRun(); 313 } 314 315 /* (non-Javadoc) 316 * @see org.apache.oozie.action.ActionExecutor.Context#getActionDir() 317 */ 318 public Path getActionDir() throws HadoopAccessorException, IOException, URISyntaxException { 319 String name = getWorkflow().getId() + "/" + action.getName() + "--" + action.getType(); 320 FileSystem fs = getAppFileSystem(); 321 String actionDirPath = Services.get().getSystemId() + "/" + name; 322 Path fqActionDir = new Path(fs.getHomeDirectory(), actionDirPath); 323 return fqActionDir; 324 } 325 326 /* (non-Javadoc) 327 * @see org.apache.oozie.action.ActionExecutor.Context#getAppFileSystem() 328 */ 329 public FileSystem getAppFileSystem() throws HadoopAccessorException, IOException, URISyntaxException { 330 WorkflowJob workflow = getWorkflow(); 331 XConfiguration jobConf = new XConfiguration(new StringReader(workflow.getConf())); 332 Configuration fsConf = new Configuration(); 333 XConfiguration.copy(jobConf, fsConf); 334 return Services.get().get(HadoopAccessorService.class).createFileSystem(workflow.getUser(), 335 workflow.getGroup(), new URI(getWorkflow().getAppPath()), fsConf); 336 337 } 338 339 /* (non-Javadoc) 340 * @see org.apache.oozie.action.ActionExecutor.Context#setErrorInfo(java.lang.String, java.lang.String) 341 */ 342 @Override 343 public void setErrorInfo(String str, String exMsg) { 344 action.setErrorInfo(str, exMsg); 345 } 346 } 347 348 }