001    /**
002     * Copyright (c) 2010 Yahoo! Inc. All rights reserved.
003     * Licensed under the Apache License, Version 2.0 (the "License");
004     * you may not use this file except in compliance with the License.
005     * You may obtain a copy of the License at
006     *
007     *   http://www.apache.org/licenses/LICENSE-2.0
008     *
009     *  Unless required by applicable law or agreed to in writing, software
010     *  distributed under the License is distributed on an "AS IS" BASIS,
011     *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
012     *  See the License for the specific language governing permissions and
013     *  limitations under the License. See accompanying LICENSE file.
014     */
015    package org.apache.oozie.service;
016    
017    import java.io.IOException;
018    import java.io.StringReader;
019    import java.util.ArrayList;
020    import java.util.Date;
021    import java.util.List;
022    
023    import org.apache.hadoop.conf.Configuration;
024    import org.apache.oozie.BundleActionBean;
025    import org.apache.oozie.BundleJobBean;
026    import org.apache.oozie.CoordinatorActionBean;
027    import org.apache.oozie.CoordinatorJobBean;
028    import org.apache.oozie.ErrorCode;
029    import org.apache.oozie.WorkflowActionBean;
030    import org.apache.oozie.client.Job;
031    import org.apache.oozie.client.OozieClient;
032    import org.apache.oozie.command.CommandException;
033    import org.apache.oozie.command.coord.CoordActionInputCheckCommand;
034    import org.apache.oozie.command.coord.CoordActionInputCheckXCommand;
035    import org.apache.oozie.command.coord.CoordActionReadyCommand;
036    import org.apache.oozie.command.coord.CoordActionReadyXCommand;
037    import org.apache.oozie.command.coord.CoordActionStartCommand;
038    import org.apache.oozie.command.coord.CoordActionStartXCommand;
039    import org.apache.oozie.command.coord.CoordKillXCommand;
040    import org.apache.oozie.command.coord.CoordResumeXCommand;
041    import org.apache.oozie.command.coord.CoordSubmitXCommand;
042    import org.apache.oozie.command.coord.CoordSuspendXCommand;
043    import org.apache.oozie.command.wf.ActionEndCommand;
044    import org.apache.oozie.command.wf.ActionEndXCommand;
045    import org.apache.oozie.command.wf.ActionStartCommand;
046    import org.apache.oozie.command.wf.ActionStartXCommand;
047    import org.apache.oozie.command.wf.KillXCommand;
048    import org.apache.oozie.command.wf.ResumeXCommand;
049    import org.apache.oozie.command.wf.SignalCommand;
050    import org.apache.oozie.command.wf.SignalXCommand;
051    import org.apache.oozie.command.wf.SuspendXCommand;
052    import org.apache.oozie.executor.jpa.BundleActionsGetWaitingOlderJPAExecutor;
053    import org.apache.oozie.executor.jpa.BundleJobGetJPAExecutor;
054    import org.apache.oozie.executor.jpa.CoordActionsGetForRecoveryJPAExecutor;
055    import org.apache.oozie.executor.jpa.CoordActionsGetReadyGroupbyJobIDJPAExecutor;
056    import org.apache.oozie.executor.jpa.CoordJobGetJPAExecutor;
057    import org.apache.oozie.executor.jpa.JPAExecutorException;
058    import org.apache.oozie.executor.jpa.WorkflowActionsGetPendingJPAExecutor;
059    import org.apache.oozie.util.JobUtils;
060    import org.apache.oozie.util.XCallable;
061    import org.apache.oozie.util.XConfiguration;
062    import org.apache.oozie.util.XLog;
063    import org.apache.oozie.util.XmlUtils;
064    import org.jdom.Attribute;
065    import org.jdom.Element;
066    import org.jdom.JDOMException;
067    
068    /**
069     * The Recovery Service checks for pending actions and premater coordinator jobs older than a configured age and then
070     * queues them for execution.
071     */
072    public class RecoveryService implements Service {
073    
074        public static final String CONF_PREFIX = Service.CONF_PREFIX + "RecoveryService.";
075        public static final String CONF_PREFIX_WF_ACTIONS = Service.CONF_PREFIX + "wf.actions.";
076        public static final String CONF_PREFIX_COORD = Service.CONF_PREFIX + "coord.";
077        public static final String CONF_PREFIX_BUNDLE = Service.CONF_PREFIX + "bundle.";
078        /**
079         * Time interval, in seconds, at which the recovery service will be scheduled to run.
080         */
081        public static final String CONF_SERVICE_INTERVAL = CONF_PREFIX + "interval";
082        /**
083         * The number of callables to be queued in a batch.
084         */
085        public static final String CONF_CALLABLE_BATCH_SIZE = CONF_PREFIX + "callable.batch.size";
086        /**
087         * Age of actions to queue, in seconds.
088         */
089        public static final String CONF_WF_ACTIONS_OLDER_THAN = CONF_PREFIX_WF_ACTIONS + "older.than";
090        /**
091         * Age of coordinator jobs to recover, in seconds.
092         */
093        public static final String CONF_COORD_OLDER_THAN = CONF_PREFIX_COORD + "older.than";
094    
095        /**
096         * Age of Bundle jobs to recover, in seconds.
097         */
098        public static final String CONF_BUNDLE_OLDER_THAN = CONF_PREFIX_BUNDLE + "older.than";
099    
100        private static final String INSTRUMENTATION_GROUP = "recovery";
101        private static final String INSTR_RECOVERED_ACTIONS_COUNTER = "actions";
102        private static final String INSTR_RECOVERED_COORD_ACTIONS_COUNTER = "coord_actions";
103        private static final String INSTR_RECOVERED_BUNDLE_ACTIONS_COUNTER = "bundle_actions";
104    
105        private static boolean useXCommand = true;
106    
107    
108        /**
109         * RecoveryRunnable is the Runnable which is scheduled to run with the configured interval, and takes care of the
110         * queuing of commands.
111         */
112        static class RecoveryRunnable implements Runnable {
113            private final long olderThan;
114            private final long coordOlderThan;
115            private final long bundleOlderThan;
116            private long delay = 0;
117            private List<XCallable<?>> callables;
118            private List<XCallable<?>> delayedCallables;
119            private StringBuilder msg = null;
120            private JPAService jpaService = null;
121    
122            public RecoveryRunnable(long olderThan, long coordOlderThan,long bundleOlderThan) {
123                this.olderThan = olderThan;
124                this.coordOlderThan = coordOlderThan;
125                this.bundleOlderThan = bundleOlderThan;
126            }
127    
128            public void run() {
129                XLog.Info.get().clear();
130                XLog log = XLog.getLog(getClass());
131                msg = new StringBuilder();
132                jpaService = Services.get().get(JPAService.class);
133                runWFRecovery();
134                runCoordActionRecovery();
135                runCoordActionRecoveryForReady();
136                runBundleRecovery();
137                log.debug("QUEUING [{0}] for potential recovery", msg.toString());
138                boolean ret = false;
139                if (null != callables) {
140                    ret = Services.get().get(CallableQueueService.class).queueSerial(callables);
141                    if (ret == false) {
142                        log.warn("Unable to queue the callables commands for RecoveryService. "
143                                + "Most possibly command queue is full. Queue size is :"
144                                + Services.get().get(CallableQueueService.class).queueSize());
145                    }
146                    callables = null;
147                }
148                if (null != delayedCallables) {
149                    ret = Services.get().get(CallableQueueService.class).queueSerial(delayedCallables, this.delay);
150                    if (ret == false) {
151                        log.warn("Unable to queue the delayedCallables commands for RecoveryService. "
152                                + "Most possibly Callable queue is full. Queue size is :"
153                                + Services.get().get(CallableQueueService.class).queueSize());
154                    }
155                    delayedCallables = null;
156                    this.delay = 0;
157                }
158            }
159    
160            private void runBundleRecovery(){
161                XLog.Info.get().clear();
162                XLog log = XLog.getLog(getClass());
163    
164                try {
165                    List<BundleActionBean> bactions = jpaService.execute(new BundleActionsGetWaitingOlderJPAExecutor(bundleOlderThan));
166                    msg.append(", BUNDLE_ACTIONS : " + bactions.size());
167                    for (BundleActionBean baction : bactions) {
168                        Services.get().get(InstrumentationService.class).get().incr(INSTRUMENTATION_GROUP,
169                                INSTR_RECOVERED_BUNDLE_ACTIONS_COUNTER, 1);
170                        if(baction.getStatus() == Job.Status.PREP){
171                            BundleJobBean bundleJob = null;
172                            try {
173                                if (jpaService != null) {
174                                    bundleJob = jpaService.execute(new BundleJobGetJPAExecutor(baction.getBundleId()));
175                                }
176                                if(bundleJob != null){
177                                    Element bAppXml = XmlUtils.parseXml(bundleJob.getJobXml());
178                                    List<Element> coordElems = bAppXml.getChildren("coordinator", bAppXml.getNamespace());
179                                    for (Element coordElem : coordElems) {
180                                        Attribute name = coordElem.getAttribute("name");
181                                        if (name.getValue().equals(baction.getCoordName())) {
182                                            Configuration coordConf = mergeConfig(coordElem,bundleJob);
183                                            coordConf.set(OozieClient.BUNDLE_ID, baction.getBundleId());
184                                            queueCallable(new CoordSubmitXCommand(coordConf, bundleJob.getAuthToken(), bundleJob.getId(), name.getValue()));
185                                        }
186                                    }
187                                }
188                            }
189                            catch (JDOMException jex) {
190                                throw new CommandException(ErrorCode.E1301, jex);
191                            }
192                            catch (JPAExecutorException je) {
193                                throw new CommandException(je);
194                            }
195                        }
196                        else if(baction.getStatus() == Job.Status.KILLED){
197                            queueCallable(new CoordKillXCommand(baction.getCoordId()));
198                        }
199                        else if(baction.getStatus() == Job.Status.SUSPENDED){
200                            queueCallable(new CoordSuspendXCommand(baction.getCoordId()));
201                        }
202                        else if(baction.getStatus() == Job.Status.RUNNING){
203                            queueCallable(new CoordResumeXCommand(baction.getCoordId()));
204                        }
205                    }
206                }
207                catch (Exception ex) {
208                    log.error("Exception, {0}", ex.getMessage(), ex);
209                }
210            }
211    
212            /**
213             * Recover coordinator actions that are staying in WAITING or SUBMITTED too long
214             */
215            private void runCoordActionRecovery() {
216                XLog.Info.get().clear();
217                XLog log = XLog.getLog(getClass());
218    
219                try {
220                    List<CoordinatorActionBean> cactions = jpaService.execute(new CoordActionsGetForRecoveryJPAExecutor(coordOlderThan));
221                    msg.append(", COORD_ACTIONS : " + cactions.size());
222                    for (CoordinatorActionBean caction : cactions) {
223                        Services.get().get(InstrumentationService.class).get().incr(INSTRUMENTATION_GROUP,
224                                                                                    INSTR_RECOVERED_COORD_ACTIONS_COUNTER, 1);
225                        if (caction.getStatus() == CoordinatorActionBean.Status.WAITING) {
226                            if (useXCommand) {
227                                queueCallable(new CoordActionInputCheckXCommand(caction.getId()));
228                            } else {
229                                queueCallable(new CoordActionInputCheckCommand(caction.getId()));
230                            }
231    
232                            log.info("Recover a WAITTING coord action and resubmit CoordActionInputCheckXCommand :" + caction.getId());
233                        }
234                        else if (caction.getStatus() == CoordinatorActionBean.Status.SUBMITTED) {
235                            CoordinatorJobBean coordJob = jpaService.execute(new CoordJobGetJPAExecutor(caction.getJobId()));
236    
237                            if (useXCommand) {
238                                queueCallable(new CoordActionStartXCommand(caction.getId(), coordJob.getUser(), coordJob
239                                        .getAuthToken()));
240                            } else {
241                                queueCallable(new CoordActionStartCommand(caction.getId(), coordJob.getUser(), coordJob
242                                        .getAuthToken()));
243                            }
244    
245                            log.info("Recover a SUBMITTED coord action and resubmit CoordActionStartCommand :" + caction.getId());
246                        }
247                        else if (caction.getStatus() == CoordinatorActionBean.Status.SUSPENDED) {
248                            if (caction.getExternalId() != null) {
249                                queueCallable(new SuspendXCommand(caction.getExternalId()));
250                                log.debug("Recover a SUSPENDED coord action and resubmit SuspendXCommand :" + caction.getId());
251                            }
252                        }
253                        else if (caction.getStatus() == CoordinatorActionBean.Status.KILLED) {
254                            if (caction.getExternalId() != null) {
255                                queueCallable(new KillXCommand(caction.getExternalId()));
256                                log.debug("Recover a KILLED coord action and resubmit KillXCommand :" + caction.getId());
257                            }
258                        }
259                        else if (caction.getStatus() == CoordinatorActionBean.Status.RUNNING) {
260                            if (caction.getExternalId() != null) {
261                                queueCallable(new ResumeXCommand(caction.getExternalId()));
262                                log.debug("Recover a RUNNING coord action and resubmit ResumeXCommand :" + caction.getId());
263                            }
264                        }
265                    }
266                }
267                catch (Exception ex) {
268                    log.error("Exception, {0}", ex.getMessage(), ex);
269                }
270            }
271    
272            /**
273             * Recover coordinator actions that are staying in READY too long
274             */
275            private void runCoordActionRecoveryForReady() {
276                XLog.Info.get().clear();
277                XLog log = XLog.getLog(getClass());
278    
279                try {
280                    List<String> jobids = jpaService.execute(new CoordActionsGetReadyGroupbyJobIDJPAExecutor(coordOlderThan));
281                    msg.append(", COORD_READY_JOBS : " + jobids.size());
282                    for (String jobid : jobids) {
283                        if (useXCommand) {
284                            queueCallable(new CoordActionReadyXCommand(jobid));
285                        } else {
286                            queueCallable(new CoordActionReadyCommand(jobid));
287                        }
288    
289                        log.info("Recover READY coord actions for jobid :" + jobid);
290                    }
291                }
292                catch (Exception ex) {
293                    log.error("Exception, {0}", ex.getMessage(), ex);
294                }
295            }
296    
297            /**
298             * Recover wf actions
299             */
300            private void runWFRecovery() {
301                XLog.Info.get().clear();
302                XLog log = XLog.getLog(getClass());
303                // queue command for action recovery
304                try {
305                    List<WorkflowActionBean> actions = null;
306                    try {
307                        actions = jpaService.execute(new WorkflowActionsGetPendingJPAExecutor(olderThan));
308                    }
309                    catch (JPAExecutorException ex) {
310                        log.warn("Exception while reading pending actions from storage", ex);
311                    }
312                    //log.debug("QUEUING[{0}] pending wf actions for potential recovery", actions.size());
313                    msg.append(" WF_ACTIONS " + actions.size());
314    
315                    for (WorkflowActionBean action : actions) {
316                        Services.get().get(InstrumentationService.class).get().incr(INSTRUMENTATION_GROUP,
317                                INSTR_RECOVERED_ACTIONS_COUNTER, 1);
318                        if (action.getStatus() == WorkflowActionBean.Status.PREP
319                                || action.getStatus() == WorkflowActionBean.Status.START_MANUAL) {
320    
321                            if (useXCommand) {
322                                queueCallable(new ActionStartXCommand(action.getId(), action.getType()));
323                            } else {
324                                queueCallable(new ActionStartCommand(action.getId(), action.getType()));
325                            }
326    
327                        }
328                        else if (action.getStatus() == WorkflowActionBean.Status.START_RETRY) {
329                            Date nextRunTime = action.getPendingAge();
330                            if (useXCommand) {
331                                queueCallable(new ActionStartXCommand(action.getId(), action.getType()), nextRunTime.getTime()
332                                        - System.currentTimeMillis());
333                            } else {
334                                queueCallable(new ActionStartCommand(action.getId(), action.getType()), nextRunTime.getTime()
335                                        - System.currentTimeMillis());
336                            }
337    
338                        }
339                        else if (action.getStatus() == WorkflowActionBean.Status.DONE
340                                || action.getStatus() == WorkflowActionBean.Status.END_MANUAL) {
341                            if (useXCommand) {
342                                queueCallable(new ActionEndXCommand(action.getId(), action.getType()));
343                            } else {
344                                queueCallable(new ActionEndCommand(action.getId(), action.getType()));
345                            }
346    
347                        }
348                        else if (action.getStatus() == WorkflowActionBean.Status.END_RETRY) {
349                            Date nextRunTime = action.getPendingAge();
350                            if (useXCommand) {
351                                queueCallable(new ActionEndXCommand(action.getId(), action.getType()), nextRunTime.getTime()
352                                        - System.currentTimeMillis());
353                            } else {
354                                queueCallable(new ActionEndCommand(action.getId(), action.getType()), nextRunTime.getTime()
355                                        - System.currentTimeMillis());
356                            }
357    
358                        }
359                        else if (action.getStatus() == WorkflowActionBean.Status.OK
360                                || action.getStatus() == WorkflowActionBean.Status.ERROR) {
361                            if (useXCommand) {
362                                queueCallable(new SignalXCommand(action.getJobId(), action.getId()));
363                            } else {
364                                queueCallable(new SignalCommand(action.getJobId(), action.getId()));
365                            }
366    
367                        }
368                    }
369                }
370                catch (Exception ex) {
371                    log.error("Exception, {0}", ex.getMessage(), ex);
372                }
373            }
374    
375            /**
376             * Adds callables to a list. If the number of callables in the list reaches {@link
377             * RecoveryService#CONF_CALLABLE_BATCH_SIZE}, the entire batch is queued and the callables list is reset.
378             *
379             * @param callable the callable to queue.
380             */
381            private void queueCallable(XCallable<?> callable) {
382                if (callables == null) {
383                    callables = new ArrayList<XCallable<?>>();
384                }
385                callables.add(callable);
386                if (callables.size() == Services.get().getConf().getInt(CONF_CALLABLE_BATCH_SIZE, 10)) {
387                    boolean ret = Services.get().get(CallableQueueService.class).queueSerial(callables);
388                    if (ret == false) {
389                        XLog.getLog(getClass()).warn(
390                                "Unable to queue the callables commands for RecoveryService. "
391                                        + "Most possibly command queue is full. Queue size is :"
392                                        + Services.get().get(CallableQueueService.class).queueSize());
393                    }
394                    callables = new ArrayList<XCallable<?>>();
395                }
396            }
397    
398            /**
399             * Adds callables to a list. If the number of callables in the list reaches {@link
400             * RecoveryService#CONF_CALLABLE_BATCH_SIZE}, the entire batch is queued with the delay set to the maximum delay
401             * of the callables in the list. The callables list and the delay is reset.
402             *
403             * @param callable the callable to queue.
404             * @param delay the delay for the callable.
405             */
406            private void queueCallable(XCallable<?> callable, long delay) {
407                if (delayedCallables == null) {
408                    delayedCallables = new ArrayList<XCallable<?>>();
409                }
410                this.delay = Math.max(this.delay, delay);
411                delayedCallables.add(callable);
412                if (delayedCallables.size() == Services.get().getConf().getInt(CONF_CALLABLE_BATCH_SIZE, 10)) {
413                    boolean ret = Services.get().get(CallableQueueService.class).queueSerial(delayedCallables, this.delay);
414                    if (ret == false) {
415                        XLog.getLog(getClass()).warn("Unable to queue the delayedCallables commands for RecoveryService. "
416                                + "Most possibly Callable queue is full. Queue size is :"
417                                + Services.get().get(CallableQueueService.class).queueSize());
418                    }
419                    delayedCallables = new ArrayList<XCallable<?>>();
420                    this.delay = 0;
421                }
422            }
423        }
424    
425        /**
426         * Initializes the RecoveryService.
427         *
428         * @param services services instance.
429         */
430        @Override
431        public void init(Services services) {
432            Configuration conf = services.getConf();
433            Runnable recoveryRunnable = new RecoveryRunnable(conf.getInt(CONF_WF_ACTIONS_OLDER_THAN, 120), conf.getInt(
434                    CONF_COORD_OLDER_THAN, 600),conf.getInt(CONF_BUNDLE_OLDER_THAN, 600));
435            services.get(SchedulerService.class).schedule(recoveryRunnable, 10, conf.getInt(CONF_SERVICE_INTERVAL, 600),
436                                                          SchedulerService.Unit.SEC);
437    
438            if (Services.get().getConf().getBoolean(USE_XCOMMAND, true) == false) {
439                useXCommand = false;
440            }
441        }
442    
443        /**
444         * Destroy the Recovery Service.
445         */
446        @Override
447        public void destroy() {
448        }
449    
450        /**
451         * Return the public interface for the Recovery Service.
452         *
453         * @return {@link RecoveryService}.
454         */
455        @Override
456        public Class<? extends Service> getInterface() {
457            return RecoveryService.class;
458        }
459    
460        /**
461         * Merge Bundle job config and the configuration from the coord job to pass
462         * to Coord Engine
463         *
464         * @param coordElem the coordinator configuration
465         * @return Configuration merged configuration
466         * @throws CommandException thrown if failed to merge configuration
467         */
468        private static Configuration mergeConfig(Element coordElem,BundleJobBean bundleJob) throws CommandException {
469            XLog.Info.get().clear();
470            XLog log = XLog.getLog("RecoveryService");
471    
472            String jobConf = bundleJob.getConf();
473            // Step 1: runConf = jobConf
474            Configuration runConf = null;
475            try {
476                runConf = new XConfiguration(new StringReader(jobConf));
477            }
478            catch (IOException e1) {
479                log.warn("Configuration parse error in:" + jobConf);
480                throw new CommandException(ErrorCode.E1306, e1.getMessage(), e1);
481            }
482            // Step 2: Merge local properties into runConf
483            // extract 'property' tags under 'configuration' block in the coordElem
484            // convert Element to XConfiguration
485            Element localConfigElement = coordElem.getChild("configuration", coordElem.getNamespace());
486    
487            if (localConfigElement != null) {
488                String strConfig = XmlUtils.prettyPrint(localConfigElement).toString();
489                Configuration localConf;
490                try {
491                    localConf = new XConfiguration(new StringReader(strConfig));
492                }
493                catch (IOException e1) {
494                    log.warn("Configuration parse error in:" + strConfig);
495                    throw new CommandException(ErrorCode.E1307, e1.getMessage(), e1);
496                }
497    
498                // copy configuration properties in the coordElem to the runConf
499                XConfiguration.copy(localConf, runConf);
500            }
501    
502            // Step 3: Extract value of 'app-path' in coordElem, save it as a
503            // new property called 'oozie.coord.application.path', and normalize.
504            String appPath = coordElem.getChild("app-path", coordElem.getNamespace()).getValue();
505            runConf.set(OozieClient.COORDINATOR_APP_PATH, appPath);
506            // Normalize coordinator appPath here;
507            try {
508                JobUtils.normalizeAppPath(runConf.get(OozieClient.USER_NAME), runConf.get(OozieClient.GROUP_NAME), runConf);
509            }
510            catch (IOException e) {
511                throw new CommandException(ErrorCode.E1001, runConf.get(OozieClient.COORDINATOR_APP_PATH));
512            }
513            return runConf;
514        }
515    }