001/**
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.activemq.store.kahadb.scheduler;
018
019import java.io.DataInput;
020import java.io.DataOutput;
021import java.io.File;
022import java.io.FilenameFilter;
023import java.io.IOException;
024import java.util.ArrayList;
025import java.util.Collection;
026import java.util.HashMap;
027import java.util.HashSet;
028import java.util.Iterator;
029import java.util.List;
030import java.util.Map;
031import java.util.Map.Entry;
032import java.util.Set;
033import java.util.TreeSet;
034import java.util.UUID;
035
036import org.apache.activemq.broker.scheduler.JobScheduler;
037import org.apache.activemq.broker.scheduler.JobSchedulerStore;
038import org.apache.activemq.protobuf.Buffer;
039import org.apache.activemq.store.kahadb.AbstractKahaDBStore;
040import org.apache.activemq.store.kahadb.JournalCommand;
041import org.apache.activemq.store.kahadb.KahaDBMetaData;
042import org.apache.activemq.store.kahadb.Visitor;
043import org.apache.activemq.store.kahadb.data.KahaAddScheduledJobCommand;
044import org.apache.activemq.store.kahadb.data.KahaDestroySchedulerCommand;
045import org.apache.activemq.store.kahadb.data.KahaRemoveScheduledJobCommand;
046import org.apache.activemq.store.kahadb.data.KahaRemoveScheduledJobsCommand;
047import org.apache.activemq.store.kahadb.data.KahaRescheduleJobCommand;
048import org.apache.activemq.store.kahadb.data.KahaTraceCommand;
049import org.apache.activemq.store.kahadb.disk.index.BTreeVisitor;
050import org.apache.activemq.store.kahadb.disk.journal.DataFile;
051import org.apache.activemq.store.kahadb.disk.journal.Location;
052import org.apache.activemq.store.kahadb.disk.page.Page;
053import org.apache.activemq.store.kahadb.disk.page.PageFile;
054import org.apache.activemq.store.kahadb.disk.page.Transaction;
055import org.apache.activemq.store.kahadb.disk.util.VariableMarshaller;
056import org.apache.activemq.store.kahadb.scheduler.legacy.LegacyStoreReplayer;
057import org.apache.activemq.util.ByteSequence;
058import org.apache.activemq.util.IOHelper;
059import org.slf4j.Logger;
060import org.slf4j.LoggerFactory;
061
062/*
063 * @org.apache.xbean.XBean element="kahaDBJobScheduler"
064 */
065
066public class JobSchedulerStoreImpl extends AbstractKahaDBStore implements JobSchedulerStore {
067
068    private static final Logger LOG = LoggerFactory.getLogger(JobSchedulerStoreImpl.class);
069
070    private JobSchedulerKahaDBMetaData metaData = new JobSchedulerKahaDBMetaData(this);
071    private final MetaDataMarshaller metaDataMarshaller = new MetaDataMarshaller(this);
072    private final Map<String, JobSchedulerImpl> schedulers = new HashMap<String, JobSchedulerImpl>();
073    private File legacyStoreArchiveDirectory;
074
075    /**
076     * The Scheduler Token is used to identify base revisions of the Scheduler store.  A store
077     * based on the initial scheduler design will not have this tag in it's meta-data and will
078     * indicate an update is needed.  Later versions of the scheduler can also change this value
079     * to indicate incompatible store bases which require complete meta-data and journal rewrites
080     * instead of simpler meta-data updates.
081     */
082    static final UUID SCHEDULER_STORE_TOKEN = UUID.fromString("57ed642b-1ee3-47b3-be6d-b7297d500409");
083
084    /**
085     * The default scheduler store version.  All new store instance will be given this version and
086     * earlier versions will be updated to this version.
087     */
088    static final int CURRENT_VERSION = 1;
089
090    @Override
091    public JobScheduler getJobScheduler(final String name) throws Exception {
092        this.indexLock.writeLock().lock();
093        try {
094            JobSchedulerImpl result = this.schedulers.get(name);
095            if (result == null) {
096                final JobSchedulerImpl js = new JobSchedulerImpl(this);
097                js.setName(name);
098                getPageFile().tx().execute(new Transaction.Closure<IOException>() {
099                    @Override
100                    public void execute(Transaction tx) throws IOException {
101                        js.createIndexes(tx);
102                        js.load(tx);
103                        metaData.getJobSchedulers().put(tx, name, js);
104                    }
105                });
106                result = js;
107                this.schedulers.put(name, js);
108                if (isStarted()) {
109                    result.start();
110                }
111                this.pageFile.flush();
112            }
113            return result;
114        } finally {
115            this.indexLock.writeLock().unlock();
116        }
117    }
118
119    @Override
120    public boolean removeJobScheduler(final String name) throws Exception {
121        boolean result = false;
122
123        this.indexLock.writeLock().lock();
124        try {
125            final JobSchedulerImpl js = this.schedulers.remove(name);
126            result = js != null;
127            if (result) {
128                js.stop();
129                getPageFile().tx().execute(new Transaction.Closure<IOException>() {
130                    @Override
131                    public void execute(Transaction tx) throws IOException {
132                        metaData.getJobSchedulers().remove(tx, name);
133                        js.removeAll(tx);
134                    }
135                });
136            }
137        } finally {
138            this.indexLock.writeLock().unlock();
139        }
140        return result;
141    }
142
143    /**
144     * Sets the directory where the legacy scheduler store files are archived before an
145     * update attempt is made.  Both the legacy index files and the journal files are moved
146     * to this folder prior to an upgrade attempt.
147     *
148     * @param directory
149     *      The directory to move the legacy Scheduler Store files to.
150     */
151    public void setLegacyStoreArchiveDirectory(File directory) {
152        this.legacyStoreArchiveDirectory = directory;
153    }
154
155    /**
156     * Gets the directory where the legacy Scheduler Store files will be archived if the
157     * broker is started and an existing Job Scheduler Store from an old version is detected.
158     *
159     * @return the directory where scheduler store legacy files are archived on upgrade.
160     */
161    public File getLegacyStoreArchiveDirectory() {
162        if (this.legacyStoreArchiveDirectory == null) {
163            this.legacyStoreArchiveDirectory = new File(getDirectory(), "legacySchedulerStore");
164        }
165
166        return this.legacyStoreArchiveDirectory.getAbsoluteFile();
167    }
168
169    @Override
170    public void load() throws IOException {
171        if (opened.compareAndSet(false, true)) {
172            getJournal().start();
173            try {
174                loadPageFile();
175            } catch (UnknownStoreVersionException ex) {
176                LOG.info("Can't start until store update is performed.");
177                upgradeFromLegacy();
178                // Restart with the updated store
179                getJournal().start();
180                loadPageFile();
181                LOG.info("Update from legacy Scheduler store completed successfully.");
182            } catch (Throwable t) {
183                LOG.warn("Index corrupted. Recovering the index through journal replay. Cause: {}", t.toString());
184                LOG.debug("Index load failure", t);
185
186                // try to recover index
187                try {
188                    pageFile.unload();
189                } catch (Exception ignore) {
190                }
191                if (isArchiveCorruptedIndex()) {
192                    pageFile.archive();
193                } else {
194                    pageFile.delete();
195                }
196                metaData = new JobSchedulerKahaDBMetaData(this);
197                pageFile = null;
198                loadPageFile();
199            }
200            startCheckpoint();
201            recover();
202        }
203        LOG.info("{} started.", this);
204    }
205
206    @Override
207    public void unload() throws IOException {
208        if (opened.compareAndSet(true, false)) {
209            for (JobSchedulerImpl js : this.schedulers.values()) {
210                try {
211                    js.stop();
212                } catch (Exception e) {
213                    throw new IOException(e);
214                }
215            }
216            this.indexLock.writeLock().lock();
217            try {
218                if (pageFile != null && pageFile.isLoaded()) {
219                    metaData.setState(KahaDBMetaData.CLOSED_STATE);
220
221                    if (metaData.getPage() != null) {
222                        pageFile.tx().execute(new Transaction.Closure<IOException>() {
223                            @Override
224                            public void execute(Transaction tx) throws IOException {
225                                tx.store(metaData.getPage(), metaDataMarshaller, true);
226                            }
227                        });
228                    }
229                }
230            } finally {
231                this.indexLock.writeLock().unlock();
232            }
233
234            checkpointLock.writeLock().lock();
235            try {
236                if (metaData.getPage() != null) {
237                    checkpointUpdate(getCleanupOnStop());
238                }
239            } finally {
240                checkpointLock.writeLock().unlock();
241            }
242            synchronized (checkpointThreadLock) {
243                if (checkpointThread != null) {
244                    try {
245                        checkpointThread.join();
246                        checkpointThread = null;
247                    } catch (InterruptedException e) {
248                    }
249                }
250            }
251
252            if (pageFile != null) {
253                pageFile.unload();
254                pageFile = null;
255            }
256            if (this.journal != null) {
257                journal.close();
258                journal = null;
259            }
260
261            metaData = new JobSchedulerKahaDBMetaData(this);
262        }
263        LOG.info("{} stopped.", this);
264    }
265
266    private void loadPageFile() throws IOException {
267        this.indexLock.writeLock().lock();
268        try {
269            final PageFile pageFile = getPageFile();
270            pageFile.load();
271            pageFile.tx().execute(new Transaction.Closure<IOException>() {
272                @Override
273                public void execute(Transaction tx) throws IOException {
274                    if (pageFile.getPageCount() == 0) {
275                        Page<JobSchedulerKahaDBMetaData> page = tx.allocate();
276                        assert page.getPageId() == 0;
277                        page.set(metaData);
278                        metaData.setPage(page);
279                        metaData.setState(KahaDBMetaData.CLOSED_STATE);
280                        metaData.initialize(tx);
281                        tx.store(metaData.getPage(), metaDataMarshaller, true);
282                    } else {
283                        Page<JobSchedulerKahaDBMetaData> page = null;
284                        page = tx.load(0, metaDataMarshaller);
285                        metaData = page.get();
286                        metaData.setPage(page);
287                    }
288                    metaData.load(tx);
289                    metaData.loadScheduler(tx, schedulers);
290                    for (JobSchedulerImpl js : schedulers.values()) {
291                        try {
292                            js.start();
293                        } catch (Exception e) {
294                            JobSchedulerStoreImpl.LOG.error("Failed to load " + js.getName(), e);
295                        }
296                    }
297                }
298            });
299
300            pageFile.flush();
301        } finally {
302            this.indexLock.writeLock().unlock();
303        }
304    }
305
306    private void upgradeFromLegacy() throws IOException {
307
308        journal.close();
309        journal = null;
310        try {
311            pageFile.unload();
312            pageFile = null;
313        } catch (Exception ignore) {}
314
315        File storeDir = getDirectory().getAbsoluteFile();
316        File storeArchiveDir = getLegacyStoreArchiveDirectory();
317
318        LOG.info("Attempting to move old store files from {} to {}", storeDir, storeArchiveDir);
319
320        // Move only the known store files, locks and other items left in place.
321        IOHelper.moveFiles(storeDir, storeArchiveDir, new FilenameFilter() {
322
323            @Override
324            public boolean accept(File dir, String name) {
325                if (name.endsWith(".data") || name.endsWith(".redo") || name.endsWith(".log") || name.endsWith(".free")) {
326                    return true;
327                }
328                return false;
329            }
330        });
331
332        // We reset everything to clean state, then we can read from the old
333        // scheduler store and replay the scheduled jobs into this one as adds.
334        getJournal().start();
335        metaData = new JobSchedulerKahaDBMetaData(this);
336        pageFile = null;
337        loadPageFile();
338
339        LegacyStoreReplayer replayer = new LegacyStoreReplayer(getLegacyStoreArchiveDirectory());
340        replayer.load();
341        replayer.startReplay(this);
342
343        // Cleanup after replay and store what we've done.
344        pageFile.tx().execute(new Transaction.Closure<IOException>() {
345            @Override
346            public void execute(Transaction tx) throws IOException {
347                tx.store(metaData.getPage(), metaDataMarshaller, true);
348            }
349        });
350
351        checkpointUpdate(true);
352        getJournal().close();
353        getPageFile().unload();
354    }
355
356    @Override
357    protected void checkpointUpdate(Transaction tx, boolean cleanup) throws IOException {
358        LOG.debug("Job Scheduler Store Checkpoint started.");
359
360        // reflect last update exclusive of current checkpoint
361        Location lastUpdate = metaData.getLastUpdateLocation();
362        metaData.setState(KahaDBMetaData.OPEN_STATE);
363        tx.store(metaData.getPage(), metaDataMarshaller, true);
364        pageFile.flush();
365
366        if (cleanup) {
367            final TreeSet<Integer> completeFileSet = new TreeSet<Integer>(journal.getFileMap().keySet());
368            final TreeSet<Integer> gcCandidateSet = new TreeSet<Integer>(completeFileSet);
369
370            LOG.trace("Last update: {}, full gc candidates set: {}", lastUpdate, gcCandidateSet);
371
372            if (lastUpdate != null) {
373                gcCandidateSet.remove(lastUpdate.getDataFileId());
374            }
375
376            this.metaData.getJournalRC().visit(tx, new BTreeVisitor<Integer, Integer>() {
377
378                @Override
379                public void visit(List<Integer> keys, List<Integer> values) {
380                    for (Integer key : keys) {
381                        if (gcCandidateSet.remove(key)) {
382                            LOG.trace("Removed referenced file: {} from GC set", key);
383                        }
384                    }
385                }
386
387                @Override
388                public boolean isInterestedInKeysBetween(Integer first, Integer second) {
389                    return true;
390                }
391            });
392
393            LOG.trace("gc candidates after reference check: {}", gcCandidateSet);
394
395            // If there are GC candidates then check the remove command location to see
396            // if any of them can go or if they must stay in order to ensure proper recover.
397            //
398            // A log containing any remove commands must be kept until all the logs with the
399            // add commands for all the removed jobs have been dropped.
400            if (!gcCandidateSet.isEmpty()) {
401                Iterator<Entry<Integer, List<Integer>>> removals = metaData.getRemoveLocationTracker().iterator(tx);
402                List<Integer> orphans = new ArrayList<Integer>();
403                while (removals.hasNext()) {
404                    boolean orphanedRemove = true;
405                    Entry<Integer, List<Integer>> entry = removals.next();
406
407                    // If this log is not a GC candidate then there's no need to do a check to rule it out
408                    if (gcCandidateSet.contains(entry.getKey())) {
409                        for (Integer addLocation : entry.getValue()) {
410                            if (completeFileSet.contains(addLocation)) {
411                                LOG.trace("A remove in log {} has an add still in existance in {}.", entry.getKey(), addLocation);
412                                orphanedRemove = false;
413                                break;
414                            }
415                        }
416
417                        // If it's not orphaned than we can't remove it, otherwise we
418                        // stop tracking it it's log will get deleted on the next check.
419                        if (!orphanedRemove) {
420                            gcCandidateSet.remove(entry.getKey());
421                        } else {
422                            LOG.trace("All removes in log {} are orphaned, file can be GC'd", entry.getKey());
423                            orphans.add(entry.getKey());
424                        }
425                    }
426                }
427
428                // Drop all orphaned removes from the tracker.
429                for (Integer orphan : orphans) {
430                    metaData.getRemoveLocationTracker().remove(tx, orphan);
431                }
432            }
433
434            LOG.trace("gc candidates after removals check: {}", gcCandidateSet);
435            if (!gcCandidateSet.isEmpty()) {
436                if (LOG.isDebugEnabled()) {
437                    LOG.debug("Cleanup removing the data files: " + gcCandidateSet);
438                }
439                journal.removeDataFiles(gcCandidateSet);
440            }
441        }
442
443        LOG.debug("Job Scheduler Store Checkpoint complete.");
444    }
445
446    /**
447     * Adds a reference for the journal log file pointed to by the given Location value.
448     *
449     * To prevent log files in the journal that still contain valid data that needs to be
450     * kept in order to allow for recovery the logs must have active references.  Each Job
451     * scheduler should ensure that the logs are accurately referenced.
452     *
453     * @param tx
454     *      The TX under which the update is to be performed.
455     * @param location
456     *      The location value to update the reference count of.
457     *
458     * @throws IOException if an error occurs while updating the journal references table.
459     */
460    protected void incrementJournalCount(Transaction tx, Location location) throws IOException {
461        int logId = location.getDataFileId();
462        Integer val = metaData.getJournalRC().get(tx, logId);
463        int refCount = val != null ? val.intValue() + 1 : 1;
464        metaData.getJournalRC().put(tx, logId, refCount);
465    }
466
467    /**
468     * Removes one reference for the Journal log file indicated in the given Location value.
469     *
470     * The references are used to track which log files cannot be GC'd.  When the reference count
471     * on a log file reaches zero the file id is removed from the tracker and the log will be
472     * removed on the next check point update.
473     *
474     * @param tx
475     *      The TX under which the update is to be performed.
476     * @param location
477     *      The location value to update the reference count of.
478     *
479     * @throws IOException if an error occurs while updating the journal references table.
480     */
481    protected void decrementJournalCount(Transaction tx, Location location) throws IOException {
482        int logId = location.getDataFileId();
483        Integer refCount = metaData.getJournalRC().get(tx, logId);
484        if (refCount != null) {
485            int refCountValue = refCount;
486            refCountValue--;
487            if (refCountValue <= 0) {
488                metaData.getJournalRC().remove(tx, logId);
489            } else {
490                metaData.getJournalRC().put(tx, logId, refCountValue);
491            }
492        }
493    }
494
495    /**
496     * Removes multiple references for the Journal log file indicated in the given Location map.
497     *
498     * The references are used to track which log files cannot be GC'd.  When the reference count
499     * on a log file reaches zero the file id is removed from the tracker and the log will be
500     * removed on the next check point update.
501     *
502     * @param tx
503     *      The TX under which the update is to be performed.
504     * @param decrementsByFileIds
505     *      Map indicating how many decrements per fileId.
506     *
507     * @throws IOException if an error occurs while updating the journal references table.
508     */
509    protected void decrementJournalCount(Transaction tx, HashMap<Integer, Integer> decrementsByFileIds) throws IOException {
510        for(Map.Entry<Integer, Integer> entry : decrementsByFileIds.entrySet()) {
511            int logId = entry.getKey();
512            Integer refCount = metaData.getJournalRC().get(tx, logId);
513
514            if (refCount != null) {
515                int refCountValue = refCount;
516                refCountValue -= entry.getValue();
517                if (refCountValue <= 0) {
518                    metaData.getJournalRC().remove(tx, logId);
519                } else {
520                    metaData.getJournalRC().put(tx, logId, refCountValue);
521                }
522            }
523        }
524    }
525
526    /**
527     * Updates the Job removal tracking index with the location of a remove command and the
528     * original JobLocation entry.
529     *
530     * The JobLocation holds the locations in the logs where the add and update commands for
531     * a job stored.  The log file containing the remove command can only be discarded after
532     * both the add and latest update log files have also been discarded.
533     *
534     * @param tx
535     *      The TX under which the update is to be performed.
536     * @param location
537     *      The location value to reference a remove command.
538     * @param removedJob
539     *      The original JobLocation instance that holds the add and update locations
540     *
541     * @throws IOException if an error occurs while updating the remove location tracker.
542     */
543    protected void referenceRemovedLocation(Transaction tx, Location location, JobLocation removedJob) throws IOException {
544        int logId = location.getDataFileId();
545        List<Integer> removed = this.metaData.getRemoveLocationTracker().get(tx, logId);
546        if (removed == null) {
547            removed = new ArrayList<Integer>();
548        }
549        removed.add(removedJob.getLocation().getDataFileId());
550        this.metaData.getRemoveLocationTracker().put(tx, logId, removed);
551    }
552
553    /**
554     * Updates the Job removal tracking index with the location of a remove command and the
555     * original JobLocation entry.
556     *
557     * The JobLocation holds the locations in the logs where the add and update commands for
558     * a job stored.  The log file containing the remove command can only be discarded after
559     * both the add and latest update log files have also been discarded.
560     *
561     * @param tx
562     *      The TX under which the update is to be performed.
563     * @param location
564     *      The location value to reference a remove command.
565     * @param removedJobsFileId
566     *      List of the original JobLocation instances that holds the add and update locations
567     *
568     * @throws IOException if an error occurs while updating the remove location tracker.
569     */
570    protected void referenceRemovedLocation(Transaction tx, Location location, List<Integer> removedJobsFileId) throws IOException {
571        int logId = location.getDataFileId();
572        List<Integer> removed = this.metaData.getRemoveLocationTracker().get(tx, logId);
573        if (removed == null) {
574            removed = new ArrayList<Integer>();
575        }
576        removed.addAll(removedJobsFileId);
577        this.metaData.getRemoveLocationTracker().put(tx, logId, removed);
578    }
579
580    /**
581     * Retrieve the scheduled Job's byte blob from the journal.
582     *
583     * @param location
584     *      The location of the KahaAddScheduledJobCommand that originated the Job.
585     *
586     * @return a ByteSequence containing the payload of the scheduled Job.
587     *
588     * @throws IOException if an error occurs while reading the payload value.
589     */
590    protected ByteSequence getPayload(Location location) throws IOException {
591        KahaAddScheduledJobCommand job = (KahaAddScheduledJobCommand) this.load(location);
592        Buffer payload = job.getPayload();
593        return new ByteSequence(payload.getData(), payload.getOffset(), payload.getLength());
594    }
595
596    public void readLockIndex() {
597        this.indexLock.readLock().lock();
598    }
599
600    public void readUnlockIndex() {
601        this.indexLock.readLock().unlock();
602    }
603
604    public void writeLockIndex() {
605        this.indexLock.writeLock().lock();
606    }
607
608    public void writeUnlockIndex() {
609        this.indexLock.writeLock().unlock();
610    }
611
612    @Override
613    public String toString() {
614        return "JobSchedulerStore: " + getDirectory();
615    }
616
617    @Override
618    protected String getPageFileName() {
619        return "scheduleDB";
620    }
621
622    @Override
623    protected File getDefaultDataDirectory() {
624        return new File(IOHelper.getDefaultDataDirectory(), "delayedDB");
625    }
626
627    private class MetaDataMarshaller extends VariableMarshaller<JobSchedulerKahaDBMetaData> {
628
629        private final JobSchedulerStoreImpl store;
630
631        MetaDataMarshaller(JobSchedulerStoreImpl store) {
632            this.store = store;
633        }
634
635        @Override
636        public JobSchedulerKahaDBMetaData readPayload(DataInput dataIn) throws IOException {
637            JobSchedulerKahaDBMetaData rc = new JobSchedulerKahaDBMetaData(store);
638            rc.read(dataIn);
639            return rc;
640        }
641
642        @Override
643        public void writePayload(JobSchedulerKahaDBMetaData object, DataOutput dataOut) throws IOException {
644            object.write(dataOut);
645        }
646    }
647
648    /**
649     * Called during index recovery to rebuild the index from the last known good location.  For
650     * entries that occur before the last known good position we just ignore then and move on.
651     *
652     * @param data
653     *        the command read from the Journal which should be used to update the index.
654     * @param location
655     *        the location in the index where the command was read.
656     * @param inDoubtlocation
657     *        the location in the index known to be the last time the index was valid.
658     *
659     * @throws IOException if an error occurs while recovering the index.
660     */
661    protected void doRecover(JournalCommand<?> data, final Location location, final Location inDoubtlocation) throws IOException {
662        if (inDoubtlocation != null && location.compareTo(inDoubtlocation) >= 0) {
663            process(data, location);
664        }
665    }
666
667    /**
668     * Called during recovery to allow the store to rebuild from scratch.
669     *
670     * @param data
671     *      The command to process, which was read from the Journal.
672     * @param location
673     *      The location of the command in the Journal.
674     *
675     * @throws IOException if an error occurs during command processing.
676     */
677    @Override
678    protected void process(JournalCommand<?> data, final Location location) throws IOException {
679        data.visit(new Visitor() {
680            @Override
681            public void visit(final KahaAddScheduledJobCommand command) throws IOException {
682                final JobSchedulerImpl scheduler;
683
684                indexLock.writeLock().lock();
685                try {
686                    try {
687                        scheduler = (JobSchedulerImpl) getJobScheduler(command.getScheduler());
688                    } catch (Exception e) {
689                        throw new IOException(e);
690                    }
691                    getPageFile().tx().execute(new Transaction.Closure<IOException>() {
692                        @Override
693                        public void execute(Transaction tx) throws IOException {
694                            scheduler.process(tx, command, location);
695                        }
696                    });
697
698                    processLocation(location);
699                } finally {
700                    indexLock.writeLock().unlock();
701                }
702            }
703
704            @Override
705            public void visit(final KahaRemoveScheduledJobCommand command) throws IOException {
706                final JobSchedulerImpl scheduler;
707
708                indexLock.writeLock().lock();
709                try {
710                    try {
711                        scheduler = (JobSchedulerImpl) getJobScheduler(command.getScheduler());
712                    } catch (Exception e) {
713                        throw new IOException(e);
714                    }
715                    getPageFile().tx().execute(new Transaction.Closure<IOException>() {
716                        @Override
717                        public void execute(Transaction tx) throws IOException {
718                            scheduler.process(tx, command, location);
719                        }
720                    });
721
722                    processLocation(location);
723                } finally {
724                    indexLock.writeLock().unlock();
725                }
726            }
727
728            @Override
729            public void visit(final KahaRemoveScheduledJobsCommand command) throws IOException {
730                final JobSchedulerImpl scheduler;
731
732                indexLock.writeLock().lock();
733                try {
734                    try {
735                        scheduler = (JobSchedulerImpl) getJobScheduler(command.getScheduler());
736                    } catch (Exception e) {
737                        throw new IOException(e);
738                    }
739                    getPageFile().tx().execute(new Transaction.Closure<IOException>() {
740                        @Override
741                        public void execute(Transaction tx) throws IOException {
742                            scheduler.process(tx, command, location);
743                        }
744                    });
745
746                    processLocation(location);
747                } finally {
748                    indexLock.writeLock().unlock();
749                }
750            }
751
752            @Override
753            public void visit(final KahaRescheduleJobCommand command) throws IOException {
754                final JobSchedulerImpl scheduler;
755
756                indexLock.writeLock().lock();
757                try {
758                    try {
759                        scheduler = (JobSchedulerImpl) getJobScheduler(command.getScheduler());
760                    } catch (Exception e) {
761                        throw new IOException(e);
762                    }
763                    getPageFile().tx().execute(new Transaction.Closure<IOException>() {
764                        @Override
765                        public void execute(Transaction tx) throws IOException {
766                            scheduler.process(tx, command, location);
767                        }
768                    });
769
770                    processLocation(location);
771                } finally {
772                    indexLock.writeLock().unlock();
773                }
774            }
775
776            @Override
777            public void visit(final KahaDestroySchedulerCommand command) {
778                try {
779                    removeJobScheduler(command.getScheduler());
780                } catch (Exception e) {
781                    LOG.warn("Failed to remove scheduler: {}", command.getScheduler());
782                }
783
784                processLocation(location);
785            }
786
787            @Override
788            public void visit(KahaTraceCommand command) {
789                processLocation(location);
790            }
791        });
792    }
793
794    protected void processLocation(final Location location) {
795        indexLock.writeLock().lock();
796        try {
797            this.metaData.setLastUpdateLocation(location);
798        } finally {
799            indexLock.writeLock().unlock();
800        }
801    }
802
803    /**
804     * We recover from the Journal logs as needed to restore the index.
805     *
806     * @throws IllegalStateException
807     * @throws IOException
808     */
809    private void recover() throws IllegalStateException, IOException {
810        this.indexLock.writeLock().lock();
811        try {
812            long start = System.currentTimeMillis();
813            Location lastIndoubtPosition = getRecoveryPosition();
814            Location recoveryPosition = lastIndoubtPosition;
815
816            if (recoveryPosition != null) {
817                int redoCounter = 0;
818                LOG.info("Recovering from the scheduled job journal @" + recoveryPosition);
819                while (recoveryPosition != null) {
820                    try {
821                        JournalCommand<?> message = load(recoveryPosition);
822                        metaData.setLastUpdateLocation(recoveryPosition);
823                        doRecover(message, recoveryPosition, lastIndoubtPosition);
824                        redoCounter++;
825                    } catch (IOException failedRecovery) {
826                        if (isIgnoreMissingJournalfiles()) {
827                            LOG.debug("Failed to recover data at position:" + recoveryPosition, failedRecovery);
828                            // track this dud location
829                            journal.corruptRecoveryLocation(recoveryPosition);
830                        } else {
831                            throw new IOException("Failed to recover data at position:" + recoveryPosition, failedRecovery);
832                        }
833                    }
834                    recoveryPosition = journal.getNextLocation(recoveryPosition);
835                     if (LOG.isInfoEnabled() && redoCounter % 100000 == 0) {
836                         LOG.info("@ {}, {} entries recovered ..", recoveryPosition, redoCounter);
837                     }
838                }
839                long end = System.currentTimeMillis();
840                LOG.info("Recovery replayed {} operations from the journal in {} seconds.",
841                         redoCounter, ((end - start) / 1000.0f));
842            }
843
844            // We may have to undo some index updates.
845            pageFile.tx().execute(new Transaction.Closure<IOException>() {
846                @Override
847                public void execute(Transaction tx) throws IOException {
848                    recoverIndex(tx);
849                }
850            });
851
852        } finally {
853            this.indexLock.writeLock().unlock();
854        }
855    }
856
857    private Location getRecoveryPosition() throws IOException {
858        // This loads the first position and we completely rebuild the index if we
859        // do not override it with some known recovery start location.
860        Location result = null;
861
862        if (!isForceRecoverIndex()) {
863            if (metaData.getLastUpdateLocation() != null) {
864                result = metaData.getLastUpdateLocation();
865            }
866        }
867
868        return journal.getNextLocation(result);
869    }
870
871    private void recoverIndex(Transaction tx) throws IOException {
872        long start = System.currentTimeMillis();
873
874        // It is possible index updates got applied before the journal updates..
875        // in that case we need to removed references to Jobs that are not in the journal
876        final Location lastAppendLocation = journal.getLastAppendLocation();
877        long undoCounter = 0;
878
879        // Go through all the jobs in each scheduler and check if any are added after
880        // the last appended location and remove those.  For now we ignore the update
881        // location since the scheduled job will update itself after the next fire and
882        // a new update will replace any existing update.
883        for (Iterator<Map.Entry<String, JobSchedulerImpl>> i = metaData.getJobSchedulers().iterator(tx); i.hasNext();) {
884            Map.Entry<String, JobSchedulerImpl> entry = i.next();
885            JobSchedulerImpl scheduler = entry.getValue();
886
887            for (Iterator<JobLocation> jobLocationIterator = scheduler.getAllScheduledJobs(tx); jobLocationIterator.hasNext();) {
888                final JobLocation job = jobLocationIterator.next();
889                if (job.getLocation().compareTo(lastAppendLocation) >= 0) {
890                    if (scheduler.removeJobAtTime(tx, job.getJobId(), job.getNextTime())) {
891                        LOG.trace("Removed Job past last appened in the journal: {}", job.getJobId());
892                        undoCounter++;
893                    }
894                }
895            }
896        }
897
898        if (undoCounter > 0) {
899            // The rolled back operations are basically in flight journal writes.  To avoid getting
900            // these the end user should do sync writes to the journal.
901            long end = System.currentTimeMillis();
902            LOG.info("Rolled back {} messages from the index in {} seconds.", undoCounter, ((end - start) / 1000.0f));
903            undoCounter = 0;
904        }
905
906        // Now we check for missing and corrupt journal files.
907
908        // 1. Collect the set of all referenced journal files based on the Location of the
909        //    the scheduled jobs and the marked last update field.
910        HashSet<Integer> missingJournalFiles = new HashSet<Integer>();
911        for (Iterator<Map.Entry<String, JobSchedulerImpl>> i = metaData.getJobSchedulers().iterator(tx); i.hasNext();) {
912            Map.Entry<String, JobSchedulerImpl> entry = i.next();
913            JobSchedulerImpl scheduler = entry.getValue();
914
915            for (Iterator<JobLocation> jobLocationIterator = scheduler.getAllScheduledJobs(tx); jobLocationIterator.hasNext();) {
916                final JobLocation job = jobLocationIterator.next();
917                missingJournalFiles.add(job.getLocation().getDataFileId());
918                if (job.getLastUpdate() != null) {
919                    missingJournalFiles.add(job.getLastUpdate().getDataFileId());
920                }
921            }
922        }
923
924        // 2. Remove from that set all known data file Id's in the journal and what's left
925        //    is the missing set which will soon also contain the corrupted set.
926        missingJournalFiles.removeAll(journal.getFileMap().keySet());
927        if (!missingJournalFiles.isEmpty()) {
928            LOG.info("Some journal files are missing: {}", missingJournalFiles);
929        }
930
931        // 3. Now check all references in the journal logs for corruption and add any
932        //    corrupt journal files to the missing set.
933        HashSet<Location> corruptedLocations = new HashSet<Location>();
934
935        if (isCheckForCorruptJournalFiles()) {
936            Collection<DataFile> dataFiles = journal.getFileMap().values();
937            for (DataFile dataFile : dataFiles) {
938                int id = dataFile.getDataFileId();
939                for (long offset : dataFile.getCorruptedBlocks()) {
940                    corruptedLocations.add(new Location(id, (int) offset));
941                }
942            }
943
944            if (!corruptedLocations.isEmpty()) {
945                LOG.debug("Found some corrupted data blocks in the journal: {}", corruptedLocations.size());
946            }
947        }
948
949        // 4. Now we either fail or we remove all references to missing or corrupt journal
950        //    files from the various JobSchedulerImpl instances.  We only remove the Job if
951        //    the initial Add operation is missing when the ignore option is set, the updates
952        //    could be lost but that's price you pay when ignoring the missing logs.
953        if (!missingJournalFiles.isEmpty() || !corruptedLocations.isEmpty()) {
954            if (!isIgnoreMissingJournalfiles()) {
955                throw new IOException("Detected missing/corrupt journal files.");
956            }
957
958            // Remove all Jobs that reference an Location that is either missing or corrupt.
959            undoCounter = removeJobsInMissingOrCorruptJounralFiles(tx, missingJournalFiles, corruptedLocations);
960
961            // Clean up the Journal Reference count Map.
962            removeJournalRCForMissingFiles(tx, missingJournalFiles);
963        }
964
965        if (undoCounter > 0) {
966            long end = System.currentTimeMillis();
967            LOG.info("Detected missing/corrupt journal files.  Dropped {} jobs from the " +
968                     "index in {} seconds.", undoCounter, ((end - start) / 1000.0f));
969        }
970    }
971
972    private void removeJournalRCForMissingFiles(Transaction tx, Set<Integer> missing) throws IOException {
973        List<Integer> matches = new ArrayList<Integer>();
974
975        Iterator<Entry<Integer, Integer>> references = metaData.getJournalRC().iterator(tx);
976        while (references.hasNext()) {
977            int dataFileId = references.next().getKey();
978            if (missing.contains(dataFileId)) {
979                matches.add(dataFileId);
980            }
981        }
982
983        for (Integer match : matches) {
984            metaData.getJournalRC().remove(tx, match);
985        }
986    }
987
988    private int removeJobsInMissingOrCorruptJounralFiles(Transaction tx, Set<Integer> missing, Set<Location> corrupted) throws IOException {
989        int removed = 0;
990
991        // Remove Jobs that reference missing or corrupt files.
992        // Remove Reference counts to missing or corrupt files.
993        // Remove and remove command markers to missing or corrupt files.
994        for (Iterator<Map.Entry<String, JobSchedulerImpl>> i = metaData.getJobSchedulers().iterator(tx); i.hasNext();) {
995            Map.Entry<String, JobSchedulerImpl> entry = i.next();
996            JobSchedulerImpl scheduler = entry.getValue();
997
998            for (Iterator<JobLocation> jobLocationIterator = scheduler.getAllScheduledJobs(tx); jobLocationIterator.hasNext();) {
999                final JobLocation job = jobLocationIterator.next();
1000
1001                // Remove all jobs in missing log files.
1002                if (missing.contains(job.getLocation().getDataFileId())) {
1003                    scheduler.removeJobAtTime(tx, job.getJobId(), job.getNextTime());
1004                    removed++;
1005                    continue;
1006                }
1007
1008                // Remove all jobs in corrupted parts of log files.
1009                if (corrupted.contains(job.getLocation())) {
1010                    scheduler.removeJobAtTime(tx, job.getJobId(), job.getNextTime());
1011                    removed++;
1012                }
1013            }
1014        }
1015
1016        return removed;
1017    }
1018}