001/** 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.activemq.store.kahadb.scheduler; 018 019import java.io.DataInput; 020import java.io.DataOutput; 021import java.io.File; 022import java.io.FilenameFilter; 023import java.io.IOException; 024import java.util.ArrayList; 025import java.util.Collection; 026import java.util.HashMap; 027import java.util.HashSet; 028import java.util.Iterator; 029import java.util.List; 030import java.util.Map; 031import java.util.Map.Entry; 032import java.util.Set; 033import java.util.TreeSet; 034import java.util.UUID; 035 036import org.apache.activemq.broker.scheduler.JobScheduler; 037import org.apache.activemq.broker.scheduler.JobSchedulerStore; 038import org.apache.activemq.protobuf.Buffer; 039import org.apache.activemq.store.kahadb.AbstractKahaDBStore; 040import org.apache.activemq.store.kahadb.JournalCommand; 041import org.apache.activemq.store.kahadb.KahaDBMetaData; 042import org.apache.activemq.store.kahadb.Visitor; 043import org.apache.activemq.store.kahadb.data.KahaAddScheduledJobCommand; 044import org.apache.activemq.store.kahadb.data.KahaDestroySchedulerCommand; 045import org.apache.activemq.store.kahadb.data.KahaRemoveScheduledJobCommand; 046import org.apache.activemq.store.kahadb.data.KahaRemoveScheduledJobsCommand; 047import org.apache.activemq.store.kahadb.data.KahaRescheduleJobCommand; 048import org.apache.activemq.store.kahadb.data.KahaTraceCommand; 049import org.apache.activemq.store.kahadb.disk.index.BTreeVisitor; 050import org.apache.activemq.store.kahadb.disk.journal.DataFile; 051import org.apache.activemq.store.kahadb.disk.journal.Location; 052import org.apache.activemq.store.kahadb.disk.page.Page; 053import org.apache.activemq.store.kahadb.disk.page.PageFile; 054import org.apache.activemq.store.kahadb.disk.page.Transaction; 055import org.apache.activemq.store.kahadb.disk.util.VariableMarshaller; 056import org.apache.activemq.store.kahadb.scheduler.legacy.LegacyStoreReplayer; 057import org.apache.activemq.util.ByteSequence; 058import org.apache.activemq.util.IOHelper; 059import org.slf4j.Logger; 060import org.slf4j.LoggerFactory; 061 062/* 063 * @org.apache.xbean.XBean element="kahaDBJobScheduler" 064 */ 065 066public class JobSchedulerStoreImpl extends AbstractKahaDBStore implements JobSchedulerStore { 067 068 private static final Logger LOG = LoggerFactory.getLogger(JobSchedulerStoreImpl.class); 069 070 private JobSchedulerKahaDBMetaData metaData = new JobSchedulerKahaDBMetaData(this); 071 private final MetaDataMarshaller metaDataMarshaller = new MetaDataMarshaller(this); 072 private final Map<String, JobSchedulerImpl> schedulers = new HashMap<String, JobSchedulerImpl>(); 073 private File legacyStoreArchiveDirectory; 074 075 /** 076 * The Scheduler Token is used to identify base revisions of the Scheduler store. A store 077 * based on the initial scheduler design will not have this tag in it's meta-data and will 078 * indicate an update is needed. Later versions of the scheduler can also change this value 079 * to indicate incompatible store bases which require complete meta-data and journal rewrites 080 * instead of simpler meta-data updates. 081 */ 082 static final UUID SCHEDULER_STORE_TOKEN = UUID.fromString("57ed642b-1ee3-47b3-be6d-b7297d500409"); 083 084 /** 085 * The default scheduler store version. All new store instance will be given this version and 086 * earlier versions will be updated to this version. 087 */ 088 static final int CURRENT_VERSION = 1; 089 090 @Override 091 public JobScheduler getJobScheduler(final String name) throws Exception { 092 this.indexLock.writeLock().lock(); 093 try { 094 JobSchedulerImpl result = this.schedulers.get(name); 095 if (result == null) { 096 final JobSchedulerImpl js = new JobSchedulerImpl(this); 097 js.setName(name); 098 getPageFile().tx().execute(new Transaction.Closure<IOException>() { 099 @Override 100 public void execute(Transaction tx) throws IOException { 101 js.createIndexes(tx); 102 js.load(tx); 103 metaData.getJobSchedulers().put(tx, name, js); 104 } 105 }); 106 result = js; 107 this.schedulers.put(name, js); 108 if (isStarted()) { 109 result.start(); 110 } 111 this.pageFile.flush(); 112 } 113 return result; 114 } finally { 115 this.indexLock.writeLock().unlock(); 116 } 117 } 118 119 @Override 120 public boolean removeJobScheduler(final String name) throws Exception { 121 boolean result = false; 122 123 this.indexLock.writeLock().lock(); 124 try { 125 final JobSchedulerImpl js = this.schedulers.remove(name); 126 result = js != null; 127 if (result) { 128 js.stop(); 129 getPageFile().tx().execute(new Transaction.Closure<IOException>() { 130 @Override 131 public void execute(Transaction tx) throws IOException { 132 metaData.getJobSchedulers().remove(tx, name); 133 js.removeAll(tx); 134 } 135 }); 136 } 137 } finally { 138 this.indexLock.writeLock().unlock(); 139 } 140 return result; 141 } 142 143 /** 144 * Sets the directory where the legacy scheduler store files are archived before an 145 * update attempt is made. Both the legacy index files and the journal files are moved 146 * to this folder prior to an upgrade attempt. 147 * 148 * @param directory 149 * The directory to move the legacy Scheduler Store files to. 150 */ 151 public void setLegacyStoreArchiveDirectory(File directory) { 152 this.legacyStoreArchiveDirectory = directory; 153 } 154 155 /** 156 * Gets the directory where the legacy Scheduler Store files will be archived if the 157 * broker is started and an existing Job Scheduler Store from an old version is detected. 158 * 159 * @return the directory where scheduler store legacy files are archived on upgrade. 160 */ 161 public File getLegacyStoreArchiveDirectory() { 162 if (this.legacyStoreArchiveDirectory == null) { 163 this.legacyStoreArchiveDirectory = new File(getDirectory(), "legacySchedulerStore"); 164 } 165 166 return this.legacyStoreArchiveDirectory.getAbsoluteFile(); 167 } 168 169 @Override 170 public void load() throws IOException { 171 if (opened.compareAndSet(false, true)) { 172 getJournal().start(); 173 try { 174 loadPageFile(); 175 } catch (UnknownStoreVersionException ex) { 176 LOG.info("Can't start until store update is performed."); 177 upgradeFromLegacy(); 178 // Restart with the updated store 179 getJournal().start(); 180 loadPageFile(); 181 LOG.info("Update from legacy Scheduler store completed successfully."); 182 } catch (Throwable t) { 183 LOG.warn("Index corrupted. Recovering the index through journal replay. Cause: {}", t.toString()); 184 LOG.debug("Index load failure", t); 185 186 // try to recover index 187 try { 188 pageFile.unload(); 189 } catch (Exception ignore) { 190 } 191 if (isArchiveCorruptedIndex()) { 192 pageFile.archive(); 193 } else { 194 pageFile.delete(); 195 } 196 metaData = new JobSchedulerKahaDBMetaData(this); 197 pageFile = null; 198 loadPageFile(); 199 } 200 startCheckpoint(); 201 recover(); 202 } 203 LOG.info("{} started.", this); 204 } 205 206 @Override 207 public void unload() throws IOException { 208 if (opened.compareAndSet(true, false)) { 209 for (JobSchedulerImpl js : this.schedulers.values()) { 210 try { 211 js.stop(); 212 } catch (Exception e) { 213 throw new IOException(e); 214 } 215 } 216 this.indexLock.writeLock().lock(); 217 try { 218 if (pageFile != null && pageFile.isLoaded()) { 219 metaData.setState(KahaDBMetaData.CLOSED_STATE); 220 221 if (metaData.getPage() != null) { 222 pageFile.tx().execute(new Transaction.Closure<IOException>() { 223 @Override 224 public void execute(Transaction tx) throws IOException { 225 tx.store(metaData.getPage(), metaDataMarshaller, true); 226 } 227 }); 228 } 229 } 230 } finally { 231 this.indexLock.writeLock().unlock(); 232 } 233 234 checkpointLock.writeLock().lock(); 235 try { 236 if (metaData.getPage() != null) { 237 checkpointUpdate(getCleanupOnStop()); 238 } 239 } finally { 240 checkpointLock.writeLock().unlock(); 241 } 242 synchronized (checkpointThreadLock) { 243 if (checkpointThread != null) { 244 try { 245 checkpointThread.join(); 246 checkpointThread = null; 247 } catch (InterruptedException e) { 248 } 249 } 250 } 251 252 if (pageFile != null) { 253 pageFile.unload(); 254 pageFile = null; 255 } 256 if (this.journal != null) { 257 journal.close(); 258 journal = null; 259 } 260 261 metaData = new JobSchedulerKahaDBMetaData(this); 262 } 263 LOG.info("{} stopped.", this); 264 } 265 266 private void loadPageFile() throws IOException { 267 this.indexLock.writeLock().lock(); 268 try { 269 final PageFile pageFile = getPageFile(); 270 pageFile.load(); 271 pageFile.tx().execute(new Transaction.Closure<IOException>() { 272 @Override 273 public void execute(Transaction tx) throws IOException { 274 if (pageFile.getPageCount() == 0) { 275 Page<JobSchedulerKahaDBMetaData> page = tx.allocate(); 276 assert page.getPageId() == 0; 277 page.set(metaData); 278 metaData.setPage(page); 279 metaData.setState(KahaDBMetaData.CLOSED_STATE); 280 metaData.initialize(tx); 281 tx.store(metaData.getPage(), metaDataMarshaller, true); 282 } else { 283 Page<JobSchedulerKahaDBMetaData> page = null; 284 page = tx.load(0, metaDataMarshaller); 285 metaData = page.get(); 286 metaData.setPage(page); 287 } 288 metaData.load(tx); 289 metaData.loadScheduler(tx, schedulers); 290 for (JobSchedulerImpl js : schedulers.values()) { 291 try { 292 js.start(); 293 } catch (Exception e) { 294 JobSchedulerStoreImpl.LOG.error("Failed to load " + js.getName(), e); 295 } 296 } 297 } 298 }); 299 300 pageFile.flush(); 301 } finally { 302 this.indexLock.writeLock().unlock(); 303 } 304 } 305 306 private void upgradeFromLegacy() throws IOException { 307 308 journal.close(); 309 journal = null; 310 try { 311 pageFile.unload(); 312 pageFile = null; 313 } catch (Exception ignore) {} 314 315 File storeDir = getDirectory().getAbsoluteFile(); 316 File storeArchiveDir = getLegacyStoreArchiveDirectory(); 317 318 LOG.info("Attempting to move old store files from {} to {}", storeDir, storeArchiveDir); 319 320 // Move only the known store files, locks and other items left in place. 321 IOHelper.moveFiles(storeDir, storeArchiveDir, new FilenameFilter() { 322 323 @Override 324 public boolean accept(File dir, String name) { 325 if (name.endsWith(".data") || name.endsWith(".redo") || name.endsWith(".log") || name.endsWith(".free")) { 326 return true; 327 } 328 return false; 329 } 330 }); 331 332 // We reset everything to clean state, then we can read from the old 333 // scheduler store and replay the scheduled jobs into this one as adds. 334 getJournal().start(); 335 metaData = new JobSchedulerKahaDBMetaData(this); 336 pageFile = null; 337 loadPageFile(); 338 339 LegacyStoreReplayer replayer = new LegacyStoreReplayer(getLegacyStoreArchiveDirectory()); 340 replayer.load(); 341 replayer.startReplay(this); 342 343 // Cleanup after replay and store what we've done. 344 pageFile.tx().execute(new Transaction.Closure<IOException>() { 345 @Override 346 public void execute(Transaction tx) throws IOException { 347 tx.store(metaData.getPage(), metaDataMarshaller, true); 348 } 349 }); 350 351 checkpointUpdate(true); 352 getJournal().close(); 353 getPageFile().unload(); 354 } 355 356 @Override 357 protected void checkpointUpdate(Transaction tx, boolean cleanup) throws IOException { 358 LOG.debug("Job Scheduler Store Checkpoint started."); 359 360 // reflect last update exclusive of current checkpoint 361 Location lastUpdate = metaData.getLastUpdateLocation(); 362 metaData.setState(KahaDBMetaData.OPEN_STATE); 363 tx.store(metaData.getPage(), metaDataMarshaller, true); 364 pageFile.flush(); 365 366 if (cleanup) { 367 final TreeSet<Integer> completeFileSet = new TreeSet<Integer>(journal.getFileMap().keySet()); 368 final TreeSet<Integer> gcCandidateSet = new TreeSet<Integer>(completeFileSet); 369 370 LOG.trace("Last update: {}, full gc candidates set: {}", lastUpdate, gcCandidateSet); 371 372 if (lastUpdate != null) { 373 gcCandidateSet.remove(lastUpdate.getDataFileId()); 374 } 375 376 this.metaData.getJournalRC().visit(tx, new BTreeVisitor<Integer, Integer>() { 377 378 @Override 379 public void visit(List<Integer> keys, List<Integer> values) { 380 for (Integer key : keys) { 381 if (gcCandidateSet.remove(key)) { 382 LOG.trace("Removed referenced file: {} from GC set", key); 383 } 384 } 385 } 386 387 @Override 388 public boolean isInterestedInKeysBetween(Integer first, Integer second) { 389 return true; 390 } 391 }); 392 393 LOG.trace("gc candidates after reference check: {}", gcCandidateSet); 394 395 // If there are GC candidates then check the remove command location to see 396 // if any of them can go or if they must stay in order to ensure proper recover. 397 // 398 // A log containing any remove commands must be kept until all the logs with the 399 // add commands for all the removed jobs have been dropped. 400 if (!gcCandidateSet.isEmpty()) { 401 Iterator<Entry<Integer, List<Integer>>> removals = metaData.getRemoveLocationTracker().iterator(tx); 402 List<Integer> orphans = new ArrayList<Integer>(); 403 while (removals.hasNext()) { 404 boolean orphanedRemove = true; 405 Entry<Integer, List<Integer>> entry = removals.next(); 406 407 // If this log is not a GC candidate then there's no need to do a check to rule it out 408 if (gcCandidateSet.contains(entry.getKey())) { 409 for (Integer addLocation : entry.getValue()) { 410 if (completeFileSet.contains(addLocation)) { 411 LOG.trace("A remove in log {} has an add still in existance in {}.", entry.getKey(), addLocation); 412 orphanedRemove = false; 413 break; 414 } 415 } 416 417 // If it's not orphaned than we can't remove it, otherwise we 418 // stop tracking it it's log will get deleted on the next check. 419 if (!orphanedRemove) { 420 gcCandidateSet.remove(entry.getKey()); 421 } else { 422 LOG.trace("All removes in log {} are orphaned, file can be GC'd", entry.getKey()); 423 orphans.add(entry.getKey()); 424 } 425 } 426 } 427 428 // Drop all orphaned removes from the tracker. 429 for (Integer orphan : orphans) { 430 metaData.getRemoveLocationTracker().remove(tx, orphan); 431 } 432 } 433 434 LOG.trace("gc candidates after removals check: {}", gcCandidateSet); 435 if (!gcCandidateSet.isEmpty()) { 436 if (LOG.isDebugEnabled()) { 437 LOG.debug("Cleanup removing the data files: " + gcCandidateSet); 438 } 439 journal.removeDataFiles(gcCandidateSet); 440 } 441 } 442 443 LOG.debug("Job Scheduler Store Checkpoint complete."); 444 } 445 446 /** 447 * Adds a reference for the journal log file pointed to by the given Location value. 448 * 449 * To prevent log files in the journal that still contain valid data that needs to be 450 * kept in order to allow for recovery the logs must have active references. Each Job 451 * scheduler should ensure that the logs are accurately referenced. 452 * 453 * @param tx 454 * The TX under which the update is to be performed. 455 * @param location 456 * The location value to update the reference count of. 457 * 458 * @throws IOException if an error occurs while updating the journal references table. 459 */ 460 protected void incrementJournalCount(Transaction tx, Location location) throws IOException { 461 int logId = location.getDataFileId(); 462 Integer val = metaData.getJournalRC().get(tx, logId); 463 int refCount = val != null ? val.intValue() + 1 : 1; 464 metaData.getJournalRC().put(tx, logId, refCount); 465 } 466 467 /** 468 * Removes one reference for the Journal log file indicated in the given Location value. 469 * 470 * The references are used to track which log files cannot be GC'd. When the reference count 471 * on a log file reaches zero the file id is removed from the tracker and the log will be 472 * removed on the next check point update. 473 * 474 * @param tx 475 * The TX under which the update is to be performed. 476 * @param location 477 * The location value to update the reference count of. 478 * 479 * @throws IOException if an error occurs while updating the journal references table. 480 */ 481 protected void decrementJournalCount(Transaction tx, Location location) throws IOException { 482 int logId = location.getDataFileId(); 483 Integer refCount = metaData.getJournalRC().get(tx, logId); 484 if (refCount != null) { 485 int refCountValue = refCount; 486 refCountValue--; 487 if (refCountValue <= 0) { 488 metaData.getJournalRC().remove(tx, logId); 489 } else { 490 metaData.getJournalRC().put(tx, logId, refCountValue); 491 } 492 } 493 } 494 495 /** 496 * Removes multiple references for the Journal log file indicated in the given Location map. 497 * 498 * The references are used to track which log files cannot be GC'd. When the reference count 499 * on a log file reaches zero the file id is removed from the tracker and the log will be 500 * removed on the next check point update. 501 * 502 * @param tx 503 * The TX under which the update is to be performed. 504 * @param decrementsByFileIds 505 * Map indicating how many decrements per fileId. 506 * 507 * @throws IOException if an error occurs while updating the journal references table. 508 */ 509 protected void decrementJournalCount(Transaction tx, HashMap<Integer, Integer> decrementsByFileIds) throws IOException { 510 for(Map.Entry<Integer, Integer> entry : decrementsByFileIds.entrySet()) { 511 int logId = entry.getKey(); 512 Integer refCount = metaData.getJournalRC().get(tx, logId); 513 514 if (refCount != null) { 515 int refCountValue = refCount; 516 refCountValue -= entry.getValue(); 517 if (refCountValue <= 0) { 518 metaData.getJournalRC().remove(tx, logId); 519 } else { 520 metaData.getJournalRC().put(tx, logId, refCountValue); 521 } 522 } 523 } 524 } 525 526 /** 527 * Updates the Job removal tracking index with the location of a remove command and the 528 * original JobLocation entry. 529 * 530 * The JobLocation holds the locations in the logs where the add and update commands for 531 * a job stored. The log file containing the remove command can only be discarded after 532 * both the add and latest update log files have also been discarded. 533 * 534 * @param tx 535 * The TX under which the update is to be performed. 536 * @param location 537 * The location value to reference a remove command. 538 * @param removedJob 539 * The original JobLocation instance that holds the add and update locations 540 * 541 * @throws IOException if an error occurs while updating the remove location tracker. 542 */ 543 protected void referenceRemovedLocation(Transaction tx, Location location, JobLocation removedJob) throws IOException { 544 int logId = location.getDataFileId(); 545 List<Integer> removed = this.metaData.getRemoveLocationTracker().get(tx, logId); 546 if (removed == null) { 547 removed = new ArrayList<Integer>(); 548 } 549 removed.add(removedJob.getLocation().getDataFileId()); 550 this.metaData.getRemoveLocationTracker().put(tx, logId, removed); 551 } 552 553 /** 554 * Updates the Job removal tracking index with the location of a remove command and the 555 * original JobLocation entry. 556 * 557 * The JobLocation holds the locations in the logs where the add and update commands for 558 * a job stored. The log file containing the remove command can only be discarded after 559 * both the add and latest update log files have also been discarded. 560 * 561 * @param tx 562 * The TX under which the update is to be performed. 563 * @param location 564 * The location value to reference a remove command. 565 * @param removedJobsFileId 566 * List of the original JobLocation instances that holds the add and update locations 567 * 568 * @throws IOException if an error occurs while updating the remove location tracker. 569 */ 570 protected void referenceRemovedLocation(Transaction tx, Location location, List<Integer> removedJobsFileId) throws IOException { 571 int logId = location.getDataFileId(); 572 List<Integer> removed = this.metaData.getRemoveLocationTracker().get(tx, logId); 573 if (removed == null) { 574 removed = new ArrayList<Integer>(); 575 } 576 removed.addAll(removedJobsFileId); 577 this.metaData.getRemoveLocationTracker().put(tx, logId, removed); 578 } 579 580 /** 581 * Retrieve the scheduled Job's byte blob from the journal. 582 * 583 * @param location 584 * The location of the KahaAddScheduledJobCommand that originated the Job. 585 * 586 * @return a ByteSequence containing the payload of the scheduled Job. 587 * 588 * @throws IOException if an error occurs while reading the payload value. 589 */ 590 protected ByteSequence getPayload(Location location) throws IOException { 591 KahaAddScheduledJobCommand job = (KahaAddScheduledJobCommand) this.load(location); 592 Buffer payload = job.getPayload(); 593 return new ByteSequence(payload.getData(), payload.getOffset(), payload.getLength()); 594 } 595 596 public void readLockIndex() { 597 this.indexLock.readLock().lock(); 598 } 599 600 public void readUnlockIndex() { 601 this.indexLock.readLock().unlock(); 602 } 603 604 public void writeLockIndex() { 605 this.indexLock.writeLock().lock(); 606 } 607 608 public void writeUnlockIndex() { 609 this.indexLock.writeLock().unlock(); 610 } 611 612 @Override 613 public String toString() { 614 return "JobSchedulerStore: " + getDirectory(); 615 } 616 617 @Override 618 protected String getPageFileName() { 619 return "scheduleDB"; 620 } 621 622 @Override 623 protected File getDefaultDataDirectory() { 624 return new File(IOHelper.getDefaultDataDirectory(), "delayedDB"); 625 } 626 627 private class MetaDataMarshaller extends VariableMarshaller<JobSchedulerKahaDBMetaData> { 628 629 private final JobSchedulerStoreImpl store; 630 631 MetaDataMarshaller(JobSchedulerStoreImpl store) { 632 this.store = store; 633 } 634 635 @Override 636 public JobSchedulerKahaDBMetaData readPayload(DataInput dataIn) throws IOException { 637 JobSchedulerKahaDBMetaData rc = new JobSchedulerKahaDBMetaData(store); 638 rc.read(dataIn); 639 return rc; 640 } 641 642 @Override 643 public void writePayload(JobSchedulerKahaDBMetaData object, DataOutput dataOut) throws IOException { 644 object.write(dataOut); 645 } 646 } 647 648 /** 649 * Called during index recovery to rebuild the index from the last known good location. For 650 * entries that occur before the last known good position we just ignore then and move on. 651 * 652 * @param data 653 * the command read from the Journal which should be used to update the index. 654 * @param location 655 * the location in the index where the command was read. 656 * @param inDoubtlocation 657 * the location in the index known to be the last time the index was valid. 658 * 659 * @throws IOException if an error occurs while recovering the index. 660 */ 661 protected void doRecover(JournalCommand<?> data, final Location location, final Location inDoubtlocation) throws IOException { 662 if (inDoubtlocation != null && location.compareTo(inDoubtlocation) >= 0) { 663 process(data, location); 664 } 665 } 666 667 /** 668 * Called during recovery to allow the store to rebuild from scratch. 669 * 670 * @param data 671 * The command to process, which was read from the Journal. 672 * @param location 673 * The location of the command in the Journal. 674 * 675 * @throws IOException if an error occurs during command processing. 676 */ 677 @Override 678 protected void process(JournalCommand<?> data, final Location location) throws IOException { 679 data.visit(new Visitor() { 680 @Override 681 public void visit(final KahaAddScheduledJobCommand command) throws IOException { 682 final JobSchedulerImpl scheduler; 683 684 indexLock.writeLock().lock(); 685 try { 686 try { 687 scheduler = (JobSchedulerImpl) getJobScheduler(command.getScheduler()); 688 } catch (Exception e) { 689 throw new IOException(e); 690 } 691 getPageFile().tx().execute(new Transaction.Closure<IOException>() { 692 @Override 693 public void execute(Transaction tx) throws IOException { 694 scheduler.process(tx, command, location); 695 } 696 }); 697 698 processLocation(location); 699 } finally { 700 indexLock.writeLock().unlock(); 701 } 702 } 703 704 @Override 705 public void visit(final KahaRemoveScheduledJobCommand command) throws IOException { 706 final JobSchedulerImpl scheduler; 707 708 indexLock.writeLock().lock(); 709 try { 710 try { 711 scheduler = (JobSchedulerImpl) getJobScheduler(command.getScheduler()); 712 } catch (Exception e) { 713 throw new IOException(e); 714 } 715 getPageFile().tx().execute(new Transaction.Closure<IOException>() { 716 @Override 717 public void execute(Transaction tx) throws IOException { 718 scheduler.process(tx, command, location); 719 } 720 }); 721 722 processLocation(location); 723 } finally { 724 indexLock.writeLock().unlock(); 725 } 726 } 727 728 @Override 729 public void visit(final KahaRemoveScheduledJobsCommand command) throws IOException { 730 final JobSchedulerImpl scheduler; 731 732 indexLock.writeLock().lock(); 733 try { 734 try { 735 scheduler = (JobSchedulerImpl) getJobScheduler(command.getScheduler()); 736 } catch (Exception e) { 737 throw new IOException(e); 738 } 739 getPageFile().tx().execute(new Transaction.Closure<IOException>() { 740 @Override 741 public void execute(Transaction tx) throws IOException { 742 scheduler.process(tx, command, location); 743 } 744 }); 745 746 processLocation(location); 747 } finally { 748 indexLock.writeLock().unlock(); 749 } 750 } 751 752 @Override 753 public void visit(final KahaRescheduleJobCommand command) throws IOException { 754 final JobSchedulerImpl scheduler; 755 756 indexLock.writeLock().lock(); 757 try { 758 try { 759 scheduler = (JobSchedulerImpl) getJobScheduler(command.getScheduler()); 760 } catch (Exception e) { 761 throw new IOException(e); 762 } 763 getPageFile().tx().execute(new Transaction.Closure<IOException>() { 764 @Override 765 public void execute(Transaction tx) throws IOException { 766 scheduler.process(tx, command, location); 767 } 768 }); 769 770 processLocation(location); 771 } finally { 772 indexLock.writeLock().unlock(); 773 } 774 } 775 776 @Override 777 public void visit(final KahaDestroySchedulerCommand command) { 778 try { 779 removeJobScheduler(command.getScheduler()); 780 } catch (Exception e) { 781 LOG.warn("Failed to remove scheduler: {}", command.getScheduler()); 782 } 783 784 processLocation(location); 785 } 786 787 @Override 788 public void visit(KahaTraceCommand command) { 789 processLocation(location); 790 } 791 }); 792 } 793 794 protected void processLocation(final Location location) { 795 indexLock.writeLock().lock(); 796 try { 797 this.metaData.setLastUpdateLocation(location); 798 } finally { 799 indexLock.writeLock().unlock(); 800 } 801 } 802 803 /** 804 * We recover from the Journal logs as needed to restore the index. 805 * 806 * @throws IllegalStateException 807 * @throws IOException 808 */ 809 private void recover() throws IllegalStateException, IOException { 810 this.indexLock.writeLock().lock(); 811 try { 812 long start = System.currentTimeMillis(); 813 Location lastIndoubtPosition = getRecoveryPosition(); 814 Location recoveryPosition = lastIndoubtPosition; 815 816 if (recoveryPosition != null) { 817 int redoCounter = 0; 818 LOG.info("Recovering from the scheduled job journal @" + recoveryPosition); 819 while (recoveryPosition != null) { 820 try { 821 JournalCommand<?> message = load(recoveryPosition); 822 metaData.setLastUpdateLocation(recoveryPosition); 823 doRecover(message, recoveryPosition, lastIndoubtPosition); 824 redoCounter++; 825 } catch (IOException failedRecovery) { 826 if (isIgnoreMissingJournalfiles()) { 827 LOG.debug("Failed to recover data at position:" + recoveryPosition, failedRecovery); 828 // track this dud location 829 journal.corruptRecoveryLocation(recoveryPosition); 830 } else { 831 throw new IOException("Failed to recover data at position:" + recoveryPosition, failedRecovery); 832 } 833 } 834 recoveryPosition = journal.getNextLocation(recoveryPosition); 835 if (LOG.isInfoEnabled() && redoCounter % 100000 == 0) { 836 LOG.info("@ {}, {} entries recovered ..", recoveryPosition, redoCounter); 837 } 838 } 839 long end = System.currentTimeMillis(); 840 LOG.info("Recovery replayed {} operations from the journal in {} seconds.", 841 redoCounter, ((end - start) / 1000.0f)); 842 } 843 844 // We may have to undo some index updates. 845 pageFile.tx().execute(new Transaction.Closure<IOException>() { 846 @Override 847 public void execute(Transaction tx) throws IOException { 848 recoverIndex(tx); 849 } 850 }); 851 852 } finally { 853 this.indexLock.writeLock().unlock(); 854 } 855 } 856 857 private Location getRecoveryPosition() throws IOException { 858 // This loads the first position and we completely rebuild the index if we 859 // do not override it with some known recovery start location. 860 Location result = null; 861 862 if (!isForceRecoverIndex()) { 863 if (metaData.getLastUpdateLocation() != null) { 864 result = metaData.getLastUpdateLocation(); 865 } 866 } 867 868 return journal.getNextLocation(result); 869 } 870 871 private void recoverIndex(Transaction tx) throws IOException { 872 long start = System.currentTimeMillis(); 873 874 // It is possible index updates got applied before the journal updates.. 875 // in that case we need to removed references to Jobs that are not in the journal 876 final Location lastAppendLocation = journal.getLastAppendLocation(); 877 long undoCounter = 0; 878 879 // Go through all the jobs in each scheduler and check if any are added after 880 // the last appended location and remove those. For now we ignore the update 881 // location since the scheduled job will update itself after the next fire and 882 // a new update will replace any existing update. 883 for (Iterator<Map.Entry<String, JobSchedulerImpl>> i = metaData.getJobSchedulers().iterator(tx); i.hasNext();) { 884 Map.Entry<String, JobSchedulerImpl> entry = i.next(); 885 JobSchedulerImpl scheduler = entry.getValue(); 886 887 for (Iterator<JobLocation> jobLocationIterator = scheduler.getAllScheduledJobs(tx); jobLocationIterator.hasNext();) { 888 final JobLocation job = jobLocationIterator.next(); 889 if (job.getLocation().compareTo(lastAppendLocation) >= 0) { 890 if (scheduler.removeJobAtTime(tx, job.getJobId(), job.getNextTime())) { 891 LOG.trace("Removed Job past last appened in the journal: {}", job.getJobId()); 892 undoCounter++; 893 } 894 } 895 } 896 } 897 898 if (undoCounter > 0) { 899 // The rolled back operations are basically in flight journal writes. To avoid getting 900 // these the end user should do sync writes to the journal. 901 long end = System.currentTimeMillis(); 902 LOG.info("Rolled back {} messages from the index in {} seconds.", undoCounter, ((end - start) / 1000.0f)); 903 undoCounter = 0; 904 } 905 906 // Now we check for missing and corrupt journal files. 907 908 // 1. Collect the set of all referenced journal files based on the Location of the 909 // the scheduled jobs and the marked last update field. 910 HashSet<Integer> missingJournalFiles = new HashSet<Integer>(); 911 for (Iterator<Map.Entry<String, JobSchedulerImpl>> i = metaData.getJobSchedulers().iterator(tx); i.hasNext();) { 912 Map.Entry<String, JobSchedulerImpl> entry = i.next(); 913 JobSchedulerImpl scheduler = entry.getValue(); 914 915 for (Iterator<JobLocation> jobLocationIterator = scheduler.getAllScheduledJobs(tx); jobLocationIterator.hasNext();) { 916 final JobLocation job = jobLocationIterator.next(); 917 missingJournalFiles.add(job.getLocation().getDataFileId()); 918 if (job.getLastUpdate() != null) { 919 missingJournalFiles.add(job.getLastUpdate().getDataFileId()); 920 } 921 } 922 } 923 924 // 2. Remove from that set all known data file Id's in the journal and what's left 925 // is the missing set which will soon also contain the corrupted set. 926 missingJournalFiles.removeAll(journal.getFileMap().keySet()); 927 if (!missingJournalFiles.isEmpty()) { 928 LOG.info("Some journal files are missing: {}", missingJournalFiles); 929 } 930 931 // 3. Now check all references in the journal logs for corruption and add any 932 // corrupt journal files to the missing set. 933 HashSet<Location> corruptedLocations = new HashSet<Location>(); 934 935 if (isCheckForCorruptJournalFiles()) { 936 Collection<DataFile> dataFiles = journal.getFileMap().values(); 937 for (DataFile dataFile : dataFiles) { 938 int id = dataFile.getDataFileId(); 939 for (long offset : dataFile.getCorruptedBlocks()) { 940 corruptedLocations.add(new Location(id, (int) offset)); 941 } 942 } 943 944 if (!corruptedLocations.isEmpty()) { 945 LOG.debug("Found some corrupted data blocks in the journal: {}", corruptedLocations.size()); 946 } 947 } 948 949 // 4. Now we either fail or we remove all references to missing or corrupt journal 950 // files from the various JobSchedulerImpl instances. We only remove the Job if 951 // the initial Add operation is missing when the ignore option is set, the updates 952 // could be lost but that's price you pay when ignoring the missing logs. 953 if (!missingJournalFiles.isEmpty() || !corruptedLocations.isEmpty()) { 954 if (!isIgnoreMissingJournalfiles()) { 955 throw new IOException("Detected missing/corrupt journal files."); 956 } 957 958 // Remove all Jobs that reference an Location that is either missing or corrupt. 959 undoCounter = removeJobsInMissingOrCorruptJounralFiles(tx, missingJournalFiles, corruptedLocations); 960 961 // Clean up the Journal Reference count Map. 962 removeJournalRCForMissingFiles(tx, missingJournalFiles); 963 } 964 965 if (undoCounter > 0) { 966 long end = System.currentTimeMillis(); 967 LOG.info("Detected missing/corrupt journal files. Dropped {} jobs from the " + 968 "index in {} seconds.", undoCounter, ((end - start) / 1000.0f)); 969 } 970 } 971 972 private void removeJournalRCForMissingFiles(Transaction tx, Set<Integer> missing) throws IOException { 973 List<Integer> matches = new ArrayList<Integer>(); 974 975 Iterator<Entry<Integer, Integer>> references = metaData.getJournalRC().iterator(tx); 976 while (references.hasNext()) { 977 int dataFileId = references.next().getKey(); 978 if (missing.contains(dataFileId)) { 979 matches.add(dataFileId); 980 } 981 } 982 983 for (Integer match : matches) { 984 metaData.getJournalRC().remove(tx, match); 985 } 986 } 987 988 private int removeJobsInMissingOrCorruptJounralFiles(Transaction tx, Set<Integer> missing, Set<Location> corrupted) throws IOException { 989 int removed = 0; 990 991 // Remove Jobs that reference missing or corrupt files. 992 // Remove Reference counts to missing or corrupt files. 993 // Remove and remove command markers to missing or corrupt files. 994 for (Iterator<Map.Entry<String, JobSchedulerImpl>> i = metaData.getJobSchedulers().iterator(tx); i.hasNext();) { 995 Map.Entry<String, JobSchedulerImpl> entry = i.next(); 996 JobSchedulerImpl scheduler = entry.getValue(); 997 998 for (Iterator<JobLocation> jobLocationIterator = scheduler.getAllScheduledJobs(tx); jobLocationIterator.hasNext();) { 999 final JobLocation job = jobLocationIterator.next(); 1000 1001 // Remove all jobs in missing log files. 1002 if (missing.contains(job.getLocation().getDataFileId())) { 1003 scheduler.removeJobAtTime(tx, job.getJobId(), job.getNextTime()); 1004 removed++; 1005 continue; 1006 } 1007 1008 // Remove all jobs in corrupted parts of log files. 1009 if (corrupted.contains(job.getLocation())) { 1010 scheduler.removeJobAtTime(tx, job.getJobId(), job.getNextTime()); 1011 removed++; 1012 } 1013 } 1014 } 1015 1016 return removed; 1017 } 1018}