001/** 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.activemq.store.kahadb.disk.page; 018 019import java.io.ByteArrayInputStream; 020import java.io.ByteArrayOutputStream; 021import java.io.DataInputStream; 022import java.io.DataOutputStream; 023import java.io.File; 024import java.io.FileInputStream; 025import java.io.FileOutputStream; 026import java.io.IOException; 027import java.io.InterruptedIOException; 028import java.io.RandomAccessFile; 029import java.util.ArrayList; 030import java.util.Arrays; 031import java.util.Collection; 032import java.util.Collections; 033import java.util.HashMap; 034import java.util.Iterator; 035import java.util.LinkedHashMap; 036import java.util.Map; 037import java.util.Map.Entry; 038import java.util.Properties; 039import java.util.TreeMap; 040import java.util.concurrent.CountDownLatch; 041import java.util.concurrent.atomic.AtomicBoolean; 042import java.util.concurrent.atomic.AtomicLong; 043import java.util.concurrent.atomic.AtomicReference; 044import java.util.zip.Adler32; 045import java.util.zip.Checksum; 046 047import org.apache.activemq.store.kahadb.disk.util.Sequence; 048import org.apache.activemq.store.kahadb.disk.util.SequenceSet; 049import org.apache.activemq.util.DataByteArrayOutputStream; 050import org.apache.activemq.util.IOExceptionSupport; 051import org.apache.activemq.util.IOHelper; 052import org.apache.activemq.util.IntrospectionSupport; 053import org.apache.activemq.util.LFUCache; 054import org.apache.activemq.util.LRUCache; 055import org.apache.activemq.util.RecoverableRandomAccessFile; 056import org.slf4j.Logger; 057import org.slf4j.LoggerFactory; 058 059/** 060 * A PageFile provides you random access to fixed sized disk pages. This object is not thread safe and therefore access to it should 061 * be externally synchronized. 062 * <p/> 063 * The file has 3 parts: 064 * Metadata Space: 4k : Reserved metadata area. Used to store persistent config about the file. 065 * Recovery Buffer Space: Page Size * 1000 : This is a redo log used to prevent partial page writes from making the file inconsistent 066 * Page Space: The pages in the page file. 067 */ 068public class PageFile { 069 070 private static final String PAGEFILE_SUFFIX = ".data"; 071 private static final String RECOVERY_FILE_SUFFIX = ".redo"; 072 private static final String FREE_FILE_SUFFIX = ".free"; 073 074 // 4k Default page size. 075 public static final int DEFAULT_PAGE_SIZE = Integer.getInteger("defaultPageSize", 1024*4); 076 public static final int DEFAULT_WRITE_BATCH_SIZE = Integer.getInteger("defaultWriteBatchSize", 1000); 077 public static final int DEFAULT_PAGE_CACHE_SIZE = Integer.getInteger("defaultPageCacheSize", 100);; 078 079 private static final int RECOVERY_FILE_HEADER_SIZE = 1024 * 4; 080 private static final int PAGE_FILE_HEADER_SIZE = 1024 * 4; 081 082 // Recovery header is (long offset) 083 private static final Logger LOG = LoggerFactory.getLogger(PageFile.class); 084 085 // A PageFile will use a couple of files in this directory 086 private final File directory; 087 // And the file names in that directory will be based on this name. 088 private final String name; 089 090 // File handle used for reading pages.. 091 private RecoverableRandomAccessFile readFile; 092 // File handle used for writing pages.. 093 private RecoverableRandomAccessFile writeFile; 094 // File handle used for writing pages.. 095 private RecoverableRandomAccessFile recoveryFile; 096 097 // The size of pages 098 private int pageSize = DEFAULT_PAGE_SIZE; 099 100 // The minimum number of space allocated to the recovery file in number of pages. 101 private int recoveryFileMinPageCount = 1000; 102 // The max size that we let the recovery file grow to.. ma exceed the max, but the file will get resize 103 // to this max size as soon as possible. 104 private int recoveryFileMaxPageCount = 10000; 105 // The number of pages in the current recovery buffer 106 private int recoveryPageCount; 107 108 private final AtomicBoolean loaded = new AtomicBoolean(); 109 // The number of pages we are aiming to write every time we 110 // write to disk. 111 int writeBatchSize = DEFAULT_WRITE_BATCH_SIZE; 112 113 // We keep a cache of pages recently used? 114 private Map<Long, Page> pageCache; 115 // The cache of recently used pages. 116 private boolean enablePageCaching = true; 117 // How many pages will we keep in the cache? 118 private int pageCacheSize = DEFAULT_PAGE_CACHE_SIZE; 119 120 // Should first log the page write to the recovery buffer? Avoids partial 121 // page write failures.. 122 private boolean enableRecoveryFile = true; 123 // Will we sync writes to disk. Ensures that data will not be lost after a checkpoint() 124 private boolean enableDiskSyncs = true; 125 // Will writes be done in an async thread? 126 private boolean enabledWriteThread = false; 127 128 // These are used if enableAsyncWrites==true 129 private final AtomicBoolean stopWriter = new AtomicBoolean(); 130 private Thread writerThread; 131 private CountDownLatch checkpointLatch; 132 133 // Keeps track of writes that are being written to disk. 134 private final TreeMap<Long, PageWrite> writes = new TreeMap<Long, PageWrite>(); 135 136 // Keeps track of free pages. 137 private final AtomicLong nextFreePageId = new AtomicLong(); 138 private SequenceSet freeList = new SequenceSet(); 139 140 private AtomicReference<SequenceSet> recoveredFreeList = new AtomicReference<SequenceSet>(); 141 private AtomicReference<SequenceSet> trackingFreeDuringRecovery = new AtomicReference<SequenceSet>(); 142 143 private final AtomicLong nextTxid = new AtomicLong(); 144 145 // Persistent settings stored in the page file. 146 private MetaData metaData; 147 148 private final HashMap<File, RandomAccessFile> tmpFilesForRemoval = new HashMap<>(); 149 150 private boolean useLFRUEviction = false; 151 private float LFUEvictionFactor = 0.2f; 152 153 /** 154 * Use to keep track of updated pages which have not yet been committed. 155 */ 156 static class PageWrite { 157 Page page; 158 byte[] current; 159 byte[] diskBound; 160 long currentLocation = -1; 161 long diskBoundLocation = -1; 162 File tmpFile; 163 int length; 164 165 public PageWrite(Page page, byte[] data) { 166 this.page = page; 167 current = data; 168 } 169 170 public PageWrite(Page page, long currentLocation, int length, File tmpFile) { 171 this.page = page; 172 this.currentLocation = currentLocation; 173 this.tmpFile = tmpFile; 174 this.length = length; 175 } 176 177 public void setCurrent(Page page, byte[] data) { 178 this.page = page; 179 current = data; 180 currentLocation = -1; 181 diskBoundLocation = -1; 182 } 183 184 public void setCurrentLocation(Page page, long location, int length) { 185 this.page = page; 186 this.currentLocation = location; 187 this.length = length; 188 this.current = null; 189 } 190 191 @Override 192 public String toString() { 193 return "[PageWrite:" + page.getPageId() + "-" + page.getType() + "]"; 194 } 195 196 @SuppressWarnings("unchecked") 197 public Page getPage() { 198 return page; 199 } 200 201 public byte[] getDiskBound(HashMap<File, RandomAccessFile> tmpFiles) throws IOException { 202 if (diskBound == null && diskBoundLocation != -1) { 203 diskBound = new byte[length]; 204 if (tmpFiles.containsKey(tmpFile) && tmpFiles.get(tmpFile).getChannel().isOpen()) { 205 RandomAccessFile file = tmpFiles.get(tmpFile); 206 file.seek(diskBoundLocation); 207 file.read(diskBound); 208 } else { 209 try (RandomAccessFile file = new RandomAccessFile(tmpFile, "r")) { 210 file.seek(diskBoundLocation); 211 file.read(diskBound); 212 } 213 } 214 diskBoundLocation = -1; 215 } 216 return diskBound; 217 } 218 219 void begin() { 220 if (currentLocation != -1) { 221 diskBoundLocation = currentLocation; 222 } else { 223 diskBound = current; 224 } 225 current = null; 226 currentLocation = -1; 227 } 228 229 /** 230 * @return true if there is no pending writes to do. 231 */ 232 boolean done() { 233 diskBoundLocation = -1; 234 diskBound = null; 235 return current == null || currentLocation == -1; 236 } 237 238 boolean isDone() { 239 return diskBound == null && diskBoundLocation == -1 && current == null && currentLocation == -1; 240 } 241 } 242 243 /** 244 * The MetaData object hold the persistent data associated with a PageFile object. 245 */ 246 public static class MetaData { 247 248 String fileType; 249 String fileTypeVersion; 250 251 long metaDataTxId = -1; 252 int pageSize; 253 boolean cleanShutdown; 254 long lastTxId; 255 long freePages; 256 257 public String getFileType() { 258 return fileType; 259 } 260 261 public void setFileType(String fileType) { 262 this.fileType = fileType; 263 } 264 265 public String getFileTypeVersion() { 266 return fileTypeVersion; 267 } 268 269 public void setFileTypeVersion(String version) { 270 this.fileTypeVersion = version; 271 } 272 273 public long getMetaDataTxId() { 274 return metaDataTxId; 275 } 276 277 public void setMetaDataTxId(long metaDataTxId) { 278 this.metaDataTxId = metaDataTxId; 279 } 280 281 public int getPageSize() { 282 return pageSize; 283 } 284 285 public void setPageSize(int pageSize) { 286 this.pageSize = pageSize; 287 } 288 289 public boolean isCleanShutdown() { 290 return cleanShutdown; 291 } 292 293 public void setCleanShutdown(boolean cleanShutdown) { 294 this.cleanShutdown = cleanShutdown; 295 } 296 297 public long getLastTxId() { 298 return lastTxId; 299 } 300 301 public void setLastTxId(long lastTxId) { 302 this.lastTxId = lastTxId; 303 } 304 305 public long getFreePages() { 306 return freePages; 307 } 308 309 public void setFreePages(long value) { 310 this.freePages = value; 311 } 312 } 313 314 public Transaction tx() { 315 assertLoaded(); 316 return new Transaction(this); 317 } 318 319 /** 320 * Creates a PageFile in the specified directory who's data files are named by name. 321 */ 322 public PageFile(File directory, String name) { 323 this.directory = directory; 324 this.name = name; 325 } 326 327 /** 328 * Deletes the files used by the PageFile object. This method can only be used when this object is not loaded. 329 * 330 * @throws IOException if the files cannot be deleted. 331 * @throws IllegalStateException if this PageFile is loaded 332 */ 333 public void delete() throws IOException { 334 if (loaded.get()) { 335 throw new IllegalStateException("Cannot delete page file data when the page file is loaded"); 336 } 337 delete(getMainPageFile()); 338 delete(getFreeFile()); 339 delete(getRecoveryFile()); 340 } 341 342 public void archive() throws IOException { 343 if (loaded.get()) { 344 throw new IllegalStateException("Cannot delete page file data when the page file is loaded"); 345 } 346 long timestamp = System.currentTimeMillis(); 347 archive(getMainPageFile(), String.valueOf(timestamp)); 348 archive(getFreeFile(), String.valueOf(timestamp)); 349 archive(getRecoveryFile(), String.valueOf(timestamp)); 350 } 351 352 /** 353 * @param file 354 * @throws IOException 355 */ 356 private void delete(File file) throws IOException { 357 if (file.exists() && !file.delete()) { 358 throw new IOException("Could not delete: " + file.getPath()); 359 } 360 } 361 362 private void archive(File file, String suffix) throws IOException { 363 if (file.exists()) { 364 File archive = new File(file.getPath() + "-" + suffix); 365 if (!file.renameTo(archive)) { 366 throw new IOException("Could not archive: " + file.getPath() + " to " + file.getPath()); 367 } 368 } 369 } 370 371 /** 372 * Loads the page file so that it can be accessed for read/write purposes. This allocates OS resources. If this is the 373 * first time the page file is loaded, then this creates the page file in the file system. 374 * 375 * @throws IOException If the page file cannot be loaded. This could be cause the existing page file is corrupt is a bad version or if 376 * there was a disk error. 377 * @throws IllegalStateException If the page file was already loaded. 378 */ 379 public void load() throws IOException, IllegalStateException { 380 if (loaded.compareAndSet(false, true)) { 381 382 if (enablePageCaching) { 383 if (isUseLFRUEviction()) { 384 pageCache = Collections.synchronizedMap(new LFUCache<Long, Page>(pageCacheSize, getLFUEvictionFactor())); 385 } else { 386 pageCache = Collections.synchronizedMap(new LRUCache<Long, Page>(pageCacheSize, pageCacheSize, 0.75f, true)); 387 } 388 } 389 390 File file = getMainPageFile(); 391 IOHelper.mkdirs(file.getParentFile()); 392 writeFile = new RecoverableRandomAccessFile(file, "rw", false); 393 readFile = new RecoverableRandomAccessFile(file, "r"); 394 395 if (readFile.length() > 0) { 396 // Load the page size setting cause that can't change once the file is created. 397 loadMetaData(); 398 pageSize = metaData.getPageSize(); 399 } else { 400 // Store the page size setting cause that can't change once the file is created. 401 metaData = new MetaData(); 402 metaData.setFileType(PageFile.class.getName()); 403 metaData.setFileTypeVersion("1"); 404 metaData.setPageSize(getPageSize()); 405 metaData.setCleanShutdown(true); 406 metaData.setFreePages(-1); 407 metaData.setLastTxId(0); 408 storeMetaData(); 409 } 410 411 if (enableRecoveryFile) { 412 recoveryFile = new RecoverableRandomAccessFile(getRecoveryFile(), "rw"); 413 } 414 415 if (metaData.isCleanShutdown()) { 416 nextTxid.set(metaData.getLastTxId() + 1); 417 if (metaData.getFreePages() > 0) { 418 loadFreeList(); 419 } 420 } else { 421 LOG.debug(toString() + ", Recovering page file..."); 422 nextTxid.set(redoRecoveryUpdates()); 423 trackingFreeDuringRecovery.set(new SequenceSet()); 424 } 425 426 if (writeFile.length() < PAGE_FILE_HEADER_SIZE) { 427 writeFile.setLength(PAGE_FILE_HEADER_SIZE); 428 } 429 nextFreePageId.set((writeFile.length() - PAGE_FILE_HEADER_SIZE) / pageSize); 430 431 metaData.setCleanShutdown(false); 432 storeMetaData(); 433 getFreeFile().delete(); 434 startWriter(); 435 if (trackingFreeDuringRecovery.get() != null) { 436 asyncFreePageRecovery(nextFreePageId.get()); 437 } 438 } else { 439 throw new IllegalStateException("Cannot load the page file when it is already loaded."); 440 } 441 } 442 443 private void asyncFreePageRecovery(final long lastRecoveryPage) { 444 Thread thread = new Thread("KahaDB Index Free Page Recovery") { 445 @Override 446 public void run() { 447 try { 448 recoverFreePages(lastRecoveryPage); 449 } catch (Throwable e) { 450 if (loaded.get()) { 451 LOG.warn("Error recovering index free page list", e); 452 } 453 } 454 } 455 }; 456 thread.setPriority(Thread.NORM_PRIORITY); 457 thread.setDaemon(true); 458 thread.start(); 459 } 460 461 private void recoverFreePages(final long lastRecoveryPage) throws Exception { 462 LOG.info(toString() + ". Recovering pageFile free list due to prior unclean shutdown.."); 463 SequenceSet newFreePages = new SequenceSet(); 464 // need new pageFile instance to get unshared readFile 465 PageFile recoveryPageFile = new PageFile(directory, name); 466 recoveryPageFile.loadForRecovery(nextFreePageId.get()); 467 try { 468 for (Iterator<Page> i = new Transaction(recoveryPageFile).iterator(true); i.hasNext(); ) { 469 Page page = i.next(); 470 471 if (page.getPageId() >= lastRecoveryPage) { 472 break; 473 } 474 475 if (page.getType() == Page.PAGE_FREE_TYPE) { 476 newFreePages.add(page.getPageId()); 477 } 478 } 479 } finally { 480 recoveryPageFile.readFile.close(); 481 } 482 483 LOG.info(toString() + ". Recovered pageFile free list of size: " + newFreePages.rangeSize()); 484 if (!newFreePages.isEmpty()) { 485 486 // allow flush (with index lock held) to merge eventually 487 recoveredFreeList.lazySet(newFreePages); 488 } else { 489 // If there is no free pages, set trackingFreeDuringRecovery to allow the broker to have a clean shutdown 490 trackingFreeDuringRecovery.set(null); 491 } 492 } 493 494 private void loadForRecovery(long nextFreePageIdSnap) throws Exception { 495 loaded.set(true); 496 enablePageCaching = false; 497 File file = getMainPageFile(); 498 readFile = new RecoverableRandomAccessFile(file, "r"); 499 loadMetaData(); 500 pageSize = metaData.getPageSize(); 501 enableRecoveryFile = false; 502 nextFreePageId.set(nextFreePageIdSnap); 503 } 504 505 506 /** 507 * Unloads a previously loaded PageFile. This deallocates OS related resources like file handles. 508 * once unloaded, you can no longer use the page file to read or write Pages. 509 * 510 * @throws IOException if there was a disk error occurred while closing the down the page file. 511 * @throws IllegalStateException if the PageFile is not loaded 512 */ 513 public void unload() throws IOException { 514 if (loaded.compareAndSet(true, false)) { 515 flush(); 516 try { 517 stopWriter(); 518 } catch (InterruptedException e) { 519 throw new InterruptedIOException(); 520 } 521 522 if (freeList.isEmpty()) { 523 metaData.setFreePages(0); 524 } else { 525 storeFreeList(); 526 metaData.setFreePages(freeList.size()); 527 } 528 529 metaData.setLastTxId(nextTxid.get() - 1); 530 if (trackingFreeDuringRecovery.get() != null) { 531 // async recovery incomplete, will have to try again 532 metaData.setCleanShutdown(false); 533 } else { 534 metaData.setCleanShutdown(true); 535 } 536 storeMetaData(); 537 538 if (readFile != null) { 539 readFile.close(); 540 readFile = null; 541 writeFile.close(); 542 writeFile = null; 543 if (enableRecoveryFile) { 544 recoveryFile.close(); 545 recoveryFile = null; 546 } 547 freeList.clear(); 548 if (pageCache != null) { 549 pageCache = null; 550 } 551 synchronized (writes) { 552 writes.clear(); 553 } 554 } 555 } else { 556 throw new IllegalStateException("Cannot unload the page file when it is not loaded"); 557 } 558 } 559 560 public boolean isLoaded() { 561 return loaded.get(); 562 } 563 564 public boolean isCleanShutdown() { 565 return metaData != null && metaData.isCleanShutdown(); 566 } 567 568 public void allowIOResumption() { 569 loaded.set(true); 570 } 571 572 /** 573 * Flush and sync all write buffers to disk. 574 * 575 * @throws IOException If an disk error occurred. 576 */ 577 public void flush() throws IOException { 578 579 if (enabledWriteThread && stopWriter.get()) { 580 throw new IOException("Page file already stopped: checkpointing is not allowed"); 581 } 582 583 SequenceSet recovered = recoveredFreeList.get(); 584 if (recovered != null) { 585 recoveredFreeList.lazySet(null); 586 SequenceSet inUse = trackingFreeDuringRecovery.get(); 587 recovered.remove(inUse); 588 freeList.merge(recovered); 589 590 // all set for clean shutdown 591 trackingFreeDuringRecovery.set(null); 592 inUse.clear(); 593 } 594 595 // Setup a latch that gets notified when all buffered writes hits the disk. 596 CountDownLatch checkpointLatch; 597 synchronized (writes) { 598 if (writes.isEmpty()) { 599 return; 600 } 601 if (enabledWriteThread) { 602 if (this.checkpointLatch == null) { 603 this.checkpointLatch = new CountDownLatch(1); 604 } 605 checkpointLatch = this.checkpointLatch; 606 writes.notify(); 607 } else { 608 writeBatch(); 609 return; 610 } 611 } 612 try { 613 checkpointLatch.await(); 614 } catch (InterruptedException e) { 615 InterruptedIOException ioe = new InterruptedIOException(); 616 ioe.initCause(e); 617 throw ioe; 618 } 619 } 620 621 622 @Override 623 public String toString() { 624 return "Page File: " + getMainPageFile(); 625 } 626 627 /////////////////////////////////////////////////////////////////// 628 // Private Implementation Methods 629 /////////////////////////////////////////////////////////////////// 630 private File getMainPageFile() { 631 return new File(directory, IOHelper.toFileSystemSafeName(name) + PAGEFILE_SUFFIX); 632 } 633 634 public File getFreeFile() { 635 return new File(directory, IOHelper.toFileSystemSafeName(name) + FREE_FILE_SUFFIX); 636 } 637 638 public File getRecoveryFile() { 639 return new File(directory, IOHelper.toFileSystemSafeName(name) + RECOVERY_FILE_SUFFIX); 640 } 641 642 public long toOffset(long pageId) { 643 return PAGE_FILE_HEADER_SIZE + (pageId * pageSize); 644 } 645 646 private void loadMetaData() throws IOException { 647 648 ByteArrayInputStream is; 649 MetaData v1 = new MetaData(); 650 MetaData v2 = new MetaData(); 651 try { 652 Properties p = new Properties(); 653 byte[] d = new byte[PAGE_FILE_HEADER_SIZE / 2]; 654 readFile.seek(0); 655 readFile.readFully(d); 656 is = new ByteArrayInputStream(d); 657 p.load(is); 658 IntrospectionSupport.setProperties(v1, p); 659 } catch (IOException e) { 660 v1 = null; 661 } 662 663 try { 664 Properties p = new Properties(); 665 byte[] d = new byte[PAGE_FILE_HEADER_SIZE / 2]; 666 readFile.seek(PAGE_FILE_HEADER_SIZE / 2); 667 readFile.readFully(d); 668 is = new ByteArrayInputStream(d); 669 p.load(is); 670 IntrospectionSupport.setProperties(v2, p); 671 } catch (IOException e) { 672 v2 = null; 673 } 674 675 if (v1 == null && v2 == null) { 676 throw new IOException("Could not load page file meta data"); 677 } 678 679 if (v1 == null || v1.metaDataTxId < 0) { 680 metaData = v2; 681 } else if (v2 == null || v1.metaDataTxId < 0) { 682 metaData = v1; 683 } else if (v1.metaDataTxId == v2.metaDataTxId) { 684 metaData = v1; // use the first since the 2nd could be a partial.. 685 } else { 686 metaData = v2; // use the second cause the first is probably a partial. 687 } 688 } 689 690 private void storeMetaData() throws IOException { 691 // Convert the metadata into a property format 692 metaData.metaDataTxId++; 693 Properties p = new Properties(); 694 IntrospectionSupport.getProperties(metaData, p, null); 695 696 ByteArrayOutputStream os = new ByteArrayOutputStream(PAGE_FILE_HEADER_SIZE); 697 p.store(os, ""); 698 if (os.size() > PAGE_FILE_HEADER_SIZE / 2) { 699 throw new IOException("Configuation is larger than: " + PAGE_FILE_HEADER_SIZE / 2); 700 } 701 // Fill the rest with space... 702 byte[] filler = new byte[(PAGE_FILE_HEADER_SIZE / 2) - os.size()]; 703 Arrays.fill(filler, (byte) ' '); 704 os.write(filler); 705 os.flush(); 706 707 byte[] d = os.toByteArray(); 708 709 // So we don't loose it.. write it 2 times... 710 writeFile.seek(0); 711 writeFile.write(d); 712 writeFile.sync(); 713 writeFile.seek(PAGE_FILE_HEADER_SIZE / 2); 714 writeFile.write(d); 715 writeFile.sync(); 716 } 717 718 private void storeFreeList() throws IOException { 719 FileOutputStream os = new FileOutputStream(getFreeFile()); 720 DataOutputStream dos = new DataOutputStream(os); 721 SequenceSet.Marshaller.INSTANCE.writePayload(freeList, dos); 722 dos.close(); 723 } 724 725 private void loadFreeList() throws IOException { 726 freeList.clear(); 727 FileInputStream is = new FileInputStream(getFreeFile()); 728 DataInputStream dis = new DataInputStream(is); 729 freeList = SequenceSet.Marshaller.INSTANCE.readPayload(dis); 730 dis.close(); 731 } 732 733 /////////////////////////////////////////////////////////////////// 734 // Property Accessors 735 /////////////////////////////////////////////////////////////////// 736 737 /** 738 * Is the recovery buffer used to double buffer page writes. Enabled by default. 739 * 740 * @return is the recovery buffer enabled. 741 */ 742 public boolean isEnableRecoveryFile() { 743 return enableRecoveryFile; 744 } 745 746 /** 747 * Sets if the recovery buffer uses to double buffer page writes. Enabled by default. Disabling this 748 * may potentially cause partial page writes which can lead to page file corruption. 749 */ 750 public void setEnableRecoveryFile(boolean doubleBuffer) { 751 assertNotLoaded(); 752 this.enableRecoveryFile = doubleBuffer; 753 } 754 755 /** 756 * @return Are page writes synced to disk? 757 */ 758 public boolean isEnableDiskSyncs() { 759 return enableDiskSyncs; 760 } 761 762 /** 763 * Allows you enable syncing writes to disk. 764 */ 765 public void setEnableDiskSyncs(boolean syncWrites) { 766 assertNotLoaded(); 767 this.enableDiskSyncs = syncWrites; 768 } 769 770 /** 771 * @return the page size 772 */ 773 public int getPageSize() { 774 return this.pageSize; 775 } 776 777 /** 778 * @return the amount of content data that a page can hold. 779 */ 780 public int getPageContentSize() { 781 return this.pageSize - Page.PAGE_HEADER_SIZE; 782 } 783 784 /** 785 * Configures the page size used by the page file. By default it is 4k. Once a page file is created on disk, 786 * subsequent loads of that file will use the original pageSize. Once the PageFile is loaded, this setting 787 * can no longer be changed. 788 * 789 * @param pageSize the pageSize to set 790 * @throws IllegalStateException once the page file is loaded. 791 */ 792 public void setPageSize(int pageSize) throws IllegalStateException { 793 assertNotLoaded(); 794 this.pageSize = pageSize; 795 } 796 797 /** 798 * @return true if read page caching is enabled 799 */ 800 public boolean isEnablePageCaching() { 801 return this.enablePageCaching; 802 } 803 804 /** 805 * @param enablePageCaching allows you to enable read page caching 806 */ 807 public void setEnablePageCaching(boolean enablePageCaching) { 808 assertNotLoaded(); 809 this.enablePageCaching = enablePageCaching; 810 } 811 812 /** 813 * @return the maximum number of pages that will get stored in the read page cache. 814 */ 815 public int getPageCacheSize() { 816 return this.pageCacheSize; 817 } 818 819 /** 820 * @param pageCacheSize Sets the maximum number of pages that will get stored in the read page cache. 821 */ 822 public void setPageCacheSize(int pageCacheSize) { 823 assertNotLoaded(); 824 this.pageCacheSize = pageCacheSize; 825 } 826 827 public boolean isEnabledWriteThread() { 828 return enabledWriteThread; 829 } 830 831 public void setEnableWriteThread(boolean enableAsyncWrites) { 832 assertNotLoaded(); 833 this.enabledWriteThread = enableAsyncWrites; 834 } 835 836 public long getDiskSize() throws IOException { 837 return toOffset(nextFreePageId.get()); 838 } 839 840 public boolean isFreePage(long pageId) { 841 return freeList.contains(pageId); 842 } 843 /** 844 * @return the number of pages allocated in the PageFile 845 */ 846 public long getPageCount() { 847 return nextFreePageId.get(); 848 } 849 850 public int getRecoveryFileMinPageCount() { 851 return recoveryFileMinPageCount; 852 } 853 854 public long getFreePageCount() { 855 assertLoaded(); 856 return freeList.rangeSize(); 857 } 858 859 public void setRecoveryFileMinPageCount(int recoveryFileMinPageCount) { 860 assertNotLoaded(); 861 this.recoveryFileMinPageCount = recoveryFileMinPageCount; 862 } 863 864 public int getRecoveryFileMaxPageCount() { 865 return recoveryFileMaxPageCount; 866 } 867 868 public void setRecoveryFileMaxPageCount(int recoveryFileMaxPageCount) { 869 assertNotLoaded(); 870 this.recoveryFileMaxPageCount = recoveryFileMaxPageCount; 871 } 872 873 public int getWriteBatchSize() { 874 return writeBatchSize; 875 } 876 877 public void setWriteBatchSize(int writeBatchSize) { 878 this.writeBatchSize = writeBatchSize; 879 } 880 881 public float getLFUEvictionFactor() { 882 return LFUEvictionFactor; 883 } 884 885 public void setLFUEvictionFactor(float LFUEvictionFactor) { 886 this.LFUEvictionFactor = LFUEvictionFactor; 887 } 888 889 public boolean isUseLFRUEviction() { 890 return useLFRUEviction; 891 } 892 893 public void setUseLFRUEviction(boolean useLFRUEviction) { 894 this.useLFRUEviction = useLFRUEviction; 895 } 896 897 /////////////////////////////////////////////////////////////////// 898 // Package Protected Methods exposed to Transaction 899 /////////////////////////////////////////////////////////////////// 900 901 /** 902 * @throws IllegalStateException if the page file is not loaded. 903 */ 904 void assertLoaded() throws IllegalStateException { 905 if (!loaded.get()) { 906 throw new IllegalStateException("PageFile is not loaded"); 907 } 908 } 909 910 void assertNotLoaded() throws IllegalStateException { 911 if (loaded.get()) { 912 throw new IllegalStateException("PageFile is loaded"); 913 } 914 } 915 916 /** 917 * Allocates a block of free pages that you can write data to. 918 * 919 * @param count the number of sequential pages to allocate 920 * @return the first page of the sequential set. 921 * @throws IOException If an disk error occurred. 922 * @throws IllegalStateException if the PageFile is not loaded 923 */ 924 <T> Page<T> allocate(int count) throws IOException { 925 assertLoaded(); 926 if (count <= 0) { 927 throw new IllegalArgumentException("The allocation count must be larger than zero"); 928 } 929 930 Sequence seq = freeList.removeFirstSequence(count); 931 932 // We may need to create new free pages... 933 if (seq == null) { 934 935 Page<T> first = null; 936 int c = count; 937 938 // Perform the id's only once.... 939 long pageId = nextFreePageId.getAndAdd(count); 940 long writeTxnId = nextTxid.getAndAdd(count); 941 942 while (c-- > 0) { 943 Page<T> page = new Page<T>(pageId++); 944 page.makeFree(writeTxnId++); 945 946 if (first == null) { 947 first = page; 948 } 949 950 addToCache(page); 951 DataByteArrayOutputStream out = new DataByteArrayOutputStream(pageSize); 952 page.write(out); 953 write(page, out.getData()); 954 955 // LOG.debug("allocate writing: "+page.getPageId()); 956 } 957 958 return first; 959 } 960 961 Page<T> page = new Page<T>(seq.getFirst()); 962 page.makeFree(0); 963 // LOG.debug("allocated: "+page.getPageId()); 964 return page; 965 } 966 967 long getNextWriteTransactionId() { 968 return nextTxid.incrementAndGet(); 969 } 970 971 synchronized void readPage(long pageId, byte[] data) throws IOException { 972 readFile.seek(toOffset(pageId)); 973 readFile.readFully(data); 974 } 975 976 public void freePage(long pageId) { 977 freeList.add(pageId); 978 removeFromCache(pageId); 979 980 SequenceSet trackFreeDuringRecovery = trackingFreeDuringRecovery.get(); 981 if (trackFreeDuringRecovery != null) { 982 trackFreeDuringRecovery.add(pageId); 983 } 984 } 985 986 @SuppressWarnings("unchecked") 987 private <T> void write(Page<T> page, byte[] data) throws IOException { 988 final PageWrite write = new PageWrite(page, data); 989 Entry<Long, PageWrite> entry = new Entry<Long, PageWrite>() { 990 @Override 991 public Long getKey() { 992 return write.getPage().getPageId(); 993 } 994 995 @Override 996 public PageWrite getValue() { 997 return write; 998 } 999 1000 @Override 1001 public PageWrite setValue(PageWrite value) { 1002 return null; 1003 } 1004 }; 1005 Entry<Long, PageWrite>[] entries = new Map.Entry[]{entry}; 1006 write(Arrays.asList(entries)); 1007 } 1008 1009 void write(Collection<Map.Entry<Long, PageWrite>> updates) throws IOException { 1010 synchronized (writes) { 1011 if (enabledWriteThread) { 1012 while (writes.size() >= writeBatchSize && !stopWriter.get()) { 1013 try { 1014 writes.wait(); 1015 } catch (InterruptedException e) { 1016 Thread.currentThread().interrupt(); 1017 throw new InterruptedIOException(); 1018 } 1019 } 1020 } 1021 1022 boolean longTx = false; 1023 1024 for (Map.Entry<Long, PageWrite> entry : updates) { 1025 Long key = entry.getKey(); 1026 PageWrite value = entry.getValue(); 1027 PageWrite write = writes.get(key); 1028 if (write == null) { 1029 writes.put(key, value); 1030 } else { 1031 if (value.currentLocation != -1) { 1032 write.setCurrentLocation(value.page, value.currentLocation, value.length); 1033 write.tmpFile = value.tmpFile; 1034 longTx = true; 1035 } else { 1036 write.setCurrent(value.page, value.current); 1037 } 1038 } 1039 } 1040 1041 // Once we start approaching capacity, notify the writer to start writing 1042 // sync immediately for long txs 1043 if (longTx || canStartWriteBatch()) { 1044 1045 if (enabledWriteThread) { 1046 writes.notify(); 1047 } else { 1048 writeBatch(); 1049 } 1050 } 1051 } 1052 } 1053 1054 private boolean canStartWriteBatch() { 1055 int capacityUsed = ((writes.size() * 100) / writeBatchSize); 1056 if (enabledWriteThread) { 1057 // The constant 10 here controls how soon write batches start going to disk.. 1058 // would be nice to figure out how to auto tune that value. Make to small and 1059 // we reduce through put because we are locking the write mutex too often doing writes 1060 return capacityUsed >= 10 || checkpointLatch != null; 1061 } else { 1062 return capacityUsed >= 80 || checkpointLatch != null; 1063 } 1064 } 1065 1066 /////////////////////////////////////////////////////////////////// 1067 // Cache Related operations 1068 /////////////////////////////////////////////////////////////////// 1069 @SuppressWarnings("unchecked") 1070 <T> Page<T> getFromCache(long pageId) { 1071 synchronized (writes) { 1072 PageWrite pageWrite = writes.get(pageId); 1073 if (pageWrite != null) { 1074 return pageWrite.page; 1075 } 1076 } 1077 1078 Page<T> result = null; 1079 if (enablePageCaching) { 1080 result = pageCache.get(pageId); 1081 } 1082 return result; 1083 } 1084 1085 void addToCache(Page page) { 1086 if (enablePageCaching) { 1087 pageCache.put(page.getPageId(), page); 1088 } 1089 } 1090 1091 void removeFromCache(long pageId) { 1092 if (enablePageCaching) { 1093 pageCache.remove(pageId); 1094 } 1095 } 1096 1097 /////////////////////////////////////////////////////////////////// 1098 // Internal Double write implementation follows... 1099 /////////////////////////////////////////////////////////////////// 1100 1101 private void pollWrites() { 1102 try { 1103 while (!stopWriter.get()) { 1104 // Wait for a notification... 1105 synchronized (writes) { 1106 writes.notifyAll(); 1107 1108 // If there is not enough to write, wait for a notification... 1109 while (writes.isEmpty() && checkpointLatch == null && !stopWriter.get()) { 1110 writes.wait(100); 1111 } 1112 1113 if (writes.isEmpty()) { 1114 releaseCheckpointWaiter(); 1115 } 1116 } 1117 writeBatch(); 1118 } 1119 } catch (Throwable e) { 1120 LOG.info("An exception was raised while performing poll writes", e); 1121 } finally { 1122 releaseCheckpointWaiter(); 1123 } 1124 } 1125 1126 private void writeBatch() throws IOException { 1127 1128 CountDownLatch checkpointLatch; 1129 ArrayList<PageWrite> batch; 1130 synchronized (writes) { 1131 // If there is not enough to write, wait for a notification... 1132 1133 batch = new ArrayList<PageWrite>(writes.size()); 1134 // build a write batch from the current write cache. 1135 for (PageWrite write : writes.values()) { 1136 batch.add(write); 1137 // Move the current write to the diskBound write, this lets folks update the 1138 // page again without blocking for this write. 1139 write.begin(); 1140 if (write.diskBound == null && write.diskBoundLocation == -1) { 1141 batch.remove(write); 1142 } 1143 } 1144 1145 // Grab on to the existing checkpoint latch cause once we do this write we can 1146 // release the folks that were waiting for those writes to hit disk. 1147 checkpointLatch = this.checkpointLatch; 1148 this.checkpointLatch = null; 1149 } 1150 1151 try { 1152 1153 // First land the writes in the recovery file 1154 if (enableRecoveryFile) { 1155 Checksum checksum = new Adler32(); 1156 1157 recoveryFile.seek(RECOVERY_FILE_HEADER_SIZE); 1158 1159 for (PageWrite w : batch) { 1160 try { 1161 checksum.update(w.getDiskBound(tmpFilesForRemoval), 0, pageSize); 1162 } catch (Throwable t) { 1163 throw IOExceptionSupport.create("Cannot create recovery file. Reason: " + t, t); 1164 } 1165 recoveryFile.writeLong(w.page.getPageId()); 1166 recoveryFile.write(w.getDiskBound(tmpFilesForRemoval), 0, pageSize); 1167 } 1168 1169 // Can we shrink the recovery buffer?? 1170 if (recoveryPageCount > recoveryFileMaxPageCount) { 1171 int t = Math.max(recoveryFileMinPageCount, batch.size()); 1172 recoveryFile.setLength(recoveryFileSizeForPages(t)); 1173 } 1174 1175 // Record the page writes in the recovery buffer. 1176 recoveryFile.seek(0); 1177 // Store the next tx id... 1178 recoveryFile.writeLong(nextTxid.get()); 1179 // Store the checksum for thw write batch so that on recovery we 1180 // know if we have a consistent 1181 // write batch on disk. 1182 recoveryFile.writeLong(checksum.getValue()); 1183 // Write the # of pages that will follow 1184 recoveryFile.writeInt(batch.size()); 1185 1186 if (enableDiskSyncs) { 1187 recoveryFile.sync(); 1188 } 1189 } 1190 1191 for (PageWrite w : batch) { 1192 writeFile.seek(toOffset(w.page.getPageId())); 1193 writeFile.write(w.getDiskBound(tmpFilesForRemoval), 0, pageSize); 1194 w.done(); 1195 } 1196 1197 if (enableDiskSyncs) { 1198 writeFile.sync(); 1199 } 1200 1201 } catch (IOException ioError) { 1202 LOG.info("Unexpected io error on pagefile write of " + batch.size() + " pages.", ioError); 1203 // any subsequent write needs to be prefaced with a considered call to redoRecoveryUpdates 1204 // to ensure disk image is self consistent 1205 loaded.set(false); 1206 throw ioError; 1207 } finally { 1208 synchronized (writes) { 1209 for (PageWrite w : batch) { 1210 // If there are no more pending writes, then remove it from 1211 // the write cache. 1212 if (w.isDone()) { 1213 writes.remove(w.page.getPageId()); 1214 if (w.tmpFile != null && tmpFilesForRemoval.containsKey(w.tmpFile)) { 1215 tmpFilesForRemoval.get(w.tmpFile).close(); 1216 if (!w.tmpFile.delete()) { 1217 throw new IOException("Can't delete temporary KahaDB transaction file:" + w.tmpFile); 1218 } 1219 tmpFilesForRemoval.remove(w.tmpFile); 1220 } 1221 } 1222 } 1223 } 1224 1225 if (checkpointLatch != null) { 1226 checkpointLatch.countDown(); 1227 } 1228 } 1229 } 1230 1231 public void removeTmpFile(File file, RandomAccessFile randomAccessFile) throws IOException { 1232 if (!tmpFilesForRemoval.containsKey(file)) { 1233 tmpFilesForRemoval.put(file, randomAccessFile); 1234 } else { 1235 randomAccessFile.close(); 1236 } 1237 } 1238 1239 private long recoveryFileSizeForPages(int pageCount) { 1240 return RECOVERY_FILE_HEADER_SIZE + ((pageSize + 8L) * pageCount); 1241 } 1242 1243 private void releaseCheckpointWaiter() { 1244 if (checkpointLatch != null) { 1245 checkpointLatch.countDown(); 1246 checkpointLatch = null; 1247 } 1248 } 1249 1250 /** 1251 * Inspects the recovery buffer and re-applies any 1252 * partially applied page writes. 1253 * 1254 * @return the next transaction id that can be used. 1255 */ 1256 private long redoRecoveryUpdates() throws IOException { 1257 if (!enableRecoveryFile) { 1258 return 0; 1259 } 1260 recoveryPageCount = 0; 1261 1262 // Are we initializing the recovery file? 1263 if (recoveryFile.length() == 0) { 1264 // Write an empty header.. 1265 recoveryFile.write(new byte[RECOVERY_FILE_HEADER_SIZE]); 1266 // Preallocate the minium size for better performance. 1267 recoveryFile.setLength(recoveryFileSizeForPages(recoveryFileMinPageCount)); 1268 return 0; 1269 } 1270 1271 // How many recovery pages do we have in the recovery buffer? 1272 recoveryFile.seek(0); 1273 long nextTxId = recoveryFile.readLong(); 1274 long expectedChecksum = recoveryFile.readLong(); 1275 int pageCounter = recoveryFile.readInt(); 1276 1277 recoveryFile.seek(RECOVERY_FILE_HEADER_SIZE); 1278 Checksum checksum = new Adler32(); 1279 LinkedHashMap<Long, byte[]> batch = new LinkedHashMap<Long, byte[]>(); 1280 try { 1281 for (int i = 0; i < pageCounter; i++) { 1282 long offset = recoveryFile.readLong(); 1283 byte[] data = new byte[pageSize]; 1284 if (recoveryFile.read(data, 0, pageSize) != pageSize) { 1285 // Invalid recovery record, Could not fully read the data". Probably due to a partial write to the recovery buffer 1286 return nextTxId; 1287 } 1288 checksum.update(data, 0, pageSize); 1289 batch.put(offset, data); 1290 } 1291 } catch (Exception e) { 1292 // If an error occurred it was cause the redo buffer was not full written out correctly.. so don't redo it. 1293 // as the pages should still be consistent. 1294 LOG.debug("Redo buffer was not fully intact: ", e); 1295 return nextTxId; 1296 } 1297 1298 recoveryPageCount = pageCounter; 1299 1300 // If the checksum is not valid then the recovery buffer was partially written to disk. 1301 if (checksum.getValue() != expectedChecksum) { 1302 return nextTxId; 1303 } 1304 1305 // Re-apply all the writes in the recovery buffer. 1306 for (Map.Entry<Long, byte[]> e : batch.entrySet()) { 1307 writeFile.seek(toOffset(e.getKey())); 1308 writeFile.write(e.getValue()); 1309 } 1310 1311 // And sync it to disk 1312 writeFile.sync(); 1313 return nextTxId; 1314 } 1315 1316 private void startWriter() { 1317 synchronized (writes) { 1318 if (enabledWriteThread) { 1319 stopWriter.set(false); 1320 writerThread = new Thread("KahaDB Page Writer") { 1321 @Override 1322 public void run() { 1323 pollWrites(); 1324 } 1325 }; 1326 writerThread.setPriority(Thread.MAX_PRIORITY); 1327 writerThread.setDaemon(true); 1328 writerThread.start(); 1329 } 1330 } 1331 } 1332 1333 private void stopWriter() throws InterruptedException { 1334 if (enabledWriteThread) { 1335 stopWriter.set(true); 1336 writerThread.join(); 1337 } 1338 } 1339 1340 public File getFile() { 1341 return getMainPageFile(); 1342 } 1343 1344 public File getDirectory() { 1345 return directory; 1346 } 1347}