001/**
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.activemq.store.kahadb.disk.page;
018
019import java.io.ByteArrayInputStream;
020import java.io.ByteArrayOutputStream;
021import java.io.DataInputStream;
022import java.io.DataOutputStream;
023import java.io.File;
024import java.io.FileInputStream;
025import java.io.FileOutputStream;
026import java.io.IOException;
027import java.io.InterruptedIOException;
028import java.io.RandomAccessFile;
029import java.util.ArrayList;
030import java.util.Arrays;
031import java.util.Collection;
032import java.util.Collections;
033import java.util.HashMap;
034import java.util.Iterator;
035import java.util.LinkedHashMap;
036import java.util.Map;
037import java.util.Map.Entry;
038import java.util.Properties;
039import java.util.TreeMap;
040import java.util.concurrent.CountDownLatch;
041import java.util.concurrent.atomic.AtomicBoolean;
042import java.util.concurrent.atomic.AtomicLong;
043import java.util.concurrent.atomic.AtomicReference;
044import java.util.zip.Adler32;
045import java.util.zip.Checksum;
046
047import org.apache.activemq.store.kahadb.disk.util.Sequence;
048import org.apache.activemq.store.kahadb.disk.util.SequenceSet;
049import org.apache.activemq.util.DataByteArrayOutputStream;
050import org.apache.activemq.util.IOExceptionSupport;
051import org.apache.activemq.util.IOHelper;
052import org.apache.activemq.util.IntrospectionSupport;
053import org.apache.activemq.util.LFUCache;
054import org.apache.activemq.util.LRUCache;
055import org.apache.activemq.util.RecoverableRandomAccessFile;
056import org.slf4j.Logger;
057import org.slf4j.LoggerFactory;
058
059/**
060 * A PageFile provides you random access to fixed sized disk pages. This object is not thread safe and therefore access to it should
061 * be externally synchronized.
062 * <p/>
063 * The file has 3 parts:
064 * Metadata Space: 4k : Reserved metadata area. Used to store persistent config about the file.
065 * Recovery Buffer Space: Page Size * 1000 : This is a redo log used to prevent partial page writes from making the file inconsistent
066 * Page Space: The pages in the page file.
067 */
068public class PageFile {
069
070    private static final String PAGEFILE_SUFFIX = ".data";
071    private static final String RECOVERY_FILE_SUFFIX = ".redo";
072    private static final String FREE_FILE_SUFFIX = ".free";
073
074    // 4k Default page size.
075    public static final int DEFAULT_PAGE_SIZE = Integer.getInteger("defaultPageSize", 1024*4);
076    public static final int DEFAULT_WRITE_BATCH_SIZE = Integer.getInteger("defaultWriteBatchSize", 1000);
077    public static final int DEFAULT_PAGE_CACHE_SIZE = Integer.getInteger("defaultPageCacheSize", 100);;
078
079    private static final int RECOVERY_FILE_HEADER_SIZE = 1024 * 4;
080    private static final int PAGE_FILE_HEADER_SIZE = 1024 * 4;
081
082    // Recovery header is (long offset)
083    private static final Logger LOG = LoggerFactory.getLogger(PageFile.class);
084
085    // A PageFile will use a couple of files in this directory
086    private final File directory;
087    // And the file names in that directory will be based on this name.
088    private final String name;
089
090    // File handle used for reading pages..
091    private RecoverableRandomAccessFile readFile;
092    // File handle used for writing pages..
093    private RecoverableRandomAccessFile writeFile;
094    // File handle used for writing pages..
095    private RecoverableRandomAccessFile recoveryFile;
096
097    // The size of pages
098    private int pageSize = DEFAULT_PAGE_SIZE;
099
100    // The minimum number of space allocated to the recovery file in number of pages.
101    private int recoveryFileMinPageCount = 1000;
102    // The max size that we let the recovery file grow to.. ma exceed the max, but the file will get resize
103    // to this max size as soon as  possible.
104    private int recoveryFileMaxPageCount = 10000;
105    // The number of pages in the current recovery buffer
106    private int recoveryPageCount;
107
108    private final AtomicBoolean loaded = new AtomicBoolean();
109    // The number of pages we are aiming to write every time we
110    // write to disk.
111    int writeBatchSize = DEFAULT_WRITE_BATCH_SIZE;
112
113    // We keep a cache of pages recently used?
114    private Map<Long, Page> pageCache;
115    // The cache of recently used pages.
116    private boolean enablePageCaching = true;
117    // How many pages will we keep in the cache?
118    private int pageCacheSize = DEFAULT_PAGE_CACHE_SIZE;
119
120    // Should first log the page write to the recovery buffer? Avoids partial
121    // page write failures..
122    private boolean enableRecoveryFile = true;
123    // Will we sync writes to disk. Ensures that data will not be lost after a checkpoint()
124    private boolean enableDiskSyncs = true;
125    // Will writes be done in an async thread?
126    private boolean enabledWriteThread = false;
127
128    // These are used if enableAsyncWrites==true
129    private final AtomicBoolean stopWriter = new AtomicBoolean();
130    private Thread writerThread;
131    private CountDownLatch checkpointLatch;
132
133    // Keeps track of writes that are being written to disk.
134    private final TreeMap<Long, PageWrite> writes = new TreeMap<Long, PageWrite>();
135
136    // Keeps track of free pages.
137    private final AtomicLong nextFreePageId = new AtomicLong();
138    private SequenceSet freeList = new SequenceSet();
139
140    private AtomicReference<SequenceSet> recoveredFreeList = new AtomicReference<SequenceSet>();
141    private AtomicReference<SequenceSet> trackingFreeDuringRecovery = new AtomicReference<SequenceSet>();
142
143    private final AtomicLong nextTxid = new AtomicLong();
144
145    // Persistent settings stored in the page file.
146    private MetaData metaData;
147
148    private final HashMap<File, RandomAccessFile> tmpFilesForRemoval = new HashMap<>();
149
150    private boolean useLFRUEviction = false;
151    private float LFUEvictionFactor = 0.2f;
152
153    /**
154     * Use to keep track of updated pages which have not yet been committed.
155     */
156    static class PageWrite {
157        Page page;
158        byte[] current;
159        byte[] diskBound;
160        long currentLocation = -1;
161        long diskBoundLocation = -1;
162        File tmpFile;
163        int length;
164
165        public PageWrite(Page page, byte[] data) {
166            this.page = page;
167            current = data;
168        }
169
170        public PageWrite(Page page, long currentLocation, int length, File tmpFile) {
171            this.page = page;
172            this.currentLocation = currentLocation;
173            this.tmpFile = tmpFile;
174            this.length = length;
175        }
176
177        public void setCurrent(Page page, byte[] data) {
178            this.page = page;
179            current = data;
180            currentLocation = -1;
181            diskBoundLocation = -1;
182        }
183
184        public void setCurrentLocation(Page page, long location, int length) {
185            this.page = page;
186            this.currentLocation = location;
187            this.length = length;
188            this.current = null;
189        }
190
191        @Override
192        public String toString() {
193            return "[PageWrite:" + page.getPageId() + "-" + page.getType() + "]";
194        }
195
196        @SuppressWarnings("unchecked")
197        public Page getPage() {
198            return page;
199        }
200
201        public byte[] getDiskBound(HashMap<File, RandomAccessFile> tmpFiles) throws IOException {
202            if (diskBound == null && diskBoundLocation != -1) {
203                diskBound = new byte[length];
204                if (tmpFiles.containsKey(tmpFile) && tmpFiles.get(tmpFile).getChannel().isOpen()) {
205                    RandomAccessFile file = tmpFiles.get(tmpFile);
206                    file.seek(diskBoundLocation);
207                    file.read(diskBound);
208                } else {
209                    try (RandomAccessFile file = new RandomAccessFile(tmpFile, "r")) {
210                        file.seek(diskBoundLocation);
211                        file.read(diskBound);
212                    }
213                }
214                diskBoundLocation = -1;
215            }
216            return diskBound;
217        }
218
219        void begin() {
220            if (currentLocation != -1) {
221                diskBoundLocation = currentLocation;
222            } else {
223                diskBound = current;
224            }
225            current = null;
226            currentLocation = -1;
227        }
228
229        /**
230         * @return true if there is no pending writes to do.
231         */
232        boolean done() {
233            diskBoundLocation = -1;
234            diskBound = null;
235            return current == null || currentLocation == -1;
236        }
237
238        boolean isDone() {
239            return diskBound == null && diskBoundLocation == -1 && current == null && currentLocation == -1;
240        }
241    }
242
243    /**
244     * The MetaData object hold the persistent data associated with a PageFile object.
245     */
246    public static class MetaData {
247
248        String fileType;
249        String fileTypeVersion;
250
251        long metaDataTxId = -1;
252        int pageSize;
253        boolean cleanShutdown;
254        long lastTxId;
255        long freePages;
256
257        public String getFileType() {
258            return fileType;
259        }
260
261        public void setFileType(String fileType) {
262            this.fileType = fileType;
263        }
264
265        public String getFileTypeVersion() {
266            return fileTypeVersion;
267        }
268
269        public void setFileTypeVersion(String version) {
270            this.fileTypeVersion = version;
271        }
272
273        public long getMetaDataTxId() {
274            return metaDataTxId;
275        }
276
277        public void setMetaDataTxId(long metaDataTxId) {
278            this.metaDataTxId = metaDataTxId;
279        }
280
281        public int getPageSize() {
282            return pageSize;
283        }
284
285        public void setPageSize(int pageSize) {
286            this.pageSize = pageSize;
287        }
288
289        public boolean isCleanShutdown() {
290            return cleanShutdown;
291        }
292
293        public void setCleanShutdown(boolean cleanShutdown) {
294            this.cleanShutdown = cleanShutdown;
295        }
296
297        public long getLastTxId() {
298            return lastTxId;
299        }
300
301        public void setLastTxId(long lastTxId) {
302            this.lastTxId = lastTxId;
303        }
304
305        public long getFreePages() {
306            return freePages;
307        }
308
309        public void setFreePages(long value) {
310            this.freePages = value;
311        }
312    }
313
314    public Transaction tx() {
315        assertLoaded();
316        return new Transaction(this);
317    }
318
319    /**
320     * Creates a PageFile in the specified directory who's data files are named by name.
321     */
322    public PageFile(File directory, String name) {
323        this.directory = directory;
324        this.name = name;
325    }
326
327    /**
328     * Deletes the files used by the PageFile object.  This method can only be used when this object is not loaded.
329     *
330     * @throws IOException           if the files cannot be deleted.
331     * @throws IllegalStateException if this PageFile is loaded
332     */
333    public void delete() throws IOException {
334        if (loaded.get()) {
335            throw new IllegalStateException("Cannot delete page file data when the page file is loaded");
336        }
337        delete(getMainPageFile());
338        delete(getFreeFile());
339        delete(getRecoveryFile());
340    }
341
342    public void archive() throws IOException {
343        if (loaded.get()) {
344            throw new IllegalStateException("Cannot delete page file data when the page file is loaded");
345        }
346        long timestamp = System.currentTimeMillis();
347        archive(getMainPageFile(), String.valueOf(timestamp));
348        archive(getFreeFile(), String.valueOf(timestamp));
349        archive(getRecoveryFile(), String.valueOf(timestamp));
350    }
351
352    /**
353     * @param file
354     * @throws IOException
355     */
356    private void delete(File file) throws IOException {
357        if (file.exists() && !file.delete()) {
358            throw new IOException("Could not delete: " + file.getPath());
359        }
360    }
361
362    private void archive(File file, String suffix) throws IOException {
363        if (file.exists()) {
364            File archive = new File(file.getPath() + "-" + suffix);
365            if (!file.renameTo(archive)) {
366                throw new IOException("Could not archive: " + file.getPath() + " to " + file.getPath());
367            }
368        }
369    }
370
371    /**
372     * Loads the page file so that it can be accessed for read/write purposes.  This allocates OS resources.  If this is the
373     * first time the page file is loaded, then this creates the page file in the file system.
374     *
375     * @throws IOException           If the page file cannot be loaded. This could be cause the existing page file is corrupt is a bad version or if
376     *                               there was a disk error.
377     * @throws IllegalStateException If the page file was already loaded.
378     */
379    public void load() throws IOException, IllegalStateException {
380        if (loaded.compareAndSet(false, true)) {
381
382            if (enablePageCaching) {
383                if (isUseLFRUEviction()) {
384                    pageCache = Collections.synchronizedMap(new LFUCache<Long, Page>(pageCacheSize, getLFUEvictionFactor()));
385                } else {
386                    pageCache = Collections.synchronizedMap(new LRUCache<Long, Page>(pageCacheSize, pageCacheSize, 0.75f, true));
387                }
388            }
389
390            File file = getMainPageFile();
391            IOHelper.mkdirs(file.getParentFile());
392            writeFile = new RecoverableRandomAccessFile(file, "rw", false);
393            readFile = new RecoverableRandomAccessFile(file, "r");
394
395            if (readFile.length() > 0) {
396                // Load the page size setting cause that can't change once the file is created.
397                loadMetaData();
398                pageSize = metaData.getPageSize();
399            } else {
400                // Store the page size setting cause that can't change once the file is created.
401                metaData = new MetaData();
402                metaData.setFileType(PageFile.class.getName());
403                metaData.setFileTypeVersion("1");
404                metaData.setPageSize(getPageSize());
405                metaData.setCleanShutdown(true);
406                metaData.setFreePages(-1);
407                metaData.setLastTxId(0);
408                storeMetaData();
409            }
410
411            if (enableRecoveryFile) {
412                recoveryFile = new RecoverableRandomAccessFile(getRecoveryFile(), "rw");
413            }
414
415            if (metaData.isCleanShutdown()) {
416                nextTxid.set(metaData.getLastTxId() + 1);
417                if (metaData.getFreePages() > 0) {
418                    loadFreeList();
419                }
420            } else {
421                LOG.debug(toString() + ", Recovering page file...");
422                nextTxid.set(redoRecoveryUpdates());
423                trackingFreeDuringRecovery.set(new SequenceSet());
424            }
425
426            if (writeFile.length() < PAGE_FILE_HEADER_SIZE) {
427                writeFile.setLength(PAGE_FILE_HEADER_SIZE);
428            }
429            nextFreePageId.set((writeFile.length() - PAGE_FILE_HEADER_SIZE) / pageSize);
430
431            metaData.setCleanShutdown(false);
432            storeMetaData();
433            getFreeFile().delete();
434            startWriter();
435            if (trackingFreeDuringRecovery.get() != null) {
436                asyncFreePageRecovery(nextFreePageId.get());
437            }
438        } else {
439            throw new IllegalStateException("Cannot load the page file when it is already loaded.");
440        }
441    }
442
443    private void asyncFreePageRecovery(final long lastRecoveryPage) {
444        Thread thread = new Thread("KahaDB Index Free Page Recovery") {
445            @Override
446            public void run() {
447                try {
448                    recoverFreePages(lastRecoveryPage);
449                } catch (Throwable e) {
450                    if (loaded.get()) {
451                        LOG.warn("Error recovering index free page list", e);
452                    }
453                }
454            }
455        };
456        thread.setPriority(Thread.NORM_PRIORITY);
457        thread.setDaemon(true);
458        thread.start();
459    }
460
461    private void recoverFreePages(final long lastRecoveryPage) throws Exception {
462        LOG.info(toString() + ". Recovering pageFile free list due to prior unclean shutdown..");
463        SequenceSet newFreePages = new SequenceSet();
464        // need new pageFile instance to get unshared readFile
465        PageFile recoveryPageFile = new PageFile(directory, name);
466        recoveryPageFile.loadForRecovery(nextFreePageId.get());
467        try {
468            for (Iterator<Page> i = new Transaction(recoveryPageFile).iterator(true); i.hasNext(); ) {
469                Page page = i.next();
470
471                if (page.getPageId() >= lastRecoveryPage) {
472                    break;
473                }
474
475                if (page.getType() == Page.PAGE_FREE_TYPE) {
476                    newFreePages.add(page.getPageId());
477                }
478            }
479        } finally {
480            recoveryPageFile.readFile.close();
481        }
482
483        LOG.info(toString() + ". Recovered pageFile free list of size: " + newFreePages.rangeSize());
484        if (!newFreePages.isEmpty()) {
485
486            // allow flush (with index lock held) to merge eventually
487            recoveredFreeList.lazySet(newFreePages);
488        } else {
489            // If there is no free pages, set trackingFreeDuringRecovery to allow the broker to have a clean shutdown
490            trackingFreeDuringRecovery.set(null);
491        }
492    }
493
494    private void loadForRecovery(long nextFreePageIdSnap) throws Exception {
495        loaded.set(true);
496        enablePageCaching = false;
497        File file = getMainPageFile();
498        readFile = new RecoverableRandomAccessFile(file, "r");
499        loadMetaData();
500        pageSize = metaData.getPageSize();
501        enableRecoveryFile = false;
502        nextFreePageId.set(nextFreePageIdSnap);
503    }
504
505
506    /**
507     * Unloads a previously loaded PageFile.  This deallocates OS related resources like file handles.
508     * once unloaded, you can no longer use the page file to read or write Pages.
509     *
510     * @throws IOException           if there was a disk error occurred while closing the down the page file.
511     * @throws IllegalStateException if the PageFile is not loaded
512     */
513    public void unload() throws IOException {
514        if (loaded.compareAndSet(true, false)) {
515            flush();
516            try {
517                stopWriter();
518            } catch (InterruptedException e) {
519                throw new InterruptedIOException();
520            }
521
522            if (freeList.isEmpty()) {
523                metaData.setFreePages(0);
524            } else {
525                storeFreeList();
526                metaData.setFreePages(freeList.size());
527            }
528
529            metaData.setLastTxId(nextTxid.get() - 1);
530            if (trackingFreeDuringRecovery.get() != null) {
531                // async recovery incomplete, will have to try again
532                metaData.setCleanShutdown(false);
533            } else {
534                metaData.setCleanShutdown(true);
535            }
536            storeMetaData();
537
538            if (readFile != null) {
539                readFile.close();
540                readFile = null;
541                writeFile.close();
542                writeFile = null;
543                if (enableRecoveryFile) {
544                    recoveryFile.close();
545                    recoveryFile = null;
546                }
547                freeList.clear();
548                if (pageCache != null) {
549                    pageCache = null;
550                }
551                synchronized (writes) {
552                    writes.clear();
553                }
554            }
555        } else {
556            throw new IllegalStateException("Cannot unload the page file when it is not loaded");
557        }
558    }
559
560    public boolean isLoaded() {
561        return loaded.get();
562    }
563
564    public boolean isCleanShutdown() {
565        return metaData != null && metaData.isCleanShutdown();
566    }
567
568    public void allowIOResumption() {
569        loaded.set(true);
570    }
571
572    /**
573     * Flush and sync all write buffers to disk.
574     *
575     * @throws IOException If an disk error occurred.
576     */
577    public void flush() throws IOException {
578
579        if (enabledWriteThread && stopWriter.get()) {
580            throw new IOException("Page file already stopped: checkpointing is not allowed");
581        }
582
583        SequenceSet recovered = recoveredFreeList.get();
584        if (recovered != null) {
585            recoveredFreeList.lazySet(null);
586            SequenceSet inUse = trackingFreeDuringRecovery.get();
587            recovered.remove(inUse);
588            freeList.merge(recovered);
589
590            // all set for clean shutdown
591            trackingFreeDuringRecovery.set(null);
592            inUse.clear();
593        }
594
595        // Setup a latch that gets notified when all buffered writes hits the disk.
596        CountDownLatch checkpointLatch;
597        synchronized (writes) {
598            if (writes.isEmpty()) {
599                return;
600            }
601            if (enabledWriteThread) {
602                if (this.checkpointLatch == null) {
603                    this.checkpointLatch = new CountDownLatch(1);
604                }
605                checkpointLatch = this.checkpointLatch;
606                writes.notify();
607            } else {
608                writeBatch();
609                return;
610            }
611        }
612        try {
613            checkpointLatch.await();
614        } catch (InterruptedException e) {
615            InterruptedIOException ioe = new InterruptedIOException();
616            ioe.initCause(e);
617            throw ioe;
618        }
619    }
620
621
622    @Override
623    public String toString() {
624        return "Page File: " + getMainPageFile();
625    }
626
627    ///////////////////////////////////////////////////////////////////
628    // Private Implementation Methods
629    ///////////////////////////////////////////////////////////////////
630    private File getMainPageFile() {
631        return new File(directory, IOHelper.toFileSystemSafeName(name) + PAGEFILE_SUFFIX);
632    }
633
634    public File getFreeFile() {
635        return new File(directory, IOHelper.toFileSystemSafeName(name) + FREE_FILE_SUFFIX);
636    }
637
638    public File getRecoveryFile() {
639        return new File(directory, IOHelper.toFileSystemSafeName(name) + RECOVERY_FILE_SUFFIX);
640    }
641
642    public long toOffset(long pageId) {
643        return PAGE_FILE_HEADER_SIZE + (pageId * pageSize);
644    }
645
646    private void loadMetaData() throws IOException {
647
648        ByteArrayInputStream is;
649        MetaData v1 = new MetaData();
650        MetaData v2 = new MetaData();
651        try {
652            Properties p = new Properties();
653            byte[] d = new byte[PAGE_FILE_HEADER_SIZE / 2];
654            readFile.seek(0);
655            readFile.readFully(d);
656            is = new ByteArrayInputStream(d);
657            p.load(is);
658            IntrospectionSupport.setProperties(v1, p);
659        } catch (IOException e) {
660            v1 = null;
661        }
662
663        try {
664            Properties p = new Properties();
665            byte[] d = new byte[PAGE_FILE_HEADER_SIZE / 2];
666            readFile.seek(PAGE_FILE_HEADER_SIZE / 2);
667            readFile.readFully(d);
668            is = new ByteArrayInputStream(d);
669            p.load(is);
670            IntrospectionSupport.setProperties(v2, p);
671        } catch (IOException e) {
672            v2 = null;
673        }
674
675        if (v1 == null && v2 == null) {
676            throw new IOException("Could not load page file meta data");
677        }
678
679        if (v1 == null || v1.metaDataTxId < 0) {
680            metaData = v2;
681        } else if (v2 == null || v1.metaDataTxId < 0) {
682            metaData = v1;
683        } else if (v1.metaDataTxId == v2.metaDataTxId) {
684            metaData = v1; // use the first since the 2nd could be a partial..
685        } else {
686            metaData = v2; // use the second cause the first is probably a partial.
687        }
688    }
689
690    private void storeMetaData() throws IOException {
691        // Convert the metadata into a property format
692        metaData.metaDataTxId++;
693        Properties p = new Properties();
694        IntrospectionSupport.getProperties(metaData, p, null);
695
696        ByteArrayOutputStream os = new ByteArrayOutputStream(PAGE_FILE_HEADER_SIZE);
697        p.store(os, "");
698        if (os.size() > PAGE_FILE_HEADER_SIZE / 2) {
699            throw new IOException("Configuation is larger than: " + PAGE_FILE_HEADER_SIZE / 2);
700        }
701        // Fill the rest with space...
702        byte[] filler = new byte[(PAGE_FILE_HEADER_SIZE / 2) - os.size()];
703        Arrays.fill(filler, (byte) ' ');
704        os.write(filler);
705        os.flush();
706
707        byte[] d = os.toByteArray();
708
709        // So we don't loose it.. write it 2 times...
710        writeFile.seek(0);
711        writeFile.write(d);
712        writeFile.sync();
713        writeFile.seek(PAGE_FILE_HEADER_SIZE / 2);
714        writeFile.write(d);
715        writeFile.sync();
716    }
717
718    private void storeFreeList() throws IOException {
719        FileOutputStream os = new FileOutputStream(getFreeFile());
720        DataOutputStream dos = new DataOutputStream(os);
721        SequenceSet.Marshaller.INSTANCE.writePayload(freeList, dos);
722        dos.close();
723    }
724
725    private void loadFreeList() throws IOException {
726        freeList.clear();
727        FileInputStream is = new FileInputStream(getFreeFile());
728        DataInputStream dis = new DataInputStream(is);
729        freeList = SequenceSet.Marshaller.INSTANCE.readPayload(dis);
730        dis.close();
731    }
732
733    ///////////////////////////////////////////////////////////////////
734    // Property Accessors
735    ///////////////////////////////////////////////////////////////////
736
737    /**
738     * Is the recovery buffer used to double buffer page writes.  Enabled by default.
739     *
740     * @return is the recovery buffer enabled.
741     */
742    public boolean isEnableRecoveryFile() {
743        return enableRecoveryFile;
744    }
745
746    /**
747     * Sets if the recovery buffer uses to double buffer page writes.  Enabled by default.  Disabling this
748     * may potentially cause partial page writes which can lead to page file corruption.
749     */
750    public void setEnableRecoveryFile(boolean doubleBuffer) {
751        assertNotLoaded();
752        this.enableRecoveryFile = doubleBuffer;
753    }
754
755    /**
756     * @return Are page writes synced to disk?
757     */
758    public boolean isEnableDiskSyncs() {
759        return enableDiskSyncs;
760    }
761
762    /**
763     * Allows you enable syncing writes to disk.
764     */
765    public void setEnableDiskSyncs(boolean syncWrites) {
766        assertNotLoaded();
767        this.enableDiskSyncs = syncWrites;
768    }
769
770    /**
771     * @return the page size
772     */
773    public int getPageSize() {
774        return this.pageSize;
775    }
776
777    /**
778     * @return the amount of content data that a page can hold.
779     */
780    public int getPageContentSize() {
781        return this.pageSize - Page.PAGE_HEADER_SIZE;
782    }
783
784    /**
785     * Configures the page size used by the page file.  By default it is 4k.  Once a page file is created on disk,
786     * subsequent loads of that file will use the original pageSize.  Once the PageFile is loaded, this setting
787     * can no longer be changed.
788     *
789     * @param pageSize the pageSize to set
790     * @throws IllegalStateException once the page file is loaded.
791     */
792    public void setPageSize(int pageSize) throws IllegalStateException {
793        assertNotLoaded();
794        this.pageSize = pageSize;
795    }
796
797    /**
798     * @return true if read page caching is enabled
799     */
800    public boolean isEnablePageCaching() {
801        return this.enablePageCaching;
802    }
803
804    /**
805     * @param enablePageCaching allows you to enable read page caching
806     */
807    public void setEnablePageCaching(boolean enablePageCaching) {
808        assertNotLoaded();
809        this.enablePageCaching = enablePageCaching;
810    }
811
812    /**
813     * @return the maximum number of pages that will get stored in the read page cache.
814     */
815    public int getPageCacheSize() {
816        return this.pageCacheSize;
817    }
818
819    /**
820     * @param pageCacheSize Sets the maximum number of pages that will get stored in the read page cache.
821     */
822    public void setPageCacheSize(int pageCacheSize) {
823        assertNotLoaded();
824        this.pageCacheSize = pageCacheSize;
825    }
826
827    public boolean isEnabledWriteThread() {
828        return enabledWriteThread;
829    }
830
831    public void setEnableWriteThread(boolean enableAsyncWrites) {
832        assertNotLoaded();
833        this.enabledWriteThread = enableAsyncWrites;
834    }
835
836    public long getDiskSize() throws IOException {
837        return toOffset(nextFreePageId.get());
838    }
839
840    public boolean isFreePage(long pageId) {
841        return freeList.contains(pageId);
842    }
843    /**
844     * @return the number of pages allocated in the PageFile
845     */
846    public long getPageCount() {
847        return nextFreePageId.get();
848    }
849
850    public int getRecoveryFileMinPageCount() {
851        return recoveryFileMinPageCount;
852    }
853
854    public long getFreePageCount() {
855        assertLoaded();
856        return freeList.rangeSize();
857    }
858
859    public void setRecoveryFileMinPageCount(int recoveryFileMinPageCount) {
860        assertNotLoaded();
861        this.recoveryFileMinPageCount = recoveryFileMinPageCount;
862    }
863
864    public int getRecoveryFileMaxPageCount() {
865        return recoveryFileMaxPageCount;
866    }
867
868    public void setRecoveryFileMaxPageCount(int recoveryFileMaxPageCount) {
869        assertNotLoaded();
870        this.recoveryFileMaxPageCount = recoveryFileMaxPageCount;
871    }
872
873    public int getWriteBatchSize() {
874        return writeBatchSize;
875    }
876
877    public void setWriteBatchSize(int writeBatchSize) {
878        this.writeBatchSize = writeBatchSize;
879    }
880
881    public float getLFUEvictionFactor() {
882        return LFUEvictionFactor;
883    }
884
885    public void setLFUEvictionFactor(float LFUEvictionFactor) {
886        this.LFUEvictionFactor = LFUEvictionFactor;
887    }
888
889    public boolean isUseLFRUEviction() {
890        return useLFRUEviction;
891    }
892
893    public void setUseLFRUEviction(boolean useLFRUEviction) {
894        this.useLFRUEviction = useLFRUEviction;
895    }
896
897    ///////////////////////////////////////////////////////////////////
898    // Package Protected Methods exposed to Transaction
899    ///////////////////////////////////////////////////////////////////
900
901    /**
902     * @throws IllegalStateException if the page file is not loaded.
903     */
904    void assertLoaded() throws IllegalStateException {
905        if (!loaded.get()) {
906            throw new IllegalStateException("PageFile is not loaded");
907        }
908    }
909
910    void assertNotLoaded() throws IllegalStateException {
911        if (loaded.get()) {
912            throw new IllegalStateException("PageFile is loaded");
913        }
914    }
915
916    /**
917     * Allocates a block of free pages that you can write data to.
918     *
919     * @param count the number of sequential pages to allocate
920     * @return the first page of the sequential set.
921     * @throws IOException           If an disk error occurred.
922     * @throws IllegalStateException if the PageFile is not loaded
923     */
924    <T> Page<T> allocate(int count) throws IOException {
925        assertLoaded();
926        if (count <= 0) {
927            throw new IllegalArgumentException("The allocation count must be larger than zero");
928        }
929
930        Sequence seq = freeList.removeFirstSequence(count);
931
932        // We may need to create new free pages...
933        if (seq == null) {
934
935            Page<T> first = null;
936            int c = count;
937
938            // Perform the id's only once....
939            long pageId = nextFreePageId.getAndAdd(count);
940            long writeTxnId = nextTxid.getAndAdd(count);
941
942            while (c-- > 0) {
943                Page<T> page = new Page<T>(pageId++);
944                page.makeFree(writeTxnId++);
945
946                if (first == null) {
947                    first = page;
948                }
949
950                addToCache(page);
951                DataByteArrayOutputStream out = new DataByteArrayOutputStream(pageSize);
952                page.write(out);
953                write(page, out.getData());
954
955                // LOG.debug("allocate writing: "+page.getPageId());
956            }
957
958            return first;
959        }
960
961        Page<T> page = new Page<T>(seq.getFirst());
962        page.makeFree(0);
963        // LOG.debug("allocated: "+page.getPageId());
964        return page;
965    }
966
967    long getNextWriteTransactionId() {
968        return nextTxid.incrementAndGet();
969    }
970
971    synchronized void readPage(long pageId, byte[] data) throws IOException {
972        readFile.seek(toOffset(pageId));
973        readFile.readFully(data);
974    }
975
976    public void freePage(long pageId) {
977        freeList.add(pageId);
978        removeFromCache(pageId);
979
980        SequenceSet trackFreeDuringRecovery = trackingFreeDuringRecovery.get();
981        if (trackFreeDuringRecovery != null) {
982            trackFreeDuringRecovery.add(pageId);
983        }
984    }
985
986    @SuppressWarnings("unchecked")
987    private <T> void write(Page<T> page, byte[] data) throws IOException {
988        final PageWrite write = new PageWrite(page, data);
989        Entry<Long, PageWrite> entry = new Entry<Long, PageWrite>() {
990            @Override
991            public Long getKey() {
992                return write.getPage().getPageId();
993            }
994
995            @Override
996            public PageWrite getValue() {
997                return write;
998            }
999
1000            @Override
1001            public PageWrite setValue(PageWrite value) {
1002                return null;
1003            }
1004        };
1005        Entry<Long, PageWrite>[] entries = new Map.Entry[]{entry};
1006        write(Arrays.asList(entries));
1007    }
1008
1009    void write(Collection<Map.Entry<Long, PageWrite>> updates) throws IOException {
1010        synchronized (writes) {
1011            if (enabledWriteThread) {
1012                while (writes.size() >= writeBatchSize && !stopWriter.get()) {
1013                    try {
1014                        writes.wait();
1015                    } catch (InterruptedException e) {
1016                        Thread.currentThread().interrupt();
1017                        throw new InterruptedIOException();
1018                    }
1019                }
1020            }
1021
1022            boolean longTx = false;
1023
1024            for (Map.Entry<Long, PageWrite> entry : updates) {
1025                Long key = entry.getKey();
1026                PageWrite value = entry.getValue();
1027                PageWrite write = writes.get(key);
1028                if (write == null) {
1029                    writes.put(key, value);
1030                } else {
1031                    if (value.currentLocation != -1) {
1032                        write.setCurrentLocation(value.page, value.currentLocation, value.length);
1033                        write.tmpFile = value.tmpFile;
1034                        longTx = true;
1035                    } else {
1036                        write.setCurrent(value.page, value.current);
1037                    }
1038                }
1039            }
1040
1041            // Once we start approaching capacity, notify the writer to start writing
1042            // sync immediately for long txs
1043            if (longTx || canStartWriteBatch()) {
1044
1045                if (enabledWriteThread) {
1046                    writes.notify();
1047                } else {
1048                    writeBatch();
1049                }
1050            }
1051        }
1052    }
1053
1054    private boolean canStartWriteBatch() {
1055        int capacityUsed = ((writes.size() * 100) / writeBatchSize);
1056        if (enabledWriteThread) {
1057            // The constant 10 here controls how soon write batches start going to disk..
1058            // would be nice to figure out how to auto tune that value.  Make to small and
1059            // we reduce through put because we are locking the write mutex too often doing writes
1060            return capacityUsed >= 10 || checkpointLatch != null;
1061        } else {
1062            return capacityUsed >= 80 || checkpointLatch != null;
1063        }
1064    }
1065
1066    ///////////////////////////////////////////////////////////////////
1067    // Cache Related operations
1068    ///////////////////////////////////////////////////////////////////
1069    @SuppressWarnings("unchecked")
1070    <T> Page<T> getFromCache(long pageId) {
1071        synchronized (writes) {
1072            PageWrite pageWrite = writes.get(pageId);
1073            if (pageWrite != null) {
1074                return pageWrite.page;
1075            }
1076        }
1077
1078        Page<T> result = null;
1079        if (enablePageCaching) {
1080            result = pageCache.get(pageId);
1081        }
1082        return result;
1083    }
1084
1085    void addToCache(Page page) {
1086        if (enablePageCaching) {
1087            pageCache.put(page.getPageId(), page);
1088        }
1089    }
1090
1091    void removeFromCache(long pageId) {
1092        if (enablePageCaching) {
1093            pageCache.remove(pageId);
1094        }
1095    }
1096
1097    ///////////////////////////////////////////////////////////////////
1098    // Internal Double write implementation follows...
1099    ///////////////////////////////////////////////////////////////////
1100
1101    private void pollWrites() {
1102        try {
1103            while (!stopWriter.get()) {
1104                // Wait for a notification...
1105                synchronized (writes) {
1106                    writes.notifyAll();
1107
1108                    // If there is not enough to write, wait for a notification...
1109                    while (writes.isEmpty() && checkpointLatch == null && !stopWriter.get()) {
1110                        writes.wait(100);
1111                    }
1112
1113                    if (writes.isEmpty()) {
1114                        releaseCheckpointWaiter();
1115                    }
1116                }
1117                writeBatch();
1118            }
1119        } catch (Throwable e) {
1120            LOG.info("An exception was raised while performing poll writes", e);
1121        } finally {
1122            releaseCheckpointWaiter();
1123        }
1124    }
1125
1126    private void writeBatch() throws IOException {
1127
1128        CountDownLatch checkpointLatch;
1129        ArrayList<PageWrite> batch;
1130        synchronized (writes) {
1131            // If there is not enough to write, wait for a notification...
1132
1133            batch = new ArrayList<PageWrite>(writes.size());
1134            // build a write batch from the current write cache.
1135            for (PageWrite write : writes.values()) {
1136                batch.add(write);
1137                // Move the current write to the diskBound write, this lets folks update the
1138                // page again without blocking for this write.
1139                write.begin();
1140                if (write.diskBound == null && write.diskBoundLocation == -1) {
1141                    batch.remove(write);
1142                }
1143            }
1144
1145            // Grab on to the existing checkpoint latch cause once we do this write we can
1146            // release the folks that were waiting for those writes to hit disk.
1147            checkpointLatch = this.checkpointLatch;
1148            this.checkpointLatch = null;
1149        }
1150
1151        try {
1152
1153            // First land the writes in the recovery file
1154            if (enableRecoveryFile) {
1155                Checksum checksum = new Adler32();
1156
1157                recoveryFile.seek(RECOVERY_FILE_HEADER_SIZE);
1158
1159                for (PageWrite w : batch) {
1160                    try {
1161                        checksum.update(w.getDiskBound(tmpFilesForRemoval), 0, pageSize);
1162                    } catch (Throwable t) {
1163                        throw IOExceptionSupport.create("Cannot create recovery file. Reason: " + t, t);
1164                    }
1165                    recoveryFile.writeLong(w.page.getPageId());
1166                    recoveryFile.write(w.getDiskBound(tmpFilesForRemoval), 0, pageSize);
1167                }
1168
1169                // Can we shrink the recovery buffer??
1170                if (recoveryPageCount > recoveryFileMaxPageCount) {
1171                    int t = Math.max(recoveryFileMinPageCount, batch.size());
1172                    recoveryFile.setLength(recoveryFileSizeForPages(t));
1173                }
1174
1175                // Record the page writes in the recovery buffer.
1176                recoveryFile.seek(0);
1177                // Store the next tx id...
1178                recoveryFile.writeLong(nextTxid.get());
1179                // Store the checksum for thw write batch so that on recovery we
1180                // know if we have a consistent
1181                // write batch on disk.
1182                recoveryFile.writeLong(checksum.getValue());
1183                // Write the # of pages that will follow
1184                recoveryFile.writeInt(batch.size());
1185
1186                if (enableDiskSyncs) {
1187                    recoveryFile.sync();
1188                }
1189            }
1190
1191            for (PageWrite w : batch) {
1192                writeFile.seek(toOffset(w.page.getPageId()));
1193                writeFile.write(w.getDiskBound(tmpFilesForRemoval), 0, pageSize);
1194                w.done();
1195            }
1196
1197            if (enableDiskSyncs) {
1198                writeFile.sync();
1199            }
1200
1201        } catch (IOException ioError) {
1202            LOG.info("Unexpected io error on pagefile write of " + batch.size() + " pages.", ioError);
1203            // any subsequent write needs to be prefaced with a considered call to redoRecoveryUpdates
1204            // to ensure disk image is self consistent
1205            loaded.set(false);
1206            throw  ioError;
1207        } finally {
1208            synchronized (writes) {
1209                for (PageWrite w : batch) {
1210                    // If there are no more pending writes, then remove it from
1211                    // the write cache.
1212                    if (w.isDone()) {
1213                        writes.remove(w.page.getPageId());
1214                        if (w.tmpFile != null && tmpFilesForRemoval.containsKey(w.tmpFile)) {
1215                            tmpFilesForRemoval.get(w.tmpFile).close();
1216                            if (!w.tmpFile.delete()) {
1217                                throw new IOException("Can't delete temporary KahaDB transaction file:" + w.tmpFile);
1218                            }
1219                            tmpFilesForRemoval.remove(w.tmpFile);
1220                        }
1221                    }
1222                }
1223            }
1224
1225            if (checkpointLatch != null) {
1226                checkpointLatch.countDown();
1227            }
1228        }
1229    }
1230
1231    public void removeTmpFile(File file, RandomAccessFile randomAccessFile) throws IOException {
1232        if (!tmpFilesForRemoval.containsKey(file)) {
1233            tmpFilesForRemoval.put(file, randomAccessFile);
1234        } else {
1235            randomAccessFile.close();
1236        }
1237    }
1238
1239    private long recoveryFileSizeForPages(int pageCount) {
1240        return RECOVERY_FILE_HEADER_SIZE + ((pageSize + 8L) * pageCount);
1241    }
1242
1243    private void releaseCheckpointWaiter() {
1244        if (checkpointLatch != null) {
1245            checkpointLatch.countDown();
1246            checkpointLatch = null;
1247        }
1248    }
1249
1250    /**
1251     * Inspects the recovery buffer and re-applies any
1252     * partially applied page writes.
1253     *
1254     * @return the next transaction id that can be used.
1255     */
1256    private long redoRecoveryUpdates() throws IOException {
1257        if (!enableRecoveryFile) {
1258            return 0;
1259        }
1260        recoveryPageCount = 0;
1261
1262        // Are we initializing the recovery file?
1263        if (recoveryFile.length() == 0) {
1264            // Write an empty header..
1265            recoveryFile.write(new byte[RECOVERY_FILE_HEADER_SIZE]);
1266            // Preallocate the minium size for better performance.
1267            recoveryFile.setLength(recoveryFileSizeForPages(recoveryFileMinPageCount));
1268            return 0;
1269        }
1270
1271        // How many recovery pages do we have in the recovery buffer?
1272        recoveryFile.seek(0);
1273        long nextTxId = recoveryFile.readLong();
1274        long expectedChecksum = recoveryFile.readLong();
1275        int pageCounter = recoveryFile.readInt();
1276
1277        recoveryFile.seek(RECOVERY_FILE_HEADER_SIZE);
1278        Checksum checksum = new Adler32();
1279        LinkedHashMap<Long, byte[]> batch = new LinkedHashMap<Long, byte[]>();
1280        try {
1281            for (int i = 0; i < pageCounter; i++) {
1282                long offset = recoveryFile.readLong();
1283                byte[] data = new byte[pageSize];
1284                if (recoveryFile.read(data, 0, pageSize) != pageSize) {
1285                    // Invalid recovery record, Could not fully read the data". Probably due to a partial write to the recovery buffer
1286                    return nextTxId;
1287                }
1288                checksum.update(data, 0, pageSize);
1289                batch.put(offset, data);
1290            }
1291        } catch (Exception e) {
1292            // If an error occurred it was cause the redo buffer was not full written out correctly.. so don't redo it.
1293            // as the pages should still be consistent.
1294            LOG.debug("Redo buffer was not fully intact: ", e);
1295            return nextTxId;
1296        }
1297
1298        recoveryPageCount = pageCounter;
1299
1300        // If the checksum is not valid then the recovery buffer was partially written to disk.
1301        if (checksum.getValue() != expectedChecksum) {
1302            return nextTxId;
1303        }
1304
1305        // Re-apply all the writes in the recovery buffer.
1306        for (Map.Entry<Long, byte[]> e : batch.entrySet()) {
1307            writeFile.seek(toOffset(e.getKey()));
1308            writeFile.write(e.getValue());
1309        }
1310
1311        // And sync it to disk
1312        writeFile.sync();
1313        return nextTxId;
1314    }
1315
1316    private void startWriter() {
1317        synchronized (writes) {
1318            if (enabledWriteThread) {
1319                stopWriter.set(false);
1320                writerThread = new Thread("KahaDB Page Writer") {
1321                    @Override
1322                    public void run() {
1323                        pollWrites();
1324                    }
1325                };
1326                writerThread.setPriority(Thread.MAX_PRIORITY);
1327                writerThread.setDaemon(true);
1328                writerThread.start();
1329            }
1330        }
1331    }
1332
1333    private void stopWriter() throws InterruptedException {
1334        if (enabledWriteThread) {
1335            stopWriter.set(true);
1336            writerThread.join();
1337        }
1338    }
1339
1340    public File getFile() {
1341        return getMainPageFile();
1342    }
1343
1344    public File getDirectory() {
1345        return directory;
1346    }
1347}