001    /**
002     * Licensed to the Apache Software Foundation (ASF) under one
003     * or more contributor license agreements.  See the NOTICE file
004     * distributed with this work for additional information
005     * regarding copyright ownership.  The ASF licenses this file
006     * to you under the Apache License, Version 2.0 (the
007     * "License"); you may not use this file except in compliance
008     * with the License.  You may obtain a copy of the License at
009     *
010     *     http://www.apache.org/licenses/LICENSE-2.0
011     *
012     * Unless required by applicable law or agreed to in writing, software
013     * distributed under the License is distributed on an "AS IS" BASIS,
014     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015     * See the License for the specific language governing permissions and
016     * limitations under the License.
017     */
018    package org.apache.hadoop.hdfs.shortcircuit;
019    
020    import java.io.BufferedOutputStream;
021    import java.io.Closeable;
022    import java.io.DataInputStream;
023    import java.io.DataOutputStream;
024    import java.io.IOException;
025    import java.nio.MappedByteBuffer;
026    import java.util.HashMap;
027    import java.util.Map;
028    import java.util.Map.Entry;
029    import java.util.TreeMap;
030    import java.util.concurrent.ScheduledFuture;
031    import java.util.concurrent.ScheduledThreadPoolExecutor;
032    import java.util.concurrent.TimeUnit;
033    import java.util.concurrent.locks.Condition;
034    import java.util.concurrent.locks.ReentrantLock;
035    
036    import org.apache.commons.lang.mutable.MutableBoolean;
037    import org.apache.commons.logging.Log;
038    import org.apache.commons.logging.LogFactory;
039    import org.apache.hadoop.classification.InterfaceAudience;
040    import org.apache.hadoop.conf.Configuration;
041    import org.apache.hadoop.hdfs.DFSConfigKeys;
042    import org.apache.hadoop.hdfs.ExtendedBlockId;
043    import org.apache.hadoop.hdfs.net.DomainPeer;
044    import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
045    import org.apache.hadoop.hdfs.protocol.datatransfer.Sender;
046    import org.apache.hadoop.hdfs.protocol.proto.DataTransferProtos.ReleaseShortCircuitAccessResponseProto;
047    import org.apache.hadoop.hdfs.protocol.proto.DataTransferProtos.Status;
048    import org.apache.hadoop.hdfs.protocolPB.PBHelper;
049    import org.apache.hadoop.hdfs.shortcircuit.ShortCircuitShm.Slot;
050    import org.apache.hadoop.io.IOUtils;
051    import org.apache.hadoop.ipc.RetriableException;
052    import org.apache.hadoop.net.unix.DomainSocket;
053    import org.apache.hadoop.net.unix.DomainSocketWatcher;
054    import org.apache.hadoop.security.token.SecretManager.InvalidToken;
055    import org.apache.hadoop.util.StringUtils;
056    import org.apache.hadoop.util.Time;
057    import org.apache.hadoop.util.Waitable;
058    
059    import com.google.common.annotations.VisibleForTesting;
060    import com.google.common.base.Preconditions;
061    import com.google.common.util.concurrent.ThreadFactoryBuilder;
062    
063    /**
064     * The ShortCircuitCache tracks things which the client needs to access
065     * HDFS block files via short-circuit.
066     *
067     * These things include: memory-mapped regions, file descriptors, and shared
068     * memory areas for communicating with the DataNode.
069     */
070    @InterfaceAudience.Private
071    public class ShortCircuitCache implements Closeable {
072      public static final Log LOG = LogFactory.getLog(ShortCircuitCache.class);
073    
074      /**
075       * Expiry thread which makes sure that the file descriptors get closed
076       * after a while.
077       */
078      private class CacheCleaner implements Runnable, Closeable {
079        private ScheduledFuture<?> future;
080    
081        /**
082         * Run the CacheCleaner thread.
083         *
084         * Whenever a thread requests a ShortCircuitReplica object, we will make
085         * sure it gets one.  That ShortCircuitReplica object can then be re-used
086         * when another thread requests a ShortCircuitReplica object for the same
087         * block.  So in that sense, there is no maximum size to the cache.
088         *
089         * However, when a ShortCircuitReplica object is unreferenced by the
090         * thread(s) that are using it, it becomes evictable.  There are two
091         * separate eviction lists-- one for mmaped objects, and another for
092         * non-mmaped objects.  We do this in order to avoid having the regular
093         * files kick the mmaped files out of the cache too quickly.  Reusing
094         * an already-existing mmap gives a huge performance boost, since the
095         * page table entries don't have to be re-populated.  Both the mmap
096         * and non-mmap evictable lists have maximum sizes and maximum lifespans.
097         */
098        @Override
099        public void run() {
100          ShortCircuitCache.this.lock.lock();
101          try {
102            if (ShortCircuitCache.this.closed) return;
103            long curMs = Time.monotonicNow();
104    
105            if (LOG.isDebugEnabled()) {
106              LOG.debug(this + ": cache cleaner running at " + curMs);
107            }
108    
109            int numDemoted = demoteOldEvictableMmaped(curMs);
110            int numPurged = 0;
111            Long evictionTimeNs = Long.valueOf(0);
112            while (true) {
113              Entry<Long, ShortCircuitReplica> entry = 
114                  evictable.ceilingEntry(evictionTimeNs);
115              if (entry == null) break;
116              evictionTimeNs = entry.getKey();
117              long evictionTimeMs = 
118                  TimeUnit.MILLISECONDS.convert(evictionTimeNs, TimeUnit.NANOSECONDS);
119              if (evictionTimeMs + maxNonMmappedEvictableLifespanMs >= curMs) break;
120              ShortCircuitReplica replica = entry.getValue();
121              if (LOG.isTraceEnabled()) {
122                LOG.trace("CacheCleaner: purging " + replica + ": " + 
123                      StringUtils.getStackTrace(Thread.currentThread()));
124              }
125              purge(replica);
126              numPurged++;
127            }
128    
129            if (LOG.isDebugEnabled()) {
130              LOG.debug(this + ": finishing cache cleaner run started at " +
131                curMs + ".  Demoted " + numDemoted + " mmapped replicas; " +
132                "purged " + numPurged + " replicas.");
133            }
134          } finally {
135            ShortCircuitCache.this.lock.unlock();
136          }
137        }
138    
139        @Override
140        public void close() throws IOException {
141          if (future != null) {
142            future.cancel(false);
143          }
144        }
145    
146        public void setFuture(ScheduledFuture<?> future) {
147          this.future = future;
148        }
149    
150        /**
151         * Get the rate at which this cleaner thread should be scheduled.
152         *
153         * We do this by taking the minimum expiration time and dividing by 4.
154         *
155         * @return the rate in milliseconds at which this thread should be
156         *         scheduled.
157         */
158        public long getRateInMs() {
159          long minLifespanMs =
160              Math.min(maxNonMmappedEvictableLifespanMs,
161                  maxEvictableMmapedLifespanMs);
162          long sampleTimeMs = minLifespanMs / 4;
163          return (sampleTimeMs < 1) ? 1 : sampleTimeMs;
164        }
165      }
166    
167      /**
168       * A task which asks the DataNode to release a short-circuit shared memory
169       * slot.  If successful, this will tell the DataNode to stop monitoring
170       * changes to the mlock status of the replica associated with the slot.
171       * It will also allow us (the client) to re-use this slot for another
172       * replica.  If we can't communicate with the DataNode for some reason,
173       * we tear down the shared memory segment to avoid being in an inconsistent
174       * state.
175       */
176      private class SlotReleaser implements Runnable {
177        /**
178         * The slot that we need to release.
179         */
180        private final Slot slot;
181    
182        SlotReleaser(Slot slot) {
183          this.slot = slot;
184        }
185    
186        @Override
187        public void run() {
188          if (LOG.isTraceEnabled()) {
189            LOG.trace(ShortCircuitCache.this + ": about to release " + slot);
190          }
191          final DfsClientShm shm = (DfsClientShm)slot.getShm();
192          final DomainSocket shmSock = shm.getPeer().getDomainSocket();
193          DomainSocket sock = null;
194          DataOutputStream out = null;
195          final String path = shmSock.getPath();
196          boolean success = false;
197          try {
198            sock = DomainSocket.connect(path);
199            out = new DataOutputStream(
200                new BufferedOutputStream(sock.getOutputStream()));
201            new Sender(out).releaseShortCircuitFds(slot.getSlotId());
202            DataInputStream in = new DataInputStream(sock.getInputStream());
203            ReleaseShortCircuitAccessResponseProto resp =
204                ReleaseShortCircuitAccessResponseProto.parseFrom(
205                    PBHelper.vintPrefixed(in));
206            if (resp.getStatus() != Status.SUCCESS) {
207              String error = resp.hasError() ? resp.getError() : "(unknown)";
208              throw new IOException(resp.getStatus().toString() + ": " + error);
209            }
210            if (LOG.isTraceEnabled()) {
211              LOG.trace(ShortCircuitCache.this + ": released " + slot);
212            }
213            success = true;
214          } catch (IOException e) {
215            LOG.error(ShortCircuitCache.this + ": failed to release " +
216                "short-circuit shared memory slot " + slot + " by sending " +
217                "ReleaseShortCircuitAccessRequestProto to " + path +
218                ".  Closing shared memory segment.", e);
219          } finally {
220            if (success) {
221              shmManager.freeSlot(slot);
222            } else {
223              shm.getEndpointShmManager().shutdown(shm);
224            }
225            IOUtils.cleanup(LOG, sock, out);
226          }
227        }
228      }
229    
230      public interface ShortCircuitReplicaCreator {
231        /**
232         * Attempt to create a ShortCircuitReplica object.
233         *
234         * This callback will be made without holding any locks.
235         *
236         * @return a non-null ShortCircuitReplicaInfo object.
237         */
238        ShortCircuitReplicaInfo createShortCircuitReplicaInfo();
239      }
240    
241      /**
242       * Lock protecting the cache.
243       */
244      private final ReentrantLock lock = new ReentrantLock();
245    
246      /**
247       * The executor service that runs the cacheCleaner.
248       */
249      private final ScheduledThreadPoolExecutor cleanerExecutor
250      = new ScheduledThreadPoolExecutor(1, new ThreadFactoryBuilder().
251              setDaemon(true).setNameFormat("ShortCircuitCache_Cleaner").
252              build());
253    
254      /**
255       * The executor service that runs the cacheCleaner.
256       */
257      private final ScheduledThreadPoolExecutor releaserExecutor
258          = new ScheduledThreadPoolExecutor(1, new ThreadFactoryBuilder().
259              setDaemon(true).setNameFormat("ShortCircuitCache_SlotReleaser").
260              build());
261    
262      /**
263       * A map containing all ShortCircuitReplicaInfo objects, organized by Key.
264       * ShortCircuitReplicaInfo objects may contain a replica, or an InvalidToken
265       * exception.
266       */
267      private final HashMap<ExtendedBlockId, Waitable<ShortCircuitReplicaInfo>> 
268          replicaInfoMap = new HashMap<ExtendedBlockId,
269              Waitable<ShortCircuitReplicaInfo>>();
270    
271      /**
272       * The CacheCleaner.  We don't create this and schedule it until it becomes
273       * necessary.
274       */
275      private CacheCleaner cacheCleaner;
276    
277      /**
278       * Tree of evictable elements.
279       *
280       * Maps (unique) insertion time in nanoseconds to the element.
281       */
282      private final TreeMap<Long, ShortCircuitReplica> evictable =
283          new TreeMap<Long, ShortCircuitReplica>();
284    
285      /**
286       * Maximum total size of the cache, including both mmapped and
287       * no$-mmapped elements.
288       */
289      private final int maxTotalSize;
290    
291      /**
292       * Non-mmaped elements older than this will be closed.
293       */
294      private long maxNonMmappedEvictableLifespanMs;
295    
296      /**
297       * Tree of mmaped evictable elements.
298       *
299       * Maps (unique) insertion time in nanoseconds to the element.
300       */
301      private final TreeMap<Long, ShortCircuitReplica> evictableMmapped =
302          new TreeMap<Long, ShortCircuitReplica>();
303    
304      /**
305       * Maximum number of mmaped evictable elements.
306       */
307      private int maxEvictableMmapedSize;
308    
309      /**
310       * Mmaped elements older than this will be closed.
311       */
312      private final long maxEvictableMmapedLifespanMs;
313    
314      /**
315       * The minimum number of milliseconds we'll wait after an unsuccessful
316       * mmap attempt before trying again.
317       */
318      private final long mmapRetryTimeoutMs;
319    
320      /**
321       * How long we will keep replicas in the cache before declaring them
322       * to be stale.
323       */
324      private final long staleThresholdMs;
325    
326      /**
327       * True if the ShortCircuitCache is closed.
328       */
329      private boolean closed = false;
330    
331      /**
332       * Number of existing mmaps associated with this cache.
333       */
334      private int outstandingMmapCount = 0;
335    
336      /**
337       * Manages short-circuit shared memory segments for the client.
338       */
339      private final DfsClientShmManager shmManager;
340    
341      /**
342       * Create a {@link ShortCircuitCache} object from a {@link Configuration}
343       */
344      public static ShortCircuitCache fromConf(Configuration conf) {
345        return new ShortCircuitCache(
346            conf.getInt(DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_SIZE_KEY,
347                DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_SIZE_DEFAULT),
348            conf.getLong(DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_EXPIRY_MS_KEY,
349                DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_EXPIRY_MS_DEFAULT),
350            conf.getInt(DFSConfigKeys.DFS_CLIENT_MMAP_CACHE_SIZE,
351                DFSConfigKeys.DFS_CLIENT_MMAP_CACHE_SIZE_DEFAULT),
352            conf.getLong(DFSConfigKeys.DFS_CLIENT_MMAP_CACHE_TIMEOUT_MS,
353                DFSConfigKeys.DFS_CLIENT_MMAP_CACHE_TIMEOUT_MS_DEFAULT),
354            conf.getLong(DFSConfigKeys.DFS_CLIENT_MMAP_RETRY_TIMEOUT_MS,
355                DFSConfigKeys.DFS_CLIENT_MMAP_RETRY_TIMEOUT_MS_DEFAULT),
356            conf.getLong(DFSConfigKeys.DFS_CLIENT_SHORT_CIRCUIT_REPLICA_STALE_THRESHOLD_MS,
357                DFSConfigKeys.DFS_CLIENT_SHORT_CIRCUIT_REPLICA_STALE_THRESHOLD_MS_DEFAULT),
358            conf.getInt(DFSConfigKeys.DFS_SHORT_CIRCUIT_SHARED_MEMORY_WATCHER_INTERRUPT_CHECK_MS,
359                DFSConfigKeys.DFS_SHORT_CIRCUIT_SHARED_MEMORY_WATCHER_INTERRUPT_CHECK_MS_DEFAULT));
360      }
361    
362      public ShortCircuitCache(int maxTotalSize, long maxNonMmappedEvictableLifespanMs,
363          int maxEvictableMmapedSize, long maxEvictableMmapedLifespanMs,
364          long mmapRetryTimeoutMs, long staleThresholdMs, int shmInterruptCheckMs) {
365        Preconditions.checkArgument(maxTotalSize >= 0);
366        this.maxTotalSize = maxTotalSize;
367        Preconditions.checkArgument(maxNonMmappedEvictableLifespanMs >= 0);
368        this.maxNonMmappedEvictableLifespanMs = maxNonMmappedEvictableLifespanMs;
369        Preconditions.checkArgument(maxEvictableMmapedSize >= 0);
370        this.maxEvictableMmapedSize = maxEvictableMmapedSize;
371        Preconditions.checkArgument(maxEvictableMmapedLifespanMs >= 0);
372        this.maxEvictableMmapedLifespanMs = maxEvictableMmapedLifespanMs;
373        this.mmapRetryTimeoutMs = mmapRetryTimeoutMs;
374        this.staleThresholdMs = staleThresholdMs;
375        DfsClientShmManager shmManager = null;
376        if ((shmInterruptCheckMs > 0) &&
377            (DomainSocketWatcher.getLoadingFailureReason() == null)) {
378          try {
379            shmManager = new DfsClientShmManager(shmInterruptCheckMs);
380          } catch (IOException e) {
381            LOG.error("failed to create ShortCircuitShmManager", e);
382          }
383        }
384        this.shmManager = shmManager;
385      }
386    
387      public long getStaleThresholdMs() {
388        return staleThresholdMs;
389      }
390    
391      /**
392       * Increment the reference count of a replica, and remove it from any free
393       * list it may be in.
394       *
395       * You must hold the cache lock while calling this function.
396       *
397       * @param replica      The replica we're removing.
398       */
399      private void ref(ShortCircuitReplica replica) {
400        lock.lock();
401        try {
402          Preconditions.checkArgument(replica.refCount > 0,
403              "can't ref " + replica + " because its refCount reached " +
404              replica.refCount);
405          Long evictableTimeNs = replica.getEvictableTimeNs();
406          replica.refCount++;
407          if (evictableTimeNs != null) {
408            String removedFrom = removeEvictable(replica);
409            if (LOG.isTraceEnabled()) {
410              LOG.trace(this + ": " + removedFrom +
411                  " no longer contains " + replica + ".  refCount " +
412                  (replica.refCount - 1) + " -> " + replica.refCount +
413                  StringUtils.getStackTrace(Thread.currentThread()));
414    
415            }
416          } else if (LOG.isTraceEnabled()) {
417            LOG.trace(this + ": replica  refCount " +
418                (replica.refCount - 1) + " -> " + replica.refCount +
419                StringUtils.getStackTrace(Thread.currentThread()));
420          }
421        } finally {
422          lock.unlock();
423        }
424      }
425    
426      /**
427       * Unreference a replica.
428       *
429       * You must hold the cache lock while calling this function.
430       *
431       * @param replica   The replica being unreferenced.
432       */
433      void unref(ShortCircuitReplica replica) {
434        lock.lock();
435        try {
436          // If the replica is stale or unusable, but we haven't purged it yet,
437          // let's do that.  It would be a shame to evict a non-stale replica so
438          // that we could put a stale or unusable one into the cache.
439          if (!replica.purged) {
440            String purgeReason = null;
441            if (!replica.getDataStream().getChannel().isOpen()) {
442              purgeReason = "purging replica because its data channel is closed.";
443            } else if (!replica.getMetaStream().getChannel().isOpen()) {
444              purgeReason = "purging replica because its meta channel is closed.";
445            } else if (replica.isStale()) {
446              purgeReason = "purging replica because it is stale.";
447            }
448            if (purgeReason != null) {
449              LOG.debug(this + ": " + purgeReason);
450              purge(replica);
451            }
452          }
453          String addedString = "";
454          boolean shouldTrimEvictionMaps = false;
455          int newRefCount = --replica.refCount;
456          if (newRefCount == 0) {
457            // Close replica, since there are no remaining references to it.
458            Preconditions.checkArgument(replica.purged,
459                "Replica " + replica + " reached a refCount of 0 without " +
460                "being purged");
461            replica.close();
462          } else if (newRefCount == 1) {
463            Preconditions.checkState(null == replica.getEvictableTimeNs(),
464                "Replica " + replica + " had a refCount higher than 1, " +
465                  "but was still evictable (evictableTimeNs = " +
466                    replica.getEvictableTimeNs() + ")");
467            if (!replica.purged) {
468              // Add the replica to the end of an eviction list.
469              // Eviction lists are sorted by time.
470              if (replica.hasMmap()) {
471                insertEvictable(System.nanoTime(), replica, evictableMmapped);
472                addedString = "added to evictableMmapped, ";
473              } else {
474                insertEvictable(System.nanoTime(), replica, evictable);
475                addedString = "added to evictable, ";
476              }
477              shouldTrimEvictionMaps = true;
478            }
479          } else {
480            Preconditions.checkArgument(replica.refCount >= 0,
481                "replica's refCount went negative (refCount = " +
482                replica.refCount + " for " + replica + ")");
483          }
484          if (LOG.isTraceEnabled()) {
485            LOG.trace(this + ": unref replica " + replica +
486                ": " + addedString + " refCount " +
487                (newRefCount + 1) + " -> " + newRefCount +
488                StringUtils.getStackTrace(Thread.currentThread()));
489          }
490          if (shouldTrimEvictionMaps) {
491            trimEvictionMaps();
492          }
493        } finally {
494          lock.unlock();
495        }
496      }
497    
498      /**
499       * Demote old evictable mmaps into the regular eviction map.
500       *
501       * You must hold the cache lock while calling this function.
502       *
503       * @param now   Current time in monotonic milliseconds.
504       * @return      Number of replicas demoted.
505       */
506      private int demoteOldEvictableMmaped(long now) {
507        int numDemoted = 0;
508        boolean needMoreSpace = false;
509        Long evictionTimeNs = Long.valueOf(0);
510    
511        while (true) {
512          Entry<Long, ShortCircuitReplica> entry = 
513              evictableMmapped.ceilingEntry(evictionTimeNs);
514          if (entry == null) break;
515          evictionTimeNs = entry.getKey();
516          long evictionTimeMs = 
517              TimeUnit.MILLISECONDS.convert(evictionTimeNs, TimeUnit.NANOSECONDS);
518          if (evictionTimeMs + maxEvictableMmapedLifespanMs >= now) {
519            if (evictableMmapped.size() < maxEvictableMmapedSize) {
520              break;
521            }
522            needMoreSpace = true;
523          }
524          ShortCircuitReplica replica = entry.getValue();
525          if (LOG.isTraceEnabled()) {
526            String rationale = needMoreSpace ? "because we need more space" : 
527                "because it's too old";
528            LOG.trace("demoteOldEvictable: demoting " + replica + ": " +
529                rationale + ": " +
530                StringUtils.getStackTrace(Thread.currentThread()));
531          }
532          removeEvictable(replica, evictableMmapped);
533          munmap(replica);
534          insertEvictable(evictionTimeNs, replica, evictable);
535          numDemoted++;
536        }
537        return numDemoted;
538      }
539    
540      /**
541       * Trim the eviction lists.
542       */
543      private void trimEvictionMaps() {
544        long now = Time.monotonicNow();
545        demoteOldEvictableMmaped(now);
546    
547        while (true) {
548          long evictableSize = evictable.size();
549          long evictableMmappedSize = evictableMmapped.size();
550          if (evictableSize + evictableMmappedSize <= maxTotalSize) {
551            return;
552          }
553          ShortCircuitReplica replica;
554          if (evictableSize == 0) {
555           replica = evictableMmapped.firstEntry().getValue();
556          } else {
557           replica = evictable.firstEntry().getValue();
558          }
559          if (LOG.isTraceEnabled()) {
560            LOG.trace(this + ": trimEvictionMaps is purging " + replica +
561              StringUtils.getStackTrace(Thread.currentThread()));
562          }
563          purge(replica);
564        }
565      }
566    
567      /**
568       * Munmap a replica, updating outstandingMmapCount.
569       *
570       * @param replica  The replica to munmap.
571       */
572      private void munmap(ShortCircuitReplica replica) {
573        replica.munmap();
574        outstandingMmapCount--;
575      }
576    
577      /**
578       * Remove a replica from an evictable map.
579       *
580       * @param replica   The replica to remove.
581       * @return          The map it was removed from.
582       */
583      private String removeEvictable(ShortCircuitReplica replica) {
584        if (replica.hasMmap()) {
585          removeEvictable(replica, evictableMmapped);
586          return "evictableMmapped";
587        } else {
588          removeEvictable(replica, evictable);
589          return "evictable";
590        }
591      }
592    
593      /**
594       * Remove a replica from an evictable map.
595       *
596       * @param replica   The replica to remove.
597       * @param map       The map to remove it from.
598       */
599      private void removeEvictable(ShortCircuitReplica replica,
600          TreeMap<Long, ShortCircuitReplica> map) {
601        Long evictableTimeNs = replica.getEvictableTimeNs();
602        Preconditions.checkNotNull(evictableTimeNs);
603        ShortCircuitReplica removed = map.remove(evictableTimeNs);
604        Preconditions.checkState(removed == replica,
605            "failed to make " + replica + " unevictable");
606        replica.setEvictableTimeNs(null);
607      }
608    
609      /**
610       * Insert a replica into an evictable map.
611       *
612       * If an element already exists with this eviction time, we add a nanosecond
613       * to it until we find an unused key.
614       *
615       * @param evictionTimeNs   The eviction time in absolute nanoseconds.
616       * @param replica          The replica to insert.
617       * @param map              The map to insert it into.
618       */
619      private void insertEvictable(Long evictionTimeNs,
620          ShortCircuitReplica replica, TreeMap<Long, ShortCircuitReplica> map) {
621        while (map.containsKey(evictionTimeNs)) {
622          evictionTimeNs++;
623        }
624        Preconditions.checkState(null == replica.getEvictableTimeNs());
625        replica.setEvictableTimeNs(evictionTimeNs);
626        map.put(evictionTimeNs, replica);
627      }
628    
629      /**
630       * Purge a replica from the cache.
631       *
632       * This doesn't necessarily close the replica, since there may be
633       * outstanding references to it.  However, it does mean the cache won't
634       * hand it out to anyone after this.
635       *
636       * You must hold the cache lock while calling this function.
637       *
638       * @param replica   The replica being removed.
639       */
640      private void purge(ShortCircuitReplica replica) {
641        boolean removedFromInfoMap = false;
642        String evictionMapName = null;
643        Preconditions.checkArgument(!replica.purged);
644        replica.purged = true;
645        Waitable<ShortCircuitReplicaInfo> val = replicaInfoMap.get(replica.key);
646        if (val != null) {
647          ShortCircuitReplicaInfo info = val.getVal();
648          if ((info != null) && (info.getReplica() == replica)) {
649            replicaInfoMap.remove(replica.key);
650            removedFromInfoMap = true;
651          }
652        }
653        Long evictableTimeNs = replica.getEvictableTimeNs();
654        if (evictableTimeNs != null) {
655          evictionMapName = removeEvictable(replica);
656        }
657        if (LOG.isTraceEnabled()) {
658          StringBuilder builder = new StringBuilder();
659          builder.append(this).append(": ").append(": purged ").
660              append(replica).append(" from the cache.");
661          if (removedFromInfoMap) {
662            builder.append("  Removed from the replicaInfoMap.");
663          }
664          if (evictionMapName != null) {
665            builder.append("  Removed from ").append(evictionMapName);
666          }
667          LOG.trace(builder.toString());
668        }
669        unref(replica);
670      }
671    
672      /**
673       * Fetch or create a replica.
674       *
675       * You must hold the cache lock while calling this function.
676       *
677       * @param key          Key to use for lookup.
678       * @param creator      Replica creator callback.  Will be called without
679       *                     the cache lock being held.
680       *
681       * @return             Null if no replica could be found or created.
682       *                     The replica, otherwise.
683       */
684      public ShortCircuitReplicaInfo fetchOrCreate(ExtendedBlockId key,
685          ShortCircuitReplicaCreator creator) {
686        Waitable<ShortCircuitReplicaInfo> newWaitable = null;
687        lock.lock();
688        try {
689          ShortCircuitReplicaInfo info = null;
690          do {
691            if (closed) {
692              if (LOG.isTraceEnabled()) {
693                LOG.trace(this + ": can't fetchOrCreate " + key +
694                    " because the cache is closed.");
695              }
696              return null;
697            }
698            Waitable<ShortCircuitReplicaInfo> waitable = replicaInfoMap.get(key);
699            if (waitable != null) {
700              try {
701                info = fetch(key, waitable);
702              } catch (RetriableException e) {
703                if (LOG.isDebugEnabled()) {
704                  LOG.debug(this + ": retrying " + e.getMessage());
705                }
706                continue;
707              }
708            }
709          } while (false);
710          if (info != null) return info;
711          // We need to load the replica ourselves.
712          newWaitable = new Waitable<ShortCircuitReplicaInfo>(lock.newCondition());
713          replicaInfoMap.put(key, newWaitable);
714        } finally {
715          lock.unlock();
716        }
717        return create(key, creator, newWaitable);
718      }
719    
720      /**
721       * Fetch an existing ReplicaInfo object.
722       *
723       * @param key       The key that we're using.
724       * @param waitable  The waitable object to wait on.
725       * @return          The existing ReplicaInfo object, or null if there is
726       *                  none.
727       *
728       * @throws RetriableException   If the caller needs to retry.
729       */
730      private ShortCircuitReplicaInfo fetch(ExtendedBlockId key,
731          Waitable<ShortCircuitReplicaInfo> waitable) throws RetriableException {
732        // Another thread is already in the process of loading this
733        // ShortCircuitReplica.  So we simply wait for it to complete.
734        ShortCircuitReplicaInfo info;
735        try {
736          if (LOG.isTraceEnabled()) {
737            LOG.trace(this + ": found waitable for " + key);
738          }
739          info = waitable.await();
740        } catch (InterruptedException e) {
741          LOG.info(this + ": interrupted while waiting for " + key);
742          Thread.currentThread().interrupt();
743          throw new RetriableException("interrupted");
744        }
745        if (info.getInvalidTokenException() != null) {
746          LOG.warn(this + ": could not get " + key + " due to InvalidToken " +
747                "exception.", info.getInvalidTokenException());
748          return info;
749        }
750        ShortCircuitReplica replica = info.getReplica();
751        if (replica == null) {
752          LOG.warn(this + ": failed to get " + key);
753          return info;
754        }
755        if (replica.purged) {
756          // Ignore replicas that have already been purged from the cache.
757          throw new RetriableException("Ignoring purged replica " +
758              replica + ".  Retrying.");
759        }
760        // Check if the replica is stale before using it.
761        // If it is, purge it and retry.
762        if (replica.isStale()) {
763          LOG.info(this + ": got stale replica " + replica + ".  Removing " +
764              "this replica from the replicaInfoMap and retrying.");
765          // Remove the cache's reference to the replica.  This may or may not
766          // trigger a close.
767          purge(replica);
768          throw new RetriableException("ignoring stale replica " + replica);
769        }
770        ref(replica);
771        return info;
772      }
773    
774      private ShortCircuitReplicaInfo create(ExtendedBlockId key,
775          ShortCircuitReplicaCreator creator,
776          Waitable<ShortCircuitReplicaInfo> newWaitable) {
777        // Handle loading a new replica.
778        ShortCircuitReplicaInfo info = null;
779        try {
780          if (LOG.isTraceEnabled()) {
781            LOG.trace(this + ": loading " + key);
782          }
783          info = creator.createShortCircuitReplicaInfo();
784        } catch (RuntimeException e) {
785          LOG.warn(this + ": failed to load " + key, e);
786        }
787        if (info == null) info = new ShortCircuitReplicaInfo();
788        lock.lock();
789        try {
790          if (info.getReplica() != null) {
791            // On success, make sure the cache cleaner thread is running.
792            if (LOG.isTraceEnabled()) {
793              LOG.trace(this + ": successfully loaded " + info.getReplica());
794            }
795            startCacheCleanerThreadIfNeeded();
796            // Note: new ShortCircuitReplicas start with a refCount of 2,
797            // indicating that both this cache and whoever requested the 
798            // creation of the replica hold a reference.  So we don't need
799            // to increment the reference count here.
800          } else {
801            // On failure, remove the waitable from the replicaInfoMap.
802            Waitable<ShortCircuitReplicaInfo> waitableInMap = replicaInfoMap.get(key);
803            if (waitableInMap == newWaitable) replicaInfoMap.remove(key);
804            if (info.getInvalidTokenException() != null) {
805              LOG.warn(this + ": could not load " + key + " due to InvalidToken " +
806                  "exception.", info.getInvalidTokenException());
807            } else {
808              LOG.warn(this + ": failed to load " + key);
809            }
810          }
811          newWaitable.provide(info);
812        } finally {
813          lock.unlock();
814        }
815        return info;
816      }
817    
818      private void startCacheCleanerThreadIfNeeded() {
819        if (cacheCleaner == null) {
820          cacheCleaner = new CacheCleaner();
821          long rateMs = cacheCleaner.getRateInMs();
822          ScheduledFuture<?> future =
823              cleanerExecutor.scheduleAtFixedRate(cacheCleaner, rateMs, rateMs,
824                  TimeUnit.MILLISECONDS);
825          cacheCleaner.setFuture(future);
826          if (LOG.isDebugEnabled()) {
827            LOG.debug(this + ": starting cache cleaner thread which will run " +
828              "every " + rateMs + " ms");
829          }
830        }
831      }
832    
833      ClientMmap getOrCreateClientMmap(ShortCircuitReplica replica,
834          boolean anchored) {
835        Condition newCond;
836        lock.lock();
837        try {
838          while (replica.mmapData != null) {
839            if (replica.mmapData instanceof MappedByteBuffer) {
840              ref(replica);
841              MappedByteBuffer mmap = (MappedByteBuffer)replica.mmapData;
842              return new ClientMmap(replica, mmap, anchored);
843            } else if (replica.mmapData instanceof Long) {
844              long lastAttemptTimeMs = (Long)replica.mmapData;
845              long delta = Time.monotonicNow() - lastAttemptTimeMs;
846              if (delta < mmapRetryTimeoutMs) {
847                if (LOG.isTraceEnabled()) {
848                  LOG.trace(this + ": can't create client mmap for " +
849                      replica + " because we failed to " +
850                      "create one just " + delta + "ms ago.");
851                }
852                return null;
853              }
854              if (LOG.isTraceEnabled()) {
855                LOG.trace(this + ": retrying client mmap for " + replica +
856                    ", " + delta + " ms after the previous failure.");
857              }
858            } else if (replica.mmapData instanceof Condition) {
859              Condition cond = (Condition)replica.mmapData;
860              cond.awaitUninterruptibly();
861            } else {
862              Preconditions.checkState(false, "invalid mmapData type " +
863                  replica.mmapData.getClass().getName());
864            }
865          }
866          newCond = lock.newCondition();
867          replica.mmapData = newCond;
868        } finally {
869          lock.unlock();
870        }
871        MappedByteBuffer map = replica.loadMmapInternal();
872        lock.lock();
873        try {
874          if (map == null) {
875            replica.mmapData = Long.valueOf(Time.monotonicNow());
876            newCond.signalAll();
877            return null;
878          } else {
879            outstandingMmapCount++;
880            replica.mmapData = map;
881            ref(replica);
882            newCond.signalAll();
883            return new ClientMmap(replica, map, anchored);
884          }
885        } finally {
886          lock.unlock();
887        }
888      }
889    
890      /**
891       * Close the cache and free all associated resources.
892       */
893      @Override
894      public void close() {
895        try {
896          lock.lock();
897          if (closed) return;
898          closed = true;
899          LOG.info(this + ": closing");
900          maxNonMmappedEvictableLifespanMs = 0;
901          maxEvictableMmapedSize = 0;
902          // Close and join cacheCleaner thread.
903          IOUtils.cleanup(LOG, cacheCleaner);
904          // Purge all replicas.
905          while (true) {
906            Entry<Long, ShortCircuitReplica> entry = evictable.firstEntry();
907            if (entry == null) break;
908            purge(entry.getValue());
909          }
910          while (true) {
911            Entry<Long, ShortCircuitReplica> entry = evictableMmapped.firstEntry();
912            if (entry == null) break;
913            purge(entry.getValue());
914          }
915        } finally {
916          lock.unlock();
917        }
918        IOUtils.cleanup(LOG, shmManager);
919      }
920    
921      @VisibleForTesting // ONLY for testing
922      public interface CacheVisitor {
923        void visit(int numOutstandingMmaps,
924            Map<ExtendedBlockId, ShortCircuitReplica> replicas,
925            Map<ExtendedBlockId, InvalidToken> failedLoads,
926            Map<Long, ShortCircuitReplica> evictable,
927            Map<Long, ShortCircuitReplica> evictableMmapped);
928      }
929    
930      @VisibleForTesting // ONLY for testing
931      public void accept(CacheVisitor visitor) {
932        lock.lock();
933        try {
934          Map<ExtendedBlockId, ShortCircuitReplica> replicas =
935              new HashMap<ExtendedBlockId, ShortCircuitReplica>();
936          Map<ExtendedBlockId, InvalidToken> failedLoads =
937              new HashMap<ExtendedBlockId, InvalidToken>();
938          for (Entry<ExtendedBlockId, Waitable<ShortCircuitReplicaInfo>> entry :
939                replicaInfoMap.entrySet()) {
940            Waitable<ShortCircuitReplicaInfo> waitable = entry.getValue();
941            if (waitable.hasVal()) {
942              if (waitable.getVal().getReplica() != null) {
943                replicas.put(entry.getKey(), waitable.getVal().getReplica());
944              } else {
945                // The exception may be null here, indicating a failed load that
946                // isn't the result of an invalid block token.
947                failedLoads.put(entry.getKey(),
948                    waitable.getVal().getInvalidTokenException());
949              }
950            }
951          }
952          if (LOG.isDebugEnabled()) {
953            StringBuilder builder = new StringBuilder();
954            builder.append("visiting ").append(visitor.getClass().getName()).
955                append("with outstandingMmapCount=").append(outstandingMmapCount).
956                append(", replicas=");
957            String prefix = "";
958            for (Entry<ExtendedBlockId, ShortCircuitReplica> entry : replicas.entrySet()) {
959              builder.append(prefix).append(entry.getValue());
960              prefix = ",";
961            }
962            prefix = "";
963            builder.append(", failedLoads=");
964            for (Entry<ExtendedBlockId, InvalidToken> entry : failedLoads.entrySet()) {
965              builder.append(prefix).append(entry.getValue());
966              prefix = ",";
967            }
968            prefix = "";
969            builder.append(", evictable=");
970            for (Entry<Long, ShortCircuitReplica> entry : evictable.entrySet()) {
971              builder.append(prefix).append(entry.getKey()).
972                  append(":").append(entry.getValue());
973              prefix = ",";
974            }
975            prefix = "";
976            builder.append(", evictableMmapped=");
977            for (Entry<Long, ShortCircuitReplica> entry : evictableMmapped.entrySet()) {
978              builder.append(prefix).append(entry.getKey()).
979                  append(":").append(entry.getValue());
980              prefix = ",";
981            }
982            LOG.debug(builder.toString());
983          }
984          visitor.visit(outstandingMmapCount, replicas, failedLoads,
985                evictable, evictableMmapped);
986        } finally {
987          lock.unlock();
988        }
989      }
990    
991      @Override
992      public String toString() {
993        return "ShortCircuitCache(0x" +
994            Integer.toHexString(System.identityHashCode(this)) + ")";
995      }
996    
997      /**
998       * Allocate a new shared memory slot.
999       *
1000       * @param datanode       The datanode to allocate a shm slot with.
1001       * @param peer           A peer connected to the datanode.
1002       * @param usedPeer       Will be set to true if we use up the provided peer.
1003       * @param blockId        The block id and block pool id of the block we're 
1004       *                         allocating this slot for.
1005       * @param clientName     The name of the DFSClient allocating the shared
1006       *                         memory.
1007       * @return               Null if short-circuit shared memory is disabled;
1008       *                         a short-circuit memory slot otherwise.
1009       * @throws IOException   An exception if there was an error talking to 
1010       *                         the datanode.
1011       */
1012      public Slot allocShmSlot(DatanodeInfo datanode,
1013            DomainPeer peer, MutableBoolean usedPeer,
1014            ExtendedBlockId blockId, String clientName) throws IOException {
1015        if (shmManager != null) {
1016          return shmManager.allocSlot(datanode, peer, usedPeer,
1017              blockId, clientName);
1018        } else {
1019          return null;
1020        }
1021      }
1022    
1023      /**
1024       * Free a slot immediately.
1025       *
1026       * ONLY use this if the DataNode is not yet aware of the slot.
1027       * 
1028       * @param slot           The slot to free.
1029       */
1030      public void freeSlot(Slot slot) {
1031        Preconditions.checkState(shmManager != null);
1032        slot.makeInvalid();
1033        shmManager.freeSlot(slot);
1034      }
1035      
1036      /**
1037       * Schedule a shared memory slot to be released.
1038       *
1039       * @param slot           The slot to release.
1040       */
1041      public void scheduleSlotReleaser(Slot slot) {
1042        Preconditions.checkState(shmManager != null);
1043        releaserExecutor.execute(new SlotReleaser(slot));
1044      }
1045    
1046      @VisibleForTesting
1047      public DfsClientShmManager getDfsClientShmManager() {
1048        return shmManager;
1049      }
1050    }