001 /**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements. See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership. The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License. You may obtain a copy of the License at
009 *
010 * http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018 package org.apache.hadoop.hdfs.shortcircuit;
019
020 import java.io.BufferedOutputStream;
021 import java.io.Closeable;
022 import java.io.DataInputStream;
023 import java.io.DataOutputStream;
024 import java.io.IOException;
025 import java.nio.MappedByteBuffer;
026 import java.util.HashMap;
027 import java.util.Map;
028 import java.util.Map.Entry;
029 import java.util.TreeMap;
030 import java.util.concurrent.ScheduledFuture;
031 import java.util.concurrent.ScheduledThreadPoolExecutor;
032 import java.util.concurrent.TimeUnit;
033 import java.util.concurrent.locks.Condition;
034 import java.util.concurrent.locks.ReentrantLock;
035
036 import org.apache.commons.lang.mutable.MutableBoolean;
037 import org.apache.commons.logging.Log;
038 import org.apache.commons.logging.LogFactory;
039 import org.apache.hadoop.classification.InterfaceAudience;
040 import org.apache.hadoop.conf.Configuration;
041 import org.apache.hadoop.hdfs.DFSConfigKeys;
042 import org.apache.hadoop.hdfs.ExtendedBlockId;
043 import org.apache.hadoop.hdfs.net.DomainPeer;
044 import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
045 import org.apache.hadoop.hdfs.protocol.datatransfer.Sender;
046 import org.apache.hadoop.hdfs.protocol.proto.DataTransferProtos.ReleaseShortCircuitAccessResponseProto;
047 import org.apache.hadoop.hdfs.protocol.proto.DataTransferProtos.Status;
048 import org.apache.hadoop.hdfs.protocolPB.PBHelper;
049 import org.apache.hadoop.hdfs.shortcircuit.ShortCircuitShm.Slot;
050 import org.apache.hadoop.io.IOUtils;
051 import org.apache.hadoop.ipc.RetriableException;
052 import org.apache.hadoop.net.unix.DomainSocket;
053 import org.apache.hadoop.net.unix.DomainSocketWatcher;
054 import org.apache.hadoop.security.token.SecretManager.InvalidToken;
055 import org.apache.hadoop.util.StringUtils;
056 import org.apache.hadoop.util.Time;
057 import org.apache.hadoop.util.Waitable;
058
059 import com.google.common.annotations.VisibleForTesting;
060 import com.google.common.base.Preconditions;
061 import com.google.common.util.concurrent.ThreadFactoryBuilder;
062
063 /**
064 * The ShortCircuitCache tracks things which the client needs to access
065 * HDFS block files via short-circuit.
066 *
067 * These things include: memory-mapped regions, file descriptors, and shared
068 * memory areas for communicating with the DataNode.
069 */
070 @InterfaceAudience.Private
071 public class ShortCircuitCache implements Closeable {
072 public static final Log LOG = LogFactory.getLog(ShortCircuitCache.class);
073
074 /**
075 * Expiry thread which makes sure that the file descriptors get closed
076 * after a while.
077 */
078 private class CacheCleaner implements Runnable, Closeable {
079 private ScheduledFuture<?> future;
080
081 /**
082 * Run the CacheCleaner thread.
083 *
084 * Whenever a thread requests a ShortCircuitReplica object, we will make
085 * sure it gets one. That ShortCircuitReplica object can then be re-used
086 * when another thread requests a ShortCircuitReplica object for the same
087 * block. So in that sense, there is no maximum size to the cache.
088 *
089 * However, when a ShortCircuitReplica object is unreferenced by the
090 * thread(s) that are using it, it becomes evictable. There are two
091 * separate eviction lists-- one for mmaped objects, and another for
092 * non-mmaped objects. We do this in order to avoid having the regular
093 * files kick the mmaped files out of the cache too quickly. Reusing
094 * an already-existing mmap gives a huge performance boost, since the
095 * page table entries don't have to be re-populated. Both the mmap
096 * and non-mmap evictable lists have maximum sizes and maximum lifespans.
097 */
098 @Override
099 public void run() {
100 ShortCircuitCache.this.lock.lock();
101 try {
102 if (ShortCircuitCache.this.closed) return;
103 long curMs = Time.monotonicNow();
104
105 if (LOG.isDebugEnabled()) {
106 LOG.debug(this + ": cache cleaner running at " + curMs);
107 }
108
109 int numDemoted = demoteOldEvictableMmaped(curMs);
110 int numPurged = 0;
111 Long evictionTimeNs = Long.valueOf(0);
112 while (true) {
113 Entry<Long, ShortCircuitReplica> entry =
114 evictable.ceilingEntry(evictionTimeNs);
115 if (entry == null) break;
116 evictionTimeNs = entry.getKey();
117 long evictionTimeMs =
118 TimeUnit.MILLISECONDS.convert(evictionTimeNs, TimeUnit.NANOSECONDS);
119 if (evictionTimeMs + maxNonMmappedEvictableLifespanMs >= curMs) break;
120 ShortCircuitReplica replica = entry.getValue();
121 if (LOG.isTraceEnabled()) {
122 LOG.trace("CacheCleaner: purging " + replica + ": " +
123 StringUtils.getStackTrace(Thread.currentThread()));
124 }
125 purge(replica);
126 numPurged++;
127 }
128
129 if (LOG.isDebugEnabled()) {
130 LOG.debug(this + ": finishing cache cleaner run started at " +
131 curMs + ". Demoted " + numDemoted + " mmapped replicas; " +
132 "purged " + numPurged + " replicas.");
133 }
134 } finally {
135 ShortCircuitCache.this.lock.unlock();
136 }
137 }
138
139 @Override
140 public void close() throws IOException {
141 if (future != null) {
142 future.cancel(false);
143 }
144 }
145
146 public void setFuture(ScheduledFuture<?> future) {
147 this.future = future;
148 }
149
150 /**
151 * Get the rate at which this cleaner thread should be scheduled.
152 *
153 * We do this by taking the minimum expiration time and dividing by 4.
154 *
155 * @return the rate in milliseconds at which this thread should be
156 * scheduled.
157 */
158 public long getRateInMs() {
159 long minLifespanMs =
160 Math.min(maxNonMmappedEvictableLifespanMs,
161 maxEvictableMmapedLifespanMs);
162 long sampleTimeMs = minLifespanMs / 4;
163 return (sampleTimeMs < 1) ? 1 : sampleTimeMs;
164 }
165 }
166
167 /**
168 * A task which asks the DataNode to release a short-circuit shared memory
169 * slot. If successful, this will tell the DataNode to stop monitoring
170 * changes to the mlock status of the replica associated with the slot.
171 * It will also allow us (the client) to re-use this slot for another
172 * replica. If we can't communicate with the DataNode for some reason,
173 * we tear down the shared memory segment to avoid being in an inconsistent
174 * state.
175 */
176 private class SlotReleaser implements Runnable {
177 /**
178 * The slot that we need to release.
179 */
180 private final Slot slot;
181
182 SlotReleaser(Slot slot) {
183 this.slot = slot;
184 }
185
186 @Override
187 public void run() {
188 if (LOG.isTraceEnabled()) {
189 LOG.trace(ShortCircuitCache.this + ": about to release " + slot);
190 }
191 final DfsClientShm shm = (DfsClientShm)slot.getShm();
192 final DomainSocket shmSock = shm.getPeer().getDomainSocket();
193 DomainSocket sock = null;
194 DataOutputStream out = null;
195 final String path = shmSock.getPath();
196 boolean success = false;
197 try {
198 sock = DomainSocket.connect(path);
199 out = new DataOutputStream(
200 new BufferedOutputStream(sock.getOutputStream()));
201 new Sender(out).releaseShortCircuitFds(slot.getSlotId());
202 DataInputStream in = new DataInputStream(sock.getInputStream());
203 ReleaseShortCircuitAccessResponseProto resp =
204 ReleaseShortCircuitAccessResponseProto.parseFrom(
205 PBHelper.vintPrefixed(in));
206 if (resp.getStatus() != Status.SUCCESS) {
207 String error = resp.hasError() ? resp.getError() : "(unknown)";
208 throw new IOException(resp.getStatus().toString() + ": " + error);
209 }
210 if (LOG.isTraceEnabled()) {
211 LOG.trace(ShortCircuitCache.this + ": released " + slot);
212 }
213 success = true;
214 } catch (IOException e) {
215 LOG.error(ShortCircuitCache.this + ": failed to release " +
216 "short-circuit shared memory slot " + slot + " by sending " +
217 "ReleaseShortCircuitAccessRequestProto to " + path +
218 ". Closing shared memory segment.", e);
219 } finally {
220 if (success) {
221 shmManager.freeSlot(slot);
222 } else {
223 shm.getEndpointShmManager().shutdown(shm);
224 }
225 IOUtils.cleanup(LOG, sock, out);
226 }
227 }
228 }
229
230 public interface ShortCircuitReplicaCreator {
231 /**
232 * Attempt to create a ShortCircuitReplica object.
233 *
234 * This callback will be made without holding any locks.
235 *
236 * @return a non-null ShortCircuitReplicaInfo object.
237 */
238 ShortCircuitReplicaInfo createShortCircuitReplicaInfo();
239 }
240
241 /**
242 * Lock protecting the cache.
243 */
244 private final ReentrantLock lock = new ReentrantLock();
245
246 /**
247 * The executor service that runs the cacheCleaner.
248 */
249 private final ScheduledThreadPoolExecutor cleanerExecutor
250 = new ScheduledThreadPoolExecutor(1, new ThreadFactoryBuilder().
251 setDaemon(true).setNameFormat("ShortCircuitCache_Cleaner").
252 build());
253
254 /**
255 * The executor service that runs the cacheCleaner.
256 */
257 private final ScheduledThreadPoolExecutor releaserExecutor
258 = new ScheduledThreadPoolExecutor(1, new ThreadFactoryBuilder().
259 setDaemon(true).setNameFormat("ShortCircuitCache_SlotReleaser").
260 build());
261
262 /**
263 * A map containing all ShortCircuitReplicaInfo objects, organized by Key.
264 * ShortCircuitReplicaInfo objects may contain a replica, or an InvalidToken
265 * exception.
266 */
267 private final HashMap<ExtendedBlockId, Waitable<ShortCircuitReplicaInfo>>
268 replicaInfoMap = new HashMap<ExtendedBlockId,
269 Waitable<ShortCircuitReplicaInfo>>();
270
271 /**
272 * The CacheCleaner. We don't create this and schedule it until it becomes
273 * necessary.
274 */
275 private CacheCleaner cacheCleaner;
276
277 /**
278 * Tree of evictable elements.
279 *
280 * Maps (unique) insertion time in nanoseconds to the element.
281 */
282 private final TreeMap<Long, ShortCircuitReplica> evictable =
283 new TreeMap<Long, ShortCircuitReplica>();
284
285 /**
286 * Maximum total size of the cache, including both mmapped and
287 * no$-mmapped elements.
288 */
289 private final int maxTotalSize;
290
291 /**
292 * Non-mmaped elements older than this will be closed.
293 */
294 private long maxNonMmappedEvictableLifespanMs;
295
296 /**
297 * Tree of mmaped evictable elements.
298 *
299 * Maps (unique) insertion time in nanoseconds to the element.
300 */
301 private final TreeMap<Long, ShortCircuitReplica> evictableMmapped =
302 new TreeMap<Long, ShortCircuitReplica>();
303
304 /**
305 * Maximum number of mmaped evictable elements.
306 */
307 private int maxEvictableMmapedSize;
308
309 /**
310 * Mmaped elements older than this will be closed.
311 */
312 private final long maxEvictableMmapedLifespanMs;
313
314 /**
315 * The minimum number of milliseconds we'll wait after an unsuccessful
316 * mmap attempt before trying again.
317 */
318 private final long mmapRetryTimeoutMs;
319
320 /**
321 * How long we will keep replicas in the cache before declaring them
322 * to be stale.
323 */
324 private final long staleThresholdMs;
325
326 /**
327 * True if the ShortCircuitCache is closed.
328 */
329 private boolean closed = false;
330
331 /**
332 * Number of existing mmaps associated with this cache.
333 */
334 private int outstandingMmapCount = 0;
335
336 /**
337 * Manages short-circuit shared memory segments for the client.
338 */
339 private final DfsClientShmManager shmManager;
340
341 /**
342 * Create a {@link ShortCircuitCache} object from a {@link Configuration}
343 */
344 public static ShortCircuitCache fromConf(Configuration conf) {
345 return new ShortCircuitCache(
346 conf.getInt(DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_SIZE_KEY,
347 DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_SIZE_DEFAULT),
348 conf.getLong(DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_EXPIRY_MS_KEY,
349 DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_EXPIRY_MS_DEFAULT),
350 conf.getInt(DFSConfigKeys.DFS_CLIENT_MMAP_CACHE_SIZE,
351 DFSConfigKeys.DFS_CLIENT_MMAP_CACHE_SIZE_DEFAULT),
352 conf.getLong(DFSConfigKeys.DFS_CLIENT_MMAP_CACHE_TIMEOUT_MS,
353 DFSConfigKeys.DFS_CLIENT_MMAP_CACHE_TIMEOUT_MS_DEFAULT),
354 conf.getLong(DFSConfigKeys.DFS_CLIENT_MMAP_RETRY_TIMEOUT_MS,
355 DFSConfigKeys.DFS_CLIENT_MMAP_RETRY_TIMEOUT_MS_DEFAULT),
356 conf.getLong(DFSConfigKeys.DFS_CLIENT_SHORT_CIRCUIT_REPLICA_STALE_THRESHOLD_MS,
357 DFSConfigKeys.DFS_CLIENT_SHORT_CIRCUIT_REPLICA_STALE_THRESHOLD_MS_DEFAULT),
358 conf.getInt(DFSConfigKeys.DFS_SHORT_CIRCUIT_SHARED_MEMORY_WATCHER_INTERRUPT_CHECK_MS,
359 DFSConfigKeys.DFS_SHORT_CIRCUIT_SHARED_MEMORY_WATCHER_INTERRUPT_CHECK_MS_DEFAULT));
360 }
361
362 public ShortCircuitCache(int maxTotalSize, long maxNonMmappedEvictableLifespanMs,
363 int maxEvictableMmapedSize, long maxEvictableMmapedLifespanMs,
364 long mmapRetryTimeoutMs, long staleThresholdMs, int shmInterruptCheckMs) {
365 Preconditions.checkArgument(maxTotalSize >= 0);
366 this.maxTotalSize = maxTotalSize;
367 Preconditions.checkArgument(maxNonMmappedEvictableLifespanMs >= 0);
368 this.maxNonMmappedEvictableLifespanMs = maxNonMmappedEvictableLifespanMs;
369 Preconditions.checkArgument(maxEvictableMmapedSize >= 0);
370 this.maxEvictableMmapedSize = maxEvictableMmapedSize;
371 Preconditions.checkArgument(maxEvictableMmapedLifespanMs >= 0);
372 this.maxEvictableMmapedLifespanMs = maxEvictableMmapedLifespanMs;
373 this.mmapRetryTimeoutMs = mmapRetryTimeoutMs;
374 this.staleThresholdMs = staleThresholdMs;
375 DfsClientShmManager shmManager = null;
376 if ((shmInterruptCheckMs > 0) &&
377 (DomainSocketWatcher.getLoadingFailureReason() == null)) {
378 try {
379 shmManager = new DfsClientShmManager(shmInterruptCheckMs);
380 } catch (IOException e) {
381 LOG.error("failed to create ShortCircuitShmManager", e);
382 }
383 }
384 this.shmManager = shmManager;
385 }
386
387 public long getStaleThresholdMs() {
388 return staleThresholdMs;
389 }
390
391 /**
392 * Increment the reference count of a replica, and remove it from any free
393 * list it may be in.
394 *
395 * You must hold the cache lock while calling this function.
396 *
397 * @param replica The replica we're removing.
398 */
399 private void ref(ShortCircuitReplica replica) {
400 lock.lock();
401 try {
402 Preconditions.checkArgument(replica.refCount > 0,
403 "can't ref " + replica + " because its refCount reached " +
404 replica.refCount);
405 Long evictableTimeNs = replica.getEvictableTimeNs();
406 replica.refCount++;
407 if (evictableTimeNs != null) {
408 String removedFrom = removeEvictable(replica);
409 if (LOG.isTraceEnabled()) {
410 LOG.trace(this + ": " + removedFrom +
411 " no longer contains " + replica + ". refCount " +
412 (replica.refCount - 1) + " -> " + replica.refCount +
413 StringUtils.getStackTrace(Thread.currentThread()));
414
415 }
416 } else if (LOG.isTraceEnabled()) {
417 LOG.trace(this + ": replica refCount " +
418 (replica.refCount - 1) + " -> " + replica.refCount +
419 StringUtils.getStackTrace(Thread.currentThread()));
420 }
421 } finally {
422 lock.unlock();
423 }
424 }
425
426 /**
427 * Unreference a replica.
428 *
429 * You must hold the cache lock while calling this function.
430 *
431 * @param replica The replica being unreferenced.
432 */
433 void unref(ShortCircuitReplica replica) {
434 lock.lock();
435 try {
436 // If the replica is stale or unusable, but we haven't purged it yet,
437 // let's do that. It would be a shame to evict a non-stale replica so
438 // that we could put a stale or unusable one into the cache.
439 if (!replica.purged) {
440 String purgeReason = null;
441 if (!replica.getDataStream().getChannel().isOpen()) {
442 purgeReason = "purging replica because its data channel is closed.";
443 } else if (!replica.getMetaStream().getChannel().isOpen()) {
444 purgeReason = "purging replica because its meta channel is closed.";
445 } else if (replica.isStale()) {
446 purgeReason = "purging replica because it is stale.";
447 }
448 if (purgeReason != null) {
449 LOG.debug(this + ": " + purgeReason);
450 purge(replica);
451 }
452 }
453 String addedString = "";
454 boolean shouldTrimEvictionMaps = false;
455 int newRefCount = --replica.refCount;
456 if (newRefCount == 0) {
457 // Close replica, since there are no remaining references to it.
458 Preconditions.checkArgument(replica.purged,
459 "Replica " + replica + " reached a refCount of 0 without " +
460 "being purged");
461 replica.close();
462 } else if (newRefCount == 1) {
463 Preconditions.checkState(null == replica.getEvictableTimeNs(),
464 "Replica " + replica + " had a refCount higher than 1, " +
465 "but was still evictable (evictableTimeNs = " +
466 replica.getEvictableTimeNs() + ")");
467 if (!replica.purged) {
468 // Add the replica to the end of an eviction list.
469 // Eviction lists are sorted by time.
470 if (replica.hasMmap()) {
471 insertEvictable(System.nanoTime(), replica, evictableMmapped);
472 addedString = "added to evictableMmapped, ";
473 } else {
474 insertEvictable(System.nanoTime(), replica, evictable);
475 addedString = "added to evictable, ";
476 }
477 shouldTrimEvictionMaps = true;
478 }
479 } else {
480 Preconditions.checkArgument(replica.refCount >= 0,
481 "replica's refCount went negative (refCount = " +
482 replica.refCount + " for " + replica + ")");
483 }
484 if (LOG.isTraceEnabled()) {
485 LOG.trace(this + ": unref replica " + replica +
486 ": " + addedString + " refCount " +
487 (newRefCount + 1) + " -> " + newRefCount +
488 StringUtils.getStackTrace(Thread.currentThread()));
489 }
490 if (shouldTrimEvictionMaps) {
491 trimEvictionMaps();
492 }
493 } finally {
494 lock.unlock();
495 }
496 }
497
498 /**
499 * Demote old evictable mmaps into the regular eviction map.
500 *
501 * You must hold the cache lock while calling this function.
502 *
503 * @param now Current time in monotonic milliseconds.
504 * @return Number of replicas demoted.
505 */
506 private int demoteOldEvictableMmaped(long now) {
507 int numDemoted = 0;
508 boolean needMoreSpace = false;
509 Long evictionTimeNs = Long.valueOf(0);
510
511 while (true) {
512 Entry<Long, ShortCircuitReplica> entry =
513 evictableMmapped.ceilingEntry(evictionTimeNs);
514 if (entry == null) break;
515 evictionTimeNs = entry.getKey();
516 long evictionTimeMs =
517 TimeUnit.MILLISECONDS.convert(evictionTimeNs, TimeUnit.NANOSECONDS);
518 if (evictionTimeMs + maxEvictableMmapedLifespanMs >= now) {
519 if (evictableMmapped.size() < maxEvictableMmapedSize) {
520 break;
521 }
522 needMoreSpace = true;
523 }
524 ShortCircuitReplica replica = entry.getValue();
525 if (LOG.isTraceEnabled()) {
526 String rationale = needMoreSpace ? "because we need more space" :
527 "because it's too old";
528 LOG.trace("demoteOldEvictable: demoting " + replica + ": " +
529 rationale + ": " +
530 StringUtils.getStackTrace(Thread.currentThread()));
531 }
532 removeEvictable(replica, evictableMmapped);
533 munmap(replica);
534 insertEvictable(evictionTimeNs, replica, evictable);
535 numDemoted++;
536 }
537 return numDemoted;
538 }
539
540 /**
541 * Trim the eviction lists.
542 */
543 private void trimEvictionMaps() {
544 long now = Time.monotonicNow();
545 demoteOldEvictableMmaped(now);
546
547 while (true) {
548 long evictableSize = evictable.size();
549 long evictableMmappedSize = evictableMmapped.size();
550 if (evictableSize + evictableMmappedSize <= maxTotalSize) {
551 return;
552 }
553 ShortCircuitReplica replica;
554 if (evictableSize == 0) {
555 replica = evictableMmapped.firstEntry().getValue();
556 } else {
557 replica = evictable.firstEntry().getValue();
558 }
559 if (LOG.isTraceEnabled()) {
560 LOG.trace(this + ": trimEvictionMaps is purging " + replica +
561 StringUtils.getStackTrace(Thread.currentThread()));
562 }
563 purge(replica);
564 }
565 }
566
567 /**
568 * Munmap a replica, updating outstandingMmapCount.
569 *
570 * @param replica The replica to munmap.
571 */
572 private void munmap(ShortCircuitReplica replica) {
573 replica.munmap();
574 outstandingMmapCount--;
575 }
576
577 /**
578 * Remove a replica from an evictable map.
579 *
580 * @param replica The replica to remove.
581 * @return The map it was removed from.
582 */
583 private String removeEvictable(ShortCircuitReplica replica) {
584 if (replica.hasMmap()) {
585 removeEvictable(replica, evictableMmapped);
586 return "evictableMmapped";
587 } else {
588 removeEvictable(replica, evictable);
589 return "evictable";
590 }
591 }
592
593 /**
594 * Remove a replica from an evictable map.
595 *
596 * @param replica The replica to remove.
597 * @param map The map to remove it from.
598 */
599 private void removeEvictable(ShortCircuitReplica replica,
600 TreeMap<Long, ShortCircuitReplica> map) {
601 Long evictableTimeNs = replica.getEvictableTimeNs();
602 Preconditions.checkNotNull(evictableTimeNs);
603 ShortCircuitReplica removed = map.remove(evictableTimeNs);
604 Preconditions.checkState(removed == replica,
605 "failed to make " + replica + " unevictable");
606 replica.setEvictableTimeNs(null);
607 }
608
609 /**
610 * Insert a replica into an evictable map.
611 *
612 * If an element already exists with this eviction time, we add a nanosecond
613 * to it until we find an unused key.
614 *
615 * @param evictionTimeNs The eviction time in absolute nanoseconds.
616 * @param replica The replica to insert.
617 * @param map The map to insert it into.
618 */
619 private void insertEvictable(Long evictionTimeNs,
620 ShortCircuitReplica replica, TreeMap<Long, ShortCircuitReplica> map) {
621 while (map.containsKey(evictionTimeNs)) {
622 evictionTimeNs++;
623 }
624 Preconditions.checkState(null == replica.getEvictableTimeNs());
625 replica.setEvictableTimeNs(evictionTimeNs);
626 map.put(evictionTimeNs, replica);
627 }
628
629 /**
630 * Purge a replica from the cache.
631 *
632 * This doesn't necessarily close the replica, since there may be
633 * outstanding references to it. However, it does mean the cache won't
634 * hand it out to anyone after this.
635 *
636 * You must hold the cache lock while calling this function.
637 *
638 * @param replica The replica being removed.
639 */
640 private void purge(ShortCircuitReplica replica) {
641 boolean removedFromInfoMap = false;
642 String evictionMapName = null;
643 Preconditions.checkArgument(!replica.purged);
644 replica.purged = true;
645 Waitable<ShortCircuitReplicaInfo> val = replicaInfoMap.get(replica.key);
646 if (val != null) {
647 ShortCircuitReplicaInfo info = val.getVal();
648 if ((info != null) && (info.getReplica() == replica)) {
649 replicaInfoMap.remove(replica.key);
650 removedFromInfoMap = true;
651 }
652 }
653 Long evictableTimeNs = replica.getEvictableTimeNs();
654 if (evictableTimeNs != null) {
655 evictionMapName = removeEvictable(replica);
656 }
657 if (LOG.isTraceEnabled()) {
658 StringBuilder builder = new StringBuilder();
659 builder.append(this).append(": ").append(": purged ").
660 append(replica).append(" from the cache.");
661 if (removedFromInfoMap) {
662 builder.append(" Removed from the replicaInfoMap.");
663 }
664 if (evictionMapName != null) {
665 builder.append(" Removed from ").append(evictionMapName);
666 }
667 LOG.trace(builder.toString());
668 }
669 unref(replica);
670 }
671
672 /**
673 * Fetch or create a replica.
674 *
675 * You must hold the cache lock while calling this function.
676 *
677 * @param key Key to use for lookup.
678 * @param creator Replica creator callback. Will be called without
679 * the cache lock being held.
680 *
681 * @return Null if no replica could be found or created.
682 * The replica, otherwise.
683 */
684 public ShortCircuitReplicaInfo fetchOrCreate(ExtendedBlockId key,
685 ShortCircuitReplicaCreator creator) {
686 Waitable<ShortCircuitReplicaInfo> newWaitable = null;
687 lock.lock();
688 try {
689 ShortCircuitReplicaInfo info = null;
690 do {
691 if (closed) {
692 if (LOG.isTraceEnabled()) {
693 LOG.trace(this + ": can't fetchOrCreate " + key +
694 " because the cache is closed.");
695 }
696 return null;
697 }
698 Waitable<ShortCircuitReplicaInfo> waitable = replicaInfoMap.get(key);
699 if (waitable != null) {
700 try {
701 info = fetch(key, waitable);
702 } catch (RetriableException e) {
703 if (LOG.isDebugEnabled()) {
704 LOG.debug(this + ": retrying " + e.getMessage());
705 }
706 continue;
707 }
708 }
709 } while (false);
710 if (info != null) return info;
711 // We need to load the replica ourselves.
712 newWaitable = new Waitable<ShortCircuitReplicaInfo>(lock.newCondition());
713 replicaInfoMap.put(key, newWaitable);
714 } finally {
715 lock.unlock();
716 }
717 return create(key, creator, newWaitable);
718 }
719
720 /**
721 * Fetch an existing ReplicaInfo object.
722 *
723 * @param key The key that we're using.
724 * @param waitable The waitable object to wait on.
725 * @return The existing ReplicaInfo object, or null if there is
726 * none.
727 *
728 * @throws RetriableException If the caller needs to retry.
729 */
730 private ShortCircuitReplicaInfo fetch(ExtendedBlockId key,
731 Waitable<ShortCircuitReplicaInfo> waitable) throws RetriableException {
732 // Another thread is already in the process of loading this
733 // ShortCircuitReplica. So we simply wait for it to complete.
734 ShortCircuitReplicaInfo info;
735 try {
736 if (LOG.isTraceEnabled()) {
737 LOG.trace(this + ": found waitable for " + key);
738 }
739 info = waitable.await();
740 } catch (InterruptedException e) {
741 LOG.info(this + ": interrupted while waiting for " + key);
742 Thread.currentThread().interrupt();
743 throw new RetriableException("interrupted");
744 }
745 if (info.getInvalidTokenException() != null) {
746 LOG.warn(this + ": could not get " + key + " due to InvalidToken " +
747 "exception.", info.getInvalidTokenException());
748 return info;
749 }
750 ShortCircuitReplica replica = info.getReplica();
751 if (replica == null) {
752 LOG.warn(this + ": failed to get " + key);
753 return info;
754 }
755 if (replica.purged) {
756 // Ignore replicas that have already been purged from the cache.
757 throw new RetriableException("Ignoring purged replica " +
758 replica + ". Retrying.");
759 }
760 // Check if the replica is stale before using it.
761 // If it is, purge it and retry.
762 if (replica.isStale()) {
763 LOG.info(this + ": got stale replica " + replica + ". Removing " +
764 "this replica from the replicaInfoMap and retrying.");
765 // Remove the cache's reference to the replica. This may or may not
766 // trigger a close.
767 purge(replica);
768 throw new RetriableException("ignoring stale replica " + replica);
769 }
770 ref(replica);
771 return info;
772 }
773
774 private ShortCircuitReplicaInfo create(ExtendedBlockId key,
775 ShortCircuitReplicaCreator creator,
776 Waitable<ShortCircuitReplicaInfo> newWaitable) {
777 // Handle loading a new replica.
778 ShortCircuitReplicaInfo info = null;
779 try {
780 if (LOG.isTraceEnabled()) {
781 LOG.trace(this + ": loading " + key);
782 }
783 info = creator.createShortCircuitReplicaInfo();
784 } catch (RuntimeException e) {
785 LOG.warn(this + ": failed to load " + key, e);
786 }
787 if (info == null) info = new ShortCircuitReplicaInfo();
788 lock.lock();
789 try {
790 if (info.getReplica() != null) {
791 // On success, make sure the cache cleaner thread is running.
792 if (LOG.isTraceEnabled()) {
793 LOG.trace(this + ": successfully loaded " + info.getReplica());
794 }
795 startCacheCleanerThreadIfNeeded();
796 // Note: new ShortCircuitReplicas start with a refCount of 2,
797 // indicating that both this cache and whoever requested the
798 // creation of the replica hold a reference. So we don't need
799 // to increment the reference count here.
800 } else {
801 // On failure, remove the waitable from the replicaInfoMap.
802 Waitable<ShortCircuitReplicaInfo> waitableInMap = replicaInfoMap.get(key);
803 if (waitableInMap == newWaitable) replicaInfoMap.remove(key);
804 if (info.getInvalidTokenException() != null) {
805 LOG.warn(this + ": could not load " + key + " due to InvalidToken " +
806 "exception.", info.getInvalidTokenException());
807 } else {
808 LOG.warn(this + ": failed to load " + key);
809 }
810 }
811 newWaitable.provide(info);
812 } finally {
813 lock.unlock();
814 }
815 return info;
816 }
817
818 private void startCacheCleanerThreadIfNeeded() {
819 if (cacheCleaner == null) {
820 cacheCleaner = new CacheCleaner();
821 long rateMs = cacheCleaner.getRateInMs();
822 ScheduledFuture<?> future =
823 cleanerExecutor.scheduleAtFixedRate(cacheCleaner, rateMs, rateMs,
824 TimeUnit.MILLISECONDS);
825 cacheCleaner.setFuture(future);
826 if (LOG.isDebugEnabled()) {
827 LOG.debug(this + ": starting cache cleaner thread which will run " +
828 "every " + rateMs + " ms");
829 }
830 }
831 }
832
833 ClientMmap getOrCreateClientMmap(ShortCircuitReplica replica,
834 boolean anchored) {
835 Condition newCond;
836 lock.lock();
837 try {
838 while (replica.mmapData != null) {
839 if (replica.mmapData instanceof MappedByteBuffer) {
840 ref(replica);
841 MappedByteBuffer mmap = (MappedByteBuffer)replica.mmapData;
842 return new ClientMmap(replica, mmap, anchored);
843 } else if (replica.mmapData instanceof Long) {
844 long lastAttemptTimeMs = (Long)replica.mmapData;
845 long delta = Time.monotonicNow() - lastAttemptTimeMs;
846 if (delta < mmapRetryTimeoutMs) {
847 if (LOG.isTraceEnabled()) {
848 LOG.trace(this + ": can't create client mmap for " +
849 replica + " because we failed to " +
850 "create one just " + delta + "ms ago.");
851 }
852 return null;
853 }
854 if (LOG.isTraceEnabled()) {
855 LOG.trace(this + ": retrying client mmap for " + replica +
856 ", " + delta + " ms after the previous failure.");
857 }
858 } else if (replica.mmapData instanceof Condition) {
859 Condition cond = (Condition)replica.mmapData;
860 cond.awaitUninterruptibly();
861 } else {
862 Preconditions.checkState(false, "invalid mmapData type " +
863 replica.mmapData.getClass().getName());
864 }
865 }
866 newCond = lock.newCondition();
867 replica.mmapData = newCond;
868 } finally {
869 lock.unlock();
870 }
871 MappedByteBuffer map = replica.loadMmapInternal();
872 lock.lock();
873 try {
874 if (map == null) {
875 replica.mmapData = Long.valueOf(Time.monotonicNow());
876 newCond.signalAll();
877 return null;
878 } else {
879 outstandingMmapCount++;
880 replica.mmapData = map;
881 ref(replica);
882 newCond.signalAll();
883 return new ClientMmap(replica, map, anchored);
884 }
885 } finally {
886 lock.unlock();
887 }
888 }
889
890 /**
891 * Close the cache and free all associated resources.
892 */
893 @Override
894 public void close() {
895 try {
896 lock.lock();
897 if (closed) return;
898 closed = true;
899 LOG.info(this + ": closing");
900 maxNonMmappedEvictableLifespanMs = 0;
901 maxEvictableMmapedSize = 0;
902 // Close and join cacheCleaner thread.
903 IOUtils.cleanup(LOG, cacheCleaner);
904 // Purge all replicas.
905 while (true) {
906 Entry<Long, ShortCircuitReplica> entry = evictable.firstEntry();
907 if (entry == null) break;
908 purge(entry.getValue());
909 }
910 while (true) {
911 Entry<Long, ShortCircuitReplica> entry = evictableMmapped.firstEntry();
912 if (entry == null) break;
913 purge(entry.getValue());
914 }
915 } finally {
916 lock.unlock();
917 }
918 IOUtils.cleanup(LOG, shmManager);
919 }
920
921 @VisibleForTesting // ONLY for testing
922 public interface CacheVisitor {
923 void visit(int numOutstandingMmaps,
924 Map<ExtendedBlockId, ShortCircuitReplica> replicas,
925 Map<ExtendedBlockId, InvalidToken> failedLoads,
926 Map<Long, ShortCircuitReplica> evictable,
927 Map<Long, ShortCircuitReplica> evictableMmapped);
928 }
929
930 @VisibleForTesting // ONLY for testing
931 public void accept(CacheVisitor visitor) {
932 lock.lock();
933 try {
934 Map<ExtendedBlockId, ShortCircuitReplica> replicas =
935 new HashMap<ExtendedBlockId, ShortCircuitReplica>();
936 Map<ExtendedBlockId, InvalidToken> failedLoads =
937 new HashMap<ExtendedBlockId, InvalidToken>();
938 for (Entry<ExtendedBlockId, Waitable<ShortCircuitReplicaInfo>> entry :
939 replicaInfoMap.entrySet()) {
940 Waitable<ShortCircuitReplicaInfo> waitable = entry.getValue();
941 if (waitable.hasVal()) {
942 if (waitable.getVal().getReplica() != null) {
943 replicas.put(entry.getKey(), waitable.getVal().getReplica());
944 } else {
945 // The exception may be null here, indicating a failed load that
946 // isn't the result of an invalid block token.
947 failedLoads.put(entry.getKey(),
948 waitable.getVal().getInvalidTokenException());
949 }
950 }
951 }
952 if (LOG.isDebugEnabled()) {
953 StringBuilder builder = new StringBuilder();
954 builder.append("visiting ").append(visitor.getClass().getName()).
955 append("with outstandingMmapCount=").append(outstandingMmapCount).
956 append(", replicas=");
957 String prefix = "";
958 for (Entry<ExtendedBlockId, ShortCircuitReplica> entry : replicas.entrySet()) {
959 builder.append(prefix).append(entry.getValue());
960 prefix = ",";
961 }
962 prefix = "";
963 builder.append(", failedLoads=");
964 for (Entry<ExtendedBlockId, InvalidToken> entry : failedLoads.entrySet()) {
965 builder.append(prefix).append(entry.getValue());
966 prefix = ",";
967 }
968 prefix = "";
969 builder.append(", evictable=");
970 for (Entry<Long, ShortCircuitReplica> entry : evictable.entrySet()) {
971 builder.append(prefix).append(entry.getKey()).
972 append(":").append(entry.getValue());
973 prefix = ",";
974 }
975 prefix = "";
976 builder.append(", evictableMmapped=");
977 for (Entry<Long, ShortCircuitReplica> entry : evictableMmapped.entrySet()) {
978 builder.append(prefix).append(entry.getKey()).
979 append(":").append(entry.getValue());
980 prefix = ",";
981 }
982 LOG.debug(builder.toString());
983 }
984 visitor.visit(outstandingMmapCount, replicas, failedLoads,
985 evictable, evictableMmapped);
986 } finally {
987 lock.unlock();
988 }
989 }
990
991 @Override
992 public String toString() {
993 return "ShortCircuitCache(0x" +
994 Integer.toHexString(System.identityHashCode(this)) + ")";
995 }
996
997 /**
998 * Allocate a new shared memory slot.
999 *
1000 * @param datanode The datanode to allocate a shm slot with.
1001 * @param peer A peer connected to the datanode.
1002 * @param usedPeer Will be set to true if we use up the provided peer.
1003 * @param blockId The block id and block pool id of the block we're
1004 * allocating this slot for.
1005 * @param clientName The name of the DFSClient allocating the shared
1006 * memory.
1007 * @return Null if short-circuit shared memory is disabled;
1008 * a short-circuit memory slot otherwise.
1009 * @throws IOException An exception if there was an error talking to
1010 * the datanode.
1011 */
1012 public Slot allocShmSlot(DatanodeInfo datanode,
1013 DomainPeer peer, MutableBoolean usedPeer,
1014 ExtendedBlockId blockId, String clientName) throws IOException {
1015 if (shmManager != null) {
1016 return shmManager.allocSlot(datanode, peer, usedPeer,
1017 blockId, clientName);
1018 } else {
1019 return null;
1020 }
1021 }
1022
1023 /**
1024 * Free a slot immediately.
1025 *
1026 * ONLY use this if the DataNode is not yet aware of the slot.
1027 *
1028 * @param slot The slot to free.
1029 */
1030 public void freeSlot(Slot slot) {
1031 Preconditions.checkState(shmManager != null);
1032 slot.makeInvalid();
1033 shmManager.freeSlot(slot);
1034 }
1035
1036 /**
1037 * Schedule a shared memory slot to be released.
1038 *
1039 * @param slot The slot to release.
1040 */
1041 public void scheduleSlotReleaser(Slot slot) {
1042 Preconditions.checkState(shmManager != null);
1043 releaserExecutor.execute(new SlotReleaser(slot));
1044 }
1045
1046 @VisibleForTesting
1047 public DfsClientShmManager getDfsClientShmManager() {
1048 return shmManager;
1049 }
1050 }