001 /**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements. See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership. The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License. You may obtain a copy of the License at
009 *
010 * http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018
019 package org.apache.hadoop.hdfs.server.datanode.fsdataset.impl;
020
021 import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_DATANODE_CACHE_REVOCATION_TIMEOUT_MS;
022 import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_DATANODE_CACHE_REVOCATION_TIMEOUT_MS_DEFAULT;
023 import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_DATANODE_CACHE_REVOCATION_POLLING_MS;
024 import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_DATANODE_CACHE_REVOCATION_POLLING_MS_DEFAULT;
025
026 import com.google.common.base.Preconditions;
027 import com.google.common.util.concurrent.ThreadFactoryBuilder;
028
029 import java.io.FileInputStream;
030 import java.io.FileNotFoundException;
031 import java.io.IOException;
032 import java.util.ArrayList;
033 import java.util.HashMap;
034 import java.util.Iterator;
035 import java.util.List;
036 import java.util.Map.Entry;
037 import java.util.concurrent.Executor;
038 import java.util.concurrent.LinkedBlockingQueue;
039 import java.util.concurrent.ThreadFactory;
040 import java.util.concurrent.ThreadPoolExecutor;
041 import java.util.concurrent.ScheduledThreadPoolExecutor;
042 import java.util.concurrent.TimeUnit;
043 import java.util.concurrent.atomic.AtomicLong;
044
045 import org.apache.commons.io.IOUtils;
046 import org.apache.commons.lang.time.DurationFormatUtils;
047 import org.apache.hadoop.classification.InterfaceAudience;
048 import org.apache.hadoop.classification.InterfaceStability;
049 import org.apache.hadoop.fs.ChecksumException;
050 import org.apache.hadoop.hdfs.ExtendedBlockId;
051 import org.apache.hadoop.hdfs.DFSConfigKeys;
052 import org.apache.hadoop.hdfs.protocol.BlockListAsLongs;
053 import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
054 import org.apache.hadoop.hdfs.server.datanode.DatanodeUtil;
055 import org.apache.hadoop.io.nativeio.NativeIO;
056 import org.apache.hadoop.util.Time;
057 import org.slf4j.Logger;
058 import org.slf4j.LoggerFactory;
059
060 /**
061 * Manages caching for an FsDatasetImpl by using the mmap(2) and mlock(2)
062 * system calls to lock blocks into memory. Block checksums are verified upon
063 * entry into the cache.
064 */
065 @InterfaceAudience.Private
066 @InterfaceStability.Unstable
067 public class FsDatasetCache {
068 /**
069 * MappableBlocks that we know about.
070 */
071 private static final class Value {
072 final State state;
073 final MappableBlock mappableBlock;
074
075 Value(MappableBlock mappableBlock, State state) {
076 this.mappableBlock = mappableBlock;
077 this.state = state;
078 }
079 }
080
081 private enum State {
082 /**
083 * The MappableBlock is in the process of being cached.
084 */
085 CACHING,
086
087 /**
088 * The MappableBlock was in the process of being cached, but it was
089 * cancelled. Only the FsDatasetCache#WorkerTask can remove cancelled
090 * MappableBlock objects.
091 */
092 CACHING_CANCELLED,
093
094 /**
095 * The MappableBlock is in the cache.
096 */
097 CACHED,
098
099 /**
100 * The MappableBlock is in the process of uncaching.
101 */
102 UNCACHING;
103
104 /**
105 * Whether we should advertise this block as cached to the NameNode and
106 * clients.
107 */
108 public boolean shouldAdvertise() {
109 return (this == CACHED);
110 }
111 }
112
113 private static final Logger LOG = LoggerFactory.getLogger(FsDatasetCache
114 .class);
115
116 /**
117 * Stores MappableBlock objects and the states they're in.
118 */
119 private final HashMap<ExtendedBlockId, Value> mappableBlockMap =
120 new HashMap<ExtendedBlockId, Value>();
121
122 private final AtomicLong numBlocksCached = new AtomicLong(0);
123
124 private final FsDatasetImpl dataset;
125
126 private final ThreadPoolExecutor uncachingExecutor;
127
128 private final ScheduledThreadPoolExecutor deferredUncachingExecutor;
129
130 private final long revocationMs;
131
132 private final long revocationPollingMs;
133
134 /**
135 * The approximate amount of cache space in use.
136 *
137 * This number is an overestimate, counting bytes that will be used only
138 * if pending caching operations succeed. It does not take into account
139 * pending uncaching operations.
140 *
141 * This overestimate is more useful to the NameNode than an underestimate,
142 * since we don't want the NameNode to assign us more replicas than
143 * we can cache, because of the current batch of operations.
144 */
145 private final UsedBytesCount usedBytesCount;
146
147 public static class PageRounder {
148 private final long osPageSize =
149 NativeIO.POSIX.getCacheManipulator().getOperatingSystemPageSize();
150
151 /**
152 * Round up a number to the operating system page size.
153 */
154 public long round(long count) {
155 long newCount =
156 (count + (osPageSize - 1)) / osPageSize;
157 return newCount * osPageSize;
158 }
159 }
160
161 private class UsedBytesCount {
162 private final AtomicLong usedBytes = new AtomicLong(0);
163
164 private final PageRounder rounder = new PageRounder();
165
166 /**
167 * Try to reserve more bytes.
168 *
169 * @param count The number of bytes to add. We will round this
170 * up to the page size.
171 *
172 * @return The new number of usedBytes if we succeeded;
173 * -1 if we failed.
174 */
175 long reserve(long count) {
176 count = rounder.round(count);
177 while (true) {
178 long cur = usedBytes.get();
179 long next = cur + count;
180 if (next > maxBytes) {
181 return -1;
182 }
183 if (usedBytes.compareAndSet(cur, next)) {
184 return next;
185 }
186 }
187 }
188
189 /**
190 * Release some bytes that we're using.
191 *
192 * @param count The number of bytes to release. We will round this
193 * up to the page size.
194 *
195 * @return The new number of usedBytes.
196 */
197 long release(long count) {
198 count = rounder.round(count);
199 return usedBytes.addAndGet(-count);
200 }
201
202 long get() {
203 return usedBytes.get();
204 }
205 }
206
207 /**
208 * The total cache capacity in bytes.
209 */
210 private final long maxBytes;
211
212 /**
213 * Number of cache commands that could not be completed successfully
214 */
215 final AtomicLong numBlocksFailedToCache = new AtomicLong(0);
216 /**
217 * Number of uncache commands that could not be completed successfully
218 */
219 final AtomicLong numBlocksFailedToUncache = new AtomicLong(0);
220
221 public FsDatasetCache(FsDatasetImpl dataset) {
222 this.dataset = dataset;
223 this.maxBytes = dataset.datanode.getDnConf().getMaxLockedMemory();
224 ThreadFactory workerFactory = new ThreadFactoryBuilder()
225 .setDaemon(true)
226 .setNameFormat("FsDatasetCache-%d-" + dataset.toString())
227 .build();
228 this.usedBytesCount = new UsedBytesCount();
229 this.uncachingExecutor = new ThreadPoolExecutor(
230 0, 1,
231 60, TimeUnit.SECONDS,
232 new LinkedBlockingQueue<Runnable>(),
233 workerFactory);
234 this.uncachingExecutor.allowCoreThreadTimeOut(true);
235 this.deferredUncachingExecutor = new ScheduledThreadPoolExecutor(
236 1, workerFactory);
237 this.revocationMs = dataset.datanode.getConf().getLong(
238 DFS_DATANODE_CACHE_REVOCATION_TIMEOUT_MS,
239 DFS_DATANODE_CACHE_REVOCATION_TIMEOUT_MS_DEFAULT);
240 long confRevocationPollingMs = dataset.datanode.getConf().getLong(
241 DFS_DATANODE_CACHE_REVOCATION_POLLING_MS,
242 DFS_DATANODE_CACHE_REVOCATION_POLLING_MS_DEFAULT);
243 long minRevocationPollingMs = revocationMs / 2;
244 if (minRevocationPollingMs < confRevocationPollingMs) {
245 throw new RuntimeException("configured value " +
246 confRevocationPollingMs + "for " +
247 DFS_DATANODE_CACHE_REVOCATION_POLLING_MS +
248 " is too high. It must not be more than half of the " +
249 "value of " + DFS_DATANODE_CACHE_REVOCATION_TIMEOUT_MS +
250 ". Reconfigure this to " + minRevocationPollingMs);
251 }
252 this.revocationPollingMs = confRevocationPollingMs;
253 }
254
255 /**
256 * @return List of cached blocks suitable for translation into a
257 * {@link BlockListAsLongs} for a cache report.
258 */
259 synchronized List<Long> getCachedBlocks(String bpid) {
260 List<Long> blocks = new ArrayList<Long>();
261 for (Iterator<Entry<ExtendedBlockId, Value>> iter =
262 mappableBlockMap.entrySet().iterator(); iter.hasNext(); ) {
263 Entry<ExtendedBlockId, Value> entry = iter.next();
264 if (entry.getKey().getBlockPoolId().equals(bpid)) {
265 if (entry.getValue().state.shouldAdvertise()) {
266 blocks.add(entry.getKey().getBlockId());
267 }
268 }
269 }
270 return blocks;
271 }
272
273 /**
274 * Attempt to begin caching a block.
275 */
276 synchronized void cacheBlock(long blockId, String bpid,
277 String blockFileName, long length, long genstamp,
278 Executor volumeExecutor) {
279 ExtendedBlockId key = new ExtendedBlockId(blockId, bpid);
280 Value prevValue = mappableBlockMap.get(key);
281 if (prevValue != null) {
282 LOG.debug("Block with id {}, pool {} already exists in the "
283 + "FsDatasetCache with state {}", blockId, bpid, prevValue.state
284 );
285 numBlocksFailedToCache.incrementAndGet();
286 return;
287 }
288 mappableBlockMap.put(key, new Value(null, State.CACHING));
289 volumeExecutor.execute(
290 new CachingTask(key, blockFileName, length, genstamp));
291 LOG.debug("Initiating caching for Block with id {}, pool {}", blockId,
292 bpid);
293 }
294
295 synchronized void uncacheBlock(String bpid, long blockId) {
296 ExtendedBlockId key = new ExtendedBlockId(blockId, bpid);
297 Value prevValue = mappableBlockMap.get(key);
298 boolean deferred = false;
299
300 if (!dataset.datanode.getShortCircuitRegistry().
301 processBlockMunlockRequest(key)) {
302 deferred = true;
303 }
304 if (prevValue == null) {
305 LOG.debug("Block with id {}, pool {} does not need to be uncached, "
306 + "because it is not currently in the mappableBlockMap.", blockId,
307 bpid);
308 numBlocksFailedToUncache.incrementAndGet();
309 return;
310 }
311 switch (prevValue.state) {
312 case CACHING:
313 LOG.debug("Cancelling caching for block with id {}, pool {}.", blockId,
314 bpid);
315 mappableBlockMap.put(key,
316 new Value(prevValue.mappableBlock, State.CACHING_CANCELLED));
317 break;
318 case CACHED:
319 mappableBlockMap.put(key,
320 new Value(prevValue.mappableBlock, State.UNCACHING));
321 if (deferred) {
322 LOG.debug("{} is anchored, and can't be uncached now. Scheduling it " +
323 "for uncaching in {} ",
324 key, DurationFormatUtils.formatDurationHMS(revocationPollingMs));
325 deferredUncachingExecutor.schedule(
326 new UncachingTask(key, revocationMs),
327 revocationPollingMs, TimeUnit.MILLISECONDS);
328 } else {
329 LOG.debug("{} has been scheduled for immediate uncaching.", key);
330 uncachingExecutor.execute(new UncachingTask(key, 0));
331 }
332 break;
333 default:
334 LOG.debug("Block with id {}, pool {} does not need to be uncached, "
335 + "because it is in state {}.", blockId, bpid, prevValue.state);
336 numBlocksFailedToUncache.incrementAndGet();
337 break;
338 }
339 }
340
341 /**
342 * Background worker that mmaps, mlocks, and checksums a block
343 */
344 private class CachingTask implements Runnable {
345 private final ExtendedBlockId key;
346 private final String blockFileName;
347 private final long length;
348 private final long genstamp;
349
350 CachingTask(ExtendedBlockId key, String blockFileName, long length, long genstamp) {
351 this.key = key;
352 this.blockFileName = blockFileName;
353 this.length = length;
354 this.genstamp = genstamp;
355 }
356
357 @Override
358 public void run() {
359 boolean success = false;
360 FileInputStream blockIn = null, metaIn = null;
361 MappableBlock mappableBlock = null;
362 ExtendedBlock extBlk = new ExtendedBlock(key.getBlockPoolId(),
363 key.getBlockId(), length, genstamp);
364 long newUsedBytes = usedBytesCount.reserve(length);
365 boolean reservedBytes = false;
366 try {
367 if (newUsedBytes < 0) {
368 LOG.warn("Failed to cache " + key + ": could not reserve " + length +
369 " more bytes in the cache: " +
370 DFSConfigKeys.DFS_DATANODE_MAX_LOCKED_MEMORY_KEY +
371 " of " + maxBytes + " exceeded.");
372 return;
373 }
374 reservedBytes = true;
375 try {
376 blockIn = (FileInputStream)dataset.getBlockInputStream(extBlk, 0);
377 metaIn = DatanodeUtil.getMetaDataInputStream(extBlk, dataset);
378 } catch (ClassCastException e) {
379 LOG.warn("Failed to cache " + key +
380 ": Underlying blocks are not backed by files.", e);
381 return;
382 } catch (FileNotFoundException e) {
383 LOG.info("Failed to cache " + key + ": failed to find backing " +
384 "files.");
385 return;
386 } catch (IOException e) {
387 LOG.warn("Failed to cache " + key + ": failed to open file", e);
388 return;
389 }
390 try {
391 mappableBlock = MappableBlock.
392 load(length, blockIn, metaIn, blockFileName);
393 } catch (ChecksumException e) {
394 // Exception message is bogus since this wasn't caused by a file read
395 LOG.warn("Failed to cache " + key + ": checksum verification failed.");
396 return;
397 } catch (IOException e) {
398 LOG.warn("Failed to cache " + key, e);
399 return;
400 }
401 synchronized (FsDatasetCache.this) {
402 Value value = mappableBlockMap.get(key);
403 Preconditions.checkNotNull(value);
404 Preconditions.checkState(value.state == State.CACHING ||
405 value.state == State.CACHING_CANCELLED);
406 if (value.state == State.CACHING_CANCELLED) {
407 mappableBlockMap.remove(key);
408 LOG.warn("Caching of " + key + " was cancelled.");
409 return;
410 }
411 mappableBlockMap.put(key, new Value(mappableBlock, State.CACHED));
412 }
413 LOG.debug("Successfully cached {}. We are now caching {} bytes in"
414 + " total.", key, newUsedBytes);
415 dataset.datanode.getShortCircuitRegistry().processBlockMlockEvent(key);
416 numBlocksCached.addAndGet(1);
417 dataset.datanode.getMetrics().incrBlocksCached(1);
418 success = true;
419 } finally {
420 IOUtils.closeQuietly(blockIn);
421 IOUtils.closeQuietly(metaIn);
422 if (!success) {
423 if (reservedBytes) {
424 usedBytesCount.release(length);
425 }
426 LOG.debug("Caching of {} was aborted. We are now caching only {} "
427 + "bytes in total.", key, usedBytesCount.get());
428 if (mappableBlock != null) {
429 mappableBlock.close();
430 }
431 numBlocksFailedToCache.incrementAndGet();
432
433 synchronized (FsDatasetCache.this) {
434 mappableBlockMap.remove(key);
435 }
436 }
437 }
438 }
439 }
440
441 private class UncachingTask implements Runnable {
442 private final ExtendedBlockId key;
443 private final long revocationTimeMs;
444
445 UncachingTask(ExtendedBlockId key, long revocationDelayMs) {
446 this.key = key;
447 if (revocationDelayMs == 0) {
448 this.revocationTimeMs = 0;
449 } else {
450 this.revocationTimeMs = revocationDelayMs + Time.monotonicNow();
451 }
452 }
453
454 private boolean shouldDefer() {
455 /* If revocationTimeMs == 0, this is an immediate uncache request.
456 * No clients were anchored at the time we made the request. */
457 if (revocationTimeMs == 0) {
458 return false;
459 }
460 /* Let's check if any clients still have this block anchored. */
461 boolean anchored =
462 !dataset.datanode.getShortCircuitRegistry().
463 processBlockMunlockRequest(key);
464 if (!anchored) {
465 LOG.debug("Uncaching {} now that it is no longer in use " +
466 "by any clients.", key);
467 return false;
468 }
469 long delta = revocationTimeMs - Time.monotonicNow();
470 if (delta < 0) {
471 LOG.warn("Forcibly uncaching {} after {} " +
472 "because client(s) {} refused to stop using it.", key,
473 DurationFormatUtils.formatDurationHMS(revocationTimeMs),
474 dataset.datanode.getShortCircuitRegistry().getClientNames(key));
475 return false;
476 }
477 LOG.info("Replica {} still can't be uncached because some " +
478 "clients continue to use it. Will wait for {}", key,
479 DurationFormatUtils.formatDurationHMS(delta));
480 return true;
481 }
482
483 @Override
484 public void run() {
485 Value value;
486
487 if (shouldDefer()) {
488 deferredUncachingExecutor.schedule(
489 this, revocationPollingMs, TimeUnit.MILLISECONDS);
490 return;
491 }
492
493 synchronized (FsDatasetCache.this) {
494 value = mappableBlockMap.get(key);
495 }
496 Preconditions.checkNotNull(value);
497 Preconditions.checkArgument(value.state == State.UNCACHING);
498
499 IOUtils.closeQuietly(value.mappableBlock);
500 synchronized (FsDatasetCache.this) {
501 mappableBlockMap.remove(key);
502 }
503 long newUsedBytes =
504 usedBytesCount.release(value.mappableBlock.getLength());
505 numBlocksCached.addAndGet(-1);
506 dataset.datanode.getMetrics().incrBlocksUncached(1);
507 if (revocationTimeMs != 0) {
508 LOG.debug("Uncaching of {} completed. usedBytes = {}",
509 key, newUsedBytes);
510 } else {
511 LOG.debug("Deferred uncaching of {} completed. usedBytes = {}",
512 key, newUsedBytes);
513 }
514 }
515 }
516
517 // Stats related methods for FSDatasetMBean
518
519 /**
520 * Get the approximate amount of cache space used.
521 */
522 public long getCacheUsed() {
523 return usedBytesCount.get();
524 }
525
526 /**
527 * Get the maximum amount of bytes we can cache. This is a constant.
528 */
529 public long getCacheCapacity() {
530 return maxBytes;
531 }
532
533 public long getNumBlocksFailedToCache() {
534 return numBlocksFailedToCache.get();
535 }
536
537 public long getNumBlocksFailedToUncache() {
538 return numBlocksFailedToUncache.get();
539 }
540
541 public long getNumBlocksCached() {
542 return numBlocksCached.get();
543 }
544
545 public synchronized boolean isCached(String bpid, long blockId) {
546 ExtendedBlockId block = new ExtendedBlockId(blockId, bpid);
547 Value val = mappableBlockMap.get(block);
548 return (val != null) && val.state.shouldAdvertise();
549 }
550 }