001 /**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements. See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership. The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License. You may obtain a copy of the License at
009 *
010 * http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018 package org.apache.hadoop.hdfs.shortcircuit;
019
020 import java.io.BufferedOutputStream;
021 import java.io.Closeable;
022 import java.io.DataOutputStream;
023 import java.io.EOFException;
024 import java.io.FileInputStream;
025 import java.io.IOException;
026 import java.util.HashMap;
027 import java.util.Map.Entry;
028 import java.util.TreeMap;
029 import java.util.concurrent.locks.Condition;
030 import java.util.concurrent.locks.ReentrantLock;
031
032 import org.apache.commons.lang.mutable.MutableBoolean;
033 import org.apache.commons.logging.Log;
034 import org.apache.commons.logging.LogFactory;
035 import org.apache.hadoop.classification.InterfaceAudience;
036 import org.apache.hadoop.hdfs.ExtendedBlockId;
037 import org.apache.hadoop.hdfs.net.DomainPeer;
038 import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
039 import org.apache.hadoop.hdfs.protocol.datatransfer.DataTransferProtocol;
040 import org.apache.hadoop.hdfs.protocol.datatransfer.Sender;
041 import org.apache.hadoop.hdfs.protocol.proto.DataTransferProtos.ShortCircuitShmResponseProto;
042 import org.apache.hadoop.hdfs.protocolPB.PBHelper;
043 import org.apache.hadoop.hdfs.server.datanode.ShortCircuitRegistry;
044 import org.apache.hadoop.hdfs.shortcircuit.ShortCircuitShm.ShmId;
045 import org.apache.hadoop.hdfs.shortcircuit.ShortCircuitShm.Slot;
046 import org.apache.hadoop.io.IOUtils;
047 import org.apache.hadoop.net.unix.DomainSocket;
048 import org.apache.hadoop.net.unix.DomainSocketWatcher;
049
050 import com.google.common.annotations.VisibleForTesting;
051 import com.google.common.base.Preconditions;
052
053 /**
054 * Manages short-circuit memory segments for an HDFS client.
055 *
056 * Clients are responsible for requesting and releasing shared memory segments used
057 * for communicating with the DataNode. The client will try to allocate new slots
058 * in the set of existing segments, falling back to getting a new segment from the
059 * DataNode via {@link DataTransferProtocol#requestShortCircuitFds}.
060 *
061 * The counterpart to this class on the DataNode is {@link ShortCircuitRegistry}.
062 * See {@link ShortCircuitRegistry} for more information on the communication protocol.
063 */
064 @InterfaceAudience.Private
065 public class DfsClientShmManager implements Closeable {
066 private static final Log LOG = LogFactory.getLog(DfsClientShmManager.class);
067
068 /**
069 * Manages short-circuit memory segments that pertain to a given DataNode.
070 */
071 class EndpointShmManager {
072 /**
073 * The datanode we're managing.
074 */
075 private final DatanodeInfo datanode;
076
077 /**
078 * Shared memory segments which have no empty slots.
079 *
080 * Protected by the manager lock.
081 */
082 private final TreeMap<ShmId, DfsClientShm> full =
083 new TreeMap<ShmId, DfsClientShm>();
084
085 /**
086 * Shared memory segments which have at least one empty slot.
087 *
088 * Protected by the manager lock.
089 */
090 private final TreeMap<ShmId, DfsClientShm> notFull =
091 new TreeMap<ShmId, DfsClientShm>();
092
093 /**
094 * True if this datanode doesn't support short-circuit shared memory
095 * segments.
096 *
097 * Protected by the manager lock.
098 */
099 private boolean disabled = false;
100
101 /**
102 * True if we're in the process of loading a shared memory segment from
103 * this DataNode.
104 *
105 * Protected by the manager lock.
106 */
107 private boolean loading = false;
108
109 EndpointShmManager (DatanodeInfo datanode) {
110 this.datanode = datanode;
111 }
112
113 /**
114 * Pull a slot out of a preexisting shared memory segment.
115 *
116 * Must be called with the manager lock held.
117 *
118 * @param blockId The blockId to put inside the Slot object.
119 *
120 * @return null if none of our shared memory segments contain a
121 * free slot; the slot object otherwise.
122 */
123 private Slot allocSlotFromExistingShm(ExtendedBlockId blockId) {
124 if (notFull.isEmpty()) {
125 return null;
126 }
127 Entry<ShmId, DfsClientShm> entry = notFull.firstEntry();
128 DfsClientShm shm = entry.getValue();
129 ShmId shmId = shm.getShmId();
130 Slot slot = shm.allocAndRegisterSlot(blockId);
131 if (shm.isFull()) {
132 if (LOG.isTraceEnabled()) {
133 LOG.trace(this + ": pulled the last slot " + slot.getSlotIdx() +
134 " out of " + shm);
135 }
136 DfsClientShm removedShm = notFull.remove(shmId);
137 Preconditions.checkState(removedShm == shm);
138 full.put(shmId, shm);
139 } else {
140 if (LOG.isTraceEnabled()) {
141 LOG.trace(this + ": pulled slot " + slot.getSlotIdx() +
142 " out of " + shm);
143 }
144 }
145 return slot;
146 }
147
148 /**
149 * Ask the DataNode for a new shared memory segment. This function must be
150 * called with the manager lock held. We will release the lock while
151 * communicating with the DataNode.
152 *
153 * @param clientName The current client name.
154 * @param peer The peer to use to talk to the DataNode.
155 *
156 * @return Null if the DataNode does not support shared memory
157 * segments, or experienced an error creating the
158 * shm. The shared memory segment itself on success.
159 * @throws IOException If there was an error communicating over the socket.
160 * We will not throw an IOException unless the socket
161 * itself (or the network) is the problem.
162 */
163 private DfsClientShm requestNewShm(String clientName, DomainPeer peer)
164 throws IOException {
165 final DataOutputStream out =
166 new DataOutputStream(
167 new BufferedOutputStream(peer.getOutputStream()));
168 new Sender(out).requestShortCircuitShm(clientName);
169 ShortCircuitShmResponseProto resp =
170 ShortCircuitShmResponseProto.parseFrom(
171 PBHelper.vintPrefixed(peer.getInputStream()));
172 String error = resp.hasError() ? resp.getError() : "(unknown)";
173 switch (resp.getStatus()) {
174 case SUCCESS:
175 DomainSocket sock = peer.getDomainSocket();
176 byte buf[] = new byte[1];
177 FileInputStream fis[] = new FileInputStream[1];
178 if (sock.recvFileInputStreams(fis, buf, 0, buf.length) < 0) {
179 throw new EOFException("got EOF while trying to transfer the " +
180 "file descriptor for the shared memory segment.");
181 }
182 if (fis[0] == null) {
183 throw new IOException("the datanode " + datanode + " failed to " +
184 "pass a file descriptor for the shared memory segment.");
185 }
186 try {
187 DfsClientShm shm =
188 new DfsClientShm(PBHelper.convert(resp.getId()),
189 fis[0], this, peer);
190 if (LOG.isTraceEnabled()) {
191 LOG.trace(this + ": createNewShm: created " + shm);
192 }
193 return shm;
194 } finally {
195 IOUtils.cleanup(LOG, fis[0]);
196 }
197 case ERROR_UNSUPPORTED:
198 // The DataNode just does not support short-circuit shared memory
199 // access, and we should stop asking.
200 LOG.info(this + ": datanode does not support short-circuit " +
201 "shared memory access: " + error);
202 disabled = true;
203 return null;
204 default:
205 // The datanode experienced some kind of unexpected error when trying to
206 // create the short-circuit shared memory segment.
207 LOG.warn(this + ": error requesting short-circuit shared memory " +
208 "access: " + error);
209 return null;
210 }
211 }
212
213 /**
214 * Allocate a new shared memory slot connected to this datanode.
215 *
216 * Must be called with the EndpointShmManager lock held.
217 *
218 * @param peer The peer to use to talk to the DataNode.
219 * @param clientName The client name.
220 * @param usedPeer (out param) Will be set to true if we used the peer.
221 * When a peer is used
222 *
223 * @return null if the DataNode does not support shared memory
224 * segments, or experienced an error creating the
225 * shm. The shared memory segment itself on success.
226 * @throws IOException If there was an error communicating over the socket.
227 */
228 Slot allocSlot(DomainPeer peer, MutableBoolean usedPeer,
229 String clientName, ExtendedBlockId blockId) throws IOException {
230 while (true) {
231 if (closed) {
232 if (LOG.isTraceEnabled()) {
233 LOG.trace(this + ": the DfsClientShmManager has been closed.");
234 }
235 return null;
236 }
237 if (disabled) {
238 if (LOG.isTraceEnabled()) {
239 LOG.trace(this + ": shared memory segment access is disabled.");
240 }
241 return null;
242 }
243 // Try to use an existing slot.
244 Slot slot = allocSlotFromExistingShm(blockId);
245 if (slot != null) {
246 return slot;
247 }
248 // There are no free slots. If someone is loading more slots, wait
249 // for that to finish.
250 if (loading) {
251 if (LOG.isTraceEnabled()) {
252 LOG.trace(this + ": waiting for loading to finish...");
253 }
254 finishedLoading.awaitUninterruptibly();
255 } else {
256 // Otherwise, load the slot ourselves.
257 loading = true;
258 lock.unlock();
259 DfsClientShm shm;
260 try {
261 shm = requestNewShm(clientName, peer);
262 if (shm == null) continue;
263 // See #{DfsClientShmManager#domainSocketWatcher} for details
264 // about why we do this before retaking the manager lock.
265 domainSocketWatcher.add(peer.getDomainSocket(), shm);
266 // The DomainPeer is now our responsibility, and should not be
267 // closed by the caller.
268 usedPeer.setValue(true);
269 } finally {
270 lock.lock();
271 loading = false;
272 finishedLoading.signalAll();
273 }
274 if (shm.isDisconnected()) {
275 // If the peer closed immediately after the shared memory segment
276 // was created, the DomainSocketWatcher callback might already have
277 // fired and marked the shm as disconnected. In this case, we
278 // obviously don't want to add the SharedMemorySegment to our list
279 // of valid not-full segments.
280 if (LOG.isDebugEnabled()) {
281 LOG.debug(this + ": the UNIX domain socket associated with " +
282 "this short-circuit memory closed before we could make " +
283 "use of the shm.");
284 }
285 } else {
286 notFull.put(shm.getShmId(), shm);
287 }
288 }
289 }
290 }
291
292 /**
293 * Stop tracking a slot.
294 *
295 * Must be called with the EndpointShmManager lock held.
296 *
297 * @param slot The slot to release.
298 */
299 void freeSlot(Slot slot) {
300 DfsClientShm shm = (DfsClientShm)slot.getShm();
301 shm.unregisterSlot(slot.getSlotIdx());
302 if (shm.isDisconnected()) {
303 // Stale shared memory segments should not be tracked here.
304 Preconditions.checkState(!full.containsKey(shm.getShmId()));
305 Preconditions.checkState(!notFull.containsKey(shm.getShmId()));
306 if (shm.isEmpty()) {
307 if (LOG.isTraceEnabled()) {
308 LOG.trace(this + ": freeing empty stale " + shm);
309 }
310 shm.free();
311 }
312 } else {
313 ShmId shmId = shm.getShmId();
314 full.remove(shmId); // The shm can't be full if we just freed a slot.
315 if (shm.isEmpty()) {
316 notFull.remove(shmId);
317
318 // If the shared memory segment is now empty, we call shutdown(2) on
319 // the UNIX domain socket associated with it. The DomainSocketWatcher,
320 // which is watching this socket, will call DfsClientShm#handle,
321 // cleaning up this shared memory segment.
322 //
323 // See #{DfsClientShmManager#domainSocketWatcher} for details about why
324 // we don't want to call DomainSocketWatcher#remove directly here.
325 //
326 // Note that we could experience 'fragmentation' here, where the
327 // DFSClient allocates a bunch of slots in different shared memory
328 // segments, and then frees most of them, but never fully empties out
329 // any segment. We make some attempt to avoid this fragmentation by
330 // always allocating new slots out of the shared memory segment with the
331 // lowest ID, but it could still occur. In most workloads,
332 // fragmentation should not be a major concern, since it doesn't impact
333 // peak file descriptor usage or the speed of allocation.
334 if (LOG.isTraceEnabled()) {
335 LOG.trace(this + ": shutting down UNIX domain socket for " +
336 "empty " + shm);
337 }
338 shutdown(shm);
339 } else {
340 notFull.put(shmId, shm);
341 }
342 }
343 }
344
345 /**
346 * Unregister a shared memory segment.
347 *
348 * Once a segment is unregistered, we will not allocate any more slots
349 * inside that segment.
350 *
351 * The DomainSocketWatcher calls this while holding the DomainSocketWatcher
352 * lock.
353 *
354 * @param shmId The ID of the shared memory segment to unregister.
355 */
356 void unregisterShm(ShmId shmId) {
357 lock.lock();
358 try {
359 full.remove(shmId);
360 notFull.remove(shmId);
361 } finally {
362 lock.unlock();
363 }
364 }
365
366 @Override
367 public String toString() {
368 return String.format("EndpointShmManager(%s, parent=%s)",
369 datanode, DfsClientShmManager.this);
370 }
371
372 PerDatanodeVisitorInfo getVisitorInfo() {
373 return new PerDatanodeVisitorInfo(full, notFull, disabled);
374 }
375
376 final void shutdown(DfsClientShm shm) {
377 try {
378 shm.getPeer().getDomainSocket().shutdown();
379 } catch (IOException e) {
380 LOG.warn(this + ": error shutting down shm: got IOException calling " +
381 "shutdown(SHUT_RDWR)", e);
382 }
383 }
384 }
385
386 private boolean closed = false;
387
388 private final ReentrantLock lock = new ReentrantLock();
389
390 /**
391 * A condition variable which is signalled when we finish loading a segment
392 * from the Datanode.
393 */
394 private final Condition finishedLoading = lock.newCondition();
395
396 /**
397 * Information about each Datanode.
398 */
399 private final HashMap<DatanodeInfo, EndpointShmManager> datanodes =
400 new HashMap<DatanodeInfo, EndpointShmManager>(1);
401
402 /**
403 * The DomainSocketWatcher which keeps track of the UNIX domain socket
404 * associated with each shared memory segment.
405 *
406 * Note: because the DomainSocketWatcher makes callbacks into this
407 * DfsClientShmManager object, you must MUST NOT attempt to take the
408 * DomainSocketWatcher lock while holding the DfsClientShmManager lock,
409 * or else deadlock might result. This means that most DomainSocketWatcher
410 * methods are off-limits unless you release the manager lock first.
411 */
412 private final DomainSocketWatcher domainSocketWatcher;
413
414 DfsClientShmManager(int interruptCheckPeriodMs) throws IOException {
415 this.domainSocketWatcher = new DomainSocketWatcher(interruptCheckPeriodMs);
416 }
417
418 public Slot allocSlot(DatanodeInfo datanode, DomainPeer peer,
419 MutableBoolean usedPeer, ExtendedBlockId blockId,
420 String clientName) throws IOException {
421 lock.lock();
422 try {
423 if (closed) {
424 LOG.trace(this + ": the DfsClientShmManager isclosed.");
425 return null;
426 }
427 EndpointShmManager shmManager = datanodes.get(datanode);
428 if (shmManager == null) {
429 shmManager = new EndpointShmManager(datanode);
430 datanodes.put(datanode, shmManager);
431 }
432 return shmManager.allocSlot(peer, usedPeer, clientName, blockId);
433 } finally {
434 lock.unlock();
435 }
436 }
437
438 public void freeSlot(Slot slot) {
439 lock.lock();
440 try {
441 DfsClientShm shm = (DfsClientShm)slot.getShm();
442 shm.getEndpointShmManager().freeSlot(slot);
443 } finally {
444 lock.unlock();
445 }
446 }
447
448 @VisibleForTesting
449 public static class PerDatanodeVisitorInfo {
450 public final TreeMap<ShmId, DfsClientShm> full;
451 public final TreeMap<ShmId, DfsClientShm> notFull;
452 public final boolean disabled;
453
454 PerDatanodeVisitorInfo(TreeMap<ShmId, DfsClientShm> full,
455 TreeMap<ShmId, DfsClientShm> notFull, boolean disabled) {
456 this.full = full;
457 this.notFull = notFull;
458 this.disabled = disabled;
459 }
460 }
461
462 @VisibleForTesting
463 public interface Visitor {
464 void visit(HashMap<DatanodeInfo, PerDatanodeVisitorInfo> info)
465 throws IOException;
466 }
467
468 @VisibleForTesting
469 public void visit(Visitor visitor) throws IOException {
470 lock.lock();
471 try {
472 HashMap<DatanodeInfo, PerDatanodeVisitorInfo> info =
473 new HashMap<DatanodeInfo, PerDatanodeVisitorInfo>();
474 for (Entry<DatanodeInfo, EndpointShmManager> entry :
475 datanodes.entrySet()) {
476 info.put(entry.getKey(), entry.getValue().getVisitorInfo());
477 }
478 visitor.visit(info);
479 } finally {
480 lock.unlock();
481 }
482 }
483
484 /**
485 * Close the DfsClientShmManager.
486 */
487 @Override
488 public void close() throws IOException {
489 lock.lock();
490 try {
491 if (closed) return;
492 closed = true;
493 } finally {
494 lock.unlock();
495 }
496 // When closed, the domainSocketWatcher will issue callbacks that mark
497 // all the outstanding DfsClientShm segments as stale.
498 IOUtils.cleanup(LOG, domainSocketWatcher);
499 }
500
501
502 @Override
503 public String toString() {
504 return String.format("ShortCircuitShmManager(%08x)",
505 System.identityHashCode(this));
506 }
507
508 @VisibleForTesting
509 public DomainSocketWatcher getDomainSocketWatcher() {
510 return domainSocketWatcher;
511 }
512 }