001    /**
002     * Licensed to the Apache Software Foundation (ASF) under one
003     * or more contributor license agreements.  See the NOTICE file
004     * distributed with this work for additional information
005     * regarding copyright ownership.  The ASF licenses this file
006     * to you under the Apache License, Version 2.0 (the
007     * "License"); you may not use this file except in compliance
008     * with the License.  You may obtain a copy of the License at
009     *
010     *     http://www.apache.org/licenses/LICENSE-2.0
011     *
012     * Unless required by applicable law or agreed to in writing, software
013     * distributed under the License is distributed on an "AS IS" BASIS,
014     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015     * See the License for the specific language governing permissions and
016     * limitations under the License.
017     */
018    package org.apache.hadoop.hdfs.shortcircuit;
019    
020    import java.io.FileInputStream;
021    import java.io.IOException;
022    import java.lang.reflect.Field;
023    import java.util.BitSet;
024    import java.util.Iterator;
025    import java.util.NoSuchElementException;
026    import java.util.Random;
027    
028    import org.apache.commons.lang.builder.EqualsBuilder;
029    import org.apache.commons.lang.builder.HashCodeBuilder;
030    import org.apache.commons.logging.Log;
031    import org.apache.commons.logging.LogFactory;
032    import org.apache.hadoop.fs.InvalidRequestException;
033    import org.apache.hadoop.hdfs.ExtendedBlockId;
034    import org.apache.hadoop.io.nativeio.NativeIO;
035    import org.apache.hadoop.io.nativeio.NativeIO.POSIX;
036    import org.apache.hadoop.util.Shell;
037    import org.apache.hadoop.util.StringUtils;
038    
039    import sun.misc.Unsafe;
040    
041    import com.google.common.base.Preconditions;
042    import com.google.common.collect.ComparisonChain;
043    import com.google.common.primitives.Ints;
044    
045    /**
046     * A shared memory segment used to implement short-circuit reads.
047     */
048    public class ShortCircuitShm {
049      private static final Log LOG = LogFactory.getLog(ShortCircuitShm.class);
050    
051      protected static final int BYTES_PER_SLOT = 64;
052    
053      private static final Unsafe unsafe = safetyDance();
054    
055      private static Unsafe safetyDance() {
056        try {
057          Field f = Unsafe.class.getDeclaredField("theUnsafe");
058          f.setAccessible(true);
059          return (Unsafe)f.get(null);
060        } catch (Throwable e) {
061          LOG.error("failed to load misc.Unsafe", e);
062        }
063        return null;
064      }
065    
066      /**
067       * Calculate the usable size of a shared memory segment.
068       * We round down to a multiple of the slot size and do some validation.
069       *
070       * @param stream The stream we're using.
071       * @return       The usable size of the shared memory segment.
072       */
073      private static int getUsableLength(FileInputStream stream)
074          throws IOException {
075        int intSize = Ints.checkedCast(stream.getChannel().size());
076        int slots = intSize / BYTES_PER_SLOT;
077        if (slots == 0) {
078          throw new IOException("size of shared memory segment was " +
079              intSize + ", but that is not enough to hold even one slot.");
080        }
081        return slots * BYTES_PER_SLOT;
082      }
083    
084      /**
085       * Identifies a DfsClientShm.
086       */
087      public static class ShmId implements Comparable<ShmId> {
088        private static final Random random = new Random();
089        private final long hi;
090        private final long lo;
091    
092        /**
093         * Generate a random ShmId.
094         * 
095         * We generate ShmIds randomly to prevent a malicious client from
096         * successfully guessing one and using that to interfere with another
097         * client.
098         */
099        public static ShmId createRandom() {
100          return new ShmId(random.nextLong(), random.nextLong());
101        }
102    
103        public ShmId(long hi, long lo) {
104          this.hi = hi;
105          this.lo = lo;
106        }
107        
108        public long getHi() {
109          return hi;
110        }
111        
112        public long getLo() {
113          return lo;
114        }
115    
116        @Override
117        public boolean equals(Object o) {
118          if ((o == null) || (o.getClass() != this.getClass())) {
119            return false;
120          }
121          ShmId other = (ShmId)o;
122          return new EqualsBuilder().
123              append(hi, other.hi).
124              append(lo, other.lo).
125              isEquals();
126        }
127    
128        @Override
129        public int hashCode() {
130          return new HashCodeBuilder().
131              append(this.hi).
132              append(this.lo).
133              toHashCode();
134        }
135    
136        @Override
137        public String toString() {
138          return String.format("%016x%016x", hi, lo);
139        }
140    
141        @Override
142        public int compareTo(ShmId other) {
143          return ComparisonChain.start().
144              compare(hi, other.hi).
145              compare(lo, other.lo).
146              result();
147        }
148      };
149    
150      /**
151       * Uniquely identifies a slot.
152       */
153      public static class SlotId {
154        private final ShmId shmId;
155        private final int slotIdx;
156        
157        public SlotId(ShmId shmId, int slotIdx) {
158          this.shmId = shmId;
159          this.slotIdx = slotIdx;
160        }
161    
162        public ShmId getShmId() {
163          return shmId;
164        }
165    
166        public int getSlotIdx() {
167          return slotIdx;
168        }
169    
170        @Override
171        public boolean equals(Object o) {
172          if ((o == null) || (o.getClass() != this.getClass())) {
173            return false;
174          }
175          SlotId other = (SlotId)o;
176          return new EqualsBuilder().
177              append(shmId, other.shmId).
178              append(slotIdx, other.slotIdx).
179              isEquals();
180        }
181    
182        @Override
183        public int hashCode() {
184          return new HashCodeBuilder().
185              append(this.shmId).
186              append(this.slotIdx).
187              toHashCode();
188        }
189    
190        @Override
191        public String toString() {
192          return String.format("SlotId(%s:%d)", shmId.toString(), slotIdx);
193        }
194      }
195    
196      public class SlotIterator implements Iterator<Slot> {
197        int slotIdx = -1;
198    
199        @Override
200        public boolean hasNext() {
201          synchronized (ShortCircuitShm.this) {
202            return allocatedSlots.nextSetBit(slotIdx + 1) != -1;
203          }
204        }
205    
206        @Override
207        public Slot next() {
208          synchronized (ShortCircuitShm.this) {
209            int nextSlotIdx = allocatedSlots.nextSetBit(slotIdx + 1);
210            if (nextSlotIdx == -1) {
211              throw new NoSuchElementException();
212            }
213            slotIdx = nextSlotIdx;
214            return slots[nextSlotIdx];
215          }
216        }
217    
218        @Override
219        public void remove() {
220          throw new UnsupportedOperationException("SlotIterator " +
221              "doesn't support removal");
222        }
223      }
224      
225      /**
226       * A slot containing information about a replica.
227       *
228       * The format is:
229       * word 0
230       *   bit 0:32   Slot flags (see below).
231       *   bit 33:63  Anchor count.
232       * word 1:7
233       *   Reserved for future use, such as statistics.
234       *   Padding is also useful for avoiding false sharing.
235       *
236       * Little-endian versus big-endian is not relevant here since both the client
237       * and the server reside on the same computer and use the same orientation.
238       */
239      public class Slot {
240        /**
241         * Flag indicating that the slot is valid.  
242         * 
243         * The DFSClient sets this flag when it allocates a new slot within one of
244         * its shared memory regions.
245         * 
246         * The DataNode clears this flag when the replica associated with this slot
247         * is no longer valid.  The client itself also clears this flag when it
248         * believes that the DataNode is no longer using this slot to communicate.
249         */
250        private static final long VALID_FLAG =          1L<<63;
251    
252        /**
253         * Flag indicating that the slot can be anchored.
254         */
255        private static final long ANCHORABLE_FLAG =     1L<<62;
256    
257        /**
258         * The slot address in memory.
259         */
260        private final long slotAddress;
261    
262        /**
263         * BlockId of the block this slot is used for.
264         */
265        private final ExtendedBlockId blockId;
266    
267        Slot(long slotAddress, ExtendedBlockId blockId) {
268          this.slotAddress = slotAddress;
269          this.blockId = blockId;
270        }
271    
272        /**
273         * Get the short-circuit memory segment associated with this Slot.
274         *
275         * @return      The enclosing short-circuit memory segment.
276         */
277        public ShortCircuitShm getShm() {
278          return ShortCircuitShm.this;
279        }
280    
281        /**
282         * Get the ExtendedBlockId associated with this slot.
283         *
284         * @return      The ExtendedBlockId of this slot.
285         */
286        public ExtendedBlockId getBlockId() {
287          return blockId;
288        }
289    
290        /**
291         * Get the SlotId of this slot, containing both shmId and slotIdx.
292         *
293         * @return      The SlotId of this slot.
294         */
295        public SlotId getSlotId() {
296          return new SlotId(getShmId(), getSlotIdx());
297        }
298    
299        /**
300         * Get the Slot index.
301         *
302         * @return      The index of this slot.
303         */
304        public int getSlotIdx() {
305          return Ints.checkedCast(
306              (slotAddress - baseAddress) / BYTES_PER_SLOT);
307        }
308    
309        /**
310         * Clear the slot.
311         */
312        void clear() {
313          unsafe.putLongVolatile(null, this.slotAddress, 0);
314        }
315    
316        private boolean isSet(long flag) {
317          long prev = unsafe.getLongVolatile(null, this.slotAddress);
318          return (prev & flag) != 0;
319        }
320    
321        private void setFlag(long flag) {
322          long prev;
323          do {
324            prev = unsafe.getLongVolatile(null, this.slotAddress);
325            if ((prev & flag) != 0) {
326              return;
327            }
328          } while (!unsafe.compareAndSwapLong(null, this.slotAddress,
329                      prev, prev | flag));
330        }
331    
332        private void clearFlag(long flag) {
333          long prev;
334          do {
335            prev = unsafe.getLongVolatile(null, this.slotAddress);
336            if ((prev & flag) == 0) {
337              return;
338            }
339          } while (!unsafe.compareAndSwapLong(null, this.slotAddress,
340                      prev, prev & (~flag)));
341        }
342        
343        public boolean isValid() {
344          return isSet(VALID_FLAG);
345        }
346    
347        public void makeValid() {
348          setFlag(VALID_FLAG);
349        }
350    
351        public void makeInvalid() {
352          clearFlag(VALID_FLAG);
353        }
354    
355        public boolean isAnchorable() {
356          return isSet(ANCHORABLE_FLAG);
357        }
358    
359        public void makeAnchorable() {
360          setFlag(ANCHORABLE_FLAG);
361        }
362    
363        public void makeUnanchorable() {
364          clearFlag(ANCHORABLE_FLAG);
365        }
366    
367        public boolean isAnchored() {
368          long prev = unsafe.getLongVolatile(null, this.slotAddress);
369          if ((prev & VALID_FLAG) == 0) {
370            // Slot is no longer valid.
371            return false;
372          }
373          return ((prev & 0x7fffffff) != 0);
374        }
375    
376        /**
377         * Try to add an anchor for a given slot.
378         *
379         * When a slot is anchored, we know that the block it refers to is resident
380         * in memory.
381         *
382         * @return          True if the slot is anchored.
383         */
384        public boolean addAnchor() {
385          long prev;
386          do {
387            prev = unsafe.getLongVolatile(null, this.slotAddress);
388            if ((prev & VALID_FLAG) == 0) {
389              // Slot is no longer valid.
390              return false;
391            }
392            if ((prev & ANCHORABLE_FLAG) == 0) {
393              // Slot can't be anchored right now.
394              return false;
395            }
396            if ((prev & 0x7fffffff) == 0x7fffffff) {
397              // Too many other threads have anchored the slot (2 billion?)
398              return false;
399            }
400          } while (!unsafe.compareAndSwapLong(null, this.slotAddress,
401                      prev, prev + 1));
402          return true;
403        }
404    
405        /**
406         * Remove an anchor for a given slot.
407         */
408        public void removeAnchor() {
409          long prev;
410          do {
411            prev = unsafe.getLongVolatile(null, this.slotAddress);
412            Preconditions.checkState((prev & 0x7fffffff) != 0,
413                "Tried to remove anchor for slot " + slotAddress +", which was " +
414                "not anchored.");
415          } while (!unsafe.compareAndSwapLong(null, this.slotAddress,
416                      prev, prev - 1));
417        }
418    
419        @Override
420        public String toString() {
421          return "Slot(slotIdx=" + getSlotIdx() + ", shm=" + getShm() + ")";
422        }
423      }
424    
425      /**
426       * ID for this SharedMemorySegment.
427       */
428      private final ShmId shmId;
429    
430      /**
431       * The base address of the memory-mapped file.
432       */
433      private final long baseAddress;
434    
435      /**
436       * The mmapped length of the shared memory segment
437       */
438      private final int mmappedLength;
439    
440      /**
441       * The slots associated with this shared memory segment.
442       * slot[i] contains the slot at offset i * BYTES_PER_SLOT,
443       * or null if that slot is not allocated.
444       */
445      private final Slot slots[];
446    
447      /**
448       * A bitset where each bit represents a slot which is in use.
449       */
450      private final BitSet allocatedSlots;
451    
452      /**
453       * Create the ShortCircuitShm.
454       * 
455       * @param shmId       The ID to use.
456       * @param stream      The stream that we're going to use to create this 
457       *                    shared memory segment.
458       *                    
459       *                    Although this is a FileInputStream, we are going to
460       *                    assume that the underlying file descriptor is writable
461       *                    as well as readable. It would be more appropriate to use
462       *                    a RandomAccessFile here, but that class does not have
463       *                    any public accessor which returns a FileDescriptor,
464       *                    unlike FileInputStream.
465       */
466      public ShortCircuitShm(ShmId shmId, FileInputStream stream)
467            throws IOException {
468        if (!NativeIO.isAvailable()) {
469          throw new UnsupportedOperationException("NativeIO is not available.");
470        }
471        if (Shell.WINDOWS) {
472          throw new UnsupportedOperationException(
473              "DfsClientShm is not yet implemented for Windows.");
474        }
475        if (unsafe == null) {
476          throw new UnsupportedOperationException(
477              "can't use DfsClientShm because we failed to " +
478              "load misc.Unsafe.");
479        }
480        this.shmId = shmId;
481        this.mmappedLength = getUsableLength(stream);
482        this.baseAddress = POSIX.mmap(stream.getFD(), 
483            POSIX.MMAP_PROT_READ | POSIX.MMAP_PROT_WRITE, true, mmappedLength);
484        this.slots = new Slot[mmappedLength / BYTES_PER_SLOT];
485        this.allocatedSlots = new BitSet(slots.length);
486        if (LOG.isTraceEnabled()) {
487          LOG.trace("creating " + this.getClass().getSimpleName() +
488              "(shmId=" + shmId +
489              ", mmappedLength=" + mmappedLength +
490              ", baseAddress=" + String.format("%x", baseAddress) +
491              ", slots.length=" + slots.length + ")");
492        }
493      }
494    
495      public final ShmId getShmId() {
496        return shmId;
497      }
498      
499      /**
500       * Determine if this shared memory object is empty.
501       *
502       * @return    True if the shared memory object is empty.
503       */
504      synchronized final public boolean isEmpty() {
505        return allocatedSlots.nextSetBit(0) == -1;
506      }
507    
508      /**
509       * Determine if this shared memory object is full.
510       *
511       * @return    True if the shared memory object is full.
512       */
513      synchronized final public boolean isFull() {
514        return allocatedSlots.nextClearBit(0) >= slots.length;
515      }
516    
517      /**
518       * Calculate the base address of a slot.
519       *
520       * @param slotIdx   Index of the slot.
521       * @return          The base address of the slot.
522       */
523      private final long calculateSlotAddress(int slotIdx) {
524        long offset = slotIdx;
525        offset *= BYTES_PER_SLOT;
526        return this.baseAddress + offset;
527      }
528    
529      /**
530       * Allocate a new slot and register it.
531       *
532       * This function chooses an empty slot, initializes it, and then returns
533       * the relevant Slot object.
534       *
535       * @return    The new slot.
536       */
537      synchronized public final Slot allocAndRegisterSlot(
538          ExtendedBlockId blockId) {
539        int idx = allocatedSlots.nextClearBit(0);
540        if (idx >= slots.length) {
541          throw new RuntimeException(this + ": no more slots are available.");
542        }
543        allocatedSlots.set(idx, true);
544        Slot slot = new Slot(calculateSlotAddress(idx), blockId);
545        slot.clear();
546        slot.makeValid();
547        slots[idx] = slot;
548        if (LOG.isTraceEnabled()) {
549          LOG.trace(this + ": allocAndRegisterSlot " + idx + ": allocatedSlots=" + allocatedSlots +
550                      StringUtils.getStackTrace(Thread.currentThread()));
551        }
552        return slot;
553      }
554    
555      synchronized public final Slot getSlot(int slotIdx)
556          throws InvalidRequestException {
557        if (!allocatedSlots.get(slotIdx)) {
558          throw new InvalidRequestException(this + ": slot " + slotIdx +
559              " does not exist.");
560        }
561        return slots[slotIdx];
562      }
563    
564      /**
565       * Register a slot.
566       *
567       * This function looks at a slot which has already been initialized (by
568       * another process), and registers it with us.  Then, it returns the 
569       * relevant Slot object.
570       *
571       * @return    The slot.
572       *
573       * @throws InvalidRequestException
574       *            If the slot index we're trying to allocate has not been
575       *            initialized, or is already in use.
576       */
577      synchronized public final Slot registerSlot(int slotIdx,
578          ExtendedBlockId blockId) throws InvalidRequestException {
579        if (slotIdx < 0) {
580          throw new InvalidRequestException(this + ": invalid negative slot " +
581              "index " + slotIdx);
582        }
583        if (slotIdx >= slots.length) {
584          throw new InvalidRequestException(this + ": invalid slot " +
585              "index " + slotIdx);
586        }
587        if (allocatedSlots.get(slotIdx)) {
588          throw new InvalidRequestException(this + ": slot " + slotIdx +
589              " is already in use.");
590        }
591        Slot slot = new Slot(calculateSlotAddress(slotIdx), blockId);
592        if (!slot.isValid()) {
593          throw new InvalidRequestException(this + ": slot " + slotIdx +
594              " is not marked as valid.");
595        }
596        slots[slotIdx] = slot;
597        allocatedSlots.set(slotIdx, true);
598        if (LOG.isTraceEnabled()) {
599          LOG.trace(this + ": registerSlot " + slotIdx + ": allocatedSlots=" + allocatedSlots +
600                      StringUtils.getStackTrace(Thread.currentThread()));
601        }
602        return slot;
603      }
604    
605      /**
606       * Unregisters a slot.
607       * 
608       * This doesn't alter the contents of the slot.  It just means
609       *
610       * @param slotIdx  Index of the slot to unregister.
611       */
612      synchronized public final void unregisterSlot(int slotIdx) {
613        Preconditions.checkState(allocatedSlots.get(slotIdx),
614            "tried to unregister slot " + slotIdx + ", which was not registered.");
615        allocatedSlots.set(slotIdx, false);
616        slots[slotIdx] = null;
617        if (LOG.isTraceEnabled()) {
618          LOG.trace(this + ": unregisterSlot " + slotIdx);
619        }
620      }
621      
622      /**
623       * Iterate over all allocated slots.
624       * 
625       * Note that this method isn't safe if 
626       *
627       * @return        The slot iterator.
628       */
629      public SlotIterator slotIterator() {
630        return new SlotIterator();
631      }
632    
633      public void free() {
634        try {
635          POSIX.munmap(baseAddress, mmappedLength);
636        } catch (IOException e) {
637          LOG.warn(this + ": failed to munmap", e);
638        }
639        LOG.trace(this + ": freed");
640      }
641      
642      @Override
643      public String toString() {
644        return this.getClass().getSimpleName() + "(" + shmId + ")";
645      }
646    }