001    /**
002     * Licensed to the Apache Software Foundation (ASF) under one
003     * or more contributor license agreements.  See the NOTICE file
004     * distributed with this work for additional information
005     * regarding copyright ownership.  The ASF licenses this file
006     * to you under the Apache License, Version 2.0 (the
007     * "License"); you may not use this file except in compliance
008     * with the License.  You may obtain a copy of the License at
009     *
010     *     http://www.apache.org/licenses/LICENSE-2.0
011     *
012     * Unless required by applicable law or agreed to in writing, software
013     * distributed under the License is distributed on an "AS IS" BASIS,
014     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015     * See the License for the specific language governing permissions and
016     * limitations under the License.
017     */
018    package org.apache.hadoop.hdfs.server.datanode;
019    
020    import java.io.File;
021    import java.io.FileOutputStream;
022    import java.io.IOException;
023    import java.io.RandomAccessFile;
024    
025    import org.apache.hadoop.hdfs.protocol.Block;
026    import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.ReplicaState;
027    import org.apache.hadoop.hdfs.server.datanode.fsdataset.FsVolumeSpi;
028    import org.apache.hadoop.hdfs.server.datanode.fsdataset.ReplicaOutputStreams;
029    import org.apache.hadoop.io.IOUtils;
030    import org.apache.hadoop.util.DataChecksum;
031    import org.apache.hadoop.util.StringUtils;
032    
033    /** 
034     * This class defines a replica in a pipeline, which
035     * includes a persistent replica being written to by a dfs client or
036     * a temporary replica being replicated by a source datanode or
037     * being copied for the balancing purpose.
038     * 
039     * The base class implements a temporary replica
040     */
041    public class ReplicaInPipeline extends ReplicaInfo
042                            implements ReplicaInPipelineInterface {
043      private long bytesAcked;
044      private long bytesOnDisk;
045      private byte[] lastChecksum;  
046      private Thread writer;
047    
048      /**
049       * Bytes reserved for this replica on the containing volume.
050       * Based off difference between the estimated maximum block length and
051       * the bytes already written to this block.
052       */
053      private long bytesReserved;
054      
055      /**
056       * Constructor for a zero length replica
057       * @param blockId block id
058       * @param genStamp replica generation stamp
059       * @param vol volume where replica is located
060       * @param dir directory path where block and meta files are located
061       * @param bytesToReserve disk space to reserve for this replica, based on
062       *                       the estimated maximum block length.
063       */
064      public ReplicaInPipeline(long blockId, long genStamp, 
065            FsVolumeSpi vol, File dir, long bytesToReserve) {
066        this(blockId, 0L, genStamp, vol, dir, Thread.currentThread(), bytesToReserve);
067      }
068    
069      /**
070       * Constructor
071       * @param block a block
072       * @param vol volume where replica is located
073       * @param dir directory path where block and meta files are located
074       * @param writer a thread that is writing to this replica
075       */
076      ReplicaInPipeline(Block block, 
077          FsVolumeSpi vol, File dir, Thread writer) {
078        this( block.getBlockId(), block.getNumBytes(), block.getGenerationStamp(),
079            vol, dir, writer, 0L);
080      }
081    
082      /**
083       * Constructor
084       * @param blockId block id
085       * @param len replica length
086       * @param genStamp replica generation stamp
087       * @param vol volume where replica is located
088       * @param dir directory path where block and meta files are located
089       * @param writer a thread that is writing to this replica
090       * @param bytesToReserve disk space to reserve for this replica, based on
091       *                       the estimated maximum block length.
092       */
093      ReplicaInPipeline(long blockId, long len, long genStamp,
094          FsVolumeSpi vol, File dir, Thread writer, long bytesToReserve) {
095        super( blockId, len, genStamp, vol, dir);
096        this.bytesAcked = len;
097        this.bytesOnDisk = len;
098        this.writer = writer;
099        this.bytesReserved = bytesToReserve;
100      }
101    
102      /**
103       * Copy constructor.
104       * @param from where to copy from
105       */
106      public ReplicaInPipeline(ReplicaInPipeline from) {
107        super(from);
108        this.bytesAcked = from.getBytesAcked();
109        this.bytesOnDisk = from.getBytesOnDisk();
110        this.writer = from.writer;
111        this.bytesReserved = from.bytesReserved;
112      }
113    
114      @Override
115      public long getVisibleLength() {
116        return -1;
117      }
118      
119      @Override  //ReplicaInfo
120      public ReplicaState getState() {
121        return ReplicaState.TEMPORARY;
122      }
123      
124      @Override // ReplicaInPipelineInterface
125      public long getBytesAcked() {
126        return bytesAcked;
127      }
128      
129      @Override // ReplicaInPipelineInterface
130      public void setBytesAcked(long bytesAcked) {
131        long newBytesAcked = bytesAcked - this.bytesAcked;
132        this.bytesAcked = bytesAcked;
133    
134        // Once bytes are ACK'ed we can release equivalent space from the
135        // volume's reservedForRbw count. We could have released it as soon
136        // as the write-to-disk completed but that would be inefficient.
137        getVolume().releaseReservedSpace(newBytesAcked);
138        bytesReserved -= newBytesAcked;
139      }
140      
141      @Override // ReplicaInPipelineInterface
142      public long getBytesOnDisk() {
143        return bytesOnDisk;
144      }
145    
146      @Override
147      public long getBytesReserved() {
148        return bytesReserved;
149      }
150      
151      @Override // ReplicaInPipelineInterface
152      public synchronized void setLastChecksumAndDataLen(long dataLength, byte[] lastChecksum) {
153        this.bytesOnDisk = dataLength;
154        this.lastChecksum = lastChecksum;
155      }
156      
157      @Override // ReplicaInPipelineInterface
158      public synchronized ChunkChecksum getLastChecksumAndDataLen() {
159        return new ChunkChecksum(getBytesOnDisk(), lastChecksum);
160      }
161    
162      /**
163       * Set the thread that is writing to this replica
164       * @param writer a thread writing to this replica
165       */
166      public void setWriter(Thread writer) {
167        this.writer = writer;
168      }
169      
170      @Override  // Object
171      public boolean equals(Object o) {
172        return super.equals(o);
173      }
174      
175      /**
176       * Interrupt the writing thread and wait until it dies
177       * @throws IOException the waiting is interrupted
178       */
179      public void stopWriter(long xceiverStopTimeout) throws IOException {
180        if (writer != null && writer != Thread.currentThread() && writer.isAlive()) {
181          writer.interrupt();
182          try {
183            writer.join(xceiverStopTimeout);
184            if (writer.isAlive()) {
185              final String msg = "Join on writer thread " + writer + " timed out";
186              DataNode.LOG.warn(msg + "\n" + StringUtils.getStackTrace(writer));
187              throw new IOException(msg);
188            }
189          } catch (InterruptedException e) {
190            throw new IOException("Waiting for writer thread is interrupted.");
191          }
192        }
193      }
194      
195      @Override  // Object
196      public int hashCode() {
197        return super.hashCode();
198      }
199      
200      @Override // ReplicaInPipelineInterface
201      public ReplicaOutputStreams createStreams(boolean isCreate, 
202          DataChecksum requestedChecksum) throws IOException {
203        File blockFile = getBlockFile();
204        File metaFile = getMetaFile();
205        if (DataNode.LOG.isDebugEnabled()) {
206          DataNode.LOG.debug("writeTo blockfile is " + blockFile +
207                             " of size " + blockFile.length());
208          DataNode.LOG.debug("writeTo metafile is " + metaFile +
209                             " of size " + metaFile.length());
210        }
211        long blockDiskSize = 0L;
212        long crcDiskSize = 0L;
213        
214        // the checksum that should actually be used -- this
215        // may differ from requestedChecksum for appends.
216        final DataChecksum checksum;
217        
218        RandomAccessFile metaRAF = new RandomAccessFile(metaFile, "rw");
219        
220        if (!isCreate) {
221          // For append or recovery, we must enforce the existing checksum.
222          // Also, verify that the file has correct lengths, etc.
223          boolean checkedMeta = false;
224          try {
225            BlockMetadataHeader header = BlockMetadataHeader.readHeader(metaRAF);
226            checksum = header.getChecksum();
227            
228            if (checksum.getBytesPerChecksum() !=
229                requestedChecksum.getBytesPerChecksum()) {
230              throw new IOException("Client requested checksum " +
231                  requestedChecksum + " when appending to an existing block " +
232                  "with different chunk size: " + checksum);
233            }
234            
235            int bytesPerChunk = checksum.getBytesPerChecksum();
236            int checksumSize = checksum.getChecksumSize();
237            
238            blockDiskSize = bytesOnDisk;
239            crcDiskSize = BlockMetadataHeader.getHeaderSize() +
240              (blockDiskSize+bytesPerChunk-1)/bytesPerChunk*checksumSize;
241            if (blockDiskSize>0 && 
242                (blockDiskSize>blockFile.length() || crcDiskSize>metaFile.length())) {
243              throw new IOException("Corrupted block: " + this);
244            }
245            checkedMeta = true;
246          } finally {
247            if (!checkedMeta) {
248              // clean up in case of exceptions.
249              IOUtils.closeStream(metaRAF);
250            }
251          }
252        } else {
253          // for create, we can use the requested checksum
254          checksum = requestedChecksum;
255        }
256        
257        FileOutputStream blockOut = null;
258        FileOutputStream crcOut = null;
259        try {
260          blockOut = new FileOutputStream(
261              new RandomAccessFile( blockFile, "rw" ).getFD() );
262          crcOut = new FileOutputStream(metaRAF.getFD() );
263          if (!isCreate) {
264            blockOut.getChannel().position(blockDiskSize);
265            crcOut.getChannel().position(crcDiskSize);
266          }
267          return new ReplicaOutputStreams(blockOut, crcOut, checksum,
268              getVolume().isTransientStorage());
269        } catch (IOException e) {
270          IOUtils.closeStream(blockOut);
271          IOUtils.closeStream(metaRAF);
272          throw e;
273        }
274      }
275      
276      @Override
277      public String toString() {
278        return super.toString()
279            + "\n  bytesAcked=" + bytesAcked
280            + "\n  bytesOnDisk=" + bytesOnDisk;
281      }
282    }