001    /**
002     * Licensed to the Apache Software Foundation (ASF) under one
003     * or more contributor license agreements.  See the NOTICE file
004     * distributed with this work for additional information
005     * regarding copyright ownership.  The ASF licenses this file
006     * to you under the Apache License, Version 2.0 (the
007     * "License"); you may not use this file except in compliance
008     * with the License.  You may obtain a copy of the License at
009     *
010     *     http://www.apache.org/licenses/LICENSE-2.0
011     *
012     * Unless required by applicable law or agreed to in writing, software
013     * distributed under the License is distributed on an "AS IS" BASIS,
014     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015     * See the License for the specific language governing permissions and
016     * limitations under the License.
017     */
018    package org.apache.hadoop.hdfs.server.namenode;
019    
020    import static org.apache.hadoop.util.Time.now;
021    
022    import java.io.DataInput;
023    import java.io.DataInputStream;
024    import java.io.DataOutputStream;
025    import java.io.File;
026    import java.io.FileInputStream;
027    import java.io.FileNotFoundException;
028    import java.io.FileOutputStream;
029    import java.io.IOException;
030    import java.security.DigestInputStream;
031    import java.security.DigestOutputStream;
032    import java.security.MessageDigest;
033    import java.util.ArrayList;
034    import java.util.Arrays;
035    import java.util.Collection;
036    import java.util.HashMap;
037    import java.util.List;
038    import java.util.Map;
039    import java.util.TreeMap;
040    
041    import org.apache.commons.logging.Log;
042    import org.apache.hadoop.classification.InterfaceAudience;
043    import org.apache.hadoop.classification.InterfaceStability;
044    import org.apache.hadoop.conf.Configuration;
045    import org.apache.hadoop.fs.FileSystem;
046    import org.apache.hadoop.fs.Path;
047    import org.apache.hadoop.fs.PathIsNotDirectoryException;
048    import org.apache.hadoop.fs.UnresolvedLinkException;
049    import org.apache.hadoop.fs.permission.PermissionStatus;
050    import org.apache.hadoop.hdfs.DFSUtil;
051    import org.apache.hadoop.hdfs.protocol.HdfsConstants;
052    import org.apache.hadoop.hdfs.protocol.LayoutFlags;
053    import org.apache.hadoop.hdfs.protocol.LayoutVersion;
054    import org.apache.hadoop.hdfs.protocol.LayoutVersion.Feature;
055    import org.apache.hadoop.hdfs.server.blockmanagement.BlockInfo;
056    import org.apache.hadoop.hdfs.server.blockmanagement.BlockInfoUnderConstruction;
057    import org.apache.hadoop.hdfs.server.blockmanagement.BlockManager;
058    import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.StartupOption;
059    import org.apache.hadoop.hdfs.server.common.InconsistentFSStateException;
060    import org.apache.hadoop.hdfs.server.namenode.snapshot.DirectoryWithSnapshotFeature;
061    import org.apache.hadoop.hdfs.server.namenode.snapshot.FileDiffList;
062    import org.apache.hadoop.hdfs.server.namenode.snapshot.Snapshot;
063    import org.apache.hadoop.hdfs.server.namenode.snapshot.SnapshotFSImageFormat;
064    import org.apache.hadoop.hdfs.server.namenode.snapshot.SnapshotFSImageFormat.ReferenceMap;
065    import org.apache.hadoop.hdfs.server.namenode.startupprogress.Phase;
066    import org.apache.hadoop.hdfs.server.namenode.startupprogress.StartupProgress;
067    import org.apache.hadoop.hdfs.server.namenode.startupprogress.StartupProgress.Counter;
068    import org.apache.hadoop.hdfs.server.namenode.startupprogress.Step;
069    import org.apache.hadoop.hdfs.server.namenode.startupprogress.StepType;
070    import org.apache.hadoop.hdfs.util.ReadOnlyList;
071    import org.apache.hadoop.io.IOUtils;
072    import org.apache.hadoop.io.MD5Hash;
073    import org.apache.hadoop.io.Text;
074    import org.apache.hadoop.util.StringUtils;
075    
076    import com.google.common.annotations.VisibleForTesting;
077    import com.google.common.base.Preconditions;
078    
079    /**
080     * Contains inner classes for reading or writing the on-disk format for
081     * FSImages.
082     *
083     * In particular, the format of the FSImage looks like:
084     * <pre>
085     * FSImage {
086     *   layoutVersion: int, namespaceID: int, numberItemsInFSDirectoryTree: long,
087     *   namesystemGenerationStampV1: long, namesystemGenerationStampV2: long,
088     *   generationStampAtBlockIdSwitch:long, lastAllocatedBlockId:
089     *   long transactionID: long, snapshotCounter: int, numberOfSnapshots: int,
090     *   numOfSnapshottableDirs: int,
091     *   {FSDirectoryTree, FilesUnderConstruction, SecretManagerState} (can be compressed)
092     * }
093     *
094     * FSDirectoryTree (if {@link Feature#FSIMAGE_NAME_OPTIMIZATION} is supported) {
095     *   INodeInfo of root, numberOfChildren of root: int
096     *   [list of INodeInfo of root's children],
097     *   [list of INodeDirectoryInfo of root's directory children]
098     * }
099     *
100     * FSDirectoryTree (if {@link Feature#FSIMAGE_NAME_OPTIMIZATION} not supported){
101     *   [list of INodeInfo of INodes in topological order]
102     * }
103     *
104     * INodeInfo {
105     *   {
106     *     localName: short + byte[]
107     *   } when {@link Feature#FSIMAGE_NAME_OPTIMIZATION} is supported
108     *   or
109     *   {
110     *     fullPath: byte[]
111     *   } when {@link Feature#FSIMAGE_NAME_OPTIMIZATION} is not supported
112     *   replicationFactor: short, modificationTime: long,
113     *   accessTime: long, preferredBlockSize: long,
114     *   numberOfBlocks: int (-1 for INodeDirectory, -2 for INodeSymLink),
115     *   {
116     *     nsQuota: long, dsQuota: long,
117     *     {
118     *       isINodeSnapshottable: byte,
119     *       isINodeWithSnapshot: byte (if isINodeSnapshottable is false)
120     *     } (when {@link Feature#SNAPSHOT} is supported),
121     *     fsPermission: short, PermissionStatus
122     *   } for INodeDirectory
123     *   or
124     *   {
125     *     symlinkString, fsPermission: short, PermissionStatus
126     *   } for INodeSymlink
127     *   or
128     *   {
129     *     [list of BlockInfo]
130     *     [list of FileDiff]
131     *     {
132     *       isINodeFileUnderConstructionSnapshot: byte,
133     *       {clientName: short + byte[], clientMachine: short + byte[]} (when
134     *       isINodeFileUnderConstructionSnapshot is true),
135     *     } (when {@link Feature#SNAPSHOT} is supported and writing snapshotINode),
136     *     fsPermission: short, PermissionStatus
137     *   } for INodeFile
138     * }
139     *
140     * INodeDirectoryInfo {
141     *   fullPath of the directory: short + byte[],
142     *   numberOfChildren: int, [list of INodeInfo of children INode],
143     *   {
144     *     numberOfSnapshots: int,
145     *     [list of Snapshot] (when NumberOfSnapshots is positive),
146     *     numberOfDirectoryDiffs: int,
147     *     [list of DirectoryDiff] (NumberOfDirectoryDiffs is positive),
148     *     number of children that are directories,
149     *     [list of INodeDirectoryInfo of the directory children] (includes
150     *     snapshot copies of deleted sub-directories)
151     *   } (when {@link Feature#SNAPSHOT} is supported),
152     * }
153     *
154     * Snapshot {
155     *   snapshotID: int, root of Snapshot: INodeDirectoryInfo (its local name is
156     *   the name of the snapshot)
157     * }
158     *
159     * DirectoryDiff {
160     *   full path of the root of the associated Snapshot: short + byte[],
161     *   childrenSize: int,
162     *   isSnapshotRoot: byte,
163     *   snapshotINodeIsNotNull: byte (when isSnapshotRoot is false),
164     *   snapshotINode: INodeDirectory (when SnapshotINodeIsNotNull is true), Diff
165     * }
166     *
167     * Diff {
168     *   createdListSize: int, [Local name of INode in created list],
169     *   deletedListSize: int, [INode in deleted list: INodeInfo]
170     * }
171     *
172     * FileDiff {
173     *   full path of the root of the associated Snapshot: short + byte[],
174     *   fileSize: long,
175     *   snapshotINodeIsNotNull: byte,
176     *   snapshotINode: INodeFile (when SnapshotINodeIsNotNull is true), Diff
177     * }
178     * </pre>
179     */
180    @InterfaceAudience.Private
181    @InterfaceStability.Evolving
182    public class FSImageFormat {
183      private static final Log LOG = FSImage.LOG;
184    
185      // Static-only class
186      private FSImageFormat() {}
187    
188      interface AbstractLoader {
189        MD5Hash getLoadedImageMd5();
190        long getLoadedImageTxId();
191      }
192    
193      static class LoaderDelegator implements AbstractLoader {
194        private AbstractLoader impl;
195        private final Configuration conf;
196        private final FSNamesystem fsn;
197    
198        LoaderDelegator(Configuration conf, FSNamesystem fsn) {
199          this.conf = conf;
200          this.fsn = fsn;
201        }
202    
203        @Override
204        public MD5Hash getLoadedImageMd5() {
205          return impl.getLoadedImageMd5();
206        }
207    
208        @Override
209        public long getLoadedImageTxId() {
210          return impl.getLoadedImageTxId();
211        }
212    
213        public void load(File file, boolean requireSameLayoutVersion)
214            throws IOException {
215          Preconditions.checkState(impl == null, "Image already loaded!");
216    
217          FileInputStream is = null;
218          try {
219            is = new FileInputStream(file);
220            byte[] magic = new byte[FSImageUtil.MAGIC_HEADER.length];
221            IOUtils.readFully(is, magic, 0, magic.length);
222            if (Arrays.equals(magic, FSImageUtil.MAGIC_HEADER)) {
223              FSImageFormatProtobuf.Loader loader = new FSImageFormatProtobuf.Loader(
224                  conf, fsn, requireSameLayoutVersion);
225              impl = loader;
226              loader.load(file);
227            } else {
228              Loader loader = new Loader(conf, fsn);
229              impl = loader;
230              loader.load(file);
231            }
232          } finally {
233            IOUtils.cleanup(LOG, is);
234          }
235        }
236      }
237    
238      /**
239       * Construct a loader class to load the image. It chooses the loader based on
240       * the layout version.
241       */
242      public static LoaderDelegator newLoader(Configuration conf, FSNamesystem fsn) {
243        return new LoaderDelegator(conf, fsn);
244      }
245    
246      /**
247       * A one-shot class responsible for loading an image. The load() function
248       * should be called once, after which the getter methods may be used to retrieve
249       * information about the image that was loaded, if loading was successful.
250       */
251      public static class Loader implements AbstractLoader {
252        private final Configuration conf;
253        /** which namesystem this loader is working for */
254        private final FSNamesystem namesystem;
255    
256        /** Set to true once a file has been loaded using this loader. */
257        private boolean loaded = false;
258    
259        /** The transaction ID of the last edit represented by the loaded file */
260        private long imgTxId;
261        /** The MD5 sum of the loaded file */
262        private MD5Hash imgDigest;
263        
264        private Map<Integer, Snapshot> snapshotMap = null;
265        private final ReferenceMap referenceMap = new ReferenceMap();
266    
267        Loader(Configuration conf, FSNamesystem namesystem) {
268          this.conf = conf;
269          this.namesystem = namesystem;
270        }
271    
272        /**
273         * Return the MD5 checksum of the image that has been loaded.
274         * @throws IllegalStateException if load() has not yet been called.
275         */
276        @Override
277        public MD5Hash getLoadedImageMd5() {
278          checkLoaded();
279          return imgDigest;
280        }
281    
282        @Override
283        public long getLoadedImageTxId() {
284          checkLoaded();
285          return imgTxId;
286        }
287    
288        /**
289         * Throw IllegalStateException if load() has not yet been called.
290         */
291        private void checkLoaded() {
292          if (!loaded) {
293            throw new IllegalStateException("Image not yet loaded!");
294          }
295        }
296    
297        /**
298         * Throw IllegalStateException if load() has already been called.
299         */
300        private void checkNotLoaded() {
301          if (loaded) {
302            throw new IllegalStateException("Image already loaded!");
303          }
304        }
305    
306        public void load(File curFile) throws IOException {
307          checkNotLoaded();
308          assert curFile != null : "curFile is null";
309    
310          StartupProgress prog = NameNode.getStartupProgress();
311          Step step = new Step(StepType.INODES);
312          prog.beginStep(Phase.LOADING_FSIMAGE, step);
313          long startTime = now();
314    
315          //
316          // Load in bits
317          //
318          MessageDigest digester = MD5Hash.getDigester();
319          DigestInputStream fin = new DigestInputStream(
320               new FileInputStream(curFile), digester);
321    
322          DataInputStream in = new DataInputStream(fin);
323          try {
324            // read image version: first appeared in version -1
325            int imgVersion = in.readInt();
326            if (getLayoutVersion() != imgVersion) {
327              throw new InconsistentFSStateException(curFile, 
328                  "imgVersion " + imgVersion +
329                  " expected to be " + getLayoutVersion());
330            }
331            boolean supportSnapshot = NameNodeLayoutVersion.supports(
332                LayoutVersion.Feature.SNAPSHOT, imgVersion);
333            if (NameNodeLayoutVersion.supports(
334                LayoutVersion.Feature.ADD_LAYOUT_FLAGS, imgVersion)) {
335              LayoutFlags.read(in);
336            }
337    
338            // read namespaceID: first appeared in version -2
339            in.readInt();
340    
341            long numFiles = in.readLong();
342    
343            // read in the last generation stamp for legacy blocks.
344            long genstamp = in.readLong();
345            namesystem.setGenerationStampV1(genstamp);
346            
347            if (NameNodeLayoutVersion.supports(
348                LayoutVersion.Feature.SEQUENTIAL_BLOCK_ID, imgVersion)) {
349              // read the starting generation stamp for sequential block IDs
350              genstamp = in.readLong();
351              namesystem.setGenerationStampV2(genstamp);
352    
353              // read the last generation stamp for blocks created after
354              // the switch to sequential block IDs.
355              long stampAtIdSwitch = in.readLong();
356              namesystem.setGenerationStampV1Limit(stampAtIdSwitch);
357    
358              // read the max sequential block ID.
359              long maxSequentialBlockId = in.readLong();
360              namesystem.setLastAllocatedBlockId(maxSequentialBlockId);
361            } else {
362              long startingGenStamp = namesystem.upgradeGenerationStampToV2();
363              // This is an upgrade.
364              LOG.info("Upgrading to sequential block IDs. Generation stamp " +
365                       "for new blocks set to " + startingGenStamp);
366            }
367    
368            // read the transaction ID of the last edit represented by
369            // this image
370            if (NameNodeLayoutVersion.supports(
371                LayoutVersion.Feature.STORED_TXIDS, imgVersion)) {
372              imgTxId = in.readLong();
373            } else {
374              imgTxId = 0;
375            }
376    
377            // read the last allocated inode id in the fsimage
378            if (NameNodeLayoutVersion.supports(
379                LayoutVersion.Feature.ADD_INODE_ID, imgVersion)) {
380              long lastInodeId = in.readLong();
381              namesystem.resetLastInodeId(lastInodeId);
382              if (LOG.isDebugEnabled()) {
383                LOG.debug("load last allocated InodeId from fsimage:" + lastInodeId);
384              }
385            } else {
386              if (LOG.isDebugEnabled()) {
387                LOG.debug("Old layout version doesn't have inode id."
388                    + " Will assign new id for each inode.");
389              }
390            }
391            
392            if (supportSnapshot) {
393              snapshotMap = namesystem.getSnapshotManager().read(in, this);
394            }
395    
396            // read compression related info
397            FSImageCompression compression;
398            if (NameNodeLayoutVersion.supports(
399                LayoutVersion.Feature.FSIMAGE_COMPRESSION, imgVersion)) {
400              compression = FSImageCompression.readCompressionHeader(conf, in);
401            } else {
402              compression = FSImageCompression.createNoopCompression();
403            }
404            in = compression.unwrapInputStream(fin);
405    
406            LOG.info("Loading image file " + curFile + " using " + compression);
407            
408            // load all inodes
409            LOG.info("Number of files = " + numFiles);
410            prog.setTotal(Phase.LOADING_FSIMAGE, step, numFiles);
411            Counter counter = prog.getCounter(Phase.LOADING_FSIMAGE, step);
412            if (NameNodeLayoutVersion.supports(
413                LayoutVersion.Feature.FSIMAGE_NAME_OPTIMIZATION, imgVersion)) {
414              if (supportSnapshot) {
415                loadLocalNameINodesWithSnapshot(numFiles, in, counter);
416              } else {
417                loadLocalNameINodes(numFiles, in, counter);
418              }
419            } else {
420              loadFullNameINodes(numFiles, in, counter);
421            }
422    
423            loadFilesUnderConstruction(in, supportSnapshot, counter);
424            prog.endStep(Phase.LOADING_FSIMAGE, step);
425            // Now that the step is finished, set counter equal to total to adjust
426            // for possible under-counting due to reference inodes.
427            prog.setCount(Phase.LOADING_FSIMAGE, step, numFiles);
428    
429            loadSecretManagerState(in);
430    
431            loadCacheManagerState(in);
432    
433            // make sure to read to the end of file
434            boolean eof = (in.read() == -1);
435            assert eof : "Should have reached the end of image file " + curFile;
436          } finally {
437            in.close();
438          }
439    
440          imgDigest = new MD5Hash(digester.digest());
441          loaded = true;
442          
443          LOG.info("Image file " + curFile + " of size " + curFile.length() +
444              " bytes loaded in " + (now() - startTime)/1000 + " seconds.");
445        }
446    
447      /** Update the root node's attributes */
448      private void updateRootAttr(INodeWithAdditionalFields root) {                                                           
449        final Quota.Counts q = root.getQuotaCounts();
450        final long nsQuota = q.get(Quota.NAMESPACE);
451        final long dsQuota = q.get(Quota.DISKSPACE);
452        FSDirectory fsDir = namesystem.dir;
453        if (nsQuota != -1 || dsQuota != -1) {
454          fsDir.rootDir.getDirectoryWithQuotaFeature().setQuota(nsQuota, dsQuota);
455        }
456        fsDir.rootDir.cloneModificationTime(root);
457        fsDir.rootDir.clonePermissionStatus(root);    
458      }
459      
460        /**
461         * Load fsimage files when 1) only local names are stored, 
462         * and 2) snapshot is supported.
463         * 
464         * @param numFiles number of files expected to be read
465         * @param in Image input stream
466         * @param counter Counter to increment for namenode startup progress
467         */
468        private void loadLocalNameINodesWithSnapshot(long numFiles, DataInput in,
469            Counter counter) throws IOException {
470          assert NameNodeLayoutVersion.supports(
471              LayoutVersion.Feature.FSIMAGE_NAME_OPTIMIZATION, getLayoutVersion());
472          assert NameNodeLayoutVersion.supports(
473              LayoutVersion.Feature.SNAPSHOT, getLayoutVersion());
474          
475          // load root
476          loadRoot(in, counter);
477          // load rest of the nodes recursively
478          loadDirectoryWithSnapshot(in, counter);
479        }
480        
481      /** 
482       * load fsimage files assuming only local names are stored. Used when
483       * snapshots are not supported by the layout version.
484       *   
485       * @param numFiles number of files expected to be read
486       * @param in image input stream
487       * @param counter Counter to increment for namenode startup progress
488       * @throws IOException
489       */  
490       private void loadLocalNameINodes(long numFiles, DataInput in, Counter counter)
491           throws IOException {
492         assert NameNodeLayoutVersion.supports(
493             LayoutVersion.Feature.FSIMAGE_NAME_OPTIMIZATION, getLayoutVersion());
494         assert numFiles > 0;
495    
496         // load root
497         loadRoot(in, counter);
498         // have loaded the first file (the root)
499         numFiles--; 
500    
501         // load rest of the nodes directory by directory
502         while (numFiles > 0) {
503           numFiles -= loadDirectory(in, counter);
504         }
505         if (numFiles != 0) {
506           throw new IOException("Read unexpect number of files: " + -numFiles);
507         }
508       }
509       
510        /**
511         * Load information about root, and use the information to update the root
512         * directory of NameSystem.
513         * @param in The {@link DataInput} instance to read.
514         * @param counter Counter to increment for namenode startup progress
515         */
516        private void loadRoot(DataInput in, Counter counter)
517            throws IOException {
518          // load root
519          if (in.readShort() != 0) {
520            throw new IOException("First node is not root");
521          }
522          final INodeDirectory root = loadINode(null, false, in, counter)
523            .asDirectory();
524          // update the root's attributes
525          updateRootAttr(root);
526        }
527       
528        /** Load children nodes for the parent directory. */
529        private int loadChildren(INodeDirectory parent, DataInput in,
530            Counter counter) throws IOException {
531          int numChildren = in.readInt();
532          for (int i = 0; i < numChildren; i++) {
533            // load single inode
534            INode newNode = loadINodeWithLocalName(false, in, true, counter);
535            addToParent(parent, newNode);
536          }
537          return numChildren;
538        }
539        
540        /**
541         * Load a directory when snapshot is supported.
542         * @param in The {@link DataInput} instance to read.
543         * @param counter Counter to increment for namenode startup progress
544         */
545        private void loadDirectoryWithSnapshot(DataInput in, Counter counter)
546            throws IOException {
547          // Step 1. Identify the parent INode
548          long inodeId = in.readLong();
549          final INodeDirectory parent = this.namesystem.dir.getInode(inodeId)
550              .asDirectory();
551          
552          // Check if the whole subtree has been saved (for reference nodes)
553          boolean toLoadSubtree = referenceMap.toProcessSubtree(parent.getId());
554          if (!toLoadSubtree) {
555            return;
556          }
557    
558          // Step 2. Load snapshots if parent is snapshottable
559          int numSnapshots = in.readInt();
560          if (numSnapshots >= 0) {
561            // load snapshots and snapshotQuota
562            SnapshotFSImageFormat.loadSnapshotList(parent, numSnapshots, in, this);
563            if (parent.getDirectorySnapshottableFeature().getSnapshotQuota() > 0) {
564              // add the directory to the snapshottable directory list in 
565              // SnapshotManager. Note that we only add root when its snapshot quota
566              // is positive.
567              this.namesystem.getSnapshotManager().addSnapshottable(parent);
568            }
569          }
570    
571          // Step 3. Load children nodes under parent
572          loadChildren(parent, in, counter);
573          
574          // Step 4. load Directory Diff List
575          SnapshotFSImageFormat.loadDirectoryDiffList(parent, in, this);
576          
577          // Recursively load sub-directories, including snapshot copies of deleted
578          // directories
579          int numSubTree = in.readInt();
580          for (int i = 0; i < numSubTree; i++) {
581            loadDirectoryWithSnapshot(in, counter);
582          }
583        }
584        
585       /**
586        * Load all children of a directory
587        * 
588        * @param in input to load from
589        * @param counter Counter to increment for namenode startup progress
590        * @return number of child inodes read
591        * @throws IOException
592        */
593       private int loadDirectory(DataInput in, Counter counter) throws IOException {
594         String parentPath = FSImageSerialization.readString(in);
595         // Rename .snapshot paths if we're doing an upgrade
596         parentPath = renameReservedPathsOnUpgrade(parentPath, getLayoutVersion());
597         final INodeDirectory parent = INodeDirectory.valueOf(
598             namesystem.dir.getNode(parentPath, true), parentPath);
599         return loadChildren(parent, in, counter);
600       }
601    
602      /**
603       * load fsimage files assuming full path names are stored
604       * 
605       * @param numFiles total number of files to load
606       * @param in data input stream
607       * @param counter Counter to increment for namenode startup progress
608       * @throws IOException if any error occurs
609       */
610      private void loadFullNameINodes(long numFiles, DataInput in, Counter counter)
611          throws IOException {
612        byte[][] pathComponents;
613        byte[][] parentPath = {{}};      
614        FSDirectory fsDir = namesystem.dir;
615        INodeDirectory parentINode = fsDir.rootDir;
616        for (long i = 0; i < numFiles; i++) {
617          pathComponents = FSImageSerialization.readPathComponents(in);
618          for (int j=0; j < pathComponents.length; j++) {
619            byte[] newComponent = renameReservedComponentOnUpgrade
620                (pathComponents[j], getLayoutVersion());
621            if (!Arrays.equals(newComponent, pathComponents[j])) {
622              String oldPath = DFSUtil.byteArray2PathString(pathComponents);
623              pathComponents[j] = newComponent;
624              String newPath = DFSUtil.byteArray2PathString(pathComponents);
625              LOG.info("Renaming reserved path " + oldPath + " to " + newPath);
626            }
627          }
628          final INode newNode = loadINode(
629              pathComponents[pathComponents.length-1], false, in, counter);
630    
631          if (isRoot(pathComponents)) { // it is the root
632            // update the root's attributes
633            updateRootAttr(newNode.asDirectory());
634            continue;
635          }
636    
637          namesystem.dir.addToInodeMap(newNode);
638          // check if the new inode belongs to the same parent
639          if(!isParent(pathComponents, parentPath)) {
640            parentINode = getParentINodeDirectory(pathComponents);
641            parentPath = getParent(pathComponents);
642          }
643    
644          // add new inode
645          addToParent(parentINode, newNode);
646        }
647      }
648    
649      private INodeDirectory getParentINodeDirectory(byte[][] pathComponents
650          ) throws FileNotFoundException, PathIsNotDirectoryException,
651          UnresolvedLinkException {
652        if (pathComponents.length < 2) { // root
653          return null;
654        }
655        // Gets the parent INode
656        final INodesInPath inodes = namesystem.dir.getExistingPathINodes(
657            pathComponents);
658        return INodeDirectory.valueOf(inodes.getINode(-2), pathComponents);
659      }
660    
661      /**
662       * Add the child node to parent and, if child is a file, update block map.
663       * This method is only used for image loading so that synchronization,
664       * modification time update and space count update are not needed.
665       */
666      private void addToParent(INodeDirectory parent, INode child) {
667        FSDirectory fsDir = namesystem.dir;
668        if (parent == fsDir.rootDir) {
669            child.setLocalName(renameReservedRootComponentOnUpgrade(
670                child.getLocalNameBytes(), getLayoutVersion()));
671        }
672        // NOTE: This does not update space counts for parents
673        if (!parent.addChild(child)) {
674          return;
675        }
676        namesystem.dir.cacheName(child);
677    
678        if (child.isFile()) {
679          updateBlocksMap(child.asFile());
680        }
681      }
682    
683        public void updateBlocksMap(INodeFile file) {
684          // Add file->block mapping
685          final BlockInfo[] blocks = file.getBlocks();
686          if (blocks != null) {
687            final BlockManager bm = namesystem.getBlockManager();
688            for (int i = 0; i < blocks.length; i++) {
689              file.setBlock(i, bm.addBlockCollection(blocks[i], file));
690            } 
691          }
692        }
693    
694        /** @return The FSDirectory of the namesystem where the fsimage is loaded */
695        public FSDirectory getFSDirectoryInLoading() {
696          return namesystem.dir;
697        }
698    
699        public INode loadINodeWithLocalName(boolean isSnapshotINode, DataInput in,
700            boolean updateINodeMap) throws IOException {
701          return loadINodeWithLocalName(isSnapshotINode, in, updateINodeMap, null);
702        }
703    
704        public INode loadINodeWithLocalName(boolean isSnapshotINode,
705            DataInput in, boolean updateINodeMap, Counter counter)
706            throws IOException {
707          byte[] localName = FSImageSerialization.readLocalName(in);
708          localName =
709              renameReservedComponentOnUpgrade(localName, getLayoutVersion());
710          INode inode = loadINode(localName, isSnapshotINode, in, counter);
711          if (updateINodeMap) {
712            namesystem.dir.addToInodeMap(inode);
713          }
714          return inode;
715        }
716      
717      /**
718       * load an inode from fsimage except for its name
719       * 
720       * @param in data input stream from which image is read
721       * @param counter Counter to increment for namenode startup progress
722       * @return an inode
723       */
724      @SuppressWarnings("deprecation")
725      INode loadINode(final byte[] localName, boolean isSnapshotINode,
726          DataInput in, Counter counter) throws IOException {
727        final int imgVersion = getLayoutVersion();
728        if (NameNodeLayoutVersion.supports(
729            LayoutVersion.Feature.SNAPSHOT, imgVersion)) {
730          namesystem.getFSDirectory().verifyINodeName(localName);
731        }
732    
733        long inodeId = NameNodeLayoutVersion.supports(
734            LayoutVersion.Feature.ADD_INODE_ID, imgVersion) ? in.readLong()
735            : namesystem.allocateNewInodeId();
736        
737        final short replication = namesystem.getBlockManager().adjustReplication(
738            in.readShort());
739        final long modificationTime = in.readLong();
740        long atime = 0;
741        if (NameNodeLayoutVersion.supports(
742            LayoutVersion.Feature.FILE_ACCESS_TIME, imgVersion)) {
743          atime = in.readLong();
744        }
745        final long blockSize = in.readLong();
746        final int numBlocks = in.readInt();
747    
748        if (numBlocks >= 0) {
749          // file
750          
751          // read blocks
752          BlockInfo[] blocks = new BlockInfo[numBlocks];
753          for (int j = 0; j < numBlocks; j++) {
754            blocks[j] = new BlockInfo(replication);
755            blocks[j].readFields(in);
756          }
757    
758          String clientName = "";
759          String clientMachine = "";
760          boolean underConstruction = false;
761          FileDiffList fileDiffs = null;
762          if (NameNodeLayoutVersion.supports(
763              LayoutVersion.Feature.SNAPSHOT, imgVersion)) {
764            // read diffs
765            fileDiffs = SnapshotFSImageFormat.loadFileDiffList(in, this);
766    
767            if (isSnapshotINode) {
768              underConstruction = in.readBoolean();
769              if (underConstruction) {
770                clientName = FSImageSerialization.readString(in);
771                clientMachine = FSImageSerialization.readString(in);
772                // convert the last block to BlockUC
773                if (blocks.length > 0) {
774                  BlockInfo lastBlk = blocks[blocks.length - 1]; 
775                  blocks[blocks.length - 1] = new BlockInfoUnderConstruction(
776                      lastBlk, replication);
777                }
778              }
779            }
780          }
781    
782          final PermissionStatus permissions = PermissionStatus.read(in);
783    
784          // return
785          if (counter != null) {
786            counter.increment();
787          }
788    
789          final INodeFile file = new INodeFile(inodeId, localName, permissions,
790              modificationTime, atime, blocks, replication, blockSize, (byte)0);
791          if (underConstruction) {
792            file.toUnderConstruction(clientName, clientMachine);
793          }
794            return fileDiffs == null ? file : new INodeFile(file, fileDiffs);
795          } else if (numBlocks == -1) {
796            //directory
797          
798          //read quotas
799          final long nsQuota = in.readLong();
800          long dsQuota = -1L;
801          if (NameNodeLayoutVersion.supports(
802              LayoutVersion.Feature.DISKSPACE_QUOTA, imgVersion)) {
803            dsQuota = in.readLong();
804          }
805    
806          //read snapshot info
807          boolean snapshottable = false;
808          boolean withSnapshot = false;
809          if (NameNodeLayoutVersion.supports(
810              LayoutVersion.Feature.SNAPSHOT, imgVersion)) {
811            snapshottable = in.readBoolean();
812            if (!snapshottable) {
813              withSnapshot = in.readBoolean();
814            }
815          }
816    
817          final PermissionStatus permissions = PermissionStatus.read(in);
818    
819          //return
820          if (counter != null) {
821            counter.increment();
822          }
823          final INodeDirectory dir = new INodeDirectory(inodeId, localName,
824              permissions, modificationTime);
825          if (nsQuota >= 0 || dsQuota >= 0) {
826            dir.addDirectoryWithQuotaFeature(nsQuota, dsQuota);
827          }
828          if (withSnapshot) {
829            dir.addSnapshotFeature(null);
830          }
831          if (snapshottable) {
832            dir.addSnapshottableFeature();
833          }
834          return dir;
835        } else if (numBlocks == -2) {
836          //symlink
837          if (!FileSystem.areSymlinksEnabled()) {
838            throw new IOException("Symlinks not supported - please remove symlink before upgrading to this version of HDFS");
839          }
840    
841          final String symlink = Text.readString(in);
842          final PermissionStatus permissions = PermissionStatus.read(in);
843          if (counter != null) {
844            counter.increment();
845          }
846          return new INodeSymlink(inodeId, localName, permissions,
847              modificationTime, atime, symlink);
848        } else if (numBlocks == -3) {
849          //reference
850          // Intentionally do not increment counter, because it is too difficult at
851          // this point to assess whether or not this is a reference that counts
852          // toward quota.
853          
854          final boolean isWithName = in.readBoolean();
855          // lastSnapshotId for WithName node, dstSnapshotId for DstReference node
856          int snapshotId = in.readInt();
857          
858          final INodeReference.WithCount withCount
859              = referenceMap.loadINodeReferenceWithCount(isSnapshotINode, in, this);
860    
861          if (isWithName) {
862              return new INodeReference.WithName(null, withCount, localName,
863                  snapshotId);
864          } else {
865            final INodeReference ref = new INodeReference.DstReference(null,
866                withCount, snapshotId);
867            return ref;
868          }
869        }
870        
871        throw new IOException("Unknown inode type: numBlocks=" + numBlocks);
872      }
873    
874        /** Load {@link INodeFileAttributes}. */
875        public INodeFileAttributes loadINodeFileAttributes(DataInput in)
876            throws IOException {
877          final int layoutVersion = getLayoutVersion();
878          
879          if (!NameNodeLayoutVersion.supports(
880              LayoutVersion.Feature.OPTIMIZE_SNAPSHOT_INODES, layoutVersion)) {
881            return loadINodeWithLocalName(true, in, false).asFile();
882          }
883      
884          final byte[] name = FSImageSerialization.readLocalName(in);
885          final PermissionStatus permissions = PermissionStatus.read(in);
886          final long modificationTime = in.readLong();
887          final long accessTime = in.readLong();
888      
889          final short replication = namesystem.getBlockManager().adjustReplication(
890              in.readShort());
891          final long preferredBlockSize = in.readLong();
892    
893          return new INodeFileAttributes.SnapshotCopy(name, permissions, null, modificationTime,
894              accessTime, replication, preferredBlockSize, (byte) 0, null);
895        }
896    
897        public INodeDirectoryAttributes loadINodeDirectoryAttributes(DataInput in)
898            throws IOException {
899          final int layoutVersion = getLayoutVersion();
900          
901          if (!NameNodeLayoutVersion.supports(
902              LayoutVersion.Feature.OPTIMIZE_SNAPSHOT_INODES, layoutVersion)) {
903            return loadINodeWithLocalName(true, in, false).asDirectory();
904          }
905      
906          final byte[] name = FSImageSerialization.readLocalName(in);
907          final PermissionStatus permissions = PermissionStatus.read(in);
908          final long modificationTime = in.readLong();
909          
910          //read quotas
911          final long nsQuota = in.readLong();
912          final long dsQuota = in.readLong();
913      
914          return nsQuota == -1L && dsQuota == -1L ? new INodeDirectoryAttributes.SnapshotCopy(
915              name, permissions, null, modificationTime, null)
916            : new INodeDirectoryAttributes.CopyWithQuota(name, permissions,
917                null, modificationTime, nsQuota, dsQuota, null);
918        }
919      
920        private void loadFilesUnderConstruction(DataInput in,
921            boolean supportSnapshot, Counter counter) throws IOException {
922          FSDirectory fsDir = namesystem.dir;
923          int size = in.readInt();
924    
925          LOG.info("Number of files under construction = " + size);
926    
927          for (int i = 0; i < size; i++) {
928            INodeFile cons = FSImageSerialization.readINodeUnderConstruction(in,
929                namesystem, getLayoutVersion());
930            counter.increment();
931    
932            // verify that file exists in namespace
933            String path = cons.getLocalName();
934            INodeFile oldnode = null;
935            boolean inSnapshot = false;
936            if (path != null && FSDirectory.isReservedName(path) && 
937                NameNodeLayoutVersion.supports(
938                    LayoutVersion.Feature.ADD_INODE_ID, getLayoutVersion())) {
939              // TODO: for HDFS-5428, we use reserved path for those INodeFileUC in
940              // snapshot. If we support INode ID in the layout version, we can use
941              // the inode id to find the oldnode.
942              oldnode = namesystem.dir.getInode(cons.getId()).asFile();
943              inSnapshot = true;
944            } else {
945              path = renameReservedPathsOnUpgrade(path, getLayoutVersion());
946              final INodesInPath iip = fsDir.getLastINodeInPath(path);
947              oldnode = INodeFile.valueOf(iip.getINode(0), path);
948            }
949    
950            FileUnderConstructionFeature uc = cons.getFileUnderConstructionFeature();
951            oldnode.toUnderConstruction(uc.getClientName(), uc.getClientMachine());
952            if (oldnode.numBlocks() > 0) {
953              BlockInfo ucBlock = cons.getLastBlock();
954              // we do not replace the inode, just replace the last block of oldnode
955              BlockInfo info = namesystem.getBlockManager().addBlockCollection(
956                  ucBlock, oldnode);
957              oldnode.setBlock(oldnode.numBlocks() - 1, info);
958            }
959    
960            if (!inSnapshot) {
961              namesystem.leaseManager.addLease(cons
962                  .getFileUnderConstructionFeature().getClientName(), path);
963            }
964          }
965        }
966    
967        private void loadSecretManagerState(DataInput in)
968            throws IOException {
969          int imgVersion = getLayoutVersion();
970    
971          if (!NameNodeLayoutVersion.supports(
972              LayoutVersion.Feature.DELEGATION_TOKEN, imgVersion)) {
973            //SecretManagerState is not available.
974            //This must not happen if security is turned on.
975            return; 
976          }
977          namesystem.loadSecretManagerStateCompat(in);
978        }
979    
980        private void loadCacheManagerState(DataInput in) throws IOException {
981          int imgVersion = getLayoutVersion();
982          if (!NameNodeLayoutVersion.supports(
983              LayoutVersion.Feature.CACHING, imgVersion)) {
984            return;
985          }
986          namesystem.getCacheManager().loadStateCompat(in);
987        }
988    
989        private int getLayoutVersion() {
990          return namesystem.getFSImage().getStorage().getLayoutVersion();
991        }
992    
993        private boolean isRoot(byte[][] path) {
994          return path.length == 1 &&
995            path[0] == null;    
996        }
997    
998        private boolean isParent(byte[][] path, byte[][] parent) {
999          if (path == null || parent == null)
1000            return false;
1001          if (parent.length == 0 || path.length != parent.length + 1)
1002            return false;
1003          boolean isParent = true;
1004          for (int i = 0; i < parent.length; i++) {
1005            isParent = isParent && Arrays.equals(path[i], parent[i]); 
1006          }
1007          return isParent;
1008        }
1009    
1010        /**
1011         * Return string representing the parent of the given path.
1012         */
1013        String getParent(String path) {
1014          return path.substring(0, path.lastIndexOf(Path.SEPARATOR));
1015        }
1016        
1017        byte[][] getParent(byte[][] path) {
1018          byte[][] result = new byte[path.length - 1][];
1019          for (int i = 0; i < result.length; i++) {
1020            result[i] = new byte[path[i].length];
1021            System.arraycopy(path[i], 0, result[i], 0, path[i].length);
1022          }
1023          return result;
1024        }
1025        
1026        public Snapshot getSnapshot(DataInput in) throws IOException {
1027          return snapshotMap.get(in.readInt());
1028        }
1029      }
1030    
1031      @VisibleForTesting
1032      public static final TreeMap<String, String> renameReservedMap =
1033          new TreeMap<String, String>();
1034    
1035      /**
1036       * Use the default key-value pairs that will be used to determine how to
1037       * rename reserved paths on upgrade.
1038       */
1039      @VisibleForTesting
1040      public static void useDefaultRenameReservedPairs() {
1041        renameReservedMap.clear();
1042        for (String key: HdfsConstants.RESERVED_PATH_COMPONENTS) {
1043          renameReservedMap.put(
1044              key,
1045              key + "." + HdfsConstants.NAMENODE_LAYOUT_VERSION + "."
1046                  + "UPGRADE_RENAMED");
1047        }
1048      }
1049    
1050      /**
1051       * Set the key-value pairs that will be used to determine how to rename
1052       * reserved paths on upgrade.
1053       */
1054      @VisibleForTesting
1055      public static void setRenameReservedPairs(String renameReserved) {
1056        // Clear and set the default values
1057        useDefaultRenameReservedPairs();
1058        // Overwrite with provided values
1059        setRenameReservedMapInternal(renameReserved);
1060      }
1061    
1062      private static void setRenameReservedMapInternal(String renameReserved) {
1063        Collection<String> pairs =
1064            StringUtils.getTrimmedStringCollection(renameReserved);
1065        for (String p : pairs) {
1066          String[] pair = StringUtils.split(p, '/', '=');
1067          Preconditions.checkArgument(pair.length == 2,
1068              "Could not parse key-value pair " + p);
1069          String key = pair[0];
1070          String value = pair[1];
1071          Preconditions.checkArgument(DFSUtil.isReservedPathComponent(key),
1072              "Unknown reserved path " + key);
1073          Preconditions.checkArgument(DFSUtil.isValidNameForComponent(value),
1074              "Invalid rename path for " + key + ": " + value);
1075          LOG.info("Will rename reserved path " + key + " to " + value);
1076          renameReservedMap.put(key, value);
1077        }
1078      }
1079    
1080      /**
1081       * When upgrading from an old version, the filesystem could contain paths
1082       * that are now reserved in the new version (e.g. .snapshot). This renames
1083       * these new reserved paths to a user-specified value to avoid collisions
1084       * with the reserved name.
1085       * 
1086       * @param path Old path potentially containing a reserved path
1087       * @return New path with reserved path components renamed to user value
1088       */
1089      static String renameReservedPathsOnUpgrade(String path,
1090          final int layoutVersion) {
1091        final String oldPath = path;
1092        // If any known LVs aren't supported, we're doing an upgrade
1093        if (!NameNodeLayoutVersion.supports(Feature.ADD_INODE_ID, layoutVersion)) {
1094          String[] components = INode.getPathNames(path);
1095          // Only need to worry about the root directory
1096          if (components.length > 1) {
1097            components[1] = DFSUtil.bytes2String(
1098                renameReservedRootComponentOnUpgrade(
1099                    DFSUtil.string2Bytes(components[1]),
1100                    layoutVersion));
1101            path = DFSUtil.strings2PathString(components);
1102          }
1103        }
1104        if (!NameNodeLayoutVersion.supports(Feature.SNAPSHOT, layoutVersion)) {
1105          String[] components = INode.getPathNames(path);
1106          // Special case the root path
1107          if (components.length == 0) {
1108            return path;
1109          }
1110          for (int i=0; i<components.length; i++) {
1111            components[i] = DFSUtil.bytes2String(
1112                renameReservedComponentOnUpgrade(
1113                    DFSUtil.string2Bytes(components[i]),
1114                    layoutVersion));
1115          }
1116          path = DFSUtil.strings2PathString(components);
1117        }
1118    
1119        if (!path.equals(oldPath)) {
1120          LOG.info("Upgrade process renamed reserved path " + oldPath + " to "
1121              + path);
1122        }
1123        return path;
1124      }
1125    
1126      private final static String RESERVED_ERROR_MSG = 
1127          FSDirectory.DOT_RESERVED_PATH_PREFIX + " is a reserved path and "
1128          + HdfsConstants.DOT_SNAPSHOT_DIR + " is a reserved path component in"
1129          + " this version of HDFS. Please rollback and delete or rename"
1130          + " this path, or upgrade with the "
1131          + StartupOption.RENAMERESERVED.getName()
1132          + " [key-value pairs]"
1133          + " option to automatically rename these paths during upgrade.";
1134    
1135      /**
1136       * Same as {@link #renameReservedPathsOnUpgrade(String)}, but for a single
1137       * byte array path component.
1138       */
1139      private static byte[] renameReservedComponentOnUpgrade(byte[] component,
1140          final int layoutVersion) {
1141        // If the LV doesn't support snapshots, we're doing an upgrade
1142        if (!NameNodeLayoutVersion.supports(Feature.SNAPSHOT, layoutVersion)) {
1143          if (Arrays.equals(component, HdfsConstants.DOT_SNAPSHOT_DIR_BYTES)) {
1144            Preconditions.checkArgument(
1145                renameReservedMap.containsKey(HdfsConstants.DOT_SNAPSHOT_DIR),
1146                RESERVED_ERROR_MSG);
1147            component =
1148                DFSUtil.string2Bytes(renameReservedMap
1149                    .get(HdfsConstants.DOT_SNAPSHOT_DIR));
1150          }
1151        }
1152        return component;
1153      }
1154    
1155      /**
1156       * Same as {@link #renameReservedPathsOnUpgrade(String)}, but for a single
1157       * byte array path component.
1158       */
1159      private static byte[] renameReservedRootComponentOnUpgrade(byte[] component,
1160          final int layoutVersion) {
1161        // If the LV doesn't support inode IDs, we're doing an upgrade
1162        if (!NameNodeLayoutVersion.supports(Feature.ADD_INODE_ID, layoutVersion)) {
1163          if (Arrays.equals(component, FSDirectory.DOT_RESERVED)) {
1164            Preconditions.checkArgument(
1165                renameReservedMap.containsKey(FSDirectory.DOT_RESERVED_STRING),
1166                RESERVED_ERROR_MSG);
1167            final String renameString = renameReservedMap
1168                .get(FSDirectory.DOT_RESERVED_STRING);
1169            component =
1170                DFSUtil.string2Bytes(renameString);
1171            LOG.info("Renamed root path " + FSDirectory.DOT_RESERVED_STRING
1172                + " to " + renameString);
1173          }
1174        }
1175        return component;
1176      }
1177    
1178      /**
1179       * A one-shot class responsible for writing an image file.
1180       * The write() function should be called once, after which the getter
1181       * functions may be used to retrieve information about the file that was written.
1182       *
1183       * This is replaced by the PB-based FSImage. The class is to maintain
1184       * compatibility for the external fsimage tool.
1185       */
1186      @Deprecated
1187      static class Saver {
1188        private static final int LAYOUT_VERSION = -51;
1189        private final SaveNamespaceContext context;
1190        /** Set to true once an image has been written */
1191        private boolean saved = false;
1192    
1193        /** The MD5 checksum of the file that was written */
1194        private MD5Hash savedDigest;
1195        private final ReferenceMap referenceMap = new ReferenceMap();
1196    
1197        private final Map<Long, INodeFile> snapshotUCMap =
1198            new HashMap<Long, INodeFile>();
1199    
1200        /** @throws IllegalStateException if the instance has not yet saved an image */
1201        private void checkSaved() {
1202          if (!saved) {
1203            throw new IllegalStateException("FSImageSaver has not saved an image");
1204          }
1205        }
1206    
1207        /** @throws IllegalStateException if the instance has already saved an image */
1208        private void checkNotSaved() {
1209          if (saved) {
1210            throw new IllegalStateException("FSImageSaver has already saved an image");
1211          }
1212        }
1213    
1214    
1215        Saver(SaveNamespaceContext context) {
1216          this.context = context;
1217        }
1218    
1219        /**
1220         * Return the MD5 checksum of the image file that was saved.
1221         */
1222        MD5Hash getSavedDigest() {
1223          checkSaved();
1224          return savedDigest;
1225        }
1226    
1227        void save(File newFile, FSImageCompression compression) throws IOException {
1228          checkNotSaved();
1229    
1230          final FSNamesystem sourceNamesystem = context.getSourceNamesystem();
1231          final INodeDirectory rootDir = sourceNamesystem.dir.rootDir;
1232          final long numINodes = rootDir.getDirectoryWithQuotaFeature()
1233              .getSpaceConsumed().get(Quota.NAMESPACE);
1234          String sdPath = newFile.getParentFile().getParentFile().getAbsolutePath();
1235          Step step = new Step(StepType.INODES, sdPath);
1236          StartupProgress prog = NameNode.getStartupProgress();
1237          prog.beginStep(Phase.SAVING_CHECKPOINT, step);
1238          prog.setTotal(Phase.SAVING_CHECKPOINT, step, numINodes);
1239          Counter counter = prog.getCounter(Phase.SAVING_CHECKPOINT, step);
1240          long startTime = now();
1241          //
1242          // Write out data
1243          //
1244          MessageDigest digester = MD5Hash.getDigester();
1245          FileOutputStream fout = new FileOutputStream(newFile);
1246          DigestOutputStream fos = new DigestOutputStream(fout, digester);
1247          DataOutputStream out = new DataOutputStream(fos);
1248          try {
1249            out.writeInt(LAYOUT_VERSION);
1250            LayoutFlags.write(out);
1251            // We use the non-locked version of getNamespaceInfo here since
1252            // the coordinating thread of saveNamespace already has read-locked
1253            // the namespace for us. If we attempt to take another readlock
1254            // from the actual saver thread, there's a potential of a
1255            // fairness-related deadlock. See the comments on HDFS-2223.
1256            out.writeInt(sourceNamesystem.unprotectedGetNamespaceInfo()
1257                .getNamespaceID());
1258            out.writeLong(numINodes);
1259            out.writeLong(sourceNamesystem.getGenerationStampV1());
1260            out.writeLong(sourceNamesystem.getGenerationStampV2());
1261            out.writeLong(sourceNamesystem.getGenerationStampAtblockIdSwitch());
1262            out.writeLong(sourceNamesystem.getLastAllocatedBlockId());
1263            out.writeLong(context.getTxId());
1264            out.writeLong(sourceNamesystem.getLastInodeId());
1265    
1266    
1267            sourceNamesystem.getSnapshotManager().write(out);
1268    
1269            // write compression info and set up compressed stream
1270            out = compression.writeHeaderAndWrapStream(fos);
1271            LOG.info("Saving image file " + newFile +
1272                     " using " + compression);
1273    
1274            // save the root
1275            saveINode2Image(rootDir, out, false, referenceMap, counter);
1276            // save the rest of the nodes
1277            saveImage(rootDir, out, true, false, counter);
1278            prog.endStep(Phase.SAVING_CHECKPOINT, step);
1279            // Now that the step is finished, set counter equal to total to adjust
1280            // for possible under-counting due to reference inodes.
1281            prog.setCount(Phase.SAVING_CHECKPOINT, step, numINodes);
1282            // save files under construction
1283            // TODO: for HDFS-5428, since we cannot break the compatibility of
1284            // fsimage, we store part of the under-construction files that are only
1285            // in snapshots in this "under-construction-file" section. As a
1286            // temporary solution, we use "/.reserved/.inodes/<inodeid>" as their
1287            // paths, so that when loading fsimage we do not put them into the lease
1288            // map. In the future, we can remove this hack when we can bump the
1289            // layout version.
1290            sourceNamesystem.saveFilesUnderConstruction(out, snapshotUCMap);
1291    
1292            context.checkCancelled();
1293            sourceNamesystem.saveSecretManagerStateCompat(out, sdPath);
1294            context.checkCancelled();
1295            sourceNamesystem.getCacheManager().saveStateCompat(out, sdPath);
1296            context.checkCancelled();
1297            out.flush();
1298            context.checkCancelled();
1299            fout.getChannel().force(true);
1300          } finally {
1301            out.close();
1302          }
1303    
1304          saved = true;
1305          // set md5 of the saved image
1306          savedDigest = new MD5Hash(digester.digest());
1307    
1308          LOG.info("Image file " + newFile + " of size " + newFile.length() +
1309              " bytes saved in " + (now() - startTime)/1000 + " seconds.");
1310        }
1311    
1312        /**
1313         * Save children INodes.
1314         * @param children The list of children INodes
1315         * @param out The DataOutputStream to write
1316         * @param inSnapshot Whether the parent directory or its ancestor is in
1317         *                   the deleted list of some snapshot (caused by rename or
1318         *                   deletion)
1319         * @param counter Counter to increment for namenode startup progress
1320         * @return Number of children that are directory
1321         */
1322        private int saveChildren(ReadOnlyList<INode> children,
1323            DataOutputStream out, boolean inSnapshot, Counter counter)
1324            throws IOException {
1325          // Write normal children INode.
1326          out.writeInt(children.size());
1327          int dirNum = 0;
1328          int i = 0;
1329          for(INode child : children) {
1330            // print all children first
1331            // TODO: for HDFS-5428, we cannot change the format/content of fsimage
1332            // here, thus even if the parent directory is in snapshot, we still
1333            // do not handle INodeUC as those stored in deleted list
1334            saveINode2Image(child, out, false, referenceMap, counter);
1335            if (child.isDirectory()) {
1336              dirNum++;
1337            } else if (inSnapshot && child.isFile()
1338                && child.asFile().isUnderConstruction()) {
1339              this.snapshotUCMap.put(child.getId(), child.asFile());
1340            }
1341            if (i++ % 50 == 0) {
1342              context.checkCancelled();
1343            }
1344          }
1345          return dirNum;
1346        }
1347    
1348        /**
1349         * Save file tree image starting from the given root.
1350         * This is a recursive procedure, which first saves all children and
1351         * snapshot diffs of a current directory and then moves inside the
1352         * sub-directories.
1353         *
1354         * @param current The current node
1355         * @param out The DataoutputStream to write the image
1356         * @param toSaveSubtree Whether or not to save the subtree to fsimage. For
1357         *                      reference node, its subtree may already have been
1358         *                      saved before.
1359         * @param inSnapshot Whether the current directory is in snapshot
1360         * @param counter Counter to increment for namenode startup progress
1361         */
1362        private void saveImage(INodeDirectory current, DataOutputStream out,
1363            boolean toSaveSubtree, boolean inSnapshot, Counter counter)
1364            throws IOException {
1365          // write the inode id of the directory
1366          out.writeLong(current.getId());
1367    
1368          if (!toSaveSubtree) {
1369            return;
1370          }
1371    
1372          final ReadOnlyList<INode> children = current
1373              .getChildrenList(Snapshot.CURRENT_STATE_ID);
1374          int dirNum = 0;
1375          List<INodeDirectory> snapshotDirs = null;
1376          DirectoryWithSnapshotFeature sf = current.getDirectoryWithSnapshotFeature();
1377          if (sf != null) {
1378            snapshotDirs = new ArrayList<INodeDirectory>();
1379            sf.getSnapshotDirectory(snapshotDirs);
1380            dirNum += snapshotDirs.size();
1381          }
1382    
1383          // 2. Write INodeDirectorySnapshottable#snapshotsByNames to record all
1384          // Snapshots
1385          if (current.isDirectory() && current.asDirectory().isSnapshottable()) {
1386            SnapshotFSImageFormat.saveSnapshots(current.asDirectory(), out);
1387          } else {
1388            out.writeInt(-1); // # of snapshots
1389          }
1390    
1391          // 3. Write children INode
1392          dirNum += saveChildren(children, out, inSnapshot, counter);
1393    
1394          // 4. Write DirectoryDiff lists, if there is any.
1395          SnapshotFSImageFormat.saveDirectoryDiffList(current, out, referenceMap);
1396    
1397          // Write sub-tree of sub-directories, including possible snapshots of
1398          // deleted sub-directories
1399          out.writeInt(dirNum); // the number of sub-directories
1400          for(INode child : children) {
1401            if(!child.isDirectory()) {
1402              continue;
1403            }
1404            // make sure we only save the subtree under a reference node once
1405            boolean toSave = child.isReference() ?
1406                referenceMap.toProcessSubtree(child.getId()) : true;
1407            saveImage(child.asDirectory(), out, toSave, inSnapshot, counter);
1408          }
1409          if (snapshotDirs != null) {
1410            for (INodeDirectory subDir : snapshotDirs) {
1411              // make sure we only save the subtree under a reference node once
1412              boolean toSave = subDir.getParentReference() != null ?
1413                  referenceMap.toProcessSubtree(subDir.getId()) : true;
1414              saveImage(subDir, out, toSave, true, counter);
1415            }
1416          }
1417        }
1418    
1419        /**
1420         * Saves inode and increments progress counter.
1421         *
1422         * @param inode INode to save
1423         * @param out DataOutputStream to receive inode
1424         * @param writeUnderConstruction boolean true if this is under construction
1425         * @param referenceMap ReferenceMap containing reference inodes
1426         * @param counter Counter to increment for namenode startup progress
1427         * @throws IOException thrown if there is an I/O error
1428         */
1429        private void saveINode2Image(INode inode, DataOutputStream out,
1430            boolean writeUnderConstruction, ReferenceMap referenceMap,
1431            Counter counter) throws IOException {
1432          FSImageSerialization.saveINode2Image(inode, out, writeUnderConstruction,
1433            referenceMap);
1434          // Intentionally do not increment counter for reference inodes, because it
1435          // is too difficult at this point to assess whether or not this is a
1436          // reference that counts toward quota.
1437          if (!(inode instanceof INodeReference)) {
1438            counter.increment();
1439          }
1440        }
1441      }
1442    }