001    /**
002     * Licensed to the Apache Software Foundation (ASF) under one
003     * or more contributor license agreements.  See the NOTICE file
004     * distributed with this work for additional information
005     * regarding copyright ownership.  The ASF licenses this file
006     * to you under the Apache License, Version 2.0 (the
007     * "License"); you may not use this file except in compliance
008     * with the License.  You may obtain a copy of the License at
009     *
010     *     http://www.apache.org/licenses/LICENSE-2.0
011     *
012     * Unless required by applicable law or agreed to in writing, software
013     * distributed under the License is distributed on an "AS IS" BASIS,
014     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015     * See the License for the specific language governing permissions and
016     * limitations under the License.
017     */
018    package org.apache.hadoop.hdfs.server.namenode;
019    
020    import com.google.common.annotations.VisibleForTesting;
021    import com.google.common.base.Joiner;
022    import com.google.common.base.Preconditions;
023    import com.google.common.collect.Lists;
024    
025    import org.apache.commons.logging.Log;
026    import org.apache.commons.logging.LogFactory;
027    import org.apache.hadoop.HadoopIllegalArgumentException;
028    import org.apache.hadoop.classification.InterfaceAudience;
029    import org.apache.hadoop.conf.Configuration;
030    import org.apache.hadoop.fs.FileSystem;
031    import org.apache.hadoop.fs.Trash;
032    import org.apache.hadoop.ha.HAServiceProtocol.HAServiceState;
033    import org.apache.hadoop.ha.HAServiceProtocol.StateChangeRequestInfo;
034    import org.apache.hadoop.ha.HAServiceStatus;
035    import org.apache.hadoop.ha.HealthCheckFailedException;
036    import org.apache.hadoop.ha.ServiceFailedException;
037    import org.apache.hadoop.hdfs.DFSConfigKeys;
038    import org.apache.hadoop.hdfs.DFSUtil;
039    import org.apache.hadoop.hdfs.HAUtil;
040    import org.apache.hadoop.hdfs.HdfsConfiguration;
041    import org.apache.hadoop.hdfs.protocol.ClientProtocol;
042    import org.apache.hadoop.hdfs.protocol.HdfsConstants;
043    import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.NamenodeRole;
044    import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.RollingUpgradeStartupOption;
045    import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.StartupOption;
046    import org.apache.hadoop.hdfs.server.namenode.ha.*;
047    import org.apache.hadoop.hdfs.server.namenode.metrics.NameNodeMetrics;
048    import org.apache.hadoop.hdfs.server.namenode.startupprogress.StartupProgress;
049    import org.apache.hadoop.hdfs.server.namenode.startupprogress.StartupProgressMetrics;
050    import org.apache.hadoop.hdfs.server.protocol.*;
051    import org.apache.hadoop.ipc.Server;
052    import org.apache.hadoop.ipc.StandbyException;
053    import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
054    import org.apache.hadoop.metrics2.util.MBeans;
055    import org.apache.hadoop.net.NetUtils;
056    import org.apache.hadoop.security.AccessControlException;
057    import org.apache.hadoop.security.RefreshUserMappingsProtocol;
058    import org.apache.hadoop.security.SecurityUtil;
059    import org.apache.hadoop.security.UserGroupInformation;
060    import org.apache.hadoop.security.authorize.RefreshAuthorizationPolicyProtocol;
061    import org.apache.hadoop.ipc.RefreshCallQueueProtocol;
062    import org.apache.hadoop.tools.GetUserMappingsProtocol;
063    import org.apache.hadoop.tracing.SpanReceiverHost;
064    import org.apache.hadoop.tracing.TraceAdminProtocol;
065    import org.apache.hadoop.util.ExitUtil.ExitException;
066    import org.apache.hadoop.util.JvmPauseMonitor;
067    import org.apache.hadoop.util.ServicePlugin;
068    import org.apache.hadoop.util.StringUtils;
069    
070    import javax.management.ObjectName;
071    
072    import java.io.IOException;
073    import java.io.PrintStream;
074    import java.net.InetSocketAddress;
075    import java.net.URI;
076    import java.security.PrivilegedExceptionAction;
077    import java.util.ArrayList;
078    import java.util.Arrays;
079    import java.util.Collection;
080    import java.util.List;
081    
082    import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.FS_DEFAULT_NAME_KEY;
083    import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.FS_TRASH_INTERVAL_DEFAULT;
084    import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.FS_TRASH_INTERVAL_KEY;
085    import static org.apache.hadoop.hdfs.DFSConfigKeys.*;
086    import static org.apache.hadoop.util.ExitUtil.terminate;
087    import static org.apache.hadoop.util.ToolRunner.confirmPrompt;
088    
089    /**********************************************************
090     * NameNode serves as both directory namespace manager and
091     * "inode table" for the Hadoop DFS.  There is a single NameNode
092     * running in any DFS deployment.  (Well, except when there
093     * is a second backup/failover NameNode, or when using federated NameNodes.)
094     *
095     * The NameNode controls two critical tables:
096     *   1)  filename->blocksequence (namespace)
097     *   2)  block->machinelist ("inodes")
098     *
099     * The first table is stored on disk and is very precious.
100     * The second table is rebuilt every time the NameNode comes up.
101     *
102     * 'NameNode' refers to both this class as well as the 'NameNode server'.
103     * The 'FSNamesystem' class actually performs most of the filesystem
104     * management.  The majority of the 'NameNode' class itself is concerned
105     * with exposing the IPC interface and the HTTP server to the outside world,
106     * plus some configuration management.
107     *
108     * NameNode implements the
109     * {@link org.apache.hadoop.hdfs.protocol.ClientProtocol} interface, which
110     * allows clients to ask for DFS services.
111     * {@link org.apache.hadoop.hdfs.protocol.ClientProtocol} is not designed for
112     * direct use by authors of DFS client code.  End-users should instead use the
113     * {@link org.apache.hadoop.fs.FileSystem} class.
114     *
115     * NameNode also implements the
116     * {@link org.apache.hadoop.hdfs.server.protocol.DatanodeProtocol} interface,
117     * used by DataNodes that actually store DFS data blocks.  These
118     * methods are invoked repeatedly and automatically by all the
119     * DataNodes in a DFS deployment.
120     *
121     * NameNode also implements the
122     * {@link org.apache.hadoop.hdfs.server.protocol.NamenodeProtocol} interface,
123     * used by secondary namenodes or rebalancing processes to get partial
124     * NameNode state, for example partial blocksMap etc.
125     **********************************************************/
126    @InterfaceAudience.Private
127    public class NameNode implements NameNodeStatusMXBean {
128      static{
129        HdfsConfiguration.init();
130      }
131    
132      /**
133       * Categories of operations supported by the namenode.
134       */
135      public static enum OperationCategory {
136        /** Operations that are state agnostic */
137        UNCHECKED,
138        /** Read operation that does not change the namespace state */
139        READ,
140        /** Write operation that changes the namespace state */
141        WRITE,
142        /** Operations related to checkpointing */
143        CHECKPOINT,
144        /** Operations related to {@link JournalProtocol} */
145        JOURNAL
146      }
147      
148      /**
149       * HDFS configuration can have three types of parameters:
150       * <ol>
151       * <li>Parameters that are common for all the name services in the cluster.</li>
152       * <li>Parameters that are specific to a name service. These keys are suffixed
153       * with nameserviceId in the configuration. For example,
154       * "dfs.namenode.rpc-address.nameservice1".</li>
155       * <li>Parameters that are specific to a single name node. These keys are suffixed
156       * with nameserviceId and namenodeId in the configuration. for example,
157       * "dfs.namenode.rpc-address.nameservice1.namenode1"</li>
158       * </ol>
159       * 
160       * In the latter cases, operators may specify the configuration without
161       * any suffix, with a nameservice suffix, or with a nameservice and namenode
162       * suffix. The more specific suffix will take precedence.
163       * 
164       * These keys are specific to a given namenode, and thus may be configured
165       * globally, for a nameservice, or for a specific namenode within a nameservice.
166       */
167      public static final String[] NAMENODE_SPECIFIC_KEYS = {
168        DFS_NAMENODE_RPC_ADDRESS_KEY,
169        DFS_NAMENODE_RPC_BIND_HOST_KEY,
170        DFS_NAMENODE_NAME_DIR_KEY,
171        DFS_NAMENODE_EDITS_DIR_KEY,
172        DFS_NAMENODE_SHARED_EDITS_DIR_KEY,
173        DFS_NAMENODE_CHECKPOINT_DIR_KEY,
174        DFS_NAMENODE_CHECKPOINT_EDITS_DIR_KEY,
175        DFS_NAMENODE_SERVICE_RPC_ADDRESS_KEY,
176        DFS_NAMENODE_SERVICE_RPC_BIND_HOST_KEY,
177        DFS_NAMENODE_HTTP_ADDRESS_KEY,
178        DFS_NAMENODE_HTTPS_ADDRESS_KEY,
179        DFS_NAMENODE_HTTP_BIND_HOST_KEY,
180        DFS_NAMENODE_HTTPS_BIND_HOST_KEY,
181        DFS_NAMENODE_KEYTAB_FILE_KEY,
182        DFS_NAMENODE_SECONDARY_HTTP_ADDRESS_KEY,
183        DFS_NAMENODE_SECONDARY_HTTPS_ADDRESS_KEY,
184        DFS_SECONDARY_NAMENODE_KEYTAB_FILE_KEY,
185        DFS_NAMENODE_BACKUP_ADDRESS_KEY,
186        DFS_NAMENODE_BACKUP_HTTP_ADDRESS_KEY,
187        DFS_NAMENODE_BACKUP_SERVICE_RPC_ADDRESS_KEY,
188        DFS_NAMENODE_KERBEROS_PRINCIPAL_KEY,
189        DFS_NAMENODE_KERBEROS_INTERNAL_SPNEGO_PRINCIPAL_KEY,
190        DFS_HA_FENCE_METHODS_KEY,
191        DFS_HA_ZKFC_PORT_KEY,
192        DFS_HA_FENCE_METHODS_KEY
193      };
194      
195      /**
196       * @see #NAMENODE_SPECIFIC_KEYS
197       * These keys are specific to a nameservice, but may not be overridden
198       * for a specific namenode.
199       */
200      public static final String[] NAMESERVICE_SPECIFIC_KEYS = {
201        DFS_HA_AUTO_FAILOVER_ENABLED_KEY
202      };
203      
204      private static final String USAGE = "Usage: java NameNode ["
205          + StartupOption.BACKUP.getName() + "] | \n\t["
206          + StartupOption.CHECKPOINT.getName() + "] | \n\t["
207          + StartupOption.FORMAT.getName() + " ["
208          + StartupOption.CLUSTERID.getName() + " cid ] ["
209          + StartupOption.FORCE.getName() + "] ["
210          + StartupOption.NONINTERACTIVE.getName() + "] ] | \n\t["
211          + StartupOption.UPGRADE.getName() + 
212            " [" + StartupOption.CLUSTERID.getName() + " cid]" +
213            " [" + StartupOption.RENAMERESERVED.getName() + "<k-v pairs>] ] | \n\t["
214          + StartupOption.UPGRADEONLY.getName() + 
215            " [" + StartupOption.CLUSTERID.getName() + " cid]" +
216            " [" + StartupOption.RENAMERESERVED.getName() + "<k-v pairs>] ] | \n\t["
217          + StartupOption.ROLLBACK.getName() + "] | \n\t["
218          + StartupOption.ROLLINGUPGRADE.getName() + " "
219          + RollingUpgradeStartupOption.getAllOptionString() + " ] | \n\t["
220          + StartupOption.FINALIZE.getName() + "] | \n\t["
221          + StartupOption.IMPORT.getName() + "] | \n\t["
222          + StartupOption.INITIALIZESHAREDEDITS.getName() + "] | \n\t["
223          + StartupOption.BOOTSTRAPSTANDBY.getName() + "] | \n\t["
224          + StartupOption.RECOVER.getName() + " [ "
225          + StartupOption.FORCE.getName() + "] ] | \n\t["
226          + StartupOption.METADATAVERSION.getName() + " ] "
227          + " ]";
228    
229      
230      public long getProtocolVersion(String protocol, 
231                                     long clientVersion) throws IOException {
232        if (protocol.equals(ClientProtocol.class.getName())) {
233          return ClientProtocol.versionID; 
234        } else if (protocol.equals(DatanodeProtocol.class.getName())){
235          return DatanodeProtocol.versionID;
236        } else if (protocol.equals(NamenodeProtocol.class.getName())){
237          return NamenodeProtocol.versionID;
238        } else if (protocol.equals(RefreshAuthorizationPolicyProtocol.class.getName())){
239          return RefreshAuthorizationPolicyProtocol.versionID;
240        } else if (protocol.equals(RefreshUserMappingsProtocol.class.getName())){
241          return RefreshUserMappingsProtocol.versionID;
242        } else if (protocol.equals(RefreshCallQueueProtocol.class.getName())) {
243          return RefreshCallQueueProtocol.versionID;
244        } else if (protocol.equals(GetUserMappingsProtocol.class.getName())){
245          return GetUserMappingsProtocol.versionID;
246        } else if (protocol.equals(TraceAdminProtocol.class.getName())){
247          return TraceAdminProtocol.versionID;
248        } else {
249          throw new IOException("Unknown protocol to name node: " + protocol);
250        }
251      }
252        
253      public static final int DEFAULT_PORT = 8020;
254      public static final Log LOG = LogFactory.getLog(NameNode.class.getName());
255      public static final Log stateChangeLog = LogFactory.getLog("org.apache.hadoop.hdfs.StateChange");
256      public static final Log blockStateChangeLog = LogFactory.getLog("BlockStateChange");
257      public static final HAState ACTIVE_STATE = new ActiveState();
258      public static final HAState STANDBY_STATE = new StandbyState();
259      
260      protected FSNamesystem namesystem; 
261      protected final Configuration conf;
262      protected final NamenodeRole role;
263      private volatile HAState state;
264      private final boolean haEnabled;
265      private final HAContext haContext;
266      protected final boolean allowStaleStandbyReads;
267    
268      
269      /** httpServer */
270      protected NameNodeHttpServer httpServer;
271      private Thread emptier;
272      /** only used for testing purposes  */
273      protected boolean stopRequested = false;
274      /** Registration information of this name-node  */
275      protected NamenodeRegistration nodeRegistration;
276      /** Activated plug-ins. */
277      private List<ServicePlugin> plugins;
278      
279      private NameNodeRpcServer rpcServer;
280    
281      private JvmPauseMonitor pauseMonitor;
282      private ObjectName nameNodeStatusBeanName;
283      SpanReceiverHost spanReceiverHost;
284      /**
285       * The namenode address that clients will use to access this namenode
286       * or the name service. For HA configurations using logical URI, it
287       * will be the logical address.
288       */
289      private String clientNamenodeAddress;
290      
291      /** Format a new filesystem.  Destroys any filesystem that may already
292       * exist at this location.  **/
293      public static void format(Configuration conf) throws IOException {
294        format(conf, true, true);
295      }
296    
297      static NameNodeMetrics metrics;
298      private static final StartupProgress startupProgress = new StartupProgress();
299      /** Return the {@link FSNamesystem} object.
300       * @return {@link FSNamesystem} object.
301       */
302      public FSNamesystem getNamesystem() {
303        return namesystem;
304      }
305    
306      public NamenodeProtocols getRpcServer() {
307        return rpcServer;
308      }
309      
310      static void initMetrics(Configuration conf, NamenodeRole role) {
311        metrics = NameNodeMetrics.create(conf, role);
312      }
313    
314      public static NameNodeMetrics getNameNodeMetrics() {
315        return metrics;
316      }
317    
318      /**
319       * Returns object used for reporting namenode startup progress.
320       * 
321       * @return StartupProgress for reporting namenode startup progress
322       */
323      public static StartupProgress getStartupProgress() {
324        return startupProgress;
325      }
326    
327      /**
328       * Return the service name of the issued delegation token.
329       *
330       * @return The name service id in HA-mode, or the rpc address in non-HA mode
331       */
332      public String getTokenServiceName() {
333        return getClientNamenodeAddress();
334      }
335    
336      /**
337       * Set the namenode address that will be used by clients to access this
338       * namenode or name service. This needs to be called before the config
339       * is overriden.
340       */
341      public void setClientNamenodeAddress(Configuration conf) {
342        String nnAddr = conf.get(FS_DEFAULT_NAME_KEY);
343        if (nnAddr == null) {
344          // default fs is not set.
345          clientNamenodeAddress = null;
346          return;
347        }
348    
349        LOG.info(FS_DEFAULT_NAME_KEY + " is " + nnAddr);
350        URI nnUri = URI.create(nnAddr);
351    
352        String nnHost = nnUri.getHost();
353        if (nnHost == null) {
354          clientNamenodeAddress = null;
355          return;
356        }
357    
358        if (DFSUtil.getNameServiceIds(conf).contains(nnHost)) {
359          // host name is logical
360          clientNamenodeAddress = nnHost;
361        } else if (nnUri.getPort() > 0) {
362          // physical address with a valid port
363          clientNamenodeAddress = nnUri.getAuthority();
364        } else {
365          // the port is missing or 0. Figure out real bind address later.
366          clientNamenodeAddress = null;
367          return;
368        }
369        LOG.info("Clients are to use " + clientNamenodeAddress + " to access"
370            + " this namenode/service.");
371      }
372    
373      /**
374       * Get the namenode address to be used by clients.
375       * @return nn address
376       */
377      public String getClientNamenodeAddress() {
378        return clientNamenodeAddress;
379      }
380    
381      public static InetSocketAddress getAddress(String address) {
382        return NetUtils.createSocketAddr(address, DEFAULT_PORT);
383      }
384      
385      /**
386       * Set the configuration property for the service rpc address
387       * to address
388       */
389      public static void setServiceAddress(Configuration conf,
390                                               String address) {
391        LOG.info("Setting ADDRESS " + address);
392        conf.set(DFS_NAMENODE_SERVICE_RPC_ADDRESS_KEY, address);
393      }
394      
395      /**
396       * Fetches the address for services to use when connecting to namenode
397       * based on the value of fallback returns null if the special
398       * address is not specified or returns the default namenode address
399       * to be used by both clients and services.
400       * Services here are datanodes, backup node, any non client connection
401       */
402      public static InetSocketAddress getServiceAddress(Configuration conf,
403                                                            boolean fallback) {
404        String addr = conf.get(DFS_NAMENODE_SERVICE_RPC_ADDRESS_KEY);
405        if (addr == null || addr.isEmpty()) {
406          return fallback ? getAddress(conf) : null;
407        }
408        return getAddress(addr);
409      }
410    
411      public static InetSocketAddress getAddress(Configuration conf) {
412        URI filesystemURI = FileSystem.getDefaultUri(conf);
413        return getAddress(filesystemURI);
414      }
415    
416    
417      /**
418       * @return address of file system
419       */
420      public static InetSocketAddress getAddress(URI filesystemURI) {
421        String authority = filesystemURI.getAuthority();
422        if (authority == null) {
423          throw new IllegalArgumentException(String.format(
424              "Invalid URI for NameNode address (check %s): %s has no authority.",
425              FileSystem.FS_DEFAULT_NAME_KEY, filesystemURI.toString()));
426        }
427        if (!HdfsConstants.HDFS_URI_SCHEME.equalsIgnoreCase(
428            filesystemURI.getScheme())) {
429          throw new IllegalArgumentException(String.format(
430              "Invalid URI for NameNode address (check %s): %s is not of scheme '%s'.",
431              FileSystem.FS_DEFAULT_NAME_KEY, filesystemURI.toString(),
432              HdfsConstants.HDFS_URI_SCHEME));
433        }
434        return getAddress(authority);
435      }
436    
437      public static URI getUri(InetSocketAddress namenode) {
438        int port = namenode.getPort();
439        String portString = port == DEFAULT_PORT ? "" : (":"+port);
440        return URI.create(HdfsConstants.HDFS_URI_SCHEME + "://" 
441            + namenode.getHostName()+portString);
442      }
443    
444      //
445      // Common NameNode methods implementation for the active name-node role.
446      //
447      public NamenodeRole getRole() {
448        return role;
449      }
450    
451      boolean isRole(NamenodeRole that) {
452        return role.equals(that);
453      }
454    
455      /**
456       * Given a configuration get the address of the service rpc server
457       * If the service rpc is not configured returns null
458       */
459      protected InetSocketAddress getServiceRpcServerAddress(Configuration conf) {
460        return NameNode.getServiceAddress(conf, false);
461      }
462    
463      protected InetSocketAddress getRpcServerAddress(Configuration conf) {
464        return getAddress(conf);
465      }
466      
467      /** Given a configuration get the bind host of the service rpc server
468       *  If the bind host is not configured returns null.
469       */
470      protected String getServiceRpcServerBindHost(Configuration conf) {
471        String addr = conf.getTrimmed(DFS_NAMENODE_SERVICE_RPC_BIND_HOST_KEY);
472        if (addr == null || addr.isEmpty()) {
473          return null;
474        }
475        return addr;
476      }
477    
478      /** Given a configuration get the bind host of the client rpc server
479       *  If the bind host is not configured returns null.
480       */
481      protected String getRpcServerBindHost(Configuration conf) {
482        String addr = conf.getTrimmed(DFS_NAMENODE_RPC_BIND_HOST_KEY);
483        if (addr == null || addr.isEmpty()) {
484          return null;
485        }
486        return addr;
487      }
488       
489      /**
490       * Modifies the configuration passed to contain the service rpc address setting
491       */
492      protected void setRpcServiceServerAddress(Configuration conf,
493          InetSocketAddress serviceRPCAddress) {
494        setServiceAddress(conf, NetUtils.getHostPortString(serviceRPCAddress));
495      }
496    
497      protected void setRpcServerAddress(Configuration conf,
498          InetSocketAddress rpcAddress) {
499        FileSystem.setDefaultUri(conf, getUri(rpcAddress));
500      }
501    
502      protected InetSocketAddress getHttpServerAddress(Configuration conf) {
503        return getHttpAddress(conf);
504      }
505    
506      /**
507       * HTTP server address for binding the endpoint. This method is
508       * for use by the NameNode and its derivatives. It may return
509       * a different address than the one that should be used by clients to
510       * connect to the NameNode. See
511       * {@link DFSConfigKeys#DFS_NAMENODE_HTTP_BIND_HOST_KEY}
512       *
513       * @param conf
514       * @return
515       */
516      protected InetSocketAddress getHttpServerBindAddress(Configuration conf) {
517        InetSocketAddress bindAddress = getHttpServerAddress(conf);
518    
519        // If DFS_NAMENODE_HTTP_BIND_HOST_KEY exists then it overrides the
520        // host name portion of DFS_NAMENODE_HTTP_ADDRESS_KEY.
521        final String bindHost = conf.getTrimmed(DFS_NAMENODE_HTTP_BIND_HOST_KEY);
522        if (bindHost != null && !bindHost.isEmpty()) {
523          bindAddress = new InetSocketAddress(bindHost, bindAddress.getPort());
524        }
525    
526        return bindAddress;
527      }
528    
529      /** @return the NameNode HTTP address. */
530      public static InetSocketAddress getHttpAddress(Configuration conf) {
531        return  NetUtils.createSocketAddr(
532            conf.get(DFS_NAMENODE_HTTP_ADDRESS_KEY, DFS_NAMENODE_HTTP_ADDRESS_DEFAULT));
533      }
534    
535      protected void loadNamesystem(Configuration conf) throws IOException {
536        this.namesystem = FSNamesystem.loadFromDisk(conf);
537      }
538    
539      NamenodeRegistration getRegistration() {
540        return nodeRegistration;
541      }
542    
543      NamenodeRegistration setRegistration() {
544        nodeRegistration = new NamenodeRegistration(
545            NetUtils.getHostPortString(rpcServer.getRpcAddress()),
546            NetUtils.getHostPortString(getHttpAddress()),
547            getFSImage().getStorage(), getRole());
548        return nodeRegistration;
549      }
550    
551      /* optimize ugi lookup for RPC operations to avoid a trip through
552       * UGI.getCurrentUser which is synch'ed
553       */
554      public static UserGroupInformation getRemoteUser() throws IOException {
555        UserGroupInformation ugi = Server.getRemoteUser();
556        return (ugi != null) ? ugi : UserGroupInformation.getCurrentUser();
557      }
558    
559    
560      /**
561       * Login as the configured user for the NameNode.
562       */
563      void loginAsNameNodeUser(Configuration conf) throws IOException {
564        InetSocketAddress socAddr = getRpcServerAddress(conf);
565        SecurityUtil.login(conf, DFS_NAMENODE_KEYTAB_FILE_KEY,
566            DFS_NAMENODE_KERBEROS_PRINCIPAL_KEY, socAddr.getHostName());
567      }
568      
569      /**
570       * Initialize name-node.
571       * 
572       * @param conf the configuration
573       */
574      protected void initialize(Configuration conf) throws IOException {
575        if (conf.get(HADOOP_USER_GROUP_METRICS_PERCENTILES_INTERVALS) == null) {
576          String intervals = conf.get(DFS_METRICS_PERCENTILES_INTERVALS_KEY);
577          if (intervals != null) {
578            conf.set(HADOOP_USER_GROUP_METRICS_PERCENTILES_INTERVALS,
579              intervals);
580          }
581        }
582    
583        UserGroupInformation.setConfiguration(conf);
584        loginAsNameNodeUser(conf);
585    
586        NameNode.initMetrics(conf, this.getRole());
587        StartupProgressMetrics.register(startupProgress);
588    
589        if (NamenodeRole.NAMENODE == role) {
590          startHttpServer(conf);
591        }
592    
593        this.spanReceiverHost = SpanReceiverHost.getInstance(conf);
594    
595        loadNamesystem(conf);
596    
597        rpcServer = createRpcServer(conf);
598        if (clientNamenodeAddress == null) {
599          // This is expected for MiniDFSCluster. Set it now using 
600          // the RPC server's bind address.
601          clientNamenodeAddress = 
602              NetUtils.getHostPortString(rpcServer.getRpcAddress());
603          LOG.info("Clients are to use " + clientNamenodeAddress + " to access"
604              + " this namenode/service.");
605        }
606        if (NamenodeRole.NAMENODE == role) {
607          httpServer.setNameNodeAddress(getNameNodeAddress());
608          httpServer.setFSImage(getFSImage());
609        }
610        
611        pauseMonitor = new JvmPauseMonitor(conf);
612        pauseMonitor.start();
613        metrics.getJvmMetrics().setPauseMonitor(pauseMonitor);
614        
615        startCommonServices(conf);
616      }
617      
618      /**
619       * Create the RPC server implementation. Used as an extension point for the
620       * BackupNode.
621       */
622      protected NameNodeRpcServer createRpcServer(Configuration conf)
623          throws IOException {
624        return new NameNodeRpcServer(conf, this);
625      }
626    
627      /** Start the services common to active and standby states */
628      private void startCommonServices(Configuration conf) throws IOException {
629        namesystem.startCommonServices(conf, haContext);
630        registerNNSMXBean();
631        if (NamenodeRole.NAMENODE != role) {
632          startHttpServer(conf);
633          httpServer.setNameNodeAddress(getNameNodeAddress());
634          httpServer.setFSImage(getFSImage());
635        }
636        rpcServer.start();
637        plugins = conf.getInstances(DFS_NAMENODE_PLUGINS_KEY,
638            ServicePlugin.class);
639        for (ServicePlugin p: plugins) {
640          try {
641            p.start(this);
642          } catch (Throwable t) {
643            LOG.warn("ServicePlugin " + p + " could not be started", t);
644          }
645        }
646        LOG.info(getRole() + " RPC up at: " + rpcServer.getRpcAddress());
647        if (rpcServer.getServiceRpcAddress() != null) {
648          LOG.info(getRole() + " service RPC up at: "
649              + rpcServer.getServiceRpcAddress());
650        }
651      }
652      
653      private void stopCommonServices() {
654        if(rpcServer != null) rpcServer.stop();
655        if(namesystem != null) namesystem.close();
656        if (pauseMonitor != null) pauseMonitor.stop();
657        if (plugins != null) {
658          for (ServicePlugin p : plugins) {
659            try {
660              p.stop();
661            } catch (Throwable t) {
662              LOG.warn("ServicePlugin " + p + " could not be stopped", t);
663            }
664          }
665        }   
666        stopHttpServer();
667      }
668      
669      private void startTrashEmptier(final Configuration conf) throws IOException {
670        long trashInterval =
671            conf.getLong(FS_TRASH_INTERVAL_KEY, FS_TRASH_INTERVAL_DEFAULT);
672        if (trashInterval == 0) {
673          return;
674        } else if (trashInterval < 0) {
675          throw new IOException("Cannot start trash emptier with negative interval."
676              + " Set " + FS_TRASH_INTERVAL_KEY + " to a positive value.");
677        }
678        
679        // This may be called from the transitionToActive code path, in which
680        // case the current user is the administrator, not the NN. The trash
681        // emptier needs to run as the NN. See HDFS-3972.
682        FileSystem fs = SecurityUtil.doAsLoginUser(
683            new PrivilegedExceptionAction<FileSystem>() {
684              @Override
685              public FileSystem run() throws IOException {
686                return FileSystem.get(conf);
687              }
688            });
689        this.emptier = new Thread(new Trash(fs, conf).getEmptier(), "Trash Emptier");
690        this.emptier.setDaemon(true);
691        this.emptier.start();
692      }
693      
694      private void stopTrashEmptier() {
695        if (this.emptier != null) {
696          emptier.interrupt();
697          emptier = null;
698        }
699      }
700      
701      private void startHttpServer(final Configuration conf) throws IOException {
702        httpServer = new NameNodeHttpServer(conf, this, getHttpServerBindAddress(conf));
703        httpServer.start();
704        httpServer.setStartupProgress(startupProgress);
705      }
706      
707      private void stopHttpServer() {
708        try {
709          if (httpServer != null) httpServer.stop();
710        } catch (Exception e) {
711          LOG.error("Exception while stopping httpserver", e);
712        }
713      }
714    
715      /**
716       * Start NameNode.
717       * <p>
718       * The name-node can be started with one of the following startup options:
719       * <ul> 
720       * <li>{@link StartupOption#REGULAR REGULAR} - normal name node startup</li>
721       * <li>{@link StartupOption#FORMAT FORMAT} - format name node</li>
722       * <li>{@link StartupOption#BACKUP BACKUP} - start backup node</li>
723       * <li>{@link StartupOption#CHECKPOINT CHECKPOINT} - start checkpoint node</li>
724       * <li>{@link StartupOption#UPGRADE UPGRADE} - start the cluster  
725       * <li>{@link StartupOption#UPGRADEONLY UPGRADEONLY} - upgrade the cluster  
726       * upgrade and create a snapshot of the current file system state</li> 
727       * <li>{@link StartupOption#RECOVER RECOVERY} - recover name node
728       * metadata</li>
729       * <li>{@link StartupOption#ROLLBACK ROLLBACK} - roll the  
730       *            cluster back to the previous state</li>
731       * <li>{@link StartupOption#FINALIZE FINALIZE} - finalize 
732       *            previous upgrade</li>
733       * <li>{@link StartupOption#IMPORT IMPORT} - import checkpoint</li>
734       * </ul>
735       * The option is passed via configuration field: 
736       * <tt>dfs.namenode.startup</tt>
737       * 
738       * The conf will be modified to reflect the actual ports on which 
739       * the NameNode is up and running if the user passes the port as
740       * <code>zero</code> in the conf.
741       * 
742       * @param conf  confirguration
743       * @throws IOException
744       */
745      public NameNode(Configuration conf) throws IOException {
746        this(conf, NamenodeRole.NAMENODE);
747      }
748    
749      protected NameNode(Configuration conf, NamenodeRole role) 
750          throws IOException { 
751        this.conf = conf;
752        this.role = role;
753        setClientNamenodeAddress(conf);
754        String nsId = getNameServiceId(conf);
755        String namenodeId = HAUtil.getNameNodeId(conf, nsId);
756        this.haEnabled = HAUtil.isHAEnabled(conf, nsId);
757        state = createHAState(getStartupOption(conf));
758        this.allowStaleStandbyReads = HAUtil.shouldAllowStandbyReads(conf);
759        this.haContext = createHAContext();
760        try {
761          initializeGenericKeys(conf, nsId, namenodeId);
762          initialize(conf);
763          try {
764            haContext.writeLock();
765            state.prepareToEnterState(haContext);
766            state.enterState(haContext);
767          } finally {
768            haContext.writeUnlock();
769          }
770        } catch (IOException e) {
771          this.stop();
772          throw e;
773        } catch (HadoopIllegalArgumentException e) {
774          this.stop();
775          throw e;
776        }
777      }
778    
779      protected HAState createHAState(StartupOption startOpt) {
780        if (!haEnabled || startOpt == StartupOption.UPGRADE 
781            || startOpt == StartupOption.UPGRADEONLY) {
782          return ACTIVE_STATE;
783        } else {
784          return STANDBY_STATE;
785        }
786      }
787    
788      protected HAContext createHAContext() {
789        return new NameNodeHAContext();
790      }
791    
792      /**
793       * Wait for service to finish.
794       * (Normally, it runs forever.)
795       */
796      public void join() {
797        try {
798          rpcServer.join();
799        } catch (InterruptedException ie) {
800          LOG.info("Caught interrupted exception ", ie);
801        }
802      }
803    
804      /**
805       * Stop all NameNode threads and wait for all to finish.
806       */
807      public void stop() {
808        synchronized(this) {
809          if (stopRequested)
810            return;
811          stopRequested = true;
812        }
813        try {
814          if (state != null) {
815            state.exitState(haContext);
816          }
817        } catch (ServiceFailedException e) {
818          LOG.warn("Encountered exception while exiting state ", e);
819        } finally {
820          stopCommonServices();
821          if (metrics != null) {
822            metrics.shutdown();
823          }
824          if (namesystem != null) {
825            namesystem.shutdown();
826          }
827          if (nameNodeStatusBeanName != null) {
828            MBeans.unregister(nameNodeStatusBeanName);
829            nameNodeStatusBeanName = null;
830          }
831          if (this.spanReceiverHost != null) {
832            this.spanReceiverHost.closeReceivers();
833          }
834        }
835      }
836    
837      synchronized boolean isStopRequested() {
838        return stopRequested;
839      }
840    
841      /**
842       * Is the cluster currently in safe mode?
843       */
844      public boolean isInSafeMode() {
845        return namesystem.isInSafeMode();
846      }
847        
848      /** get FSImage */
849      @VisibleForTesting
850      public FSImage getFSImage() {
851        return namesystem.getFSImage();
852      }
853    
854      /**
855       * @return NameNode RPC address
856       */
857      public InetSocketAddress getNameNodeAddress() {
858        return rpcServer.getRpcAddress();
859      }
860    
861      /**
862       * @return NameNode RPC address in "host:port" string form
863       */
864      public String getNameNodeAddressHostPortString() {
865        return NetUtils.getHostPortString(rpcServer.getRpcAddress());
866      }
867    
868      /**
869       * @return NameNode service RPC address if configured, the
870       *    NameNode RPC address otherwise
871       */
872      public InetSocketAddress getServiceRpcAddress() {
873        final InetSocketAddress serviceAddr = rpcServer.getServiceRpcAddress();
874        return serviceAddr == null ? rpcServer.getRpcAddress() : serviceAddr;
875      }
876    
877      /**
878       * @return NameNode HTTP address, used by the Web UI, image transfer,
879       *    and HTTP-based file system clients like Hftp and WebHDFS
880       */
881      public InetSocketAddress getHttpAddress() {
882        return httpServer.getHttpAddress();
883      }
884    
885      /**
886       * @return NameNode HTTPS address, used by the Web UI, image transfer,
887       *    and HTTP-based file system clients like Hftp and WebHDFS
888       */
889      public InetSocketAddress getHttpsAddress() {
890        return httpServer.getHttpsAddress();
891      }
892    
893      /**
894       * Verify that configured directories exist, then
895       * Interactively confirm that formatting is desired 
896       * for each existing directory and format them.
897       * 
898       * @param conf configuration to use
899       * @param force if true, format regardless of whether dirs exist
900       * @return true if formatting was aborted, false otherwise
901       * @throws IOException
902       */
903      private static boolean format(Configuration conf, boolean force,
904          boolean isInteractive) throws IOException {
905        String nsId = DFSUtil.getNamenodeNameServiceId(conf);
906        String namenodeId = HAUtil.getNameNodeId(conf, nsId);
907        initializeGenericKeys(conf, nsId, namenodeId);
908        checkAllowFormat(conf);
909    
910        if (UserGroupInformation.isSecurityEnabled()) {
911          InetSocketAddress socAddr = getAddress(conf);
912          SecurityUtil.login(conf, DFS_NAMENODE_KEYTAB_FILE_KEY,
913              DFS_NAMENODE_KERBEROS_PRINCIPAL_KEY, socAddr.getHostName());
914        }
915        
916        Collection<URI> nameDirsToFormat = FSNamesystem.getNamespaceDirs(conf);
917        List<URI> sharedDirs = FSNamesystem.getSharedEditsDirs(conf);
918        List<URI> dirsToPrompt = new ArrayList<URI>();
919        dirsToPrompt.addAll(nameDirsToFormat);
920        dirsToPrompt.addAll(sharedDirs);
921        List<URI> editDirsToFormat = 
922                     FSNamesystem.getNamespaceEditsDirs(conf);
923    
924        // if clusterID is not provided - see if you can find the current one
925        String clusterId = StartupOption.FORMAT.getClusterId();
926        if(clusterId == null || clusterId.equals("")) {
927          //Generate a new cluster id
928          clusterId = NNStorage.newClusterID();
929        }
930        System.out.println("Formatting using clusterid: " + clusterId);
931        
932        FSImage fsImage = new FSImage(conf, nameDirsToFormat, editDirsToFormat);
933        try {
934          FSNamesystem fsn = new FSNamesystem(conf, fsImage);
935          fsImage.getEditLog().initJournalsForWrite();
936    
937          if (!fsImage.confirmFormat(force, isInteractive)) {
938            return true; // aborted
939          }
940    
941          fsImage.format(fsn, clusterId);
942        } catch (IOException ioe) {
943          LOG.warn("Encountered exception during format: ", ioe);
944          fsImage.close();
945          throw ioe;
946        }
947        return false;
948      }
949    
950      public static void checkAllowFormat(Configuration conf) throws IOException {
951        if (!conf.getBoolean(DFS_NAMENODE_SUPPORT_ALLOW_FORMAT_KEY, 
952            DFS_NAMENODE_SUPPORT_ALLOW_FORMAT_DEFAULT)) {
953          throw new IOException("The option " + DFS_NAMENODE_SUPPORT_ALLOW_FORMAT_KEY
954                    + " is set to false for this filesystem, so it "
955                    + "cannot be formatted. You will need to set "
956                    + DFS_NAMENODE_SUPPORT_ALLOW_FORMAT_KEY +" parameter "
957                    + "to true in order to format this filesystem");
958        }
959      }
960      
961      @VisibleForTesting
962      public static boolean initializeSharedEdits(Configuration conf) throws IOException {
963        return initializeSharedEdits(conf, true);
964      }
965      
966      @VisibleForTesting
967      public static boolean initializeSharedEdits(Configuration conf,
968          boolean force) throws IOException {
969        return initializeSharedEdits(conf, force, false);
970      }
971    
972      /**
973       * Clone the supplied configuration but remove the shared edits dirs.
974       *
975       * @param conf Supplies the original configuration.
976       * @return Cloned configuration without the shared edit dirs.
977       * @throws IOException on failure to generate the configuration.
978       */
979      private static Configuration getConfigurationWithoutSharedEdits(
980          Configuration conf)
981          throws IOException {
982        List<URI> editsDirs = FSNamesystem.getNamespaceEditsDirs(conf, false);
983        String editsDirsString = Joiner.on(",").join(editsDirs);
984    
985        Configuration confWithoutShared = new Configuration(conf);
986        confWithoutShared.unset(DFSConfigKeys.DFS_NAMENODE_SHARED_EDITS_DIR_KEY);
987        confWithoutShared.setStrings(DFSConfigKeys.DFS_NAMENODE_EDITS_DIR_KEY,
988            editsDirsString);
989        return confWithoutShared;
990      }
991    
992      /**
993       * Format a new shared edits dir and copy in enough edit log segments so that
994       * the standby NN can start up.
995       * 
996       * @param conf configuration
997       * @param force format regardless of whether or not the shared edits dir exists
998       * @param interactive prompt the user when a dir exists
999       * @return true if the command aborts, false otherwise
1000       */
1001      private static boolean initializeSharedEdits(Configuration conf,
1002          boolean force, boolean interactive) throws IOException {
1003        String nsId = DFSUtil.getNamenodeNameServiceId(conf);
1004        String namenodeId = HAUtil.getNameNodeId(conf, nsId);
1005        initializeGenericKeys(conf, nsId, namenodeId);
1006        
1007        if (conf.get(DFSConfigKeys.DFS_NAMENODE_SHARED_EDITS_DIR_KEY) == null) {
1008          LOG.fatal("No shared edits directory configured for namespace " +
1009              nsId + " namenode " + namenodeId);
1010          return false;
1011        }
1012    
1013        if (UserGroupInformation.isSecurityEnabled()) {
1014          InetSocketAddress socAddr = getAddress(conf);
1015          SecurityUtil.login(conf, DFS_NAMENODE_KEYTAB_FILE_KEY,
1016              DFS_NAMENODE_KERBEROS_PRINCIPAL_KEY, socAddr.getHostName());
1017        }
1018    
1019        NNStorage existingStorage = null;
1020        FSImage sharedEditsImage = null;
1021        try {
1022          FSNamesystem fsns =
1023              FSNamesystem.loadFromDisk(getConfigurationWithoutSharedEdits(conf));
1024          
1025          existingStorage = fsns.getFSImage().getStorage();
1026          NamespaceInfo nsInfo = existingStorage.getNamespaceInfo();
1027          
1028          List<URI> sharedEditsDirs = FSNamesystem.getSharedEditsDirs(conf);
1029          
1030          sharedEditsImage = new FSImage(conf,
1031              Lists.<URI>newArrayList(),
1032              sharedEditsDirs);
1033          sharedEditsImage.getEditLog().initJournalsForWrite();
1034          
1035          if (!sharedEditsImage.confirmFormat(force, interactive)) {
1036            return true; // abort
1037          }
1038          
1039          NNStorage newSharedStorage = sharedEditsImage.getStorage();
1040          // Call Storage.format instead of FSImage.format here, since we don't
1041          // actually want to save a checkpoint - just prime the dirs with
1042          // the existing namespace info
1043          newSharedStorage.format(nsInfo);
1044          sharedEditsImage.getEditLog().formatNonFileJournals(nsInfo);
1045    
1046          // Need to make sure the edit log segments are in good shape to initialize
1047          // the shared edits dir.
1048          fsns.getFSImage().getEditLog().close();
1049          fsns.getFSImage().getEditLog().initJournalsForWrite();
1050          fsns.getFSImage().getEditLog().recoverUnclosedStreams();
1051    
1052          copyEditLogSegmentsToSharedDir(fsns, sharedEditsDirs, newSharedStorage,
1053              conf);
1054        } catch (IOException ioe) {
1055          LOG.error("Could not initialize shared edits dir", ioe);
1056          return true; // aborted
1057        } finally {
1058          if (sharedEditsImage != null) {
1059            try {
1060              sharedEditsImage.close();
1061            }  catch (IOException ioe) {
1062              LOG.warn("Could not close sharedEditsImage", ioe);
1063            }
1064          }
1065          // Have to unlock storage explicitly for the case when we're running in a
1066          // unit test, which runs in the same JVM as NNs.
1067          if (existingStorage != null) {
1068            try {
1069              existingStorage.unlockAll();
1070            } catch (IOException ioe) {
1071              LOG.warn("Could not unlock storage directories", ioe);
1072              return true; // aborted
1073            }
1074          }
1075        }
1076        return false; // did not abort
1077      }
1078    
1079      private static void copyEditLogSegmentsToSharedDir(FSNamesystem fsns,
1080          Collection<URI> sharedEditsDirs, NNStorage newSharedStorage,
1081          Configuration conf) throws IOException {
1082        Preconditions.checkArgument(!sharedEditsDirs.isEmpty(),
1083            "No shared edits specified");
1084        // Copy edit log segments into the new shared edits dir.
1085        List<URI> sharedEditsUris = new ArrayList<URI>(sharedEditsDirs);
1086        FSEditLog newSharedEditLog = new FSEditLog(conf, newSharedStorage,
1087            sharedEditsUris);
1088        newSharedEditLog.initJournalsForWrite();
1089        newSharedEditLog.recoverUnclosedStreams();
1090        
1091        FSEditLog sourceEditLog = fsns.getFSImage().editLog;
1092        
1093        long fromTxId = fsns.getFSImage().getMostRecentCheckpointTxId();
1094        
1095        Collection<EditLogInputStream> streams = null;
1096        try {
1097          streams = sourceEditLog.selectInputStreams(fromTxId + 1, 0);
1098    
1099          // Set the nextTxid to the CheckpointTxId+1
1100          newSharedEditLog.setNextTxId(fromTxId + 1);
1101    
1102          // Copy all edits after last CheckpointTxId to shared edits dir
1103          for (EditLogInputStream stream : streams) {
1104            LOG.debug("Beginning to copy stream " + stream + " to shared edits");
1105            FSEditLogOp op;
1106            boolean segmentOpen = false;
1107            while ((op = stream.readOp()) != null) {
1108              if (LOG.isTraceEnabled()) {
1109                LOG.trace("copying op: " + op);
1110              }
1111              if (!segmentOpen) {
1112                newSharedEditLog.startLogSegment(op.txid, false);
1113                segmentOpen = true;
1114              }
1115    
1116              newSharedEditLog.logEdit(op);
1117    
1118              if (op.opCode == FSEditLogOpCodes.OP_END_LOG_SEGMENT) {
1119                newSharedEditLog.logSync();
1120                newSharedEditLog.endCurrentLogSegment(false);
1121                LOG.debug("ending log segment because of END_LOG_SEGMENT op in "
1122                    + stream);
1123                segmentOpen = false;
1124              }
1125            }
1126    
1127            if (segmentOpen) {
1128              LOG.debug("ending log segment because of end of stream in " + stream);
1129              newSharedEditLog.logSync();
1130              newSharedEditLog.endCurrentLogSegment(false);
1131              segmentOpen = false;
1132            }
1133          }
1134        } finally {
1135          if (streams != null) {
1136            FSEditLog.closeAllStreams(streams);
1137          }
1138        }
1139      }
1140      
1141      @VisibleForTesting
1142      public static boolean doRollback(Configuration conf,
1143          boolean isConfirmationNeeded) throws IOException {
1144        String nsId = DFSUtil.getNamenodeNameServiceId(conf);
1145        String namenodeId = HAUtil.getNameNodeId(conf, nsId);
1146        initializeGenericKeys(conf, nsId, namenodeId);
1147    
1148        FSNamesystem nsys = new FSNamesystem(conf, new FSImage(conf));
1149        System.err.print(
1150            "\"rollBack\" will remove the current state of the file system,\n"
1151            + "returning you to the state prior to initiating your recent.\n"
1152            + "upgrade. This action is permanent and cannot be undone. If you\n"
1153            + "are performing a rollback in an HA environment, you should be\n"
1154            + "certain that no NameNode process is running on any host.");
1155        if (isConfirmationNeeded) {
1156          if (!confirmPrompt("Roll back file system state?")) {
1157            System.err.println("Rollback aborted.");
1158            return true;
1159          }
1160        }
1161        nsys.getFSImage().doRollback(nsys);
1162        return false;
1163      }
1164    
1165      private static void printUsage(PrintStream out) {
1166        out.println(USAGE + "\n");
1167      }
1168    
1169      @VisibleForTesting
1170      static StartupOption parseArguments(String args[]) {
1171        int argsLen = (args == null) ? 0 : args.length;
1172        StartupOption startOpt = StartupOption.REGULAR;
1173        for(int i=0; i < argsLen; i++) {
1174          String cmd = args[i];
1175          if (StartupOption.FORMAT.getName().equalsIgnoreCase(cmd)) {
1176            startOpt = StartupOption.FORMAT;
1177            for (i = i + 1; i < argsLen; i++) {
1178              if (args[i].equalsIgnoreCase(StartupOption.CLUSTERID.getName())) {
1179                i++;
1180                if (i >= argsLen) {
1181                  // if no cluster id specified, return null
1182                  LOG.fatal("Must specify a valid cluster ID after the "
1183                      + StartupOption.CLUSTERID.getName() + " flag");
1184                  return null;
1185                }
1186                String clusterId = args[i];
1187                // Make sure an id is specified and not another flag
1188                if (clusterId.isEmpty() ||
1189                    clusterId.equalsIgnoreCase(StartupOption.FORCE.getName()) ||
1190                    clusterId.equalsIgnoreCase(
1191                        StartupOption.NONINTERACTIVE.getName())) {
1192                  LOG.fatal("Must specify a valid cluster ID after the "
1193                      + StartupOption.CLUSTERID.getName() + " flag");
1194                  return null;
1195                }
1196                startOpt.setClusterId(clusterId);
1197              }
1198    
1199              if (args[i].equalsIgnoreCase(StartupOption.FORCE.getName())) {
1200                startOpt.setForceFormat(true);
1201              }
1202    
1203              if (args[i].equalsIgnoreCase(StartupOption.NONINTERACTIVE.getName())) {
1204                startOpt.setInteractiveFormat(false);
1205              }
1206            }
1207          } else if (StartupOption.GENCLUSTERID.getName().equalsIgnoreCase(cmd)) {
1208            startOpt = StartupOption.GENCLUSTERID;
1209          } else if (StartupOption.REGULAR.getName().equalsIgnoreCase(cmd)) {
1210            startOpt = StartupOption.REGULAR;
1211          } else if (StartupOption.BACKUP.getName().equalsIgnoreCase(cmd)) {
1212            startOpt = StartupOption.BACKUP;
1213          } else if (StartupOption.CHECKPOINT.getName().equalsIgnoreCase(cmd)) {
1214            startOpt = StartupOption.CHECKPOINT;
1215          } else if (StartupOption.UPGRADE.getName().equalsIgnoreCase(cmd)
1216              || StartupOption.UPGRADEONLY.getName().equalsIgnoreCase(cmd)) {
1217            startOpt = StartupOption.UPGRADE.getName().equalsIgnoreCase(cmd) ? 
1218                StartupOption.UPGRADE : StartupOption.UPGRADEONLY;
1219            /* Can be followed by CLUSTERID with a required parameter or
1220             * RENAMERESERVED with an optional parameter
1221             */
1222            while (i + 1 < argsLen) {
1223              String flag = args[i + 1];
1224              if (flag.equalsIgnoreCase(StartupOption.CLUSTERID.getName())) {
1225                if (i + 2 < argsLen) {
1226                  i += 2;
1227                  startOpt.setClusterId(args[i]);
1228                } else {
1229                  LOG.fatal("Must specify a valid cluster ID after the "
1230                      + StartupOption.CLUSTERID.getName() + " flag");
1231                  return null;
1232                }
1233              } else if (flag.equalsIgnoreCase(StartupOption.RENAMERESERVED
1234                  .getName())) {
1235                if (i + 2 < argsLen) {
1236                  FSImageFormat.setRenameReservedPairs(args[i + 2]);
1237                  i += 2;
1238                } else {
1239                  FSImageFormat.useDefaultRenameReservedPairs();
1240                  i += 1;
1241                }
1242              } else {
1243                LOG.fatal("Unknown upgrade flag " + flag);
1244                return null;
1245              }
1246            }
1247          } else if (StartupOption.ROLLINGUPGRADE.getName().equalsIgnoreCase(cmd)) {
1248            startOpt = StartupOption.ROLLINGUPGRADE;
1249            ++i;
1250            if (i >= argsLen) {
1251              LOG.fatal("Must specify a rolling upgrade startup option "
1252                  + RollingUpgradeStartupOption.getAllOptionString());
1253              return null;
1254            }
1255            startOpt.setRollingUpgradeStartupOption(args[i]);
1256          } else if (StartupOption.ROLLBACK.getName().equalsIgnoreCase(cmd)) {
1257            startOpt = StartupOption.ROLLBACK;
1258          } else if (StartupOption.FINALIZE.getName().equalsIgnoreCase(cmd)) {
1259            startOpt = StartupOption.FINALIZE;
1260          } else if (StartupOption.IMPORT.getName().equalsIgnoreCase(cmd)) {
1261            startOpt = StartupOption.IMPORT;
1262          } else if (StartupOption.BOOTSTRAPSTANDBY.getName().equalsIgnoreCase(cmd)) {
1263            startOpt = StartupOption.BOOTSTRAPSTANDBY;
1264            return startOpt;
1265          } else if (StartupOption.INITIALIZESHAREDEDITS.getName().equalsIgnoreCase(cmd)) {
1266            startOpt = StartupOption.INITIALIZESHAREDEDITS;
1267            for (i = i + 1 ; i < argsLen; i++) {
1268              if (StartupOption.NONINTERACTIVE.getName().equals(args[i])) {
1269                startOpt.setInteractiveFormat(false);
1270              } else if (StartupOption.FORCE.getName().equals(args[i])) {
1271                startOpt.setForceFormat(true);
1272              } else {
1273                LOG.fatal("Invalid argument: " + args[i]);
1274                return null;
1275              }
1276            }
1277            return startOpt;
1278          } else if (StartupOption.RECOVER.getName().equalsIgnoreCase(cmd)) {
1279            if (startOpt != StartupOption.REGULAR) {
1280              throw new RuntimeException("Can't combine -recover with " +
1281                  "other startup options.");
1282            }
1283            startOpt = StartupOption.RECOVER;
1284            while (++i < argsLen) {
1285              if (args[i].equalsIgnoreCase(
1286                    StartupOption.FORCE.getName())) {
1287                startOpt.setForce(MetaRecoveryContext.FORCE_FIRST_CHOICE);
1288              } else {
1289                throw new RuntimeException("Error parsing recovery options: " + 
1290                  "can't understand option \"" + args[i] + "\"");
1291              }
1292            }
1293          } else if (StartupOption.METADATAVERSION.getName().equalsIgnoreCase(cmd)) {
1294            startOpt = StartupOption.METADATAVERSION;
1295          } else {
1296            return null;
1297          }
1298        }
1299        return startOpt;
1300      }
1301    
1302      private static void setStartupOption(Configuration conf, StartupOption opt) {
1303        conf.set(DFS_NAMENODE_STARTUP_KEY, opt.name());
1304      }
1305    
1306      static StartupOption getStartupOption(Configuration conf) {
1307        return StartupOption.valueOf(conf.get(DFS_NAMENODE_STARTUP_KEY,
1308                                              StartupOption.REGULAR.toString()));
1309      }
1310    
1311      private static void doRecovery(StartupOption startOpt, Configuration conf)
1312          throws IOException {
1313        String nsId = DFSUtil.getNamenodeNameServiceId(conf);
1314        String namenodeId = HAUtil.getNameNodeId(conf, nsId);
1315        initializeGenericKeys(conf, nsId, namenodeId);
1316        if (startOpt.getForce() < MetaRecoveryContext.FORCE_ALL) {
1317          if (!confirmPrompt("You have selected Metadata Recovery mode.  " +
1318              "This mode is intended to recover lost metadata on a corrupt " +
1319              "filesystem.  Metadata recovery mode often permanently deletes " +
1320              "data from your HDFS filesystem.  Please back up your edit log " +
1321              "and fsimage before trying this!\n\n" +
1322              "Are you ready to proceed? (Y/N)\n")) {
1323            System.err.println("Recovery aborted at user request.\n");
1324            return;
1325          }
1326        }
1327        MetaRecoveryContext.LOG.info("starting recovery...");
1328        UserGroupInformation.setConfiguration(conf);
1329        NameNode.initMetrics(conf, startOpt.toNodeRole());
1330        FSNamesystem fsn = null;
1331        try {
1332          fsn = FSNamesystem.loadFromDisk(conf);
1333          fsn.getFSImage().saveNamespace(fsn);
1334          MetaRecoveryContext.LOG.info("RECOVERY COMPLETE");
1335        } catch (IOException e) {
1336          MetaRecoveryContext.LOG.info("RECOVERY FAILED: caught exception", e);
1337          throw e;
1338        } catch (RuntimeException e) {
1339          MetaRecoveryContext.LOG.info("RECOVERY FAILED: caught exception", e);
1340          throw e;
1341        } finally {
1342          if (fsn != null)
1343            fsn.close();
1344        }
1345      }
1346    
1347      /**
1348       * Verify that configured directories exist, then print the metadata versions
1349       * of the software and the image.
1350       *
1351       * @param conf configuration to use
1352       * @throws IOException
1353       */
1354      private static boolean printMetadataVersion(Configuration conf)
1355        throws IOException {
1356        final String nsId = DFSUtil.getNamenodeNameServiceId(conf);
1357        final String namenodeId = HAUtil.getNameNodeId(conf, nsId);
1358        NameNode.initializeGenericKeys(conf, nsId, namenodeId);
1359        final FSImage fsImage = new FSImage(conf);
1360        final FSNamesystem fs = new FSNamesystem(conf, fsImage, false);
1361        return fsImage.recoverTransitionRead(
1362          StartupOption.METADATAVERSION, fs, null);
1363      }
1364    
1365      public static NameNode createNameNode(String argv[], Configuration conf)
1366          throws IOException {
1367        LOG.info("createNameNode " + Arrays.asList(argv));
1368        if (conf == null)
1369          conf = new HdfsConfiguration();
1370        StartupOption startOpt = parseArguments(argv);
1371        if (startOpt == null) {
1372          printUsage(System.err);
1373          return null;
1374        }
1375        setStartupOption(conf, startOpt);
1376    
1377        switch (startOpt) {
1378          case FORMAT: {
1379            boolean aborted = format(conf, startOpt.getForceFormat(),
1380                startOpt.getInteractiveFormat());
1381            terminate(aborted ? 1 : 0);
1382            return null; // avoid javac warning
1383          }
1384          case GENCLUSTERID: {
1385            System.err.println("Generating new cluster id:");
1386            System.out.println(NNStorage.newClusterID());
1387            terminate(0);
1388            return null;
1389          }
1390          case FINALIZE: {
1391            System.err.println("Use of the argument '" + StartupOption.FINALIZE +
1392                "' is no longer supported. To finalize an upgrade, start the NN " +
1393                " and then run `hdfs dfsadmin -finalizeUpgrade'");
1394            terminate(1);
1395            return null; // avoid javac warning
1396          }
1397          case ROLLBACK: {
1398            boolean aborted = doRollback(conf, true);
1399            terminate(aborted ? 1 : 0);
1400            return null; // avoid warning
1401          }
1402          case BOOTSTRAPSTANDBY: {
1403            String toolArgs[] = Arrays.copyOfRange(argv, 1, argv.length);
1404            int rc = BootstrapStandby.run(toolArgs, conf);
1405            terminate(rc);
1406            return null; // avoid warning
1407          }
1408          case INITIALIZESHAREDEDITS: {
1409            boolean aborted = initializeSharedEdits(conf,
1410                startOpt.getForceFormat(),
1411                startOpt.getInteractiveFormat());
1412            terminate(aborted ? 1 : 0);
1413            return null; // avoid warning
1414          }
1415          case BACKUP:
1416          case CHECKPOINT: {
1417            NamenodeRole role = startOpt.toNodeRole();
1418            DefaultMetricsSystem.initialize(role.toString().replace(" ", ""));
1419            return new BackupNode(conf, role);
1420          }
1421          case RECOVER: {
1422            NameNode.doRecovery(startOpt, conf);
1423            return null;
1424          }
1425          case METADATAVERSION: {
1426            printMetadataVersion(conf);
1427            terminate(0);
1428            return null; // avoid javac warning
1429          }
1430          case UPGRADEONLY: {
1431            DefaultMetricsSystem.initialize("NameNode");
1432            new NameNode(conf);
1433            terminate(0);
1434            return null;
1435          }
1436          default: {
1437            DefaultMetricsSystem.initialize("NameNode");
1438            return new NameNode(conf);
1439          }
1440        }
1441      }
1442    
1443      /**
1444       * In federation configuration is set for a set of
1445       * namenode and secondary namenode/backup/checkpointer, which are
1446       * grouped under a logical nameservice ID. The configuration keys specific 
1447       * to them have suffix set to configured nameserviceId.
1448       * 
1449       * This method copies the value from specific key of format key.nameserviceId
1450       * to key, to set up the generic configuration. Once this is done, only
1451       * generic version of the configuration is read in rest of the code, for
1452       * backward compatibility and simpler code changes.
1453       * 
1454       * @param conf
1455       *          Configuration object to lookup specific key and to set the value
1456       *          to the key passed. Note the conf object is modified
1457       * @param nameserviceId name service Id (to distinguish federated NNs)
1458       * @param namenodeId the namenode ID (to distinguish HA NNs)
1459       * @see DFSUtil#setGenericConf(Configuration, String, String, String...)
1460       */
1461      public static void initializeGenericKeys(Configuration conf,
1462          String nameserviceId, String namenodeId) {
1463        if ((nameserviceId != null && !nameserviceId.isEmpty()) || 
1464            (namenodeId != null && !namenodeId.isEmpty())) {
1465          if (nameserviceId != null) {
1466            conf.set(DFS_NAMESERVICE_ID, nameserviceId);
1467          }
1468          if (namenodeId != null) {
1469            conf.set(DFS_HA_NAMENODE_ID_KEY, namenodeId);
1470          }
1471          
1472          DFSUtil.setGenericConf(conf, nameserviceId, namenodeId,
1473              NAMENODE_SPECIFIC_KEYS);
1474          DFSUtil.setGenericConf(conf, nameserviceId, null,
1475              NAMESERVICE_SPECIFIC_KEYS);
1476        }
1477        
1478        // If the RPC address is set use it to (re-)configure the default FS
1479        if (conf.get(DFS_NAMENODE_RPC_ADDRESS_KEY) != null) {
1480          URI defaultUri = URI.create(HdfsConstants.HDFS_URI_SCHEME + "://"
1481              + conf.get(DFS_NAMENODE_RPC_ADDRESS_KEY));
1482          conf.set(FS_DEFAULT_NAME_KEY, defaultUri.toString());
1483          LOG.debug("Setting " + FS_DEFAULT_NAME_KEY + " to " + defaultUri.toString());
1484        }
1485      }
1486        
1487      /** 
1488       * Get the name service Id for the node
1489       * @return name service Id or null if federation is not configured
1490       */
1491      protected String getNameServiceId(Configuration conf) {
1492        return DFSUtil.getNamenodeNameServiceId(conf);
1493      }
1494      
1495      /**
1496       */
1497      public static void main(String argv[]) throws Exception {
1498        if (DFSUtil.parseHelpArgument(argv, NameNode.USAGE, System.out, true)) {
1499          System.exit(0);
1500        }
1501    
1502        try {
1503          StringUtils.startupShutdownMessage(NameNode.class, argv, LOG);
1504          NameNode namenode = createNameNode(argv, null);
1505          if (namenode != null) {
1506            namenode.join();
1507          }
1508        } catch (Throwable e) {
1509          LOG.fatal("Failed to start namenode.", e);
1510          terminate(1, e);
1511        }
1512      }
1513    
1514      synchronized void monitorHealth() 
1515          throws HealthCheckFailedException, AccessControlException {
1516        namesystem.checkSuperuserPrivilege();
1517        if (!haEnabled) {
1518          return; // no-op, if HA is not enabled
1519        }
1520        getNamesystem().checkAvailableResources();
1521        if (!getNamesystem().nameNodeHasResourcesAvailable()) {
1522          throw new HealthCheckFailedException(
1523              "The NameNode has no resources available");
1524        }
1525      }
1526      
1527      synchronized void transitionToActive() 
1528          throws ServiceFailedException, AccessControlException {
1529        namesystem.checkSuperuserPrivilege();
1530        if (!haEnabled) {
1531          throw new ServiceFailedException("HA for namenode is not enabled");
1532        }
1533        state.setState(haContext, ACTIVE_STATE);
1534      }
1535      
1536      synchronized void transitionToStandby() 
1537          throws ServiceFailedException, AccessControlException {
1538        namesystem.checkSuperuserPrivilege();
1539        if (!haEnabled) {
1540          throw new ServiceFailedException("HA for namenode is not enabled");
1541        }
1542        state.setState(haContext, STANDBY_STATE);
1543      }
1544    
1545      synchronized HAServiceStatus getServiceStatus()
1546          throws ServiceFailedException, AccessControlException {
1547        namesystem.checkSuperuserPrivilege();
1548        if (!haEnabled) {
1549          throw new ServiceFailedException("HA for namenode is not enabled");
1550        }
1551        if (state == null) {
1552          return new HAServiceStatus(HAServiceState.INITIALIZING);
1553        }
1554        HAServiceState retState = state.getServiceState();
1555        HAServiceStatus ret = new HAServiceStatus(retState);
1556        if (retState == HAServiceState.STANDBY) {
1557          String safemodeTip = namesystem.getSafeModeTip();
1558          if (!safemodeTip.isEmpty()) {
1559            ret.setNotReadyToBecomeActive(
1560                "The NameNode is in safemode. " +
1561                safemodeTip);
1562          } else {
1563            ret.setReadyToBecomeActive();
1564          }
1565        } else if (retState == HAServiceState.ACTIVE) {
1566          ret.setReadyToBecomeActive();
1567        } else {
1568          ret.setNotReadyToBecomeActive("State is " + state);
1569        }
1570        return ret;
1571      }
1572    
1573      synchronized HAServiceState getServiceState() {
1574        if (state == null) {
1575          return HAServiceState.INITIALIZING;
1576        }
1577        return state.getServiceState();
1578      }
1579    
1580      /**
1581       * Register NameNodeStatusMXBean
1582       */
1583      private void registerNNSMXBean() {
1584        nameNodeStatusBeanName = MBeans.register("NameNode", "NameNodeStatus", this);
1585      }
1586    
1587      @Override // NameNodeStatusMXBean
1588      public String getNNRole() {
1589        String roleStr = "";
1590        NamenodeRole role = getRole();
1591        if (null != role) {
1592          roleStr = role.toString();
1593        }
1594        return roleStr;
1595      }
1596    
1597      @Override // NameNodeStatusMXBean
1598      public String getState() {
1599        String servStateStr = "";
1600        HAServiceState servState = getServiceState();
1601        if (null != servState) {
1602          servStateStr = servState.toString();
1603        }
1604        return servStateStr;
1605      }
1606    
1607      @Override // NameNodeStatusMXBean
1608      public String getHostAndPort() {
1609        return getNameNodeAddressHostPortString();
1610      }
1611    
1612      @Override // NameNodeStatusMXBean
1613      public boolean isSecurityEnabled() {
1614        return UserGroupInformation.isSecurityEnabled();
1615      }
1616    
1617      /**
1618       * Shutdown the NN immediately in an ungraceful way. Used when it would be
1619       * unsafe for the NN to continue operating, e.g. during a failed HA state
1620       * transition.
1621       * 
1622       * @param t exception which warrants the shutdown. Printed to the NN log
1623       *          before exit.
1624       * @throws ExitException thrown only for testing.
1625       */
1626      protected synchronized void doImmediateShutdown(Throwable t)
1627          throws ExitException {
1628        String message = "Error encountered requiring NN shutdown. " +
1629            "Shutting down immediately.";
1630        try {
1631          LOG.fatal(message, t);
1632        } catch (Throwable ignored) {
1633          // This is unlikely to happen, but there's nothing we can do if it does.
1634        }
1635        terminate(1, t);
1636      }
1637      
1638      /**
1639       * Class used to expose {@link NameNode} as context to {@link HAState}
1640       */
1641      protected class NameNodeHAContext implements HAContext {
1642        @Override
1643        public void setState(HAState s) {
1644          state = s;
1645        }
1646    
1647        @Override
1648        public HAState getState() {
1649          return state;
1650        }
1651    
1652        @Override
1653        public void startActiveServices() throws IOException {
1654          try {
1655            namesystem.startActiveServices();
1656            startTrashEmptier(conf);
1657          } catch (Throwable t) {
1658            doImmediateShutdown(t);
1659          }
1660        }
1661    
1662        @Override
1663        public void stopActiveServices() throws IOException {
1664          try {
1665            if (namesystem != null) {
1666              namesystem.stopActiveServices();
1667            }
1668            stopTrashEmptier();
1669          } catch (Throwable t) {
1670            doImmediateShutdown(t);
1671          }
1672        }
1673    
1674        @Override
1675        public void startStandbyServices() throws IOException {
1676          try {
1677            namesystem.startStandbyServices(conf);
1678          } catch (Throwable t) {
1679            doImmediateShutdown(t);
1680          }
1681        }
1682    
1683        @Override
1684        public void prepareToStopStandbyServices() throws ServiceFailedException {
1685          try {
1686            namesystem.prepareToStopStandbyServices();
1687          } catch (Throwable t) {
1688            doImmediateShutdown(t);
1689          }
1690        }
1691        
1692        @Override
1693        public void stopStandbyServices() throws IOException {
1694          try {
1695            if (namesystem != null) {
1696              namesystem.stopStandbyServices();
1697            }
1698          } catch (Throwable t) {
1699            doImmediateShutdown(t);
1700          }
1701        }
1702        
1703        @Override
1704        public void writeLock() {
1705          namesystem.writeLock();
1706          namesystem.lockRetryCache();
1707        }
1708        
1709        @Override
1710        public void writeUnlock() {
1711          namesystem.unlockRetryCache();
1712          namesystem.writeUnlock();
1713        }
1714        
1715        /** Check if an operation of given category is allowed */
1716        @Override
1717        public void checkOperation(final OperationCategory op)
1718            throws StandbyException {
1719          state.checkOperation(haContext, op);
1720        }
1721        
1722        @Override
1723        public boolean allowStaleReads() {
1724          return allowStaleStandbyReads;
1725        }
1726    
1727      }
1728      
1729      public boolean isStandbyState() {
1730        return (state.equals(STANDBY_STATE));
1731      }
1732      
1733      public boolean isActiveState() {
1734        return (state.equals(ACTIVE_STATE));
1735      }
1736      
1737      /**
1738       * Check that a request to change this node's HA state is valid.
1739       * In particular, verifies that, if auto failover is enabled, non-forced
1740       * requests from the HAAdmin CLI are rejected, and vice versa.
1741       *
1742       * @param req the request to check
1743       * @throws AccessControlException if the request is disallowed
1744       */
1745      void checkHaStateChange(StateChangeRequestInfo req)
1746          throws AccessControlException {
1747        boolean autoHaEnabled = conf.getBoolean(DFS_HA_AUTO_FAILOVER_ENABLED_KEY,
1748            DFS_HA_AUTO_FAILOVER_ENABLED_DEFAULT);
1749        switch (req.getSource()) {
1750        case REQUEST_BY_USER:
1751          if (autoHaEnabled) {
1752            throw new AccessControlException(
1753                "Manual HA control for this NameNode is disallowed, because " +
1754                "automatic HA is enabled.");
1755          }
1756          break;
1757        case REQUEST_BY_USER_FORCED:
1758          if (autoHaEnabled) {
1759            LOG.warn("Allowing manual HA control from " +
1760                Server.getRemoteAddress() +
1761                " even though automatic HA is enabled, because the user " +
1762                "specified the force flag");
1763          }
1764          break;
1765        case REQUEST_BY_ZKFC:
1766          if (!autoHaEnabled) {
1767            throw new AccessControlException(
1768                "Request from ZK failover controller at " +
1769                Server.getRemoteAddress() + " denied since automatic HA " +
1770                "is not enabled"); 
1771          }
1772          break;
1773        }
1774      }
1775    }