001    /**
002     * Licensed to the Apache Software Foundation (ASF) under one
003     * or more contributor license agreements.  See the NOTICE file
004     * distributed with this work for additional information
005     * regarding copyright ownership.  The ASF licenses this file
006     * to you under the Apache License, Version 2.0 (the
007     * "License"); you may not use this file except in compliance
008     * with the License.  You may obtain a copy of the License at
009     *
010     *     http://www.apache.org/licenses/LICENSE-2.0
011     *
012     * Unless required by applicable law or agreed to in writing, software
013     * distributed under the License is distributed on an "AS IS" BASIS,
014     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015     * See the License for the specific language governing permissions and
016     * limitations under the License.
017     */
018    package org.apache.hadoop.hdfs.server.blockmanagement;
019    
020    import java.util.*;
021    
022    import org.apache.hadoop.conf.Configuration;
023    import org.apache.hadoop.hdfs.DFSUtil;
024    import org.apache.hadoop.hdfs.StorageType;
025    import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
026    import org.apache.hadoop.hdfs.server.namenode.FSClusterStats;
027    import org.apache.hadoop.net.NetworkTopology;
028    import org.apache.hadoop.net.NetworkTopologyWithNodeGroup;
029    import org.apache.hadoop.net.Node;
030    import org.apache.hadoop.net.NodeBase;
031    
032    /** The class is responsible for choosing the desired number of targets
033     * for placing block replicas on environment with node-group layer.
034     * The replica placement strategy is adjusted to:
035     * If the writer is on a datanode, the 1st replica is placed on the local 
036     *     node (or local node-group), otherwise a random datanode. 
037     * The 2nd replica is placed on a datanode that is on a different rack with 1st
038     *     replica node. 
039     * The 3rd replica is placed on a datanode which is on a different node-group
040     *     but the same rack as the second replica node.
041     */
042    public class BlockPlacementPolicyWithNodeGroup extends BlockPlacementPolicyDefault {
043    
044      protected BlockPlacementPolicyWithNodeGroup(Configuration conf,  FSClusterStats stats,
045          NetworkTopology clusterMap, DatanodeManager datanodeManager) {
046        initialize(conf, stats, clusterMap, host2datanodeMap);
047      }
048    
049      protected BlockPlacementPolicyWithNodeGroup() {
050      }
051    
052      public void initialize(Configuration conf,  FSClusterStats stats,
053              NetworkTopology clusterMap, 
054              Host2NodesMap host2datanodeMap) {
055        super.initialize(conf, stats, clusterMap, host2datanodeMap);
056      }
057    
058      /** choose local node of localMachine as the target.
059       * if localMachine is not available, choose a node on the same nodegroup or 
060       * rack instead.
061       * @return the chosen node
062       */
063      @Override
064      protected DatanodeStorageInfo chooseLocalStorage(Node localMachine,
065          Set<Node> excludedNodes, long blocksize, int maxNodesPerRack,
066          List<DatanodeStorageInfo> results, boolean avoidStaleNodes,
067          EnumMap<StorageType, Integer> storageTypes, boolean fallbackToLocalRack)
068          throws NotEnoughReplicasException {
069        // if no local machine, randomly choose one node
070        if (localMachine == null)
071          return chooseRandom(NodeBase.ROOT, excludedNodes, 
072              blocksize, maxNodesPerRack, results, avoidStaleNodes, storageTypes);
073    
074        // otherwise try local machine first
075        if (localMachine instanceof DatanodeDescriptor) {
076          DatanodeDescriptor localDataNode = (DatanodeDescriptor)localMachine;
077          if (excludedNodes.add(localMachine)) { // was not in the excluded list
078            for (Iterator<Map.Entry<StorageType, Integer>> iter = storageTypes
079                .entrySet().iterator(); iter.hasNext(); ) {
080              Map.Entry<StorageType, Integer> entry = iter.next();
081              for (DatanodeStorageInfo localStorage : DFSUtil.shuffle(
082                  localDataNode.getStorageInfos())) {
083                StorageType type = entry.getKey();
084                if (addIfIsGoodTarget(localStorage, excludedNodes, blocksize,
085                    maxNodesPerRack, false, results, avoidStaleNodes, type) >= 0) {
086                  int num = entry.getValue();
087                  if (num == 1) {
088                    iter.remove();
089                  } else {
090                    entry.setValue(num - 1);
091                  }
092                  return localStorage;
093                }
094              }
095            }
096          }
097        }
098    
099        // try a node on local node group
100        DatanodeStorageInfo chosenStorage = chooseLocalNodeGroup(
101            (NetworkTopologyWithNodeGroup)clusterMap, localMachine, excludedNodes, 
102            blocksize, maxNodesPerRack, results, avoidStaleNodes, storageTypes);
103        if (chosenStorage != null) {
104          return chosenStorage;
105        }
106    
107        if (!fallbackToLocalRack) {
108          return null;
109        }
110        // try a node on local rack
111        return chooseLocalRack(localMachine, excludedNodes, 
112            blocksize, maxNodesPerRack, results, avoidStaleNodes, storageTypes);
113      }
114    
115      /** @return the node of the second replica */
116      private static DatanodeDescriptor secondNode(Node localMachine,
117          List<DatanodeStorageInfo> results) {
118        // find the second replica
119        for(DatanodeStorageInfo nextStorage : results) {
120          DatanodeDescriptor nextNode = nextStorage.getDatanodeDescriptor();
121          if (nextNode != localMachine) {
122            return nextNode;
123          }
124        }
125        return null;
126      }
127    
128      @Override
129      protected DatanodeStorageInfo chooseLocalRack(Node localMachine,
130          Set<Node> excludedNodes, long blocksize, int maxNodesPerRack,
131          List<DatanodeStorageInfo> results, boolean avoidStaleNodes,
132          EnumMap<StorageType, Integer> storageTypes) throws
133          NotEnoughReplicasException {
134        // no local machine, so choose a random machine
135        if (localMachine == null) {
136          return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize,
137              maxNodesPerRack, results, avoidStaleNodes, storageTypes);
138        }
139    
140        // choose one from the local rack, but off-nodegroup
141        try {
142          final String scope = NetworkTopology.getFirstHalf(localMachine.getNetworkLocation());
143          return chooseRandom(scope, excludedNodes, blocksize, maxNodesPerRack,
144              results, avoidStaleNodes, storageTypes);
145        } catch (NotEnoughReplicasException e1) {
146          // find the second replica
147          final DatanodeDescriptor newLocal = secondNode(localMachine, results);
148          if (newLocal != null) {
149            try {
150              return chooseRandom(
151                  clusterMap.getRack(newLocal.getNetworkLocation()), excludedNodes,
152                  blocksize, maxNodesPerRack, results, avoidStaleNodes,
153                  storageTypes);
154            } catch(NotEnoughReplicasException e2) {
155              //otherwise randomly choose one from the network
156              return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize,
157                  maxNodesPerRack, results, avoidStaleNodes, storageTypes);
158            }
159          } else {
160            //otherwise randomly choose one from the network
161            return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize,
162                maxNodesPerRack, results, avoidStaleNodes, storageTypes);
163          }
164        }
165      }
166    
167      /**
168       * {@inheritDoc}
169       */
170      @Override
171      protected void chooseRemoteRack(int numOfReplicas,
172          DatanodeDescriptor localMachine, Set<Node> excludedNodes,
173          long blocksize, int maxReplicasPerRack, List<DatanodeStorageInfo> results,
174          boolean avoidStaleNodes, EnumMap<StorageType, Integer> storageTypes)
175          throws NotEnoughReplicasException {
176        int oldNumOfReplicas = results.size();
177    
178        final String rackLocation = NetworkTopology.getFirstHalf(
179            localMachine.getNetworkLocation());
180        try {
181          // randomly choose from remote racks
182          chooseRandom(numOfReplicas, "~" + rackLocation, excludedNodes, blocksize,
183              maxReplicasPerRack, results, avoidStaleNodes, storageTypes);
184        } catch (NotEnoughReplicasException e) {
185          // fall back to the local rack
186          chooseRandom(numOfReplicas - (results.size() - oldNumOfReplicas),
187              rackLocation, excludedNodes, blocksize,
188              maxReplicasPerRack, results, avoidStaleNodes, storageTypes);
189        }
190      }
191    
192      /* choose one node from the nodegroup that <i>localMachine</i> is on.
193       * if no such node is available, choose one node from the nodegroup where
194       * a second replica is on.
195       * if still no such node is available, choose a random node in the cluster.
196       * @return the chosen node
197       */
198      private DatanodeStorageInfo chooseLocalNodeGroup(
199          NetworkTopologyWithNodeGroup clusterMap, Node localMachine,
200          Set<Node> excludedNodes, long blocksize, int maxNodesPerRack,
201          List<DatanodeStorageInfo> results, boolean avoidStaleNodes,
202          EnumMap<StorageType, Integer> storageTypes) throws
203          NotEnoughReplicasException {
204        // no local machine, so choose a random machine
205        if (localMachine == null) {
206          return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize,
207              maxNodesPerRack, results, avoidStaleNodes, storageTypes);
208        }
209    
210        // choose one from the local node group
211        try {
212          return chooseRandom(
213              clusterMap.getNodeGroup(localMachine.getNetworkLocation()),
214              excludedNodes, blocksize, maxNodesPerRack, results, avoidStaleNodes,
215              storageTypes);
216        } catch (NotEnoughReplicasException e1) {
217          final DatanodeDescriptor newLocal = secondNode(localMachine, results);
218          if (newLocal != null) {
219            try {
220              return chooseRandom(
221                  clusterMap.getNodeGroup(newLocal.getNetworkLocation()),
222                  excludedNodes, blocksize, maxNodesPerRack, results,
223                  avoidStaleNodes, storageTypes);
224            } catch(NotEnoughReplicasException e2) {
225              //otherwise randomly choose one from the network
226              return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize,
227                  maxNodesPerRack, results, avoidStaleNodes, storageTypes);
228            }
229          } else {
230            //otherwise randomly choose one from the network
231            return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize,
232                maxNodesPerRack, results, avoidStaleNodes, storageTypes);
233          }
234        }
235      }
236    
237      @Override
238      protected String getRack(final DatanodeInfo cur) {
239        String nodeGroupString = cur.getNetworkLocation();
240        return NetworkTopology.getFirstHalf(nodeGroupString);
241      }
242      
243      /**
244       * Find other nodes in the same nodegroup of <i>localMachine</i> and add them
245       * into <i>excludeNodes</i> as replica should not be duplicated for nodes 
246       * within the same nodegroup
247       * @return number of new excluded nodes
248       */
249      @Override
250      protected int addToExcludedNodes(DatanodeDescriptor chosenNode,
251          Set<Node> excludedNodes) {
252        int countOfExcludedNodes = 0;
253        String nodeGroupScope = chosenNode.getNetworkLocation();
254        List<Node> leafNodes = clusterMap.getLeaves(nodeGroupScope);
255        for (Node leafNode : leafNodes) {
256          if (excludedNodes.add(leafNode)) {
257            // not a existing node in excludedNodes
258            countOfExcludedNodes++;
259          }
260        }
261        
262        countOfExcludedNodes += addDependentNodesToExcludedNodes(
263            chosenNode, excludedNodes);
264        return countOfExcludedNodes;
265      }
266      
267      /**
268       * Add all nodes from a dependent nodes list to excludedNodes.
269       * @return number of new excluded nodes
270       */
271      private int addDependentNodesToExcludedNodes(DatanodeDescriptor chosenNode,
272          Set<Node> excludedNodes) {
273        if (this.host2datanodeMap == null) {
274          return 0;
275        }
276        int countOfExcludedNodes = 0;
277        for(String hostname : chosenNode.getDependentHostNames()) {
278          DatanodeDescriptor node =
279              this.host2datanodeMap.getDataNodeByHostName(hostname);
280          if(node!=null) {
281            if (excludedNodes.add(node)) {
282              countOfExcludedNodes++;
283            }
284          } else {
285            LOG.warn("Not able to find datanode " + hostname
286                + " which has dependency with datanode "
287                + chosenNode.getHostName());
288          }
289        }
290        
291        return countOfExcludedNodes;
292      }
293    
294      /**
295       * Pick up replica node set for deleting replica as over-replicated. 
296       * First set contains replica nodes on rack with more than one
297       * replica while second set contains remaining replica nodes.
298       * If first is not empty, divide first set into two subsets:
299       *   moreThanOne contains nodes on nodegroup with more than one replica
300       *   exactlyOne contains the remaining nodes in first set
301       * then pickup priSet if not empty.
302       * If first is empty, then pick second.
303       */
304      @Override
305      public Collection<DatanodeStorageInfo> pickupReplicaSet(
306          Collection<DatanodeStorageInfo> first,
307          Collection<DatanodeStorageInfo> second) {
308        // If no replica within same rack, return directly.
309        if (first.isEmpty()) {
310          return second;
311        }
312        // Split data nodes in the first set into two sets, 
313        // moreThanOne contains nodes on nodegroup with more than one replica
314        // exactlyOne contains the remaining nodes
315        Map<String, List<DatanodeStorageInfo>> nodeGroupMap = 
316            new HashMap<String, List<DatanodeStorageInfo>>();
317        
318        for(DatanodeStorageInfo storage : first) {
319          final String nodeGroupName = NetworkTopology.getLastHalf(
320              storage.getDatanodeDescriptor().getNetworkLocation());
321          List<DatanodeStorageInfo> storageList = nodeGroupMap.get(nodeGroupName);
322          if (storageList == null) {
323            storageList = new ArrayList<DatanodeStorageInfo>();
324            nodeGroupMap.put(nodeGroupName, storageList);
325          }
326          storageList.add(storage);
327        }
328        
329        final List<DatanodeStorageInfo> moreThanOne = new ArrayList<DatanodeStorageInfo>();
330        final List<DatanodeStorageInfo> exactlyOne = new ArrayList<DatanodeStorageInfo>();
331        // split nodes into two sets
332        for(List<DatanodeStorageInfo> datanodeList : nodeGroupMap.values()) {
333          if (datanodeList.size() == 1 ) {
334            // exactlyOne contains nodes on nodegroup with exactly one replica
335            exactlyOne.add(datanodeList.get(0));
336          } else {
337            // moreThanOne contains nodes on nodegroup with more than one replica
338            moreThanOne.addAll(datanodeList);
339          }
340        }
341        
342        return moreThanOne.isEmpty()? exactlyOne : moreThanOne;
343      }
344      
345    }