001 /**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements. See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership. The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License. You may obtain a copy of the License at
009 *
010 * http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018 package org.apache.hadoop.hdfs.server.blockmanagement;
019
020 import java.util.*;
021
022 import org.apache.hadoop.conf.Configuration;
023 import org.apache.hadoop.hdfs.DFSUtil;
024 import org.apache.hadoop.hdfs.StorageType;
025 import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
026 import org.apache.hadoop.hdfs.server.namenode.FSClusterStats;
027 import org.apache.hadoop.net.NetworkTopology;
028 import org.apache.hadoop.net.NetworkTopologyWithNodeGroup;
029 import org.apache.hadoop.net.Node;
030 import org.apache.hadoop.net.NodeBase;
031
032 /** The class is responsible for choosing the desired number of targets
033 * for placing block replicas on environment with node-group layer.
034 * The replica placement strategy is adjusted to:
035 * If the writer is on a datanode, the 1st replica is placed on the local
036 * node (or local node-group), otherwise a random datanode.
037 * The 2nd replica is placed on a datanode that is on a different rack with 1st
038 * replica node.
039 * The 3rd replica is placed on a datanode which is on a different node-group
040 * but the same rack as the second replica node.
041 */
042 public class BlockPlacementPolicyWithNodeGroup extends BlockPlacementPolicyDefault {
043
044 protected BlockPlacementPolicyWithNodeGroup(Configuration conf, FSClusterStats stats,
045 NetworkTopology clusterMap, DatanodeManager datanodeManager) {
046 initialize(conf, stats, clusterMap, host2datanodeMap);
047 }
048
049 protected BlockPlacementPolicyWithNodeGroup() {
050 }
051
052 public void initialize(Configuration conf, FSClusterStats stats,
053 NetworkTopology clusterMap,
054 Host2NodesMap host2datanodeMap) {
055 super.initialize(conf, stats, clusterMap, host2datanodeMap);
056 }
057
058 /** choose local node of localMachine as the target.
059 * if localMachine is not available, choose a node on the same nodegroup or
060 * rack instead.
061 * @return the chosen node
062 */
063 @Override
064 protected DatanodeStorageInfo chooseLocalStorage(Node localMachine,
065 Set<Node> excludedNodes, long blocksize, int maxNodesPerRack,
066 List<DatanodeStorageInfo> results, boolean avoidStaleNodes,
067 EnumMap<StorageType, Integer> storageTypes, boolean fallbackToLocalRack)
068 throws NotEnoughReplicasException {
069 // if no local machine, randomly choose one node
070 if (localMachine == null)
071 return chooseRandom(NodeBase.ROOT, excludedNodes,
072 blocksize, maxNodesPerRack, results, avoidStaleNodes, storageTypes);
073
074 // otherwise try local machine first
075 if (localMachine instanceof DatanodeDescriptor) {
076 DatanodeDescriptor localDataNode = (DatanodeDescriptor)localMachine;
077 if (excludedNodes.add(localMachine)) { // was not in the excluded list
078 for (Iterator<Map.Entry<StorageType, Integer>> iter = storageTypes
079 .entrySet().iterator(); iter.hasNext(); ) {
080 Map.Entry<StorageType, Integer> entry = iter.next();
081 for (DatanodeStorageInfo localStorage : DFSUtil.shuffle(
082 localDataNode.getStorageInfos())) {
083 StorageType type = entry.getKey();
084 if (addIfIsGoodTarget(localStorage, excludedNodes, blocksize,
085 maxNodesPerRack, false, results, avoidStaleNodes, type) >= 0) {
086 int num = entry.getValue();
087 if (num == 1) {
088 iter.remove();
089 } else {
090 entry.setValue(num - 1);
091 }
092 return localStorage;
093 }
094 }
095 }
096 }
097 }
098
099 // try a node on local node group
100 DatanodeStorageInfo chosenStorage = chooseLocalNodeGroup(
101 (NetworkTopologyWithNodeGroup)clusterMap, localMachine, excludedNodes,
102 blocksize, maxNodesPerRack, results, avoidStaleNodes, storageTypes);
103 if (chosenStorage != null) {
104 return chosenStorage;
105 }
106
107 if (!fallbackToLocalRack) {
108 return null;
109 }
110 // try a node on local rack
111 return chooseLocalRack(localMachine, excludedNodes,
112 blocksize, maxNodesPerRack, results, avoidStaleNodes, storageTypes);
113 }
114
115 /** @return the node of the second replica */
116 private static DatanodeDescriptor secondNode(Node localMachine,
117 List<DatanodeStorageInfo> results) {
118 // find the second replica
119 for(DatanodeStorageInfo nextStorage : results) {
120 DatanodeDescriptor nextNode = nextStorage.getDatanodeDescriptor();
121 if (nextNode != localMachine) {
122 return nextNode;
123 }
124 }
125 return null;
126 }
127
128 @Override
129 protected DatanodeStorageInfo chooseLocalRack(Node localMachine,
130 Set<Node> excludedNodes, long blocksize, int maxNodesPerRack,
131 List<DatanodeStorageInfo> results, boolean avoidStaleNodes,
132 EnumMap<StorageType, Integer> storageTypes) throws
133 NotEnoughReplicasException {
134 // no local machine, so choose a random machine
135 if (localMachine == null) {
136 return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize,
137 maxNodesPerRack, results, avoidStaleNodes, storageTypes);
138 }
139
140 // choose one from the local rack, but off-nodegroup
141 try {
142 final String scope = NetworkTopology.getFirstHalf(localMachine.getNetworkLocation());
143 return chooseRandom(scope, excludedNodes, blocksize, maxNodesPerRack,
144 results, avoidStaleNodes, storageTypes);
145 } catch (NotEnoughReplicasException e1) {
146 // find the second replica
147 final DatanodeDescriptor newLocal = secondNode(localMachine, results);
148 if (newLocal != null) {
149 try {
150 return chooseRandom(
151 clusterMap.getRack(newLocal.getNetworkLocation()), excludedNodes,
152 blocksize, maxNodesPerRack, results, avoidStaleNodes,
153 storageTypes);
154 } catch(NotEnoughReplicasException e2) {
155 //otherwise randomly choose one from the network
156 return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize,
157 maxNodesPerRack, results, avoidStaleNodes, storageTypes);
158 }
159 } else {
160 //otherwise randomly choose one from the network
161 return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize,
162 maxNodesPerRack, results, avoidStaleNodes, storageTypes);
163 }
164 }
165 }
166
167 /**
168 * {@inheritDoc}
169 */
170 @Override
171 protected void chooseRemoteRack(int numOfReplicas,
172 DatanodeDescriptor localMachine, Set<Node> excludedNodes,
173 long blocksize, int maxReplicasPerRack, List<DatanodeStorageInfo> results,
174 boolean avoidStaleNodes, EnumMap<StorageType, Integer> storageTypes)
175 throws NotEnoughReplicasException {
176 int oldNumOfReplicas = results.size();
177
178 final String rackLocation = NetworkTopology.getFirstHalf(
179 localMachine.getNetworkLocation());
180 try {
181 // randomly choose from remote racks
182 chooseRandom(numOfReplicas, "~" + rackLocation, excludedNodes, blocksize,
183 maxReplicasPerRack, results, avoidStaleNodes, storageTypes);
184 } catch (NotEnoughReplicasException e) {
185 // fall back to the local rack
186 chooseRandom(numOfReplicas - (results.size() - oldNumOfReplicas),
187 rackLocation, excludedNodes, blocksize,
188 maxReplicasPerRack, results, avoidStaleNodes, storageTypes);
189 }
190 }
191
192 /* choose one node from the nodegroup that <i>localMachine</i> is on.
193 * if no such node is available, choose one node from the nodegroup where
194 * a second replica is on.
195 * if still no such node is available, choose a random node in the cluster.
196 * @return the chosen node
197 */
198 private DatanodeStorageInfo chooseLocalNodeGroup(
199 NetworkTopologyWithNodeGroup clusterMap, Node localMachine,
200 Set<Node> excludedNodes, long blocksize, int maxNodesPerRack,
201 List<DatanodeStorageInfo> results, boolean avoidStaleNodes,
202 EnumMap<StorageType, Integer> storageTypes) throws
203 NotEnoughReplicasException {
204 // no local machine, so choose a random machine
205 if (localMachine == null) {
206 return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize,
207 maxNodesPerRack, results, avoidStaleNodes, storageTypes);
208 }
209
210 // choose one from the local node group
211 try {
212 return chooseRandom(
213 clusterMap.getNodeGroup(localMachine.getNetworkLocation()),
214 excludedNodes, blocksize, maxNodesPerRack, results, avoidStaleNodes,
215 storageTypes);
216 } catch (NotEnoughReplicasException e1) {
217 final DatanodeDescriptor newLocal = secondNode(localMachine, results);
218 if (newLocal != null) {
219 try {
220 return chooseRandom(
221 clusterMap.getNodeGroup(newLocal.getNetworkLocation()),
222 excludedNodes, blocksize, maxNodesPerRack, results,
223 avoidStaleNodes, storageTypes);
224 } catch(NotEnoughReplicasException e2) {
225 //otherwise randomly choose one from the network
226 return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize,
227 maxNodesPerRack, results, avoidStaleNodes, storageTypes);
228 }
229 } else {
230 //otherwise randomly choose one from the network
231 return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize,
232 maxNodesPerRack, results, avoidStaleNodes, storageTypes);
233 }
234 }
235 }
236
237 @Override
238 protected String getRack(final DatanodeInfo cur) {
239 String nodeGroupString = cur.getNetworkLocation();
240 return NetworkTopology.getFirstHalf(nodeGroupString);
241 }
242
243 /**
244 * Find other nodes in the same nodegroup of <i>localMachine</i> and add them
245 * into <i>excludeNodes</i> as replica should not be duplicated for nodes
246 * within the same nodegroup
247 * @return number of new excluded nodes
248 */
249 @Override
250 protected int addToExcludedNodes(DatanodeDescriptor chosenNode,
251 Set<Node> excludedNodes) {
252 int countOfExcludedNodes = 0;
253 String nodeGroupScope = chosenNode.getNetworkLocation();
254 List<Node> leafNodes = clusterMap.getLeaves(nodeGroupScope);
255 for (Node leafNode : leafNodes) {
256 if (excludedNodes.add(leafNode)) {
257 // not a existing node in excludedNodes
258 countOfExcludedNodes++;
259 }
260 }
261
262 countOfExcludedNodes += addDependentNodesToExcludedNodes(
263 chosenNode, excludedNodes);
264 return countOfExcludedNodes;
265 }
266
267 /**
268 * Add all nodes from a dependent nodes list to excludedNodes.
269 * @return number of new excluded nodes
270 */
271 private int addDependentNodesToExcludedNodes(DatanodeDescriptor chosenNode,
272 Set<Node> excludedNodes) {
273 if (this.host2datanodeMap == null) {
274 return 0;
275 }
276 int countOfExcludedNodes = 0;
277 for(String hostname : chosenNode.getDependentHostNames()) {
278 DatanodeDescriptor node =
279 this.host2datanodeMap.getDataNodeByHostName(hostname);
280 if(node!=null) {
281 if (excludedNodes.add(node)) {
282 countOfExcludedNodes++;
283 }
284 } else {
285 LOG.warn("Not able to find datanode " + hostname
286 + " which has dependency with datanode "
287 + chosenNode.getHostName());
288 }
289 }
290
291 return countOfExcludedNodes;
292 }
293
294 /**
295 * Pick up replica node set for deleting replica as over-replicated.
296 * First set contains replica nodes on rack with more than one
297 * replica while second set contains remaining replica nodes.
298 * If first is not empty, divide first set into two subsets:
299 * moreThanOne contains nodes on nodegroup with more than one replica
300 * exactlyOne contains the remaining nodes in first set
301 * then pickup priSet if not empty.
302 * If first is empty, then pick second.
303 */
304 @Override
305 public Collection<DatanodeStorageInfo> pickupReplicaSet(
306 Collection<DatanodeStorageInfo> first,
307 Collection<DatanodeStorageInfo> second) {
308 // If no replica within same rack, return directly.
309 if (first.isEmpty()) {
310 return second;
311 }
312 // Split data nodes in the first set into two sets,
313 // moreThanOne contains nodes on nodegroup with more than one replica
314 // exactlyOne contains the remaining nodes
315 Map<String, List<DatanodeStorageInfo>> nodeGroupMap =
316 new HashMap<String, List<DatanodeStorageInfo>>();
317
318 for(DatanodeStorageInfo storage : first) {
319 final String nodeGroupName = NetworkTopology.getLastHalf(
320 storage.getDatanodeDescriptor().getNetworkLocation());
321 List<DatanodeStorageInfo> storageList = nodeGroupMap.get(nodeGroupName);
322 if (storageList == null) {
323 storageList = new ArrayList<DatanodeStorageInfo>();
324 nodeGroupMap.put(nodeGroupName, storageList);
325 }
326 storageList.add(storage);
327 }
328
329 final List<DatanodeStorageInfo> moreThanOne = new ArrayList<DatanodeStorageInfo>();
330 final List<DatanodeStorageInfo> exactlyOne = new ArrayList<DatanodeStorageInfo>();
331 // split nodes into two sets
332 for(List<DatanodeStorageInfo> datanodeList : nodeGroupMap.values()) {
333 if (datanodeList.size() == 1 ) {
334 // exactlyOne contains nodes on nodegroup with exactly one replica
335 exactlyOne.add(datanodeList.get(0));
336 } else {
337 // moreThanOne contains nodes on nodegroup with more than one replica
338 moreThanOne.addAll(datanodeList);
339 }
340 }
341
342 return moreThanOne.isEmpty()? exactlyOne : moreThanOne;
343 }
344
345 }