001 /**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements. See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership. The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License. You may obtain a copy of the License at
009 *
010 * http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018
019 package org.apache.hadoop.hdfs.util;
020
021 import org.apache.hadoop.classification.InterfaceAudience;
022 import org.apache.hadoop.classification.InterfaceStability;
023 import org.xml.sax.ContentHandler;
024 import org.xml.sax.SAXException;
025 import org.xml.sax.helpers.AttributesImpl;
026
027 import java.util.LinkedList;
028 import java.util.List;
029 import java.util.Map;
030 import java.util.TreeMap;
031
032 /**
033 * General xml utilities.
034 *
035 */
036 @InterfaceAudience.Private
037 @InterfaceStability.Unstable
038 public class XMLUtils {
039 /**
040 * Exception that reflects an invalid XML document.
041 */
042 static public class InvalidXmlException extends RuntimeException {
043 private static final long serialVersionUID = 1L;
044 public InvalidXmlException(String s) {
045 super(s);
046 }
047 }
048
049 /**
050 * Exception that reflects a string that cannot be unmangled.
051 */
052 public static class UnmanglingError extends RuntimeException {
053 private static final long serialVersionUID = 1L;
054
055 public UnmanglingError(String str, Exception e) {
056 super(str, e);
057 }
058
059 public UnmanglingError(String str) {
060 super(str);
061 }
062 }
063
064
065 /**
066 * Given a code point, determine if it should be mangled before being
067 * represented in an XML document.
068 *
069 * Any code point that isn't valid in XML must be mangled.
070 * See http://en.wikipedia.org/wiki/Valid_characters_in_XML for a
071 * quick reference, or the w3 standard for the authoritative reference.
072 *
073 * @param cp The code point
074 * @return True if the code point should be mangled
075 */
076 private static boolean codePointMustBeMangled(int cp) {
077 if (cp < 0x20) {
078 return ((cp != 0x9) && (cp != 0xa) && (cp != 0xd));
079 } else if ((0xd7ff < cp) && (cp < 0xe000)) {
080 return true;
081 } else if ((cp == 0xfffe) || (cp == 0xffff)) {
082 return true;
083 } else if (cp == 0x5c) {
084 // we mangle backslash to simplify decoding... it's
085 // easier if backslashes always begin mangled sequences.
086 return true;
087 }
088 return false;
089 }
090
091 private static final int NUM_SLASH_POSITIONS = 4;
092
093 private static String mangleCodePoint(int cp) {
094 return String.format("\\%0" + NUM_SLASH_POSITIONS + "x;", cp);
095 }
096
097 private static String codePointToEntityRef(int cp) {
098 switch (cp) {
099 case '&':
100 return "&";
101 case '\"':
102 return """;
103 case '\'':
104 return "'";
105 case '<':
106 return "<";
107 case '>':
108 return ">";
109 default:
110 return null;
111 }
112 }
113
114 /**
115 * Mangle a string so that it can be represented in an XML document.
116 *
117 * There are three kinds of code points in XML:
118 * - Those that can be represented normally,
119 * - Those that have to be escaped (for example, & must be represented
120 * as &)
121 * - Those that cannot be represented at all in XML.
122 *
123 * The built-in SAX functions will handle the first two types for us just
124 * fine. However, sometimes we come across a code point of the third type.
125 * In this case, we have to mangle the string in order to represent it at
126 * all. We also mangle backslash to avoid confusing a backslash in the
127 * string with part our escape sequence.
128 *
129 * The encoding used here is as follows: an illegal code point is
130 * represented as '\ABCD;', where ABCD is the hexadecimal value of
131 * the code point.
132 *
133 * @param str The input string.
134 *
135 * @return The mangled string.
136 */
137 public static String mangleXmlString(String str, boolean createEntityRefs) {
138 final StringBuilder bld = new StringBuilder();
139 final int length = str.length();
140 for (int offset = 0; offset < length; ) {
141 final int cp = str.codePointAt(offset);
142 final int len = Character.charCount(cp);
143 if (codePointMustBeMangled(cp)) {
144 bld.append(mangleCodePoint(cp));
145 } else {
146 String entityRef = null;
147 if (createEntityRefs) {
148 entityRef = codePointToEntityRef(cp);
149 }
150 if (entityRef != null) {
151 bld.append(entityRef);
152 } else {
153 for (int i = 0; i < len; i++) {
154 bld.append(str.charAt(offset + i));
155 }
156 }
157 }
158 offset += len;
159 }
160 return bld.toString();
161 }
162
163 /**
164 * Demangle a string from an XML document.
165 * See {@link #mangleXmlString(String, boolean)} for a description of the
166 * mangling format.
167 *
168 * @param str The string to be demangled.
169 *
170 * @return The unmangled string
171 * @throws UnmanglingError if the input is malformed.
172 */
173 public static String unmangleXmlString(String str, boolean decodeEntityRefs)
174 throws UnmanglingError {
175 int slashPosition = -1;
176 String escapedCp = "";
177 StringBuilder bld = new StringBuilder();
178 StringBuilder entityRef = null;
179 for (int i = 0; i < str.length(); i++) {
180 char ch = str.charAt(i);
181 if (entityRef != null) {
182 entityRef.append(ch);
183 if (ch == ';') {
184 String e = entityRef.toString();
185 if (e.equals(""")) {
186 bld.append("\"");
187 } else if (e.equals("'")) {
188 bld.append("\'");
189 } else if (e.equals("&")) {
190 bld.append("&");
191 } else if (e.equals("<")) {
192 bld.append("<");
193 } else if (e.equals(">")) {
194 bld.append(">");
195 } else {
196 throw new UnmanglingError("Unknown entity ref " + e);
197 }
198 entityRef = null;
199 }
200 } else if ((slashPosition >= 0) && (slashPosition < NUM_SLASH_POSITIONS)) {
201 escapedCp += ch;
202 ++slashPosition;
203 } else if (slashPosition == NUM_SLASH_POSITIONS) {
204 if (ch != ';') {
205 throw new UnmanglingError("unterminated code point escape: " +
206 "expected semicolon at end.");
207 }
208 try {
209 bld.appendCodePoint(Integer.parseInt(escapedCp, 16));
210 } catch (NumberFormatException e) {
211 throw new UnmanglingError("error parsing unmangling escape code", e);
212 }
213 escapedCp = "";
214 slashPosition = -1;
215 } else if (ch == '\\') {
216 slashPosition = 0;
217 } else {
218 boolean startingEntityRef = false;
219 if (decodeEntityRefs) {
220 startingEntityRef = (ch == '&');
221 }
222 if (startingEntityRef) {
223 entityRef = new StringBuilder();
224 entityRef.append("&");
225 } else {
226 bld.append(ch);
227 }
228 }
229 }
230 if (entityRef != null) {
231 throw new UnmanglingError("unterminated entity ref starting with " +
232 entityRef.toString());
233 } else if (slashPosition != -1) {
234 throw new UnmanglingError("unterminated code point escape: string " +
235 "broke off in the middle");
236 }
237 return bld.toString();
238 }
239
240 /**
241 * Add a SAX tag with a string inside.
242 *
243 * @param contentHandler the SAX content handler
244 * @param tag the element tag to use
245 * @param val the string to put inside the tag
246 */
247 public static void addSaxString(ContentHandler contentHandler,
248 String tag, String val) throws SAXException {
249 contentHandler.startElement("", "", tag, new AttributesImpl());
250 char c[] = mangleXmlString(val, false).toCharArray();
251 contentHandler.characters(c, 0, c.length);
252 contentHandler.endElement("", "", tag);
253 }
254
255 /**
256 * Represents a bag of key-value pairs encountered during parsing an XML
257 * file.
258 */
259 static public class Stanza {
260 private final TreeMap<String, LinkedList <Stanza > > subtrees;
261
262 /** The unmangled value of this stanza. */
263 private String value;
264
265 public Stanza() {
266 subtrees = new TreeMap<String, LinkedList <Stanza > >();
267 value = "";
268 }
269
270 public void setValue(String value) {
271 this.value = value;
272 }
273
274 public String getValue() {
275 return this.value;
276 }
277
278 /**
279 * Discover if a stanza has a given entry.
280 *
281 * @param name entry to look for
282 *
283 * @return true if the entry was found
284 */
285 public boolean hasChildren(String name) {
286 return subtrees.containsKey(name);
287 }
288
289 /**
290 * Pull an entry from a stanza.
291 *
292 * @param name entry to look for
293 *
294 * @return the entry
295 */
296 public List<Stanza> getChildren(String name) throws InvalidXmlException {
297 LinkedList <Stanza> children = subtrees.get(name);
298 if (children == null) {
299 throw new InvalidXmlException("no entry found for " + name);
300 }
301 return children;
302 }
303
304 /**
305 * Pull a string entry from a stanza.
306 *
307 * @param name entry to look for
308 *
309 * @return the entry
310 */
311 public String getValue(String name) throws InvalidXmlException {
312 String ret = getValueOrNull(name);
313 if (ret == null) {
314 throw new InvalidXmlException("no entry found for " + name);
315 }
316 return ret;
317 }
318
319 /**
320 * Pull a string entry from a stanza, or null.
321 *
322 * @param name entry to look for
323 *
324 * @return the entry, or null if it was not found.
325 */
326 public String getValueOrNull(String name) throws InvalidXmlException {
327 if (!subtrees.containsKey(name)) {
328 return null;
329 }
330 LinkedList <Stanza> l = subtrees.get(name);
331 if (l.size() != 1) {
332 throw new InvalidXmlException("More than one value found for " + name);
333 }
334 return l.get(0).getValue();
335 }
336
337 /**
338 * Add an entry to a stanza.
339 *
340 * @param name name of the entry to add
341 * @param child the entry to add
342 */
343 public void addChild(String name, Stanza child) {
344 LinkedList<Stanza> l;
345 if (subtrees.containsKey(name)) {
346 l = subtrees.get(name);
347 } else {
348 l = new LinkedList<Stanza>();
349 subtrees.put(name, l);
350 }
351 l.add(child);
352 }
353
354 /**
355 * Convert a stanza to a human-readable string.
356 */
357 @Override
358 public String toString() {
359 StringBuilder bld = new StringBuilder();
360 bld.append("{");
361 if (!value.equals("")) {
362 bld.append("\"").append(value).append("\"");
363 }
364 String prefix = "";
365 for (Map.Entry<String, LinkedList <Stanza > > entry :
366 subtrees.entrySet()) {
367 String key = entry.getKey();
368 LinkedList <Stanza > ll = entry.getValue();
369 for (Stanza child : ll) {
370 bld.append(prefix);
371 bld.append("<").append(key).append(">");
372 bld.append(child.toString());
373 prefix = ", ";
374 }
375 }
376 bld.append("}");
377 return bld.toString();
378 }
379 }
380 }