Tokenizer.java
/*
* Copyright (C) 2003-2011 eXo Platform SAS.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package org.exoplatform.social.common.xmlprocessor;
import java.util.ArrayList;
import java.util.List;
/**
* XML scanner/tokenizer
*
* @author Ly Minh Phuong - http://phuonglm.net
*/
public class Tokenizer {
/**
* Splits tag of XML String to an arrayList
*
* @param html
* @return list of HTML tags
*/
public static List<String> tokenize(String html) {
ArrayList<String> tokens = new ArrayList<String>();
int pos = 0;
String token = "";
int len = html.length();
while (pos < len) {
char c = html.charAt(pos);
String ahead = html.substring(pos, pos > len - 4 ? len : pos + 4);
// a comment is starting
if ("<!--".equals(ahead)) {
// store the current token
if (token.length() > 0) {
tokens.add(token);
}
// clear the token
token = "";
// search the end of <......>
int end = moveToMarkerEnd(pos, "-->", html);
tokens.add(html.substring(pos, end));
pos = end;
// a new "<" token is starting
} else if ('<' == c) {
// store the current token
if (token.length() > 0) {
tokens.add(token);
}
// clear the token
token = "";
// search the end of <......>
int end = moveToMarkerEnd(pos, ">", html);
tokens.add(html.substring(pos, end));
pos = end;
} else {
token = token + c;
pos++;
}
}
if (token.length() > 0) {
tokens.add(token);
}
return tokens;
}
private static int moveToMarkerEnd(int pos, String marker, String s) {
int i = s.indexOf(marker, pos);
if (i > -1) {
pos = i + marker.length();
} else {
pos = s.length();
}
return pos;
}
}