HTMLLinkExtractor.java
/***************************************************************************
* Copyright (C) 2003-2009 eXo Platform SAS.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU Affero General Public License
* as published by the Free Software Foundation; either version 3
* of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, see<http://www.gnu.org/licenses/>.
*
**************************************************************************/
package org.exoplatform.services.wcm.link;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* Extract hyper links from HTML file
*/
public class HTMLLinkExtractor {
private Pattern patternTag, patternLink;
private Matcher matcherTag, matcherLink;
/*
*
( #start of group #1
?i # all checking are case insensitive
) #end of group #1
<a #start with "<a"
( # start of group #2
[^>]+ # anything except (">"), at least one character
) # end of group #2
> # follow by ">"
(.+?) # match anything
</a> # end with "</a>
*/
private static final String HTML_A_TAG_PATTERN = "(?i)<a([^>]+)>(.+?)</a>";
/**
*
\s* #can start with whitespace
(?i) # all checking are case insensitive
href # follow by "href" word
\s*=\s* # allows spaces on either side of the equal sign,
( # start of group #1
"([^"]*") # allow string with double quotes enclosed - "string"
| # ..or
'[^']*' # allow string with single quotes enclosed - 'string'
| # ..or
([^'">]+) # can't contains one single quotes, double quotes ">"
) # end of group #1
*/
private static final String HTML_A_HREF_TAG_PATTERN = "\\s*(?i)href\\s*=\\s*(\"([^\"]*\")|'[^']*'|([^'\">\\s]+))";
public HTMLLinkExtractor() {
patternTag = Pattern.compile(HTML_A_TAG_PATTERN);
patternLink = Pattern.compile(HTML_A_HREF_TAG_PATTERN);
}
/**
* Validate html with regular expression
* @param html html content for validation
* @return Vector links and link text
*/
public List<HtmlLink> grabHTMLLinks(String html){
List<HtmlLink> result = new ArrayList<HtmlLink>();
matcherTag = patternTag.matcher(html);
while(matcherTag.find()){
String href = matcherTag.group(1); //href
matcherLink = patternLink.matcher(href);
while(matcherLink.find()){
String link = matcherLink.group(1); //link
if(link.startsWith("\"") || link.startsWith("\'"))
link = link.substring(1, link.length() - 1);
result.add(new HtmlLink(link));
}
}
return result;
}
class HtmlLink {
String link;
HtmlLink(String link){
this.link = link;
}
@Override
public String toString() {
return this.link;
}
}
}