1 /***************************************************************************
2 * Copyright (C) 2003-2009 eXo Platform SAS.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Affero General Public License
6 * as published by the Free Software Foundation; either version 3
7 * of the License, or (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, see<http://www.gnu.org/licenses/>.
16 *
17 **************************************************************************/
18 package org.exoplatform.services.wcm.link;
19
20 import java.util.ArrayList;
21 import java.util.List;
22 import java.util.regex.Matcher;
23 import java.util.regex.Pattern;
24
25 /**
26 * Extract hyper links from HTML file
27 */
28 public class HTMLLinkExtractor {
29 private Pattern patternTag, patternLink;
30 private Matcher matcherTag, matcherLink;
31
32 /*
33 *
34 ( #start of group #1
35 ?i # all checking are case insensitive
36 ) #end of group #1
37 <a #start with "<a"
38 ( # start of group #2
39 [^>]+ # anything except (">"), at least one character
40 ) # end of group #2
41 > # follow by ">"
42 (.+?) # match anything
43 </a> # end with "</a>
44 */
45 private static final String HTML_A_TAG_PATTERN = "(?i)<a([^>]+)>(.+?)</a>";
46
47 /**
48 *
49 \s* #can start with whitespace
50 (?i) # all checking are case insensitive
51 href # follow by "href" word
52 \s*=\s* # allows spaces on either side of the equal sign,
53 ( # start of group #1
54 "([^"]*") # allow string with double quotes enclosed - "string"
55 | # ..or
56 '[^']*' # allow string with single quotes enclosed - 'string'
57 | # ..or
58 ([^'">]+) # can't contains one single quotes, double quotes ">"
59 ) # end of group #1
60 */
61 private static final String HTML_A_HREF_TAG_PATTERN = "\\s*(?i)href\\s*=\\s*(\"([^\"]*\")|'[^']*'|([^'\">\\s]+))";
62
63 public HTMLLinkExtractor() {
64 patternTag = Pattern.compile(HTML_A_TAG_PATTERN);
65 patternLink = Pattern.compile(HTML_A_HREF_TAG_PATTERN);
66 }
67
68 /**
69 * Validate html with regular expression
70 * @param html html content for validation
71 * @return Vector links and link text
72 */
73 public List<HtmlLink> grabHTMLLinks(String html){
74 List<HtmlLink> result = new ArrayList<HtmlLink>();
75 matcherTag = patternTag.matcher(html);
76 while(matcherTag.find()){
77 String href = matcherTag.group(1); //href
78 matcherLink = patternLink.matcher(href);
79 while(matcherLink.find()){
80 String link = matcherLink.group(1); //link
81 if(link.startsWith("\"") || link.startsWith("\'"))
82 link = link.substring(1, link.length() - 1);
83 result.add(new HtmlLink(link));
84 }
85 }
86 return result;
87 }
88
89 class HtmlLink {
90 String link;
91 HtmlLink(String link){
92 this.link = link;
93 }
94
95 @Override
96 public String toString() {
97 return this.link;
98 }
99 }
100 }