View Javadoc
1   /***************************************************************************
2    * Copyright (C) 2003-2009 eXo Platform SAS.
3    *
4    * This program is free software; you can redistribute it and/or
5    * modify it under the terms of the GNU Affero General Public License
6    * as published by the Free Software Foundation; either version 3
7    * of the License, or (at your option) any later version.
8    *
9    * This program is distributed in the hope that it will be useful,
10   * but WITHOUT ANY WARRANTY; without even the implied warranty of
11   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12   * GNU General Public License for more details.
13   *
14   * You should have received a copy of the GNU General Public License
15   * along with this program; if not, see<http://www.gnu.org/licenses/>.
16   *
17   **************************************************************************/
18  package org.exoplatform.services.wcm.link;
19  
20  import java.util.ArrayList;
21  import java.util.List;
22  import java.util.regex.Matcher;
23  import java.util.regex.Pattern;
24  
25  /**
26   * Extract hyper links from HTML file  
27   */
28  public class HTMLLinkExtractor {
29    private Pattern patternTag, patternLink;
30    private Matcher matcherTag, matcherLink;
31  
32    /*
33     * 
34       (               #start of group #1
35       ?i              #  all checking are case insensitive
36        )              #end of group #1
37       <a              #start with "<a"
38       (               #  start of group #2
39       [^>]+           #     anything except (">"), at least one character
40       )               #  end of group #2
41       >               #     follow by ">"
42       (.+?)           # match anything 
43       </a>            #   end with "</a>
44     */
45    private static final String HTML_A_TAG_PATTERN = "(?i)<a([^>]+)>(.+?)</a>";
46  
47    /**
48     * 
49       \s*             #can start with whitespace
50       (?i)            # all checking are case insensitive
51       href            #  follow by "href" word
52       \s*=\s*         #   allows spaces on either side of the equal sign,
53       (               #    start of group #1
54       "([^"]*")       #      allow string with double quotes enclosed - "string"
55       |               #    ..or
56       '[^']*'         #        allow string with single quotes enclosed - 'string'
57       |               #    ..or
58       ([^'">]+)       #      can't contains one single quotes, double quotes ">"
59       )               #    end of group #1
60     */
61    private static final String HTML_A_HREF_TAG_PATTERN = "\\s*(?i)href\\s*=\\s*(\"([^\"]*\")|'[^']*'|([^'\">\\s]+))";
62  
63    public HTMLLinkExtractor() {
64      patternTag = Pattern.compile(HTML_A_TAG_PATTERN);
65      patternLink = Pattern.compile(HTML_A_HREF_TAG_PATTERN);
66    }
67  
68    /**
69     * Validate html with regular expression
70     * @param html html content for validation
71     * @return Vector links and link text
72     */
73    public List<HtmlLink> grabHTMLLinks(String html){
74      List<HtmlLink> result = new ArrayList<HtmlLink>();
75      matcherTag = patternTag.matcher(html);
76      while(matcherTag.find()){
77        String href = matcherTag.group(1); //href
78        matcherLink = patternLink.matcher(href);
79        while(matcherLink.find()){
80          String link = matcherLink.group(1); //link
81          if(link.startsWith("\"") || link.startsWith("\'")) 
82            link = link.substring(1, link.length() - 1);
83          result.add(new HtmlLink(link));
84        }
85      }
86      return result;
87    }
88  
89    class HtmlLink {
90      String link;
91      HtmlLink(String link){
92        this.link = link;
93      }
94  
95      @Override
96      public String toString() {
97        return this.link;
98      }     
99    }  
100 }