View Javadoc
1   /*
2    * Copyright (C) 2003-2009 eXo Platform SAS.
3    *
4    * This program is free software; you can redistribute it and/or
5    * modify it under the terms of the GNU Affero General Public License
6    * as published by the Free Software Foundation; either version 3
7    * of the License, or (at your option) any later version.
8    *
9    * This program is distributed in the hope that it will be useful,
10   * but WITHOUT ANY WARRANTY; without even the implied warranty of
11   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12   * GNU General Public License for more details.
13   *
14   * You should have received a copy of the GNU General Public License
15   * along with this program; if not, see<http://www.gnu.org/licenses/>.
16   */
17  package org.exoplatform.social.service.rest;
18  
19  import java.awt.image.BufferedImage;
20  import java.io.IOException;
21  import java.net.MalformedURLException;
22  import java.net.URL;
23  import java.util.ArrayList;
24  import java.util.Collection;
25  import java.util.HashMap;
26  import java.util.List;
27  import java.util.regex.Matcher;
28  import java.util.regex.Pattern;
29  
30  import javax.imageio.ImageIO;
31  import javax.xml.bind.annotation.XmlRootElement;
32  
33  import org.apache.xerces.xni.Augmentations;
34  import org.apache.xerces.xni.QName;
35  import org.apache.xerces.xni.XMLAttributes;
36  import org.apache.xerces.xni.XMLString;
37  import org.apache.xerces.xni.parser.XMLDocumentFilter;
38  import org.apache.xerces.xni.parser.XMLInputSource;
39  import org.apache.xerces.xni.parser.XMLParserConfiguration;
40  import org.cyberneko.html.HTMLConfiguration;
41  import org.cyberneko.html.filters.DefaultFilter;
42  import org.cyberneko.html.filters.ElementRemover;
43  import org.exoplatform.commons.embedder.EmbedderFactory;
44  import org.exoplatform.commons.embedder.ExoMedia;
45  import org.exoplatform.services.log.ExoLogger;
46  import org.exoplatform.services.log.Log;
47  
48  /**
49   * LinkShare - gets preview information of a link including: 
50   * - link
51   * - title 
52   * - description
53   * - images 
54   * 
55   * - media (from popular sites: youtube, vimeo, flickr...) - low priority (NOT IMPLEMENTED YET)
56   *    This should be implemented from the client side to display preview and media player.
57   * <br>
58   * In order to provide this preview, always looks for
59   * the title of the page, a summary of the main content, and an image.
60   * Looks for preview information by the following priority:
61   * <br>
62   * 1.
63   * <pre>
64   * {@code
65   * <meta name="title" content="page_title" />
66   * <meta name="description" content="page_description" />
67   * <link rel="image_src" href="image_source" />
68   * }
69   * </pre>
70   * <br>
71   * 2.
72   * If title not found, then find in {@code <title>} tag.
73   * If description not found, then find first <p> tag. If no description, then return ""
74   * If img_src not found, then find all images in page with max, min specified width + height
75   * <br>
76   * 3. 
77   * To specify medium, use tag:
78   * <pre>
79   * {@code <meta name="medium" content="medium_type" />}
80   * </pre>
81   * In which: medium_type can be "audio", "image", "video", "news", "blog" and "mult".
82   * <br>
83   * Created by The eXo Platform SEA
84   * TODO: hoatle improvement:
85   * + scans description with MIN_CHARACTER
86   * + handles exception
87   * + parser more faster with and without scanning image tag, stop right when things got.
88   * 
89   * @author <a href="mailto:hoatlevan@gmail.com">hoatle</a>
90   * @since Oct 8, 2009
91   * @see "http://activitystrea.ms/"
92   * @see "http://www.facebook.com/share_partners.php"
93   */
94  @XmlRootElement
95  public class LinkShare extends DefaultFilter {
96  
97    private final String MEDIUM_TYPE_NEWS = "news";
98    private final String MEDIUM_TYPE_AUDIO = "audio";
99    private final String MEDIUM_TYPE_IMAGE = "image";
100   private final String MEDIUM_TYPE_VIDEO = "video";
101   private final String MEDIUM_TYPE_BLOG = "blog";
102   private final String MEDIUM_TYPE_MULT = "mult";
103   
104   private static final String IMAGE_MIME_TYPE = "image/";
105   private static final String HTML_MIME_TYPE = "text/html";
106   //default medium_type = "news"
107   private String mediumType = MEDIUM_TYPE_NEWS;
108   private String mediaSrc;
109   private String mediaType;
110   private String mediaTitle;
111   private String mediaArtist;
112   private String mediaAlbum;
113   private String mediaHeight;
114   private String mediaWidth;
115   
116   private static final Log LOG = ExoLogger.getLogger(LinkShare.class);
117   
118   private static final String HTTP_PROTOCOL = "http://";
119   private static final String HTTPS_PROTOCOL = "https://";
120   
121   //min with and height of images to get from img attributes in pixel.
122   // With <img src="img_src" width="55px" height="55px" /> ~ <img src="img_src" width="55" height="55" />
123   //if width="55pt" => with="55" ~ width="55px" (not correct but can be accepted) 
124   private static final int MIN_WIDTH = 55;
125   private static final int MIN_HEIGHT = 55;
126   //maxium description length = 250 characters
127   private static final int MAX_DESCRIPTION = 500;
128   public static final String ACTIVITY_LINK_PREVIEW_ENABLED_PROPERTY = "exo.activity.link.preview.enabled";
129   private static boolean previewEnabled = isPreviewEnabled();
130   //default lang
131   private static String lang = "en";
132   private String   link;
133   private String   title;
134   private String   description;
135   private String imageSrc;
136   private List<String> images;
137   private ExoMedia mediaObject;
138   //Collections of description with key as lang
139   private HashMap<String, String> descriptions;
140   //holds temporary string values from characters() method
141   private String temp;
142   //store all text from the first p tag
143   private StringBuffer pText;
144   //gets all the text from first p tag if no description meta and headEnded = true
145   private boolean firstPTagParsed = false;
146   //If on  p parsing, get all text from temp to pText
147   private boolean onPParsing = false;
148   
149   // to mark the end of the head tag part ~ no more meta tag.
150   // If no more meta tag (headEnded = true) and no description -> get description.
151   private boolean headEnded = false;
152   
153   /**
154    * Uses LinkShare.getInstance(String link) or 
155    * LinkShare.getInstance(String  link, String lang)
156    * for creating LinkShare object
157    */
158   private LinkShare() {
159 
160   }
161   
162   /**
163    * gets provided link
164    * @return provided link
165    */
166   public String getLink() {
167     return this.escapeSpecialCharacters(this.link);
168   }
169 
170   /**
171    * gets title
172    * @return title
173    */
174   public String getTitle() {
175     return this.escapeSpecialCharacters(this.title);
176   }
177   
178   /**
179    * Set new value for title.
180    * @param title
181    */
182   public void setTitle(String title) {
183     this.title = title;
184   }
185 
186   /**
187    * gets description
188    * @return description
189    */
190   public String getDescription() {
191     return this.escapeSpecialCharacters(this.description);
192   }
193 
194   /**
195    * Set new value for description.
196    * @param description
197    */
198   public void setDescription(String description) {
199     this.description = description;
200   }
201 
202   /**
203    * gets images list
204    * @return images
205    */
206   public List<String> getImages() {
207     return images;
208   }
209   
210   /**
211    * gets mediumType
212    * @return mediumType
213    */
214   public String getMediumType() {
215     return mediumType;
216   }
217   
218   
219   /**
220    * gets mediaSrc
221    * @return mediaSrc
222    */
223   public String getMediaSrc() {
224     return mediaSrc;
225   }
226   
227   /**
228    * gets mediaType if provided in:
229    * <pre>
230    *  &lt;meta name="audio_type" content="Content-Type header field" /&gt;
231    * </pre>
232    * or:
233    * <pre>
234    *  &lt;meta name="video_type" content="Content-Type header field" /&gt;
235    * </pre>
236    * @return mediaType
237    */
238   public String getMediaType() {
239     return mediaType;
240   }
241   
242   /**
243    * gets mediaTitle if provided in:
244    * <pre>
245    *  &lt;meta name="audio_title" content="audio_title_name" /&gt;
246    * </pre>
247    * @return mediaTitle
248    */
249   public String getMediaTitle() {
250     return mediaTitle;
251   }
252   
253   
254   /**
255    * gets mediaArtist if provided in:
256    * <pre>
257    *  &lt;meta name="audio_artist" content="audio_artist_name" /&gt;
258    * </pre>
259    * @return mediaArtist
260    */
261   public String getMediaArtist() {
262     return mediaArtist;
263   }
264   
265   /**
266    * gets mediaAlbum if provided in:
267    * <pre>
268    *  &lt;meta name="audio_album" content="audio_album_name" /&gt;
269    * </pre>
270    * @return mediaAlbum
271    */
272   public String getMediaAlbum() {
273     return mediaAlbum;
274   }
275   
276   /**
277    * gets mediaHeight if provided in:
278    * <pre>
279    *  &lt;meta name="video_height" content="video_height_value" /&gt;
280    * </pre>
281    * @return mediaHeight;
282    */
283   public String getMediaHeight() {
284     return mediaHeight;
285   }
286   
287   /**
288    * gets mediaWidth if provided in:
289    * <pre>
290    *  &lt;meta name="video_width" content="video_width_value" /&gt;
291    * </pre>
292    * @return mediaWidth
293    */
294   public String getMediaWidth() {
295     return mediaWidth;
296   }
297   /**
298    * get mediaObject
299    * @return
300    */
301   public ExoMedia getMediaObject() {
302     return mediaObject;
303   }
304   
305   /**
306    * Gets information of the provided link by using remover filter,
307    * using call back filter methods to get desired information.
308  * @param encoding 
309    */
310   private void get(String encoding) throws Exception {
311     //Creates element remover filter
312     ElementRemover remover = new ElementRemover();
313     remover.acceptElement("head", null);
314     remover.acceptElement("meta", new String[] {"name", "content", "lang"});
315     remover.acceptElement("link", new String[] {"rel", "href"});
316     remover.acceptElement("title", null);
317     remover.acceptElement("img", new String[] {"src", "width", "height"});
318     remover.acceptElement("p", null);
319     //accepts more tags to get text from a <p> tag
320     remover.acceptElement("a", null);
321     remover.acceptElement("b", null);
322     remover.acceptElement("i", null);
323     remover.acceptElement("strong", null);
324     
325     remover.removeElement("script");
326     //Sets up filter chain
327     XMLDocumentFilter[] filter = {
328         remover
329     };
330     XMLParserConfiguration parser = new HTMLConfiguration();
331     parser.setProperty("http://cyberneko.org/html/properties/default-encoding", "UTF-8");
332     parser.setProperty("http://cyberneko.org/html/properties/filters", filter);
333     parser.setDocumentHandler(this);
334     XMLInputSource source = new XMLInputSource(null, Util.getDecodeQueryURL(link), null);
335     source.setEncoding(encoding);
336     try {
337       parser.parse(source);
338     } catch (NullPointerException ne) {
339       ExoLogger.getLogger(LinkShare.class)
340         .warn("Problem when parsing the link in LinkShare.getInstance(String) method");
341     } catch (IOException e) {
342       // Process as normal behavior in case the link is in the valid form
343       // but have been blocked or some other same reasons.
344       this.title = this.link;
345     } catch (Exception e) {
346       this.title = this.link;
347     }
348   }
349   
350   /**
351    * Gets LinkShare instance with specified link. The default lang = "en"
352    * @param link
353    * @return LinkShare instance
354    * @throws Exception 
355    */
356   public static LinkShare getInstance(String link) throws Exception {
357     return getInstance(link, lang);
358   }
359   
360   /**
361    * Gets LinkShare instance with link and lang specified.
362    * @param link
363    * @param lang
364    * @return LinkShare instance
365    * @throws Exception 
366    */
367   public static LinkShare getInstance(String link, String lang) throws Exception {
368     if (link == null) {
369       return null;
370     }
371     if (!Util.isValidURL(link)) {
372       return null;
373     }
374     
375     if (!(link.toLowerCase().startsWith(HTTP_PROTOCOL) || link.toLowerCase().startsWith(HTTPS_PROTOCOL))) {
376         link = HTTP_PROTOCOL + link;
377     }
378     
379     LinkShare linkShare = new LinkShare();
380     linkShare.link = link;
381     LinkShare.lang = lang;
382 
383     if(previewEnabled) {
384       linkShare.mediaObject = EmbedderFactory.getInstance(link).getExoMedia();
385 
386       // if there is no media object, processes link to get page metadata
387       if (linkShare.mediaObject == null) {
388         String mimeType = org.exoplatform.social.service.rest.Util.getMimeTypeOfURL(link);
389         if (mimeType.toLowerCase().startsWith(IMAGE_MIME_TYPE)) {
390           linkShare.images = new ArrayList<>(0);
391           linkShare.images.add(link);
392           linkShare.description = "";
393         } else if (mimeType.toLowerCase().startsWith(HTML_MIME_TYPE)) {
394           String encoding = (mimeType.contains("charset=")) ? mimeType.split("charset=")[1] : "UTF-8";
395           linkShare.get(encoding);
396         } else {
397           linkShare.images = new ArrayList<>(0);
398           linkShare.description = "";
399         }
400 
401         if ((linkShare.title == null) || (linkShare.title.trim().length() == 0)) linkShare.title = link;
402 
403         //If image_src detected from meta tag, sets this image_src to images
404         if (linkShare.imageSrc != null) {
405           List<String> images = new ArrayList<>();
406           images.add(linkShare.imageSrc);
407           linkShare.images = images;
408         }
409         //gets desired description by lang when there are many description meta name with different lang
410         HashMap<String, String> descriptions = linkShare.descriptions;
411         if (descriptions != null) {
412           String description = descriptions.get(LinkShare.lang);
413           if (description == null) {
414             Collection<String> values = descriptions.values();
415             //get the first value in the collection
416             description = values.iterator().next();
417           }
418           linkShare.description = description;
419           //gets with maximum characters only
420           String tail = "";
421           if (description.length() > MAX_DESCRIPTION) {
422             tail = "...";
423             linkShare.description = description.substring(0, MAX_DESCRIPTION - 1) + tail;
424           }
425         }
426         if (linkShare.description == null) linkShare.description = "";
427         if (linkShare.images == null) {
428           linkShare.images = new ArrayList<>();
429         }
430       }
431     }
432     return linkShare;
433   }
434 
435   private static boolean isPreviewEnabled() {
436     String previewEnabledPropertyValue = System.getProperty(ACTIVITY_LINK_PREVIEW_ENABLED_PROPERTY);
437     return previewEnabledPropertyValue == null || Boolean.valueOf(previewEnabledPropertyValue);
438   }
439 
440   /**
441    * filter method is called back when scanning meets start element tag
442    */
443   public void startElement(QName element, XMLAttributes attrs, Augmentations augs) {
444     if (headEnded == true && descriptions == null) {
445       if (firstPTagParsed == false) {
446         if ("p".equalsIgnoreCase(element.rawname)) {
447           firstPTagParsed = true;
448           onPParsing = true;
449         }
450       }
451     } else if ("title".equalsIgnoreCase(element.rawname)) {
452       onPParsing = true;
453     }
454   }
455 
456   
457   /**
458    * filter method is called back when scanning meets end element tag
459    */
460   public void endElement(QName element, Augmentations augs) {
461     //System.out.println("( " + element.rawname);
462     //if end of title -> set temporary title;
463     //if detect <meta name="title" content="meta_title" />, reset title
464       if ("title".equalsIgnoreCase(element.rawname)) {
465         if (title == null) {
466           if (onPParsing) {
467             title = pText.toString();
468             onPParsing = false;
469             pText = null;
470           } else {
471             title = temp;
472           }
473         }
474       }
475     //set headEnded
476     if ("head".equalsIgnoreCase(element.rawname)) {
477       headEnded = true;
478     }
479     //Set end of p tag
480     if (onPParsing == true) {
481       if ("p".equalsIgnoreCase(element.rawname)) {
482         onPParsing = false;
483         description = pText.toString();
484       }
485     }
486   }
487   
488   /**
489    * this filter method is called back when scanning meets empty element tag
490    */
491   public void emptyElement(QName element, XMLAttributes attributes, Augmentations augs) {
492     if("link".equalsIgnoreCase(element.rawname)) { //process link tag
493       String relValue;
494       String hrefValue;
495       relValue = attributes.getValue("rel");
496       hrefValue = attributes.getValue("href");
497       if (hrefValue != null) hrefValue = getAbsLink(hrefValue);
498       if ("image_src".equalsIgnoreCase(relValue)) {
499         imageSrc = hrefValue;
500       } else if ("audio_src".equalsIgnoreCase(relValue)) {
501         mediaSrc = hrefValue;
502         mediumType = MEDIUM_TYPE_AUDIO;
503       } else if ("video_src".equalsIgnoreCase(relValue)) {
504         mediaSrc = hrefValue;
505         mediumType = MEDIUM_TYPE_VIDEO;
506       }
507     } else if ("meta".equalsIgnoreCase(element.rawname)) { //process meta tag
508       String nameValue;
509       String contentValue;
510       nameValue = attributes.getValue("name");
511       if (nameValue == null) return;
512       contentValue = attributes.getValue("content");
513       if (contentValue == null) return;
514       //Set mediumType
515       if ("medium".equalsIgnoreCase(nameValue)) {
516         if ("news".equalsIgnoreCase(contentValue)) {
517           mediumType = MEDIUM_TYPE_NEWS;
518         } else if ("audio".equalsIgnoreCase(contentValue)) {
519           mediumType = MEDIUM_TYPE_AUDIO;
520         } else if ("image".equalsIgnoreCase(contentValue)) {
521           mediumType = MEDIUM_TYPE_IMAGE;
522         } else if ("video".equalsIgnoreCase(contentValue)) {
523           mediumType = MEDIUM_TYPE_VIDEO;
524         } else if ("blog".equalsIgnoreCase(contentValue)) {
525           mediumType = MEDIUM_TYPE_BLOG;
526         } else if ("mult".equalsIgnoreCase(contentValue)) {
527          mediumType = MEDIUM_TYPE_MULT; 
528         }
529       } else if ("title".equalsIgnoreCase(nameValue)) {
530         title = contentValue;
531       } else if ("description".equalsIgnoreCase(nameValue)) {
532         String langValue = attributes.getValue("lang");
533         if (langValue != null) {
534           if (descriptions == null) descriptions = new HashMap<String, String>();
535           descriptions.put(langValue, contentValue);
536         } else {
537           description = contentValue;
538         }
539       }
540       
541       if (mediumType.equals(MEDIUM_TYPE_AUDIO) || mediumType.equals(MEDIUM_TYPE_MULT)) {
542         if ("audio_type".equalsIgnoreCase(nameValue)) {
543           mediaType = contentValue;
544         } else if ("audio_title".equalsIgnoreCase(nameValue)) {
545           mediaTitle = contentValue;
546         } else if ("audio_artist".equalsIgnoreCase(nameValue)) {
547           mediaArtist = contentValue;
548         } else if ("audio_album".equalsIgnoreCase(nameValue)) {
549           mediaAlbum = contentValue;
550         }
551       } else if (mediumType.equals(MEDIUM_TYPE_VIDEO) || mediumType.equals(MEDIUM_TYPE_MULT)) {
552         if ("video_type".equalsIgnoreCase(nameValue)) {
553           mediaType = contentValue;
554         } else if ("video_title".equalsIgnoreCase(nameValue)) {
555           mediaTitle = contentValue;
556         } else if ("video_height".equalsIgnoreCase(nameValue)) {
557           mediaHeight = contentValue;
558         } else if ("video_width".equalsIgnoreCase(nameValue)) {
559           mediaWidth = contentValue;
560         } else if ("video_artist".equalsIgnoreCase(nameValue)) {
561           mediaArtist = contentValue;
562         } else if ("video_album".equalsIgnoreCase(nameValue)) {
563           mediaAlbum = contentValue;
564         }
565       }
566     } else if ((imageSrc == null) && ("img".equalsIgnoreCase(element.rawname))) { //process img tag
567       String src = attributes.getValue("src");
568       if (src == null) return;
569       
570       if (isAcceptableImg(src)) {
571         src = getAbsLink(src);
572         if (images == null) images = new ArrayList<String>();
573         images.add(src);
574       }
575     }
576   }
577   
578   
579   /**
580    * filter method is called back when scanning meets the end of text in a tag
581    */
582   public void characters(XMLString text, Augmentations augs) {
583     temp = text.toString();
584     if (onPParsing == true) {
585       if (pText == null) pText = new StringBuffer();
586       pText.append(temp);
587     }
588   }
589   
590   /**
591    * Gets absolute link from the provided link
592    * @param link
593    * @return absolute link
594    */
595   private String getAbsLink(String link) {
596     if (link.startsWith("http://") || link.startsWith("https://")) return link;
597     URL url = null;
598     try {
599       url = new URL(this.link);
600     } catch (MalformedURLException e) {
601       LOG.debug("MalformedURLException : Could not initialize url from link.");
602     }
603     String protocol = url.getProtocol();
604     String host = url.getHost();
605     String base = protocol + "://" + host;
606     if (link.startsWith("/")) {
607       //Absolute
608       return base + link;
609     } else if (link.startsWith("./")) {
610       if (this.link.endsWith("/")) {
611         this.link = this.link.substring(0, this.link.length() - 1);
612       }
613       link = link.substring(1, link.length());
614       return this.link + link;        
615     } else if (link.startsWith("../")) {
616       String regex = "\\.\\./";
617       Pattern partern = Pattern.compile(regex);
618       Matcher matcher = partern.matcher(link);
619       int level = 0;
620       while (matcher.find()) {
621         level++;
622       }
623       String secondPath = link.replace("(\\.\\./)+", "");
624       String[] str = this.link.split("/");
625       StringBuffer sb = new StringBuffer();
626       level = (str.length - 1) - level;
627       for (int i = 0; i < level; i++) {
628         sb.append(str[i]);
629       }
630       sb.append(secondPath);
631       return sb.toString();
632     } else {
633       if (this.link.endsWith("/")) {
634         return this.link + link;
635       } else {
636         return this.link + "/" + link;
637       }
638     }
639   }
640   
641   /**
642    * Escapes the special characters.
643    * 
644    * @param str
645    * @return
646    * @since 1.2.7
647    */
648   private String escapeSpecialCharacters(String str) {
649     if (str != null) {
650       return str.replaceAll("\r\n|\n\r|\n|\r", "");
651     } else {
652       return "";
653     }
654   }
655   
656   private boolean isAcceptableImg(String src) {
657     BufferedImage img = null;
658     try {
659        img = ImageIO.read(new URL(src));
660        int width = img.getWidth();
661        int height = img.getHeight();
662        return (width > MIN_WIDTH && height > MIN_HEIGHT);
663     } catch (MalformedURLException e) {
664       return false;
665     } catch (IOException e) {
666       return false;
667     }
668   }
669 }