HTMLSanitizer.java

/*
 * Copyright (C) 2016 eXo Platform SAS.
 *
 * This is free software; you can redistribute it and/or modify it
 * under the terms of the GNU Lesser General Public License as
 * published by the Free Software Foundation; either version 2.1 of
 * the License, or (at your option) any later version.
 *
 * This software is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this software; if not, write to the Free
 * Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
 * 02110-1301 USA, or see the FSF site: http://www.fsf.org.
 */
package org.exoplatform.commons.utils;

import java.io.IOException;
import java.util.Arrays;
import java.util.Collection;
import java.util.regex.Pattern;

import org.apache.commons.collections.CollectionUtils;
import org.owasp.html.*;

import com.google.common.base.Function;
import com.google.common.base.Predicate;
import com.google.common.base.Throwables;

/**
 * Prevent XSS/XEE attacks by encoding user HTML inputs. This class will be used
 * to encode data in in presentation layer.
 *
 * @author <a href="kmenzli@exoplatform.com">Khemais MENZLI</a>
 * @version $Revision$
 */

abstract public class HTMLSanitizer {

  // Some common regular expression definitions.

  // The 16 colors defined by the HTML Spec (also used by the CSS Spec)
  private static final Pattern                                                COLOR_NAME               = Pattern.compile("(?:aqua|black|blue|fuchsia|gray|grey|green|lime|maroon|navy|olive|purple"
                                                                                                           + "|red|silver|teal|white|yellow)");

  // HTML/CSS Spec allows 3 or 6 digit hex to specify color
  private static final Pattern                                                COLOR_CODE               = Pattern.compile("(?:#(?:[0-9a-fA-F]{3}(?:[0-9a-fA-F]{3})?))");

  private static final Pattern                                                NUMBER_OR_PERCENT        = Pattern.compile("[0-9]+%?");

  private static final Pattern                                                PARAGRAPH                = Pattern.compile("(?:[\\p{L}\\p{N},'\\.\\s\\-_\\(\\)]|&[0-9]{2};)*");

  private static final Pattern                                                HTML_ID                  = Pattern.compile("[a-zA-Z0-9\\:\\-_\\.]+");

  // force non-empty with a '+' at the end instead of '*'
  private static final Pattern                                                HTML_TITLE               = Pattern.compile("[\\p{L}\\p{N}\\s\\-_',:\\[\\]!\\./\\\\\\(\\)&]*");

  private static final Pattern                                                HTML_CLASS               = Pattern.compile("[a-zA-Z0-9\\s,\\-_]+");

  private static final Pattern                                                ONSITE_URL               = Pattern.compile("(?:[\\p{L}\\p{N} \\\\\\.\\#@\\$%\\+&;\\-_~,\\?=/!]+|\\#(\\w)+)");

  private static final Pattern                                                OFFSITE_URL              = Pattern.compile("\\s*(?:(?:ht|f)tps?://|mailto:)[\\p{L}\\p{N}]"
                                                                                                           + "[\\p{L}\\p{N} \\p{Zs}\\.\\#@\\$%\\+&;:\\-_~,\\?=/!\\(\\)\\*]*+\\s*");

  private static final Pattern                                                NUMBER                   = Pattern.compile("[+-]?(?:(?:[0-9]+(?:\\.[0-9]*)?)|\\.[0-9]+)");

  private static final Pattern                                                NAME                     = Pattern.compile("[a-zA-Z0-9\\-_\\$]+");

  private static final Pattern                                                ALIGN                    = Pattern.compile("(?i)center|left|right|justify|char");

  private static final Pattern                                                VALIGN                   = Pattern.compile("(?i)baseline|bottom|middle|top");

  private static final Predicate<String>                                      COLOR_NAME_OR_COLOR_CODE = matchesEither(COLOR_NAME,
                                                                                                                       COLOR_CODE);

  private static final Predicate<String>                                      ONSITE_OR_OFFSITE_URL    = matchesEither(ONSITE_URL,
                                                                                                                       OFFSITE_URL);

  private static final Pattern                                                HISTORY_BACK             = Pattern.compile("(?:javascript:)?\\Qhistory.go(-1)\\E");

  private static final Pattern                                                ONE_CHAR                 = Pattern.compile(".?",
                                                                                                                         Pattern.DOTALL);

  @SuppressWarnings("unchecked")
  private static final Collection<String>                                     CUSTOM_ALLOWED_STYLES    =
                                                                                                    (Collection<String>) CollectionUtils.union(CssSchema.DEFAULT.allowedProperties(),
                                                                                                                                               Arrays.asList(new String[]{"float", "display", "clear"}));

  /** A policy definition that matches the minimal HTML that eXo allows. */
  public static final Function<HtmlStreamEventReceiver, HtmlSanitizer.Policy> POLICY_DEFINITION        = new HtmlPolicyBuilder()
                                                                                                       // Allow
                                                                                                       // these
                                                                                                       // tags.
                                                                                                       .allowAttributes("id")
                                                                                                                                .matching(HTML_ID)
                                                                                                                                .globally()
                                                                                                                                .allowAttributes("class")
                                                                                                                                .matching(HTML_CLASS)
                                                                                                                                .globally()
                                                                                                                                .allowAttributes("lang")
                                                                                                                                .matching(Pattern.compile("[a-zA-Z]{2,20}"))
                                                                                                                                .globally()
                                                                                                                                .allowAttributes("title")
                                                                                                                                .matching(HTML_TITLE)
                                                                                                                                .globally()
                                                                                                                                .allowStyling(CssSchema.withProperties(CUSTOM_ALLOWED_STYLES))
                                                                                                                                .allowAttributes("align")
                                                                                                                                .matching(ALIGN)
                                                                                                                                .onElements("p")
                                                                                                                                .allowAttributes("for")
                                                                                                                                .matching(HTML_ID)
                                                                                                                                .onElements("label")
                                                                                                                                .allowAttributes("color")
                                                                                                                                .matching(COLOR_NAME_OR_COLOR_CODE)
                                                                                                                                .onElements("font")
                                                                                                                                .allowAttributes("face")
                                                                                                                                .matching(Pattern.compile("[\\w;, \\-]+"))
                                                                                                                                .onElements("font")
                                                                                                                                .allowAttributes("size")
                                                                                                                                .matching(NUMBER)
                                                                                                                                .onElements("font")
                                                                                                                                .allowAttributes("href")
                                                                                                                                .matching(ONSITE_OR_OFFSITE_URL)
                                                                                                                                .onElements("a")
                                                                                                                                .allowStandardUrlProtocols()
                                                                                                                                .allowAttributes("nohref")
                                                                                                                                .onElements("a")
                                                                                                                                .allowAttributes("name", "rel")
                                                                                                                                .matching(NAME)
                                                                                                                                .onElements("a")
                                                                                                                                .allowAttributes("onfocus",
                                                                                                                                        "onblur",
                                                                                                                                        "onclick",
                                                                                                                                        "onmousedown",
                                                                                                                                        "onmouseup")
                                                                                                                                .matching(HISTORY_BACK)
                                                                                                                                .onElements("a")
                                                                                                                                .requireRelNofollowOnLinks()
                                                                                                                                .allowAttributes("src")
                                                                                                                                .matching(ONSITE_OR_OFFSITE_URL)
                                                                                                                                .onElements("img")
                                                                                                                                .allowAttributes("src")
                                                                                                                                .matching(ONSITE_OR_OFFSITE_URL)
                                                                                                                                .onElements("img")
                                                                                                                                .allowAttributes("referrerpolicy", "data-plugin-name")
                                                                                                                                .onElements("img")
                                                                                                                                .allowAttributes("name")
                                                                                                                                .matching(NAME)
                                                                                                                                .onElements("img")
                                                                                                                                .allowAttributes("alt")
                                                                                                                                .matching(PARAGRAPH)
                                                                                                                                .onElements("img")
                                                                                                                                .allowAttributes("border",
                                                                                                                                        "hspace",
                                                                                                                                        "vspace")
                                                                                                                                .matching(NUMBER)
                                                                                                                                .onElements("img")
                                                                                                                                .allowAttributes("width", "height")
                                                                                                                                .matching(NUMBER_OR_PERCENT)
                                                                                                                                .onElements("img")
                                                                                                                                .allowAttributes("border",
                                                                                                                                        "cellpadding",
                                                                                                                                        "cellspacing")
                                                                                                                                .matching(NUMBER)
                                                                                                                                .onElements("table")
                                                                                                                                .allowAttributes("bgcolor")
                                                                                                                                .matching(COLOR_NAME_OR_COLOR_CODE)
                                                                                                                                .onElements("table")
                                                                                                                                .allowAttributes("background")
                                                                                                                                .matching(ONSITE_URL)
                                                                                                                                .onElements("table")
                                                                                                                                .allowAttributes("align")
                                                                                                                                .matching(ALIGN)
                                                                                                                                .onElements("table")
                                                                                                                                .allowAttributes("noresize")
                                                                                                                                .matching(Pattern.compile("(?i)noresize"))
                                                                                                                                .onElements("table")
                                                                                                                                .allowAttributes("background")
                                                                                                                                .matching(ONSITE_URL)
                                                                                                                                .onElements("td",
                                                                                                                                        "th",
                                                                                                                                        "tr")
                                                                                                                                .allowAttributes("bgcolor")
                                                                                                                                .matching(COLOR_NAME_OR_COLOR_CODE)
                                                                                                                                .onElements("td",
                                                                                                                                        "th")
                                                                                                                                .allowAttributes("abbr")
                                                                                                                                .matching(PARAGRAPH)
                                                                                                                                .onElements("td",
                                                                                                                                        "th")
                                                                                                                                .allowAttributes("axis",
                                                                                                                                        "headers")
                                                                                                                                .matching(NAME)
                                                                                                                                .onElements("td",
                                                                                                                                        "th")
                                                                                                                                .allowAttributes("scope")
                                                                                                                                .matching(Pattern.compile("(?i)(?:row|col)(?:group)?"))
                                                                                                                                .onElements("td",
                                                                                                                                        "th")
                                                                                                                                .allowAttributes("nowrap")
                                                                                                                                .onElements("td",
                                                                                                                                        "th")
                                                                                                                                .allowAttributes("height",
                                                                                                                                        "width")
                                                                                                                                .matching(NUMBER_OR_PERCENT)
                                                                                                                                .onElements("table",
                                                                                                                                        "td",
                                                                                                                                        "th",
                                                                                                                                        "tr",
                                                                                                                                        "img")
                                                                                                                                .allowAttributes("align")
                                                                                                                                .matching(ALIGN)
                                                                                                                                .onElements("thead",
                                                                                                                                        "tbody",
                                                                                                                                        "tfoot",
                                                                                                                                        "img",
                                                                                                                                        "td",
                                                                                                                                        "th",
                                                                                                                                        "tr",
                                                                                                                                        "colgroup",
                                                                                                                                        "col")
                                                                                                                                .allowAttributes("valign")
                                                                                                                                .matching(VALIGN)
                                                                                                                                .onElements("thead",
                                                                                                                                        "tbody",
                                                                                                                                        "tfoot",
                                                                                                                                        "td",
                                                                                                                                        "th",
                                                                                                                                        "tr",
                                                                                                                                        "colgroup",
                                                                                                                                        "col")
                                                                                                                                .allowAttributes("charoff")
                                                                                                                                .matching(NUMBER_OR_PERCENT)
                                                                                                                                .onElements("td",
                                                                                                                                        "th",
                                                                                                                                        "tr",
                                                                                                                                        "colgroup",
                                                                                                                                        "col",
                                                                                                                                        "thead",
                                                                                                                                        "tbody",
                                                                                                                                        "tfoot")
                                                                                                                                .allowAttributes("char")
                                                                                                                                .matching(ONE_CHAR)
                                                                                                                                .onElements("td",
                                                                                                                                        "th",
                                                                                                                                        "tr",
                                                                                                                                        "colgroup",
                                                                                                                                        "col",
                                                                                                                                        "thead",
                                                                                                                                        "tbody",
                                                                                                                                        "tfoot")
                                                                                                                                .allowAttributes("colspan",
                                                                                                                                        "rowspan")
                                                                                                                                .matching(NUMBER)
                                                                                                                                .onElements("td",
                                                                                                                                        "th")
                                                                                                                                .allowAttributes("span",
                                                                                                                                        "width")
                                                                                                                                .matching(NUMBER_OR_PERCENT)
                                                                                                                                .onElements("colgroup",
                                                                                                                                        "col")
                                                                                                                                .allowElements("a",
                                                                                                                                        "label",
                                                                                                                                        "noscript",
                                                                                                                                        "h1",
                                                                                                                                        "h2",
                                                                                                                                        "h3",
                                                                                                                                        "h4",
                                                                                                                                        "h5",
                                                                                                                                        "h6",
                                                                                                                                        "p",
                                                                                                                                        "i",
                                                                                                                                        "b",
                                                                                                                                        "u",
                                                                                                                                        "strong",
                                                                                                                                        "em",
                                                                                                                                        "small",
                                                                                                                                        "big",
                                                                                                                                        "pre",
                                                                                                                                        "code",
                                                                                                                                        "cite",
                                                                                                                                        "samp",
                                                                                                                                        "sub",
                                                                                                                                        "sup",
                                                                                                                                        "strike",
                                                                                                                                        "del",
                                                                                                                                        "tt",
                                                                                                                                        "center",
                                                                                                                                        "blockquote",
                                                                                                                                        "hr",
                                                                                                                                        "br",
                                                                                                                                        "col",
                                                                                                                                        "font",
                                                                                                                                        "map",
                                                                                                                                        "span",
                                                                                                                                        "div",
                                                                                                                                        "img",
                                                                                                                                        "ul",
                                                                                                                                        "ol",
                                                                                                                                        "li",
                                                                                                                                        "dd",
                                                                                                                                        "dt",
                                                                                                                                        "dl",
                                                                                                                                        "tbody",
                                                                                                                                        "thead",
                                                                                                                                        "tfoot",
                                                                                                                                        "table",
                                                                                                                                        "td",
                                                                                                                                        "th",
                                                                                                                                        "tr",
                                                                                                                                        "colgroup",
                                                                                                                                        "fieldset",
                                                                                                                                        "legend",
                                                                                                                                        "ins")

                                                                                                                                 //Allows the named elements for xwiki input
                                                                                                                                .allowElements("wikiimage","wikilink","wikimacro")
                                                                                                                                .allowAttributes("wikiparam")
                                                                                                                                .globally()
                                                                                                                                .toFactory();

  /**
   * This service reads HTML from input forms and writes sanitized content to a
   * StringBuffer
   * 
   * @param html The <code>String</code> object
   * @return The sanitized HTML to store in DB layer
   * @throws Exception
   */
  public static String sanitize(String html) throws Exception {
    StringBuilder sb = new StringBuilder();
    // Set up an output channel to receive the sanitized HTML.
    HtmlStreamRenderer renderer = HtmlStreamRenderer.create(sb,
    // Receives notifications on a failure to write to the output.
                                                            new Handler<IOException>() {
                                                              public void handle(IOException ex) {
                                                                Throwables.propagate(ex);
                                                              }
                                                            },
                                                            // Our HTML parser
                                                            // is very lenient,
                                                            // but this receives
                                                            // notifications on
                                                            // truly bizarre
                                                            // inputs.
                                                            new Handler<String>() {
                                                              public void handle(String x) {
                                                                throw new AssertionError(x);
                                                              }
                                                            });

    // Use the policy defined above to sanitize the HTML.
    HtmlSanitizer.sanitize(html, POLICY_DEFINITION.apply(renderer));
    return sb.toString();
  }

  private static Predicate<String> matchesEither(final Pattern a, final Pattern b) {
    return new Predicate<String>() {
      public boolean apply(String s) {
        return a.matcher(s).matches() || b.matcher(s).matches();
      }
    };
  }

}