HtmlTool

/*
 * Copyright 2012 Marek Romanowski
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package pl.matsuo.maven.skins.msb3;

import org.apache.velocity.tools.ToolContext;
import org.apache.velocity.tools.config.DefaultKey;
import org.apache.velocity.tools.generic.SafeConfig;
import org.apache.velocity.tools.generic.ValueParser;
import org.jsoup.Jsoup;
import org.jsoup.helper.StringUtil;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.parser.Tag;

import java.text.Normalizer;
import java.text.Normalizer.Form;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.Stack;
import java.util.regex.Pattern;

/**
 * An Apache Velocity tool that provides utility methods to manipulate HTML code using
 * <a href="http://jsoup.org/">jsoup</a> HTML5 parser.
 * <p>
 * The methods utilise <a href="http://jsoup.org/cookbook/extracting-data/selector-syntax">CSS
 * selectors</a> to refer to specific elements for manipulation.
 * </p>
 *
 * @author Andrius Velykis
 * @since 1.0
 * @see <a href="http://jsoup.org/">jsoup HTML parser</a>
 * @see <a href="http://jsoup.org/cookbook/extracting-data/selector-syntax">jsoup CSS selectors</a>
 */
@DefaultKey("htmlTool")
public class HtmlTool extends SafeConfig {

	/** A list of all HTML heading classes (h1-6) */
	private static List<String> HEADINGS = Collections.unmodifiableList(
			Arrays.asList("h1", "h2", "h3", "h4", "h5", "h6"));


	/** Enum indicating separator handling strategy for document partitioning. */
	public enum JoinSeparator {
		/**
		 * Keep separators at the start of partitions. The first partition will not have a
		 * separator.
		 */
		AFTER,
		/**
		 * Keep separators at the end of partitions. The last partition will not have a separator.
		 */
		BEFORE,
		/** Drop separators altogether. */
		NO
	}

	private String outputEncoding = "UTF-8";

	/**
	 * {@inheritDoc}
	 *
	 * @see SafeConfig#configure(ValueParser)
	 */
	@Override
	protected void configure(ValueParser values) {

		// retrieve the Velocity context for output encoding
		Object velocityContext = values.get("velocityContext");

		if (!(velocityContext instanceof ToolContext)) {
			return;
		}

		ToolContext ctxt = (ToolContext) velocityContext;

		// get the output encoding
		Object outputEncodingObj = ctxt.get("outputEncoding");
		if (outputEncodingObj instanceof String) {
			this.outputEncoding = (String) outputEncodingObj;
		}
	}

	/**
	 * Splits the given HTML content into partitions based on the given separator selector. The
	 * separators themselves are dropped from the results.
	 *
	 * @param content
	 *            HTML content to split
	 * @param separatorCssSelector
	 *            CSS selector for separators.
	 * @return a list of HTML partitions split on separator locations, but without the separators.
	 * @since 1.0
	 * @see #split(String, String, JoinSeparator)
	 */
	public List<String> split(String content, String separatorCssSelector) {
		return split(content, separatorCssSelector, JoinSeparator.NO);
	}

	/**
	 * Splits the given HTML content into partitions based on the given separator selector. The
	 * separators are kept as first elements of the partitions.
	 * <p>
	 * Note that the first part is removed if the split was successful. This is because the first
	 * part does not include the separator.
	 * </p>
	 *
	 * @param content
	 *            HTML content to split
	 * @param separatorCssSelector
	 *            CSS selector for separators
	 * @return a list of HTML partitions split on separator locations (except the first one), with
	 *         separators at the beginning of each partition
	 * @since 1.0
	 * @see #split(String, String, JoinSeparator)
	 */
	public List<String> splitOnStarts(String content, String separatorCssSelector) {

		List<String> result = split(content, separatorCssSelector, JoinSeparator.AFTER);

		if (result == null || result.size() <= 1) {
			// no result or just one part - return what we have
			return result;
		}

		// otherwise, drop the first part - the first split will be the first 'start'
		// e.g. if we split on headings, the first part will contain everything
		// before the first heading.
		return result.subList(1, result.size());
	}

	/**
	 * Splits the given HTML content into partitions based on the given separator selector. The
	 * separators are either dropped or joined with before/after depending on the indicated
	 * separator strategy.
	 *
	 * @param content
	 *            HTML content to split
	 * @param separatorCssSelector
	 *            CSS selector for separators
	 * @param separatorStrategy
	 *            strategy to drop or keep separators, one of "after", "before" or "no"
	 * @return a list of HTML partitions split on separator locations.
	 * @since 1.0
	 * @see #split(String, String, JoinSeparator)
	 */
	public List<String> split(String content, String separatorCssSelector,
			String separatorStrategy) {

		JoinSeparator sepStrategy;
		if ("before".equals(separatorStrategy)) {
			sepStrategy = JoinSeparator.BEFORE;
		} else if ("after".equals(separatorStrategy)) {
			sepStrategy = JoinSeparator.AFTER;
		} else {
			sepStrategy = JoinSeparator.NO;
		}

		return split(content, separatorCssSelector, sepStrategy);
	}

	/**
	 * Splits the given HTML content into partitions based on the given separator selector.The
	 * separators are either dropped or joined with before/after depending on the indicated
	 * separator strategy.
	 * <p>
	 * Note that splitting algorithm tries to resolve nested elements so that returned partitions
	 * are self-contained HTML elements. The nesting is normally contained within the first
	 * applicable partition.
	 * </p>
	 *
	 * @param content
	 *            HTML content to split
	 * @param separatorCssSelector
	 *            CSS selector for separators
	 * @param separatorStrategy
	 *            strategy to drop or keep separators
	 * @return a list of HTML partitions split on separator locations. If no splitting occurs,
	 *         returns the original content as the single element of the list
	 * @since 1.0
	 */
	public List<String> split(String content, String separatorCssSelector,
			JoinSeparator separatorStrategy) {

		Element body = parseContent(content);

		List<Element> separators = body.select(separatorCssSelector);
		if (separators.size() > 0) {
			List<List<Element>> partitions = split(separators, separatorStrategy, body);

			List<String> sectionHtml = new ArrayList<String>();

			for (List<Element> partition : partitions) {
				sectionHtml.add(outerHtml(partition));
			}

			return sectionHtml;
		} else {
			// nothing to split
			return Collections.singletonList(content);
		}
	}

	/**
	 * Recursively splits the {@code parent} element based on the given {@code separators}. If a
	 * separator is encountered in the parent, it is split on that position. The outstanding nested
	 * elements go with the first of the partitions in each case.
	 *
	 * @param separators
	 * @param separatorStrategy
	 * @param parent
	 * @return list of partitions (as lists of root elements for each partition). Partition can be
	 *         an empty list, e.g. if the separator is at the start of the content.
	 */
	private static List<List<Element>> split(Collection<Element> separators,
			JoinSeparator separatorStrategy, Element parent) {

		List<List<Element>> partitions = new LinkedList<List<Element>>();

		for (Element child : parent.children()) {

			if (separators.contains(child)) {
				// split here and do not go deeper

				// first ensure there was a partition before
				// otherwise the split is not recognised on an outer level
				getLastPartition(partitions);

				if (separatorStrategy == JoinSeparator.BEFORE) {
					// add to the last partition
					getLastPartition(partitions).add(child);
				}

				// add an empty new partition
				List<Element> newPartition = new LinkedList<Element>();
				partitions.add(newPartition);

				if (separatorStrategy == JoinSeparator.AFTER) {
					// add to the new partition
					newPartition.add(child);
				}

			} else {
				// go deeper
				List<List<Element>> childPartitions = split(separators, separatorStrategy, child);

				// add the child to the last partition
				getLastPartition(partitions).add(child);

				if (childPartitions.size() > 1) {
					// more than one partition:
					// only keep the first partition elements in the child
					// so for all other partitions, remove them from their parents

					List<Element> allChildren = child.children();
					List<Element> firstPartition = childPartitions.get(0);

					allChildren.removeAll(firstPartition);
					for (Element removeChild : allChildren) {
						removeChild.remove();
					}

					// add the remaining partitions
					for (List<Element> nextPartition : childPartitions.subList(1, childPartitions.size())) {
						partitions.add(nextPartition);
					}
				}
			}
		}

		return partitions;
	}

	/**
	 * Retrieves the last partition (as list of elements) or creates a new one if there was none
	 * before.
	 *
	 * @param partitions
	 * @return
	 */
	private static List<Element> getLastPartition(List<List<Element>> partitions) {
		if (partitions.isEmpty()) {
			List<Element> newPartition = new LinkedList<Element>();
			partitions.add(newPartition);
			return newPartition;
		} else {
			return partitions.get(partitions.size() - 1);
		}
	}

	/**
	 * Outputs the list of partition root elements to HTML.
	 *
	 * @param elements
	 * @return
	 */
	private static String outerHtml(List<Element> elements) {

		switch (elements.size()) {
		case 0:
			return "";
		case 1:
			return elements.get(0).outerHtml();
		default: {
			// more than one element
			// wrap into <div> which we will remove afterwards
			Element root = new Element(Tag.valueOf("div"), "");
			for (Element elem : elements) {
				root.appendChild(elem);
			}

			return root.html();
		}
		}
	}


	/**
	 * Reorders elements in HTML content so that selected elements are found at the top of the
	 * content. Can be limited to a certain amount, e.g. to bring just the first of selected
	 * elements to the top.
	 *
	 * @param content
	 *            HTML content to reorder
	 * @param selector
	 *            CSS selector for elements to bring to top of the content
	 * @param amount
	 *            Maximum number of elements to reorder
	 * @return HTML content with reordered elements, or the original content if no such elements
	 *         found.
	 * @since 1.0
	 */
	public String reorderToTop(String content, String selector, int amount) {
		return reorderToTop(content, selector, amount, null);
	}

	/**
	 * Reorders elements in HTML content so that selected elements are found at the top of the
	 * content. Can be limited to a certain amount, e.g. to bring just the first of selected
	 * elements to the top.
	 *
	 * @param content
	 *            HTML content to reorder
	 * @param selector
	 *            CSS selector for elements to bring to top of the content
	 * @param amount
	 *            Maximum number of elements to reorder
	 * @param wrapRemaining
	 *            HTML to wrap the remaining (non-reordered) part
	 * @return HTML content with reordered elements, or the original content if no such elements
	 *         found.
	 * @since 1.0
	 */
	public String reorderToTop(String content, String selector, int amount,
			String wrapRemaining) {

		// extract the elements and then prepend them to the remaining body
		List<Element> extracted = extractElements(content, selector, amount);

		if (extracted.size() > 1) {

			Element body = extracted.get(0);

			if (wrapRemaining != null) {
				wrapInner(body, wrapRemaining);
			}

			List<Element> elements = extracted.subList(1, extracted.size());

			// now prepend extracted elements to the body (in backwards to preserve original order)
			for (int index = elements.size() - 1; index >= 0; index--) {
				body.prependChild(elements.get(index));
			}

			return body.html();
		} else {
			// nothing to reorder
			return content;
		}
	}

	private static Element wrapInner(Element element, String html) {

		// wrap everything into an additional <div> for wrapping
		// otherwise there may be problems, e.g. with <body> element
		Element topDiv = new Element(Tag.valueOf("div"), "");
		for (Element topElem : element.children()) {
			// add all elements in the body to the `topDiv`
			topElem.remove();
			topDiv.appendChild(topElem);
		}

		// add topDiv to the body
		element.appendChild(topDiv);

		// wrap topDiv
		topDiv.wrap(html);
		// now unwrap topDiv - will remove it from the hierarchy
		topDiv.unwrap();

		return element;
	}

	/**
	 * Extracts elements from the HTML content.
	 *
	 * @param content
	 * @param selector
	 * @param amount
	 * @return the remainder and a list of extracted elements. The main body (remainder after
	 *         extraction) is always returned as the first element of the list.
	 */
	private List<Element> extractElements(String content, String selector, int amount) {

		Element body = parseContent(content);

		List<Element> elements = body.select(selector);
		if (elements.size() > 0) {

			elements = filterParents(elements);

			if (amount >= 0) {
				// limit to the indicated amount
				elements = elements.subList(0, Math.min(amount, elements.size()));
			}

			// remove all from their parents
			for (Element element : elements) {
				element.remove();
			}
		}

		List<Element> results = new ArrayList<Element>();
		// first element is the body
		results.add(body);
		results.addAll(elements);
		return results;
	}

	/**
	 * Filters the list of elements to only contain parent elements. This is to avoid both parent
	 * and child being in the list of elements.
	 *
	 * @param elements
	 * @return
	 */
	private static List<Element> filterParents(List<Element> elements) {
		List<Element> filtered = new ArrayList<Element>();
		for (Element element : elements) {
			// get the intersection of parents and selected elements
			List<Element> parentsInter = element.parents();
			parentsInter.retainAll(elements);
			if (parentsInter.isEmpty()) {
				// no intersection - element's parents are not in the selected list
				filtered.add(element);
			}
		}

		return filtered;
	}

	/**
	 * Extracts HTML elements from the main HTML content. The result consists of the extracted HTML
	 * elements and the remainder of HTML content, with these elements removed. Can be limited to a
	 * certain amount, e.g. to extract just the first of selected elements.
	 *
	 * @param content
	 *            HTML content to extract elements from
	 * @param selector
	 *            CSS selector for elements to extract
	 * @param amount
	 *            Maximum number of elements to extract
	 * @return HTML content of the extracted elements together with the remainder of the original
	 *         content. If no elements are found, the remainder contains the original content.
	 * @since 1.0
	 */
	public ExtractResult extract(String content, String selector, int amount) {

		List<Element> extracted = extractElements(content, selector, amount);

		if (extracted.size() > 1) {

			// first element is the remaining body, the rest are extracted
			Element body = extracted.get(0);
			List<Element> elements = extracted.subList(1, extracted.size());

			// convert to HTML
			List<String> elementStr = new ArrayList<String>();
			for (Element el : elements) {
				elementStr.add(el.outerHtml());
			}

			return new DefaultExtractResult(elementStr, body.html());
		} else {
			// nothing to extract
			return new DefaultExtractResult(Collections.<String> emptyList(), content);
		}
	}

	/**
	 * A container to carry element extraction results. Contains the extracted element HTML
	 * code and the remainder of the body content with elements removed.
	 *
	 * @author Marek Romanowski
	 * @since 1.0
	 */
	public static interface ExtractResult {

		/**
		 * Retrieves the extracted HTML elements.
		 *
		 * @return List of HTML of extracted elements. Can be empty if no elements found.
		 */
		public List<String> getExtracted();

		/**
		 * Retrieves the content from which elements were extracted.
		 *
		 * @return The HTML content with extracted elements removed.
		 */
		public String getRemainder();
	}

	private static class DefaultExtractResult implements ExtractResult {
		private final List<String> extracted;
		private final String remainder;

		public DefaultExtractResult(List<String> extracted, String remainder) {
			this.extracted = extracted;
			this.remainder = remainder;
		}

		@Override
		public List<String> getExtracted() {
			return Collections.unmodifiableList(extracted);
		}

		@Override
		public String getRemainder() {
			return remainder;
		}
	}


	/**
	 * Sets attribute to the given value on elements in HTML.
	 *
	 * @param content
	 *            HTML content to set attributes on
	 * @param selector
	 *            CSS selector for elements to modify
	 * @param attributeKey
	 *            Attribute name
	 * @param value
	 *            Attribute value
	 * @return HTML content with modified elements. If no elements are found, the original content
	 *         is returned.
	 * @since 1.0
	 */
	public String setAttr(String content, String selector, String attributeKey, String value) {

		Element body = parseContent(content);

		List<Element> elements = body.select(selector);
		if (elements.size() > 0) {

			for (Element element : elements) {
				element.attr(attributeKey, value);
			}

			return body.html();
		} else {
			// nothing to update
			return content;
		}
	}

	/**
	 * Parses body fragment to the {@code <body>} element.
	 *
	 * @param content
	 * @return the {@code body} element of the parsed content
	 */
	private Element parseContent(String content) {
		Document doc = Jsoup.parseBodyFragment(content);
		doc.outputSettings().charset(outputEncoding);
		return doc.body();
	}

	/**
	 * Retrieves attribute value on elements in HTML. Will return all attribute values for the
	 * selector, since there can be more than one element.
	 *
	 * @param content
	 *            HTML content to read attributes from
	 * @param selector
	 *            CSS selector for elements to find
	 * @param attributeKey
	 *            Attribute name
	 * @return Attribute values for all matching elements. If no elements are found, empty list is
	 *         returned.
	 * @since 1.0
	 */
	public List<String> getAttr(String content, String selector, String attributeKey) {

		Element body = parseContent(content);

		List<Element> elements = body.select(selector);
		List<String> attrs = new ArrayList<String>();

		for (Element element : elements) {
			String attrValue = element.attr(attributeKey);
			attrs.add(attrValue);
		}

		return attrs;
	}

	/**
	 * Adds given class names to the elements in HTML.
	 *
	 * @param content
	 *            HTML content to modify
	 * @param selector
	 *            CSS selector for elements to add classes to
	 * @param classNames
	 *            Names of classes to add to the selected elements
	 * @param amount
	 *            Maximum number of elements to modify
	 * @return HTML content with modified elements. If no elements are found, the original content
	 *         is returned.
	 * @since 1.0
	 */
	public String addClass(String content, String selector, List<String> classNames, int amount) {

		Element body = parseContent(content);

		List<Element> elements = body.select(selector);
		if (amount >= 0) {
			// limit to the indicated amount
			elements = elements.subList(0, Math.min(amount, elements.size()));
		}

		if (elements.size() > 0) {

			for (Element element : elements) {
				for (String className : classNames) {
					element.addClass(className);
				}
			}

			return body.html();
		} else {
			// nothing to update
			return content;
		}
	}

	/**
	 * Adds given class names to the elements in HTML.
	 *
	 * @param content
	 *            HTML content to modify
	 * @param selector
	 *            CSS selector for elements to add classes to
	 * @param classNames
	 *            Names of classes to add to the selected elements
	 * @return HTML content with modified elements. If no elements are found, the original content
	 *         is returned.
	 * @since 1.0
	 */
	public String addClass(String content, String selector, List<String> classNames) {
		return addClass(content, selector, classNames, -1);
	}

	/**
	 * Adds given class to the elements in HTML.
	 *
	 * @param content
	 *            HTML content to modify
	 * @param selector
	 *            CSS selector for elements to add the class to
	 * @param className
	 *            Name of class to add to the selected elements
	 * @return HTML content with modified elements. If no elements are found, the original content
	 *         is returned.
	 * @since 1.0
	 */
	public String addClass(String content, String selector, String className) {
		return addClass(content, selector, Collections.singletonList(className));
	}

	/**
	 * Wraps elements in HTML with the given HTML.
	 *
	 * @param content
	 *            HTML content to modify
	 * @param selector
	 *            CSS selector for elements to wrap
	 * @param wrapHtml
	 *            HTML to use for wrapping the selected elements
	 * @param amount
	 *            Maximum number of elements to modify
	 * @return HTML content with modified elements. If no elements are found, the original content
	 *         is returned.
	 * @since 1.0
	 */
	public String wrap(String content, String selector, String wrapHtml, int amount) {

		Element body = parseContent(content);

		List<Element> elements = body.select(selector);
		if (amount >= 0) {
			// limit to the indicated amount
			elements = elements.subList(0, Math.min(amount, elements.size()));
		}

		if (elements.size() > 0) {

			for (Element element : elements) {
				element.wrap(wrapHtml);
			}

			return body.html();
		} else {
			// nothing to update
			return content;
		}
	}

	/**
	 * Removes elements from HTML.
	 *
	 * @param content
	 *            HTML content to modify
	 * @param selector
	 *            CSS selector for elements to remove
	 * @return HTML content with removed elements. If no elements are found, the original content is
	 *         returned.
	 * @since 1.0
	 */
	public String remove(String content, String selector) {

		Element body = parseContent(content);

		List<Element> elements = body.select(selector);
		if (elements.size() > 0) {
			for (Element element : elements) {
				element.remove();
			}

			return body.html();
		} else {
			// nothing changed
			return content;
		}
	}

	/**
	 * Replaces elements in HTML.
	 *
	 * @param content
	 *            HTML content to modify
	 * @param selector
	 *            CSS selector for elements to replace
	 * @param replacement
	 *            HTML replacement (must parse to a single element)
	 * @return HTML content with replaced elements. If no elements are found, the original content is
	 *         returned.
	 * @since 1.0
	 */
	public String replace(String content, String selector, String replacement) {
		return replaceAll(content, Collections.singletonMap(selector, replacement));
	}

	/**
	 * Replaces elements in HTML.
	 *
	 * @param content
	 *            HTML content to modify
	 * @param replacements
	 *            Map of CSS selectors to their replacement HTML texts. CSS selectors find elements
	 *            to be replaced with the HTML in the mapping. The HTML must parse to a single
	 *            element.
	 * @return HTML content with replaced elements. If no elements are found, the original content
	 *         is returned.
	 * @since 1.0
	 */
	public String replaceAll(String content, Map<String, String> replacements) {

		Element body = parseContent(content);

		boolean modified = false;
		for (Entry<String, String> replacementEntry : replacements.entrySet()) {
			String selector = replacementEntry.getKey();
			String replacement = replacementEntry.getValue();

			List<Element> elements = body.select(selector);
			if (elements.size() > 0) {

				// take the first child
				Element replacementElem = parseContent(replacement).child(0);

				if (replacementElem != null) {
					for (Element element : elements) {
						element.replaceWith(replacementElem.clone());
					}

					modified = true;
				}
			}
		}

		if (modified) {
			return body.html();
		} else {
			// nothing changed
			return content;
		}
	}

	/**
	 * Retrieves text content of the selected elements in HTML. Renders the element's text as it
	 * would be displayed on the web page (including its children).
	 *
	 * @param content
	 *            HTML content with the elements
	 * @param selector
	 *            CSS selector for elements to extract contents
	 * @return A list of element texts as rendered to display. Empty list if no elements are found.
	 * @since 1.0
	 */
	public List<String> text(String content, String selector) {

		Element body = parseContent(content);

		List<Element> elements = body.select(selector);
		List<String> texts = new ArrayList<String>();

		for (Element element : elements) {
			texts.add(element.text());
		}

		return texts;
	}

	/**
	 * Transforms the given HTML content by moving anchor ({@code <a name="myheading">}) names to
	 * IDs for heading elements.
	 * <p>
	 * The anchors are used to indicate positions within a HTML page. In HTML5, however, the
	 * {@code name} attribute is no longer supported on {@code <a>}) tag. The positions within pages
	 * are indicated using {@code id} attribute instead, e.g. {@code <h1 id="myheading">}.
	 * </p>
	 * <p>
	 * The method finds anchors inside, immediately before or after the heading tags and uses their
	 * name as heading {@code id} instead. The anchors themselves are removed.
	 * </p>
	 *
	 * @param content
	 *            HTML content to modify
	 * @return HTML content with modified elements. Anchor names are used for adjacent headings, and
	 *         anchor tags are removed. If no elements are found, the original content is returned.
	 * @since 1.0
	 */
	public String headingAnchorToId(String content) {

		Element body = parseContent(content);

		// selectors for headings without IDs
		List<String> headNoIds = concat(HEADINGS, ":not([id])", true);

		// selector for anchor with name attribute only
		String nameA = "a[name]:not([href])";

		// select all headings that have inner named anchor
		List<Element> headingsInnerA = body.select(StringUtil.join(
				concat(headNoIds, ":has(" + nameA + ")", true), ", "));

		boolean modified = false;
		for (Element heading : headingsInnerA) {
			List<Element> anchors = heading.select(nameA);
			// take first
			if (!anchors.isEmpty()) {
				anchorToId(heading, anchors.get(0));
				modified = true;
			}
		}

		// select all headings that have a preceding named anchor
		List<Element> headingsPreA = body.select(StringUtil.join(
				concat(headNoIds, nameA + " + ", false), ", "));

		for (Element heading : headingsPreA) {
			Element anchor = heading.previousElementSibling();
			if (anchor != null) {
				anchorToId(heading, anchor);
				modified = true;
			}
		}

		// select all headings that are followed by a named anchor
		// no selector available for that, so first select the anchors
		// then retrieve the headings
		List<Element> anchorsPreH = body.select(StringUtil.join(
				concat(headNoIds, " + " + nameA, true), ", "));

		for (Element anchor : anchorsPreH) {
			Element heading = anchor.previousElementSibling();
			if (heading != null) {
				anchorToId(heading, anchor);
				modified = true;
			}
		}

		if (modified) {
			return body.html();
		} else {
			// nothing to update
			return content;
		}
	}

	/**
	 * Moves anchor name to heading id, if one does not exist. Removes the anchor.
	 *
	 * @param heading
	 * @param anchor
	 */
	private static void anchorToId(Element heading, Element anchor) {

		if ("a".equals(anchor.tagName()) && heading.id().isEmpty()) {
			String aName = anchor.attr("name");
			if (!aName.isEmpty()) {
				// set the anchor name as heading ID
				heading.attr("id", aName);

				// remove the anchor
				anchor.remove();
			}
		}
	}


	/**
	 * Utility method to concatenate a String to a list of Strings. The text can be either appended
	 * or prepended.
	 *
	 * @param elements
	 *            list of elements to append/prepend the text to
	 * @param text
	 *            the given text to append/prepend
	 * @param append
	 *            if {@code true}, text will be appended to the elements. If {@code false}, it will
	 *            be prepended
	 * @return list of elements with the text appended/prepended
	 * @since 1.0
	 */
	public static List<String> concat(List<String> elements, String text, boolean append) {
		List<String> concats = new ArrayList<String>();

		for (String element : elements) {
			concats.add(append ? element + text : text + element);
		}

		return concats;
	}


	/**
	 * Transforms the given HTML content by adding IDs to all heading elements ({@code h1-6}) that
	 * do not have one.
	 * <p>
	 * IDs on heading elements are used to indicate positions within a HTML page in HTML5. If a
	 * heading tag without an {@code id} is found, its "slug" is generated automatically based on
 	 * the heading contents and used as the ID.
 	 * </p>
 	 * <p>
 	 * Note that the algorithm also modifies existing IDs that have symbols not allowed in CSS
 	 * selectors, e.g. ":", ".", etc. The symbols are removed.
 	 * </p>
 	 *
 	 * @param content
 	 *            HTML content to modify
 	 * @return HTML content with all heading elements having {@code id} attributes. If all headings
 	 *         were with IDs already, the original content is returned.
 	 * @since 1.0
 	 */
 	public String ensureHeadingIds(String content, String idSeparator) {

 		Element body = parseContent(content);

 		// first find all existing IDs (to avoid generating duplicates)
 		List<Element> idElems = body.select("*[id]");
 		Set<String> ids = new HashSet<String>();
 		boolean modified = false;
 		for (Element idElem : idElems) {

 			// fix all existing IDs - remove colon and other symbols which mess up jQuery
 			String id = idElem.id();
 			idElem.attr("id", adaptSlug(id, idSeparator));
 			modified = true;

 			ids.add(idElem.id());
 		}

 		List<String> headNoIds = concat(HEADINGS, ":not([id])", true);

 		// select all headings that do not have an ID
 		List<Element> headingsNoId = body.select(StringUtil.join(headNoIds, ", "));

 		if (!headingsNoId.isEmpty() || modified) {
 			for (Element heading : headingsNoId) {

 				String headingText = heading.text();
 				String headingSlug = slug(headingText, idSeparator);
 				// also limit slug to 50 symbols
 				if (headingSlug.length() > 50) {
 					headingSlug = headingSlug.substring(0, 50);
 				}
 				String headingId = generateUniqueId(ids, headingSlug);

 				heading.attr("id", headingId);
 			}

 			return body.html();
 		} else {
 			// nothing to update
 			return content;
 		}
 	}

 	/**
 	 * Generated a unique ID within the given set of IDs. Appends an incrementing number for
 	 * duplicates.
 	 *
 	 * @param ids
 	 * @param idBase
 	 * @return
 	 */
 	private static String generateUniqueId(Set<String> ids, String idBase) {
 		String id = idBase;
 		int counter = 1;
 		while (ids.contains(id)) {
 			id = idBase + String.valueOf(counter++);
 		}

 		// put the newly generated one into the set
 		ids.add(id);
 		return id;
 	}

 	/**
 	 * Fixes table heads: wraps rows with {@code <th>} (table heading) elements into {@code <thead>}
 	 * element if they are currently in {@code <tbody>}.
 	 *
 	 * @param content
 	 *            HTML content to modify
 	 * @return HTML content with all table heads fixed. If all heads were correct, the original
 	 *         content is returned.
 	 * @since 1.0
 	 */
 	public String fixTableHeads(String content) {

 		Element body = parseContent(content);

 		// select rows with <th> tags within <tbody>
 		List<Element> tableHeadRows = body.select("table > tbody > tr:has(th)");
 		if (tableHeadRows.size() > 0) {
 			for (Element row : tableHeadRows) {

 				// get the row's table
 				Element table = row.parent().parent();

 				// remove row from its original position
 				row.remove();

 				// create table header element with the row
 				Element thead = new Element(Tag.valueOf("thead"), "");
 				thead.appendChild(row);
 				// add at the beginning of the table
 				table.prependChild(thead);
 			}

 			return body.html();
 		} else {
 			// nothing changed
 			return content;
 		}
 	}


 	private static final Pattern NONLATIN = Pattern.compile("[^\\w-]");
 	private static final Pattern WHITESPACE = Pattern.compile("[\\s]");

 	/**
 	 * Creates a slug (latin text with no whitespace or other symbols) for a longer text (i.e. to
 	 * use in URLs).
 	 *
 	 * @param input
 	 *            text to generate the slug from
 	 * @param separator
 	 *            separator for whitespace replacement
 	 * @return the slug of the given text that contains alphanumeric symbols and separator only
 	 * @since 1.0
 	 * @see <a href="http://www.codecodex.com/wiki/Generate_a_url_slug">http://www.codecodex.com/wiki/Generate_a_url_slug</a>
 	 */
 	public static String slug(String input, String separator) {
 		String slug = adaptSlug(input, separator);
 		return slug.toLowerCase(Locale.ENGLISH);
 	}

 	/**
 	 * Creates a slug (latin text with no whitespace or other symbols) for a longer text (i.e. to
 	 * use in URLs). Uses "-" as a whitespace separator.
 	 *
 	 * @param input
 	 *            text to generate the slug from
 	 * @return the slug of the given text that contains alphanumeric symbols and "-" only
 	 * @since 1.0
 	 */
 	public static String slug(String input) {
 		return slug(input, "-");
 	}

 	/**
 	 * Creates a slug but does not change capitalization.
 	 *
 	 * @param input
 	 * @param separator
 	 * @return
 	 */
 	private static String adaptSlug(String input, String separator) {
 		String nowhitespace = WHITESPACE.matcher(input).replaceAll(separator);
 		String normalized = Normalizer.normalize(nowhitespace, Form.NFD);
 		return NONLATIN.matcher(normalized).replaceAll("");
 	}


 	/**
 	 * Reads all headings in the given HTML content as a hierarchy. Subsequent smaller headings are
 	 * nested within bigger ones, e.g. {@code <h2>} is nested under preceding {@code <h1>}.
 	 * <p>
 	 * Only headings with IDs are included in the hierarchy. The result elements contain ID and
 	 * heading text for each heading. The hierarchy is useful to generate a Table of Contents for a
 	 * page.
 	 * </p>
 	 *
 	 * @param content
 	 *            HTML content to extract heading hierarchy from
 	 * @return a list of top-level heading items (with id and text). The remaining headings are
 	 *         nested within these top-level items. Empty list if no headings are in the content.
 	 * @since 1.0
 	 */
 	public List<? extends IdElement> headingTree(String content) {

 		Element body = parseContent(content);

 		List<String> headIds = concat(HEADINGS, "[id]", true);

 		// select all headings that have an ID
 		List<Element> headings = body.select(StringUtil.join(headIds, ", "));

 		List<HeadingItem> headingItems = new ArrayList<HeadingItem>();
 		for (Element heading : headings) {
 			headingItems.add(new HeadingItem(heading.id(), heading.text(), headingIndex(heading)));
 		}

 		List<HeadingItem> topHeadings = new ArrayList<HeadingItem>();
 		Stack<HeadingItem> parentHeadings = new Stack<HeadingItem>();

 		for (HeadingItem heading : headingItems) {

 			while (!parentHeadings.isEmpty()
 					&& parentHeadings.peek().headingIndex >= heading.headingIndex) {
 				parentHeadings.pop();
 			}

 			if (parentHeadings.isEmpty()) {
 				// top level heading - no parents
 				topHeadings.add(heading);
 			} else {
 				// add to the children of topmost stack parent
 				parentHeadings.peek().children.add(heading);
 			}

 			// push the heading onto stack
 			parentHeadings.push(heading);
 		}

 		return topHeadings;
 	}

 	/**
 	 * Retrieves numeric index of a heading.
 	 *
 	 * @param element
 	 * @return
 	 */
 	private static int headingIndex(Element element) {
 		String tagName = element.tagName();
 		if (tagName.startsWith("h")) {
 			try {
 				return Integer.parseInt(tagName.substring(1));
 			} catch (Exception ex) {
 				throw new IllegalArgumentException("Must be a header tag: " + tagName, ex);
 			}
 		} else {
 			throw new IllegalArgumentException("Must be a header tag: " + tagName);
 		}
 	}

 	private static class HeadingItem implements IdElement {
 		private final String id;
 		private final String text;
 		private final int headingIndex;

 		private final List<HeadingItem> children = new ArrayList<HeadingItem>();

 		public HeadingItem(String id, String text, int headingIndex) {
 			this.id = id;
 			this.text = text;
 			this.headingIndex = headingIndex;
 		}

 		@Override
 		public String getId() {
 			return id;
 		}

 		@Override
 		public String getText() {
 			return text;
 		}

 		@Override
 		public List<HeadingItem> getItems() {
 			return Collections.unmodifiableList(children);
 		}
 	}

 	/**
 	 * Representation of a HTML element with ID and a text content. Other such elements can be
 	 * nested within.
 	 *
 	 * @author Marek Romanowski
 	 * @since 1.0
 	 */
 	public interface IdElement {

 		/**
 		 * Retrieves the ID of the HTML element (attribute {@code id})
 		 *
 		 * @return element {@code id} value
 		 */
 		public String getId();

 		/**
 		 * Retrieves the text contents of the HTML element (rendered for display)
 		 *
 		 * @return text contents of the element
 		 */
 		public String getText();

 		/**
 		 * Retrieves the children of the HTML element (nested within the element)
 		 *
 		 * @return nested items within the element
 		 */
 		public List<? extends IdElement> getItems();
 	}


 	/**
 	 * A generic method to use jsoup parser on an arbitrary HTML body fragment. Allows writing
 	 * HTML manipulations in the template without adding Java code to the class.
 	 *
 	 * @param content
 	 *            HTML content to parse
 	 * @return the wrapper element for the parsed content (i.e. the body element as if the content
 	 *         was body contents).
 	 * @since 1.0
 	 */
 	public static Element parseBodyFragment(String content) {

 		Document doc = Jsoup.parseBodyFragment(content);
 		return doc.body();
 	}

 }