View Javadoc
1   /* 
2    * Copyright 2012 Marek Romanowski
3    * 
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    * 
8    *   http://www.apache.org/licenses/LICENSE-2.0
9    * 
10   * Unless required by applicable law or agreed to in writing, software
11   * distributed under the License is distributed on an "AS IS" BASIS,
12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   * See the License for the specific language governing permissions and
14   * limitations under the License.
15   */
16  package pl.matsuo.maven.skins.msb3;
17  
18  import org.apache.velocity.tools.ToolContext;
19  import org.apache.velocity.tools.config.DefaultKey;
20  import org.apache.velocity.tools.generic.SafeConfig;
21  import org.apache.velocity.tools.generic.ValueParser;
22  import org.jsoup.Jsoup;
23  import org.jsoup.helper.StringUtil;
24  import org.jsoup.nodes.Document;
25  import org.jsoup.nodes.Element;
26  import org.jsoup.parser.Tag;
27  
28  import java.text.Normalizer;
29  import java.text.Normalizer.Form;
30  import java.util.ArrayList;
31  import java.util.Arrays;
32  import java.util.Collection;
33  import java.util.Collections;
34  import java.util.HashSet;
35  import java.util.LinkedList;
36  import java.util.List;
37  import java.util.Locale;
38  import java.util.Map;
39  import java.util.Map.Entry;
40  import java.util.Set;
41  import java.util.Stack;
42  import java.util.regex.Pattern;
43  
44  /**
45   * An Apache Velocity tool that provides utility methods to manipulate HTML code using
46   * <a href="http://jsoup.org/">jsoup</a> HTML5 parser.
47   * <p>
48   * The methods utilise <a href="http://jsoup.org/cookbook/extracting-data/selector-syntax">CSS
49   * selectors</a> to refer to specific elements for manipulation.
50   * </p>
51   * 
52   * @author Andrius Velykis
53   * @since 1.0
54   * @see <a href="http://jsoup.org/">jsoup HTML parser</a>
55   * @see <a href="http://jsoup.org/cookbook/extracting-data/selector-syntax">jsoup CSS selectors</a>
56   */
57  @DefaultKey("htmlTool")
58  public class HtmlTool extends SafeConfig {
59  	
60  	/** A list of all HTML heading classes (h1-6) */
61  	private static List<String> HEADINGS = Collections.unmodifiableList(
62  			Arrays.asList("h1", "h2", "h3", "h4", "h5", "h6"));
63  	
64  	
65  	
66  	/** Enum indicating separator handling strategy for document partitioning. */
67  	public enum JoinSeparator {
68  		/**
69  		 * Keep separators at the start of partitions. The first partition will not have a
70  		 * separator.
71  		 */
72  		AFTER,
73  		/**
74  		 * Keep separators at the end of partitions. The last partition will not have a separator.
75  		 */
76  		BEFORE,
77  		/** Drop separators altogether. */
78  		NO
79  	}
80  	
81  	private String outputEncoding = "UTF-8";
82  	
83  	/**
84  	 * {@inheritDoc}
85  	 * 
86  	 * @see SafeConfig#configure(ValueParser)
87  	 */
88  	@Override
89  	protected void configure(ValueParser values) {
90  
91  		// retrieve the Velocity context for output encoding
92  		Object velocityContext = values.get("velocityContext");
93  
94  		if (!(velocityContext instanceof ToolContext)) {
95  			return;
96  		}
97  
98  		ToolContext ctxt = (ToolContext) velocityContext;
99  		
100 		// get the output encoding
101 		Object outputEncodingObj = ctxt.get("outputEncoding");
102 		if (outputEncodingObj instanceof String) {
103 			this.outputEncoding = (String) outputEncodingObj;
104 		}
105 	}
106 
107 	/**
108 	 * Splits the given HTML content into partitions based on the given separator selector. The
109 	 * separators themselves are dropped from the results.
110 	 * 
111 	 * @param content
112 	 *            HTML content to split
113 	 * @param separatorCssSelector
114 	 *            CSS selector for separators.
115 	 * @return a list of HTML partitions split on separator locations, but without the separators.
116 	 * @since 1.0
117 	 * @see #split(String, String, JoinSeparator)
118 	 */
119 	public List<String> split(String content, String separatorCssSelector) {
120 		return split(content, separatorCssSelector, JoinSeparator.NO);
121 	}
122 
123 	/**
124 	 * Splits the given HTML content into partitions based on the given separator selector. The
125 	 * separators are kept as first elements of the partitions.
126 	 * <p>
127 	 * Note that the first part is removed if the split was successful. This is because the first
128 	 * part does not include the separator.
129 	 * </p>
130 	 * 
131 	 * @param content
132 	 *            HTML content to split
133 	 * @param separatorCssSelector
134 	 *            CSS selector for separators
135 	 * @return a list of HTML partitions split on separator locations (except the first one), with
136 	 *         separators at the beginning of each partition
137 	 * @since 1.0
138 	 * @see #split(String, String, JoinSeparator)
139 	 */
140 	public List<String> splitOnStarts(String content, String separatorCssSelector) {
141 
142 		List<String> result = split(content, separatorCssSelector, JoinSeparator.AFTER);
143 
144 		if (result == null || result.size() <= 1) {
145 			// no result or just one part - return what we have
146 			return result;
147 		}
148 
149 		// otherwise, drop the first part - the first split will be the first 'start'
150 		// e.g. if we split on headings, the first part will contain everything
151 		// before the first heading.
152 		return result.subList(1, result.size());
153 	}
154 
155 	/**
156 	 * Splits the given HTML content into partitions based on the given separator selector. The
157 	 * separators are either dropped or joined with before/after depending on the indicated
158 	 * separator strategy.
159 	 * 
160 	 * @param content
161 	 *            HTML content to split
162 	 * @param separatorCssSelector
163 	 *            CSS selector for separators
164 	 * @param separatorStrategy
165 	 *            strategy to drop or keep separators, one of "after", "before" or "no"
166 	 * @return a list of HTML partitions split on separator locations.
167 	 * @since 1.0
168 	 * @see #split(String, String, JoinSeparator)
169 	 */
170 	public List<String> split(String content, String separatorCssSelector,
171 			String separatorStrategy) {
172 
173 		JoinSeparator sepStrategy;
174 		if ("before".equals(separatorStrategy)) {
175 			sepStrategy = JoinSeparator.BEFORE;
176 		} else if ("after".equals(separatorStrategy)) {
177 			sepStrategy = JoinSeparator.AFTER;
178 		} else {
179 			sepStrategy = JoinSeparator.NO;
180 		}
181 
182 		return split(content, separatorCssSelector, sepStrategy);
183 	}
184 
185 	/**
186 	 * Splits the given HTML content into partitions based on the given separator selector.The
187 	 * separators are either dropped or joined with before/after depending on the indicated
188 	 * separator strategy.
189 	 * <p>
190 	 * Note that splitting algorithm tries to resolve nested elements so that returned partitions
191 	 * are self-contained HTML elements. The nesting is normally contained within the first
192 	 * applicable partition.
193 	 * </p>
194 	 * 
195 	 * @param content
196 	 *            HTML content to split
197 	 * @param separatorCssSelector
198 	 *            CSS selector for separators
199 	 * @param separatorStrategy
200 	 *            strategy to drop or keep separators
201 	 * @return a list of HTML partitions split on separator locations. If no splitting occurs,
202 	 *         returns the original content as the single element of the list
203 	 * @since 1.0
204 	 */
205 	public List<String> split(String content, String separatorCssSelector,
206 			JoinSeparator separatorStrategy) {
207 
208 		Element body = parseContent(content);
209 
210 		List<Element> separators = body.select(separatorCssSelector);
211 		if (separators.size() > 0) {
212 			List<List<Element>> partitions = split(separators, separatorStrategy, body);
213 
214 			List<String> sectionHtml = new ArrayList<String>();
215 
216 			for (List<Element> partition : partitions) {
217 				sectionHtml.add(outerHtml(partition));
218 			}
219 
220 			return sectionHtml;
221 		} else {
222 			// nothing to split
223 			return Collections.singletonList(content);
224 		}
225 	}
226 
227 	/**
228 	 * Recursively splits the {@code parent} element based on the given {@code separators}. If a
229 	 * separator is encountered in the parent, it is split on that position. The outstanding nested
230 	 * elements go with the first of the partitions in each case.
231 	 * 
232 	 * @param separators
233 	 * @param separatorStrategy
234 	 * @param parent
235 	 * @return list of partitions (as lists of root elements for each partition). Partition can be
236 	 *         an empty list, e.g. if the separator is at the start of the content.
237 	 */
238 	private static List<List<Element>> split(Collection<Element> separators,
239 			JoinSeparator separatorStrategy, Element parent) {
240 
241 		List<List<Element>> partitions = new LinkedList<List<Element>>();
242 
243 		for (Element child : parent.children()) {
244 
245 			if (separators.contains(child)) {
246 				// split here and do not go deeper
247 
248 				// first ensure there was a partition before
249 				// otherwise the split is not recognised on an outer level
250 				getLastPartition(partitions);
251 
252 				if (separatorStrategy == JoinSeparator.BEFORE) {
253 					// add to the last partition
254 					getLastPartition(partitions).add(child);
255 				}
256 
257 				// add an empty new partition
258 				List<Element> newPartition = new LinkedList<Element>();
259 				partitions.add(newPartition);
260 
261 				if (separatorStrategy == JoinSeparator.AFTER) {
262 					// add to the new partition
263 					newPartition.add(child);
264 				}
265 
266 			} else {
267 				// go deeper
268 				List<List<Element>> childPartitions = split(separators, separatorStrategy, child);
269 
270 				// add the child to the last partition
271 				getLastPartition(partitions).add(child);
272 
273 				if (childPartitions.size() > 1) {
274 					// more than one partition:
275 					// only keep the first partition elements in the child
276 					// so for all other partitions, remove them from their parents
277 
278 					List<Element> allChildren = child.children();
279 					List<Element> firstPartition = childPartitions.get(0);
280 
281 					allChildren.removeAll(firstPartition);
282 					for (Element removeChild : allChildren) {
283 						removeChild.remove();
284 					}
285 
286 					// add the remaining partitions
287 					for (List<Element> nextPartition : childPartitions.subList(1, childPartitions.size())) {
288 						partitions.add(nextPartition);
289 					}
290 				}
291 			}
292 		}
293 
294 		return partitions;
295 	}
296 
297 	/**
298 	 * Retrieves the last partition (as list of elements) or creates a new one if there was none
299 	 * before.
300 	 * 
301 	 * @param partitions
302 	 * @return
303 	 */
304 	private static List<Element> getLastPartition(List<List<Element>> partitions) {
305 		if (partitions.isEmpty()) {
306 			List<Element> newPartition = new LinkedList<Element>();
307 			partitions.add(newPartition);
308 			return newPartition;
309 		} else {
310 			return partitions.get(partitions.size() - 1);
311 		}
312 	}
313 
314 	/**
315 	 * Outputs the list of partition root elements to HTML.
316 	 * 
317 	 * @param elements
318 	 * @return
319 	 */
320 	private static String outerHtml(List<Element> elements) {
321 
322 		switch (elements.size()) {
323 		case 0:
324 			return "";
325 		case 1:
326 			return elements.get(0).outerHtml();
327 		default: {
328 			// more than one element
329 			// wrap into <div> which we will remove afterwards
330 			Element root = new Element(Tag.valueOf("div"), "");
331 			for (Element elem : elements) {
332 				root.appendChild(elem);
333 			}
334 
335 			return root.html();
336 		}
337 		}
338 	}
339 	
340 	
341 	
342 	/**
343 	 * Reorders elements in HTML content so that selected elements are found at the top of the
344 	 * content. Can be limited to a certain amount, e.g. to bring just the first of selected
345 	 * elements to the top.
346 	 * 
347 	 * @param content
348 	 *            HTML content to reorder
349 	 * @param selector
350 	 *            CSS selector for elements to bring to top of the content
351 	 * @param amount
352 	 *            Maximum number of elements to reorder
353 	 * @return HTML content with reordered elements, or the original content if no such elements
354 	 *         found.
355 	 * @since 1.0
356 	 */
357 	public String reorderToTop(String content, String selector, int amount) {
358 		return reorderToTop(content, selector, amount, null);
359 	}
360 	
361 	/**
362 	 * Reorders elements in HTML content so that selected elements are found at the top of the
363 	 * content. Can be limited to a certain amount, e.g. to bring just the first of selected
364 	 * elements to the top.
365 	 * 
366 	 * @param content
367 	 *            HTML content to reorder
368 	 * @param selector
369 	 *            CSS selector for elements to bring to top of the content
370 	 * @param amount
371 	 *            Maximum number of elements to reorder
372 	 * @param wrapRemaining
373 	 *            HTML to wrap the remaining (non-reordered) part
374 	 * @return HTML content with reordered elements, or the original content if no such elements
375 	 *         found.
376 	 * @since 1.0
377 	 */
378 	public String reorderToTop(String content, String selector, int amount,
379 			String wrapRemaining) {
380 
381 		// extract the elements and then prepend them to the remaining body
382 		List<Element> extracted = extractElements(content, selector, amount);
383 
384 		if (extracted.size() > 1) {
385 
386 			Element body = extracted.get(0);
387 			
388 			if (wrapRemaining != null) {
389 				wrapInner(body, wrapRemaining);
390 			}
391 			
392 			List<Element> elements = extracted.subList(1, extracted.size());
393 
394 			// now prepend extracted elements to the body (in backwards to preserve original order)
395 			for (int index = elements.size() - 1; index >= 0; index--) {
396 				body.prependChild(elements.get(index));
397 			}
398 
399 			return body.html();
400 		} else {
401 			// nothing to reorder
402 			return content;
403 		}
404 	}
405 	
406 	private static Element wrapInner(Element element, String html) {
407 
408 		// wrap everything into an additional <div> for wrapping
409 		// otherwise there may be problems, e.g. with <body> element
410 		Element topDiv = new Element(Tag.valueOf("div"), "");
411 		for (Element topElem : element.children()) {
412 			// add all elements in the body to the `topDiv`
413 			topElem.remove();
414 			topDiv.appendChild(topElem);
415 		}
416 
417 		// add topDiv to the body
418 		element.appendChild(topDiv);
419 
420 		// wrap topDiv
421 		topDiv.wrap(html);
422 		// now unwrap topDiv - will remove it from the hierarchy
423 		topDiv.unwrap();
424 		
425 		return element;
426 	}
427 	
428 	/**
429 	 * Extracts elements from the HTML content.
430 	 * 
431 	 * @param content
432 	 * @param selector
433 	 * @param amount
434 	 * @return the remainder and a list of extracted elements. The main body (remainder after
435 	 *         extraction) is always returned as the first element of the list.
436 	 */
437 	private List<Element> extractElements(String content, String selector, int amount) {
438 
439 		Element body = parseContent(content);
440 
441 		List<Element> elements = body.select(selector);
442 		if (elements.size() > 0) {
443 
444 			elements = filterParents(elements);
445 
446 			if (amount >= 0) {
447 				// limit to the indicated amount
448 				elements = elements.subList(0, Math.min(amount, elements.size()));
449 			}
450 
451 			// remove all from their parents
452 			for (Element element : elements) {
453 				element.remove();
454 			}
455 		}
456 
457 		List<Element> results = new ArrayList<Element>();
458 		// first element is the body
459 		results.add(body);
460 		results.addAll(elements);
461 		return results;
462 	}
463 	
464 	/**
465 	 * Filters the list of elements to only contain parent elements. This is to avoid both parent
466 	 * and child being in the list of elements.
467 	 * 
468 	 * @param elements
469 	 * @return
470 	 */
471 	private static List<Element> filterParents(List<Element> elements) {
472 		List<Element> filtered = new ArrayList<Element>();
473 		for (Element element : elements) {
474 			// get the intersection of parents and selected elements
475 			List<Element> parentsInter = element.parents();
476 			parentsInter.retainAll(elements);
477 			if (parentsInter.isEmpty()) {
478 				// no intersection - element's parents are not in the selected list
479 				filtered.add(element);
480 			}
481 		}
482 
483 		return filtered;
484 	}
485 
486 	/**
487 	 * Extracts HTML elements from the main HTML content. The result consists of the extracted HTML
488 	 * elements and the remainder of HTML content, with these elements removed. Can be limited to a
489 	 * certain amount, e.g. to extract just the first of selected elements.
490 	 * 
491 	 * @param content
492 	 *            HTML content to extract elements from
493 	 * @param selector
494 	 *            CSS selector for elements to extract
495 	 * @param amount
496 	 *            Maximum number of elements to extract
497 	 * @return HTML content of the extracted elements together with the remainder of the original
498 	 *         content. If no elements are found, the remainder contains the original content.
499 	 * @since 1.0
500 	 */
501 	public ExtractResult extract(String content, String selector, int amount) {
502 
503 		List<Element> extracted = extractElements(content, selector, amount);
504 
505 		if (extracted.size() > 1) {
506 
507 			// first element is the remaining body, the rest are extracted
508 			Element body = extracted.get(0);
509 			List<Element> elements = extracted.subList(1, extracted.size());
510 
511 			// convert to HTML
512 			List<String> elementStr = new ArrayList<String>();
513 			for (Element el : elements) {
514 				elementStr.add(el.outerHtml());
515 			}
516 
517 			return new DefaultExtractResult(elementStr, body.html());
518 		} else {
519 			// nothing to extract
520 			return new DefaultExtractResult(Collections.<String> emptyList(), content);
521 		}
522 	}
523 	
524 	/**
525 	 * A container to carry element extraction results. Contains the extracted element HTML
526 	 * code and the remainder of the body content with elements removed.
527 	 * 
528 	 * @author Marek Romanowski
529 	 * @since 1.0
530 	 */
531 	public static interface ExtractResult {
532 		
533 		/**
534 		 * Retrieves the extracted HTML elements.
535 		 * 
536 		 * @return List of HTML of extracted elements. Can be empty if no elements found.
537 		 */
538 		public List<String> getExtracted();
539 
540 		/**
541 		 * Retrieves the content from which elements were extracted.
542 		 * 
543 		 * @return The HTML content with extracted elements removed.
544 		 */
545 		public String getRemainder();
546 	}
547 	
548 	private static class DefaultExtractResult implements ExtractResult {
549 		private final List<String> extracted;
550 		private final String remainder;
551 		
552 		public DefaultExtractResult(List<String> extracted, String remainder) {
553 			this.extracted = extracted;
554 			this.remainder = remainder;
555 		}
556 		
557 		@Override
558 		public List<String> getExtracted() {
559 			return Collections.unmodifiableList(extracted);
560 		}
561 		
562 		@Override
563 		public String getRemainder() {
564 			return remainder;
565 		}
566 	}
567 	
568 	
569 	/**
570 	 * Sets attribute to the given value on elements in HTML.
571 	 * 
572 	 * @param content
573 	 *            HTML content to set attributes on
574 	 * @param selector
575 	 *            CSS selector for elements to modify
576 	 * @param attributeKey
577 	 *            Attribute name
578 	 * @param value
579 	 *            Attribute value
580 	 * @return HTML content with modified elements. If no elements are found, the original content
581 	 *         is returned.
582 	 * @since 1.0
583 	 */
584 	public String setAttr(String content, String selector, String attributeKey, String value) {
585 
586 		Element body = parseContent(content);
587 		
588 		List<Element> elements = body.select(selector);
589 		if (elements.size() > 0) {
590 			
591 			for (Element element : elements) {
592 				element.attr(attributeKey, value);
593 			} 
594 			
595 			return body.html();
596 		} else {
597 			// nothing to update
598 			return content;
599 		}
600 	}
601 
602 	/**
603 	 * Parses body fragment to the {@code <body>} element.
604 	 * 
605 	 * @param content
606 	 * @return the {@code body} element of the parsed content
607 	 */
608 	private Element parseContent(String content) {
609 		Document doc = Jsoup.parseBodyFragment(content);
610 		doc.outputSettings().charset(outputEncoding);
611 		return doc.body();
612 	}
613 	
614 	/**
615 	 * Retrieves attribute value on elements in HTML. Will return all attribute values for the
616 	 * selector, since there can be more than one element.
617 	 * 
618 	 * @param content
619 	 *            HTML content to read attributes from
620 	 * @param selector
621 	 *            CSS selector for elements to find
622 	 * @param attributeKey
623 	 *            Attribute name
624 	 * @return Attribute values for all matching elements. If no elements are found, empty list is
625 	 *         returned.
626 	 * @since 1.0
627 	 */
628 	public List<String> getAttr(String content, String selector, String attributeKey) {
629 
630 		Element body = parseContent(content);
631 		
632 		List<Element> elements = body.select(selector);
633 		List<String> attrs = new ArrayList<String>();
634 		
635 		for (Element element : elements) {
636 			String attrValue = element.attr(attributeKey);
637 			attrs.add(attrValue);
638 		}
639 		
640 		return attrs;
641 	}
642 	
643 	/**
644 	 * Adds given class names to the elements in HTML.
645 	 * 
646 	 * @param content
647 	 *            HTML content to modify
648 	 * @param selector
649 	 *            CSS selector for elements to add classes to
650 	 * @param classNames
651 	 *            Names of classes to add to the selected elements
652 	 * @param amount
653 	 *            Maximum number of elements to modify
654 	 * @return HTML content with modified elements. If no elements are found, the original content
655 	 *         is returned.
656 	 * @since 1.0
657 	 */
658 	public String addClass(String content, String selector, List<String> classNames, int amount) {
659 
660 		Element body = parseContent(content);
661 		
662 		List<Element> elements = body.select(selector);
663 		if (amount >= 0) {
664 			// limit to the indicated amount
665 			elements = elements.subList(0, Math.min(amount, elements.size()));
666 		}
667 		
668 		if (elements.size() > 0) {
669 			
670 			for (Element element : elements) {
671 				for (String className : classNames) {
672 					element.addClass(className);
673 				}
674 			} 
675 			
676 			return body.html();
677 		} else {
678 			// nothing to update
679 			return content;
680 		}
681 	}
682 	
683 	/**
684 	 * Adds given class names to the elements in HTML.
685 	 * 
686 	 * @param content
687 	 *            HTML content to modify
688 	 * @param selector
689 	 *            CSS selector for elements to add classes to
690 	 * @param classNames
691 	 *            Names of classes to add to the selected elements
692 	 * @return HTML content with modified elements. If no elements are found, the original content
693 	 *         is returned.
694 	 * @since 1.0
695 	 */
696 	public String addClass(String content, String selector, List<String> classNames) {
697 		return addClass(content, selector, classNames, -1);
698 	}
699 	
700 	/**
701 	 * Adds given class to the elements in HTML.
702 	 * 
703 	 * @param content
704 	 *            HTML content to modify
705 	 * @param selector
706 	 *            CSS selector for elements to add the class to
707 	 * @param className
708 	 *            Name of class to add to the selected elements
709 	 * @return HTML content with modified elements. If no elements are found, the original content
710 	 *         is returned.
711 	 * @since 1.0
712 	 */
713 	public String addClass(String content, String selector, String className) {
714 		return addClass(content, selector, Collections.singletonList(className));
715 	}
716 	
717 	/**
718 	 * Wraps elements in HTML with the given HTML.
719 	 * 
720 	 * @param content
721 	 *            HTML content to modify
722 	 * @param selector
723 	 *            CSS selector for elements to wrap
724 	 * @param wrapHtml
725 	 *            HTML to use for wrapping the selected elements
726 	 * @param amount
727 	 *            Maximum number of elements to modify
728 	 * @return HTML content with modified elements. If no elements are found, the original content
729 	 *         is returned.
730 	 * @since 1.0
731 	 */
732 	public String wrap(String content, String selector, String wrapHtml, int amount) {
733 
734 		Element body = parseContent(content);
735 		
736 		List<Element> elements = body.select(selector);
737 		if (amount >= 0) {
738 			// limit to the indicated amount
739 			elements = elements.subList(0, Math.min(amount, elements.size()));
740 		}
741 		
742 		if (elements.size() > 0) {
743 			
744 			for (Element element : elements) {
745 				element.wrap(wrapHtml);
746 			} 
747 			
748 			return body.html();
749 		} else {
750 			// nothing to update
751 			return content;
752 		}
753 	}
754 	
755 	/**
756 	 * Removes elements from HTML.
757 	 * 
758 	 * @param content
759 	 *            HTML content to modify
760 	 * @param selector
761 	 *            CSS selector for elements to remove
762 	 * @return HTML content with removed elements. If no elements are found, the original content is
763 	 *         returned.
764 	 * @since 1.0
765 	 */
766 	public String remove(String content, String selector) {
767 
768 		Element body = parseContent(content);
769 		
770 		List<Element> elements = body.select(selector);
771 		if (elements.size() > 0) {
772 			for (Element element : elements) {
773 				element.remove();
774 			}
775 			
776 			return body.html();
777 		} else {
778 			// nothing changed
779 			return content;
780 		}
781 	}
782 	
783 	/**
784 	 * Replaces elements in HTML.
785 	 * 
786 	 * @param content
787 	 *            HTML content to modify
788 	 * @param selector
789 	 *            CSS selector for elements to replace
790 	 * @param replacement
791 	 *            HTML replacement (must parse to a single element)
792 	 * @return HTML content with replaced elements. If no elements are found, the original content is
793 	 *         returned.
794 	 * @since 1.0
795 	 */
796 	public String replace(String content, String selector, String replacement) {
797 		return replaceAll(content, Collections.singletonMap(selector, replacement));
798 	}
799 	
800 	/**
801 	 * Replaces elements in HTML.
802 	 * 
803 	 * @param content
804 	 *            HTML content to modify
805 	 * @param replacements
806 	 *            Map of CSS selectors to their replacement HTML texts. CSS selectors find elements
807 	 *            to be replaced with the HTML in the mapping. The HTML must parse to a single
808 	 *            element.
809 	 * @return HTML content with replaced elements. If no elements are found, the original content
810 	 *         is returned.
811 	 * @since 1.0
812 	 */
813 	public String replaceAll(String content, Map<String, String> replacements) {
814 
815 		Element body = parseContent(content);
816 		
817 		boolean modified = false;
818 		for (Entry<String, String> replacementEntry : replacements.entrySet()) {
819 			String selector = replacementEntry.getKey();
820 			String replacement = replacementEntry.getValue();
821 			
822 			List<Element> elements = body.select(selector);
823 			if (elements.size() > 0) {
824 				
825 				// take the first child
826 				Element replacementElem = parseContent(replacement).child(0);
827 				
828 				if (replacementElem != null) {
829 					for (Element element : elements) {
830 						element.replaceWith(replacementElem.clone());
831 					}
832 					
833 					modified = true;
834 				}
835 			}
836 		}
837 		
838 		if (modified) {
839 			return body.html();
840 		} else {
841 			// nothing changed
842 			return content;
843 		}
844 	}
845 	
846 	/**
847 	 * Retrieves text content of the selected elements in HTML. Renders the element's text as it
848 	 * would be displayed on the web page (including its children).
849 	 * 
850 	 * @param content
851 	 *            HTML content with the elements
852 	 * @param selector
853 	 *            CSS selector for elements to extract contents
854 	 * @return A list of element texts as rendered to display. Empty list if no elements are found.
855 	 * @since 1.0
856 	 */
857 	public List<String> text(String content, String selector) {
858 
859 		Element body = parseContent(content);
860 		
861 		List<Element> elements = body.select(selector);
862 		List<String> texts = new ArrayList<String>();
863 		
864 		for (Element element : elements) {
865 			texts.add(element.text());
866 		}
867 		
868 		return texts;
869 	}
870 	
871 	/**
872 	 * Transforms the given HTML content by moving anchor ({@code <a name="myheading">}) names to
873 	 * IDs for heading elements.
874 	 * <p>
875 	 * The anchors are used to indicate positions within a HTML page. In HTML5, however, the
876 	 * {@code name} attribute is no longer supported on {@code <a>}) tag. The positions within pages
877 	 * are indicated using {@code id} attribute instead, e.g. {@code <h1 id="myheading">}.
878 	 * </p>
879 	 * <p>
880 	 * The method finds anchors inside, immediately before or after the heading tags and uses their
881 	 * name as heading {@code id} instead. The anchors themselves are removed.
882 	 * </p>
883 	 * 
884 	 * @param content
885 	 *            HTML content to modify
886 	 * @return HTML content with modified elements. Anchor names are used for adjacent headings, and
887 	 *         anchor tags are removed. If no elements are found, the original content is returned.
888 	 * @since 1.0
889 	 */
890 	public String headingAnchorToId(String content) {
891 
892 		Element body = parseContent(content);
893 		
894 		// selectors for headings without IDs
895 		List<String> headNoIds = concat(HEADINGS, ":not([id])", true);
896 		
897 		// selector for anchor with name attribute only
898 		String nameA = "a[name]:not([href])";
899 		
900 		// select all headings that have inner named anchor
901 		List<Element> headingsInnerA = body.select(StringUtil.join(
902 				concat(headNoIds, ":has(" + nameA + ")", true), ", "));
903 		
904 		boolean modified = false;
905 		for (Element heading : headingsInnerA) {
906 			List<Element> anchors = heading.select(nameA);
907 			// take first
908 			if (!anchors.isEmpty()) {
909 				anchorToId(heading, anchors.get(0));
910 				modified = true;
911 			}
912 		}
913 		
914 		// select all headings that have a preceding named anchor
915 		List<Element> headingsPreA = body.select(StringUtil.join(
916 				concat(headNoIds, nameA + " + ", false), ", "));
917 		
918 		for (Element heading : headingsPreA) {
919 			Element anchor = heading.previousElementSibling();
920 			if (anchor != null) {
921 				anchorToId(heading, anchor);
922 				modified = true;
923 			}
924 		}
925 		
926 		// select all headings that are followed by a named anchor
927 		// no selector available for that, so first select the anchors
928 		// then retrieve the headings
929 		List<Element> anchorsPreH = body.select(StringUtil.join(
930 				concat(headNoIds, " + " + nameA, true), ", "));
931 		
932 		for (Element anchor : anchorsPreH) {
933 			Element heading = anchor.previousElementSibling();
934 			if (heading != null) {
935 				anchorToId(heading, anchor);
936 				modified = true;
937 			}
938 		}
939 		
940 		if (modified) {
941 			return body.html();
942 		} else {
943 			// nothing to update
944 			return content;
945 		}
946 	}
947 	
948 	/**
949 	 * Moves anchor name to heading id, if one does not exist. Removes the anchor.
950 	 * 
951 	 * @param heading
952 	 * @param anchor
953 	 */
954 	private static void anchorToId(Element heading, Element anchor) {
955 		
956 		if ("a".equals(anchor.tagName()) && heading.id().isEmpty()) {
957 			String aName = anchor.attr("name");
958 			if (!aName.isEmpty()) {
959 				// set the anchor name as heading ID
960 				heading.attr("id", aName);
961 				
962 				// remove the anchor
963 				anchor.remove();
964 			}
965 		}
966 	}
967 	
968 	
969 	/**
970 	 * Utility method to concatenate a String to a list of Strings. The text can be either appended
971 	 * or prepended.
972 	 * 
973 	 * @param elements
974 	 *            list of elements to append/prepend the text to
975 	 * @param text
976 	 *            the given text to append/prepend
977 	 * @param append
978 	 *            if {@code true}, text will be appended to the elements. If {@code false}, it will
979 	 *            be prepended
980 	 * @return list of elements with the text appended/prepended
981 	 * @since 1.0
982 	 */
983 	public static List<String> concat(List<String> elements, String text, boolean append) {
984 		List<String> concats = new ArrayList<String>();
985 		
986 		for (String element : elements) {
987 			concats.add(append ? element + text : text + element);
988 		}
989 		
990 		return concats;
991 	}
992 	
993 	
994 	/**
995 	 * Transforms the given HTML content by adding IDs to all heading elements ({@code h1-6}) that
996 	 * do not have one.
997 	 * <p>
998 	 * IDs on heading elements are used to indicate positions within a HTML page in HTML5. If a
999 	 * heading tag without an {@code id} is found, its "slug" is generated automatically based on
1000 	 * the heading contents and used as the ID.
1001 	 * </p>
1002 	 * <p>
1003 	 * Note that the algorithm also modifies existing IDs that have symbols not allowed in CSS
1004 	 * selectors, e.g. ":", ".", etc. The symbols are removed.
1005 	 * </p>
1006 	 * 
1007 	 * @param content
1008 	 *            HTML content to modify
1009 	 * @return HTML content with all heading elements having {@code id} attributes. If all headings
1010 	 *         were with IDs already, the original content is returned.
1011 	 * @since 1.0
1012 	 */
1013 	public String ensureHeadingIds(String content, String idSeparator) {
1014 
1015 		Element body = parseContent(content);
1016 		
1017 		// first find all existing IDs (to avoid generating duplicates)
1018 		List<Element> idElems = body.select("*[id]");
1019 		Set<String> ids = new HashSet<String>();
1020 		boolean modified = false;
1021 		for (Element idElem : idElems) {
1022 			
1023 			// fix all existing IDs - remove colon and other symbols which mess up jQuery
1024 			String id = idElem.id();
1025 			idElem.attr("id", adaptSlug(id, idSeparator));
1026 			modified = true;
1027 			
1028 			ids.add(idElem.id());
1029 		}
1030 		
1031 		List<String> headNoIds = concat(HEADINGS, ":not([id])", true);
1032 		
1033 		// select all headings that do not have an ID
1034 		List<Element> headingsNoId = body.select(StringUtil.join(headNoIds, ", "));
1035 		
1036 		if (!headingsNoId.isEmpty() || modified) {
1037 			for (Element heading : headingsNoId) {
1038 				
1039 				String headingText = heading.text();
1040 				String headingSlug = slug(headingText, idSeparator);
1041 				// also limit slug to 50 symbols
1042 				if (headingSlug.length() > 50) {
1043 					headingSlug = headingSlug.substring(0, 50);
1044 				}
1045 				String headingId = generateUniqueId(ids, headingSlug);
1046 				
1047 				heading.attr("id", headingId);
1048 			}
1049 			
1050 			return body.html();
1051 		} else {
1052 			// nothing to update
1053 			return content;
1054 		}
1055 	}
1056 	
1057 	/**
1058 	 * Generated a unique ID within the given set of IDs. Appends an incrementing number for
1059 	 * duplicates.
1060 	 * 
1061 	 * @param ids
1062 	 * @param idBase
1063 	 * @return
1064 	 */
1065 	private static String generateUniqueId(Set<String> ids, String idBase) {
1066 		String id = idBase;
1067 		int counter = 1;
1068 		while (ids.contains(id)) {
1069 			id = idBase + String.valueOf(counter++);
1070 		}
1071 		
1072 		// put the newly generated one into the set
1073 		ids.add(id);
1074 		return id;
1075 	}
1076 	
1077 	/**
1078 	 * Fixes table heads: wraps rows with {@code <th>} (table heading) elements into {@code <thead>}
1079 	 * element if they are currently in {@code <tbody>}.
1080 	 * 
1081 	 * @param content
1082 	 *            HTML content to modify
1083 	 * @return HTML content with all table heads fixed. If all heads were correct, the original
1084 	 *         content is returned.
1085 	 * @since 1.0
1086 	 */
1087 	public String fixTableHeads(String content) {
1088 
1089 		Element body = parseContent(content);
1090 		
1091 		// select rows with <th> tags within <tbody>
1092 		List<Element> tableHeadRows = body.select("table > tbody > tr:has(th)");
1093 		if (tableHeadRows.size() > 0) {
1094 			for (Element row : tableHeadRows) {
1095 				
1096 				// get the row's table
1097 				Element table = row.parent().parent();
1098 				
1099 				// remove row from its original position
1100 				row.remove();
1101 				
1102 				// create table header element with the row
1103 				Element thead = new Element(Tag.valueOf("thead"), "");
1104 				thead.appendChild(row);
1105 				// add at the beginning of the table
1106 				table.prependChild(thead);
1107 			}
1108 			
1109 			return body.html();
1110 		} else {
1111 			// nothing changed
1112 			return content;
1113 		}
1114 	}
1115 	
1116 	
1117 	private static final Pattern NONLATIN = Pattern.compile("[^\\w-]");
1118 	private static final Pattern WHITESPACE = Pattern.compile("[\\s]");
1119 	
1120 	/**
1121 	 * Creates a slug (latin text with no whitespace or other symbols) for a longer text (i.e. to
1122 	 * use in URLs).
1123 	 * 
1124 	 * @param input
1125 	 *            text to generate the slug from
1126 	 * @param separator
1127 	 *            separator for whitespace replacement
1128 	 * @return the slug of the given text that contains alphanumeric symbols and separator only
1129 	 * @since 1.0
1130 	 * @see <a href="http://www.codecodex.com/wiki/Generate_a_url_slug">http://www.codecodex.com/wiki/Generate_a_url_slug</a>
1131 	 */
1132 	public static String slug(String input, String separator) {
1133 		String slug = adaptSlug(input, separator);
1134 		return slug.toLowerCase(Locale.ENGLISH);
1135 	}
1136 	
1137 	/**
1138 	 * Creates a slug (latin text with no whitespace or other symbols) for a longer text (i.e. to
1139 	 * use in URLs). Uses "-" as a whitespace separator.
1140 	 * 
1141 	 * @param input
1142 	 *            text to generate the slug from
1143 	 * @return the slug of the given text that contains alphanumeric symbols and "-" only
1144 	 * @since 1.0
1145 	 */
1146 	public static String slug(String input) {
1147 		return slug(input, "-");
1148 	}
1149 	
1150 	/**
1151 	 * Creates a slug but does not change capitalization.
1152 	 * 
1153 	 * @param input
1154 	 * @param separator
1155 	 * @return
1156 	 */
1157 	private static String adaptSlug(String input, String separator) {
1158 		String nowhitespace = WHITESPACE.matcher(input).replaceAll(separator);
1159 		String normalized = Normalizer.normalize(nowhitespace, Form.NFD);
1160 		return NONLATIN.matcher(normalized).replaceAll("");
1161 	}
1162 	
1163 	
1164 	/**
1165 	 * Reads all headings in the given HTML content as a hierarchy. Subsequent smaller headings are
1166 	 * nested within bigger ones, e.g. {@code <h2>} is nested under preceding {@code <h1>}.
1167 	 * <p>
1168 	 * Only headings with IDs are included in the hierarchy. The result elements contain ID and
1169 	 * heading text for each heading. The hierarchy is useful to generate a Table of Contents for a
1170 	 * page.
1171 	 * </p>
1172 	 * 
1173 	 * @param content
1174 	 *            HTML content to extract heading hierarchy from
1175 	 * @return a list of top-level heading items (with id and text). The remaining headings are
1176 	 *         nested within these top-level items. Empty list if no headings are in the content.
1177 	 * @since 1.0
1178 	 */
1179 	public List<? extends IdElement> headingTree(String content) {
1180 
1181 		Element body = parseContent(content);
1182 
1183 		List<String> headIds = concat(HEADINGS, "[id]", true);
1184 
1185 		// select all headings that have an ID
1186 		List<Element> headings = body.select(StringUtil.join(headIds, ", "));
1187 
1188 		List<HeadingItem> headingItems = new ArrayList<HeadingItem>();
1189 		for (Element heading : headings) {
1190 			headingItems.add(new HeadingItem(heading.id(), heading.text(), headingIndex(heading)));
1191 		}
1192 
1193 		List<HeadingItem> topHeadings = new ArrayList<HeadingItem>();
1194 		Stack<HeadingItem> parentHeadings = new Stack<HeadingItem>();
1195 
1196 		for (HeadingItem heading : headingItems) {
1197 
1198 			while (!parentHeadings.isEmpty()
1199 					&& parentHeadings.peek().headingIndex >= heading.headingIndex) {
1200 				parentHeadings.pop();
1201 			}
1202 
1203 			if (parentHeadings.isEmpty()) {
1204 				// top level heading - no parents
1205 				topHeadings.add(heading);
1206 			} else {
1207 				// add to the children of topmost stack parent
1208 				parentHeadings.peek().children.add(heading);
1209 			}
1210 
1211 			// push the heading onto stack
1212 			parentHeadings.push(heading);
1213 		}
1214 
1215 		return topHeadings;
1216 	}
1217 
1218 	/**
1219 	 * Retrieves numeric index of a heading.
1220 	 * 
1221 	 * @param element
1222 	 * @return
1223 	 */
1224 	private static int headingIndex(Element element) {
1225 		String tagName = element.tagName();
1226 		if (tagName.startsWith("h")) {
1227 			try {
1228 				return Integer.parseInt(tagName.substring(1));
1229 			} catch (Exception ex) {
1230 				throw new IllegalArgumentException("Must be a header tag: " + tagName, ex);
1231 			}
1232 		} else {
1233 			throw new IllegalArgumentException("Must be a header tag: " + tagName);
1234 		}
1235 	}
1236 
1237 	private static class HeadingItem implements IdElement {
1238 		private final String id;
1239 		private final String text;
1240 		private final int headingIndex;
1241 
1242 		private final List<HeadingItem> children = new ArrayList<HeadingItem>();
1243 
1244 		public HeadingItem(String id, String text, int headingIndex) {
1245 			this.id = id;
1246 			this.text = text;
1247 			this.headingIndex = headingIndex;
1248 		}
1249 
1250 		@Override
1251 		public String getId() {
1252 			return id;
1253 		}
1254 
1255 		@Override
1256 		public String getText() {
1257 			return text;
1258 		}
1259 
1260 		@Override
1261 		public List<HeadingItem> getItems() {
1262 			return Collections.unmodifiableList(children);
1263 		}
1264 	}
1265 
1266 	/**
1267 	 * Representation of a HTML element with ID and a text content. Other such elements can be
1268 	 * nested within.
1269 	 * 
1270 	 * @author Marek Romanowski
1271 	 * @since 1.0
1272 	 */
1273 	public interface IdElement {
1274 
1275 		/**
1276 		 * Retrieves the ID of the HTML element (attribute {@code id})
1277 		 * 
1278 		 * @return element {@code id} value
1279 		 */
1280 		public String getId();
1281 
1282 		/**
1283 		 * Retrieves the text contents of the HTML element (rendered for display)
1284 		 * 
1285 		 * @return text contents of the element
1286 		 */
1287 		public String getText();
1288 
1289 		/**
1290 		 * Retrieves the children of the HTML element (nested within the element)
1291 		 * 
1292 		 * @return nested items within the element
1293 		 */
1294 		public List<? extends IdElement> getItems();
1295 	}
1296 	
1297 	
1298 	/**
1299 	 * A generic method to use jsoup parser on an arbitrary HTML body fragment. Allows writing
1300 	 * HTML manipulations in the template without adding Java code to the class.
1301 	 * 
1302 	 * @param content
1303 	 *            HTML content to parse
1304 	 * @return the wrapper element for the parsed content (i.e. the body element as if the content
1305 	 *         was body contents).
1306 	 * @since 1.0
1307 	 */
1308 	public static Element parseBodyFragment(String content) {
1309 
1310 		Document doc = Jsoup.parseBodyFragment(content);
1311 		return doc.body();
1312 	}
1313 	
1314 }