1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16 package pl.matsuo.maven.skins.msb3;
17
18 import org.apache.velocity.tools.ToolContext;
19 import org.apache.velocity.tools.config.DefaultKey;
20 import org.apache.velocity.tools.generic.SafeConfig;
21 import org.apache.velocity.tools.generic.ValueParser;
22 import org.jsoup.Jsoup;
23 import org.jsoup.helper.StringUtil;
24 import org.jsoup.nodes.Document;
25 import org.jsoup.nodes.Element;
26 import org.jsoup.parser.Tag;
27
28 import java.text.Normalizer;
29 import java.text.Normalizer.Form;
30 import java.util.ArrayList;
31 import java.util.Arrays;
32 import java.util.Collection;
33 import java.util.Collections;
34 import java.util.HashSet;
35 import java.util.LinkedList;
36 import java.util.List;
37 import java.util.Locale;
38 import java.util.Map;
39 import java.util.Map.Entry;
40 import java.util.Set;
41 import java.util.Stack;
42 import java.util.regex.Pattern;
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57 @DefaultKey("htmlTool")
58 public class HtmlTool extends SafeConfig {
59
60
61 private static List<String> HEADINGS = Collections.unmodifiableList(
62 Arrays.asList("h1", "h2", "h3", "h4", "h5", "h6"));
63
64
65
66
67 public enum JoinSeparator {
68
69
70
71
72 AFTER,
73
74
75
76 BEFORE,
77
78 NO
79 }
80
81 private String outputEncoding = "UTF-8";
82
83
84
85
86
87
88 @Override
89 protected void configure(ValueParser values) {
90
91
92 Object velocityContext = values.get("velocityContext");
93
94 if (!(velocityContext instanceof ToolContext)) {
95 return;
96 }
97
98 ToolContext ctxt = (ToolContext) velocityContext;
99
100
101 Object outputEncodingObj = ctxt.get("outputEncoding");
102 if (outputEncodingObj instanceof String) {
103 this.outputEncoding = (String) outputEncodingObj;
104 }
105 }
106
107
108
109
110
111
112
113
114
115
116
117
118
119 public List<String> split(String content, String separatorCssSelector) {
120 return split(content, separatorCssSelector, JoinSeparator.NO);
121 }
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140 public List<String> splitOnStarts(String content, String separatorCssSelector) {
141
142 List<String> result = split(content, separatorCssSelector, JoinSeparator.AFTER);
143
144 if (result == null || result.size() <= 1) {
145
146 return result;
147 }
148
149
150
151
152 return result.subList(1, result.size());
153 }
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170 public List<String> split(String content, String separatorCssSelector,
171 String separatorStrategy) {
172
173 JoinSeparator sepStrategy;
174 if ("before".equals(separatorStrategy)) {
175 sepStrategy = JoinSeparator.BEFORE;
176 } else if ("after".equals(separatorStrategy)) {
177 sepStrategy = JoinSeparator.AFTER;
178 } else {
179 sepStrategy = JoinSeparator.NO;
180 }
181
182 return split(content, separatorCssSelector, sepStrategy);
183 }
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205 public List<String> split(String content, String separatorCssSelector,
206 JoinSeparator separatorStrategy) {
207
208 Element body = parseContent(content);
209
210 List<Element> separators = body.select(separatorCssSelector);
211 if (separators.size() > 0) {
212 List<List<Element>> partitions = split(separators, separatorStrategy, body);
213
214 List<String> sectionHtml = new ArrayList<String>();
215
216 for (List<Element> partition : partitions) {
217 sectionHtml.add(outerHtml(partition));
218 }
219
220 return sectionHtml;
221 } else {
222
223 return Collections.singletonList(content);
224 }
225 }
226
227
228
229
230
231
232
233
234
235
236
237
238 private static List<List<Element>> split(Collection<Element> separators,
239 JoinSeparator separatorStrategy, Element parent) {
240
241 List<List<Element>> partitions = new LinkedList<List<Element>>();
242
243 for (Element child : parent.children()) {
244
245 if (separators.contains(child)) {
246
247
248
249
250 getLastPartition(partitions);
251
252 if (separatorStrategy == JoinSeparator.BEFORE) {
253
254 getLastPartition(partitions).add(child);
255 }
256
257
258 List<Element> newPartition = new LinkedList<Element>();
259 partitions.add(newPartition);
260
261 if (separatorStrategy == JoinSeparator.AFTER) {
262
263 newPartition.add(child);
264 }
265
266 } else {
267
268 List<List<Element>> childPartitions = split(separators, separatorStrategy, child);
269
270
271 getLastPartition(partitions).add(child);
272
273 if (childPartitions.size() > 1) {
274
275
276
277
278 List<Element> allChildren = child.children();
279 List<Element> firstPartition = childPartitions.get(0);
280
281 allChildren.removeAll(firstPartition);
282 for (Element removeChild : allChildren) {
283 removeChild.remove();
284 }
285
286
287 for (List<Element> nextPartition : childPartitions.subList(1, childPartitions.size())) {
288 partitions.add(nextPartition);
289 }
290 }
291 }
292 }
293
294 return partitions;
295 }
296
297
298
299
300
301
302
303
304 private static List<Element> getLastPartition(List<List<Element>> partitions) {
305 if (partitions.isEmpty()) {
306 List<Element> newPartition = new LinkedList<Element>();
307 partitions.add(newPartition);
308 return newPartition;
309 } else {
310 return partitions.get(partitions.size() - 1);
311 }
312 }
313
314
315
316
317
318
319
320 private static String outerHtml(List<Element> elements) {
321
322 switch (elements.size()) {
323 case 0:
324 return "";
325 case 1:
326 return elements.get(0).outerHtml();
327 default: {
328
329
330 Element root = new Element(Tag.valueOf("div"), "");
331 for (Element elem : elements) {
332 root.appendChild(elem);
333 }
334
335 return root.html();
336 }
337 }
338 }
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357 public String reorderToTop(String content, String selector, int amount) {
358 return reorderToTop(content, selector, amount, null);
359 }
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378 public String reorderToTop(String content, String selector, int amount,
379 String wrapRemaining) {
380
381
382 List<Element> extracted = extractElements(content, selector, amount);
383
384 if (extracted.size() > 1) {
385
386 Element body = extracted.get(0);
387
388 if (wrapRemaining != null) {
389 wrapInner(body, wrapRemaining);
390 }
391
392 List<Element> elements = extracted.subList(1, extracted.size());
393
394
395 for (int index = elements.size() - 1; index >= 0; index--) {
396 body.prependChild(elements.get(index));
397 }
398
399 return body.html();
400 } else {
401
402 return content;
403 }
404 }
405
406 private static Element wrapInner(Element element, String html) {
407
408
409
410 Element topDiv = new Element(Tag.valueOf("div"), "");
411 for (Element topElem : element.children()) {
412
413 topElem.remove();
414 topDiv.appendChild(topElem);
415 }
416
417
418 element.appendChild(topDiv);
419
420
421 topDiv.wrap(html);
422
423 topDiv.unwrap();
424
425 return element;
426 }
427
428
429
430
431
432
433
434
435
436
437 private List<Element> extractElements(String content, String selector, int amount) {
438
439 Element body = parseContent(content);
440
441 List<Element> elements = body.select(selector);
442 if (elements.size() > 0) {
443
444 elements = filterParents(elements);
445
446 if (amount >= 0) {
447
448 elements = elements.subList(0, Math.min(amount, elements.size()));
449 }
450
451
452 for (Element element : elements) {
453 element.remove();
454 }
455 }
456
457 List<Element> results = new ArrayList<Element>();
458
459 results.add(body);
460 results.addAll(elements);
461 return results;
462 }
463
464
465
466
467
468
469
470
471 private static List<Element> filterParents(List<Element> elements) {
472 List<Element> filtered = new ArrayList<Element>();
473 for (Element element : elements) {
474
475 List<Element> parentsInter = element.parents();
476 parentsInter.retainAll(elements);
477 if (parentsInter.isEmpty()) {
478
479 filtered.add(element);
480 }
481 }
482
483 return filtered;
484 }
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501 public ExtractResult extract(String content, String selector, int amount) {
502
503 List<Element> extracted = extractElements(content, selector, amount);
504
505 if (extracted.size() > 1) {
506
507
508 Element body = extracted.get(0);
509 List<Element> elements = extracted.subList(1, extracted.size());
510
511
512 List<String> elementStr = new ArrayList<String>();
513 for (Element el : elements) {
514 elementStr.add(el.outerHtml());
515 }
516
517 return new DefaultExtractResult(elementStr, body.html());
518 } else {
519
520 return new DefaultExtractResult(Collections.<String> emptyList(), content);
521 }
522 }
523
524
525
526
527
528
529
530
531 public static interface ExtractResult {
532
533
534
535
536
537
538 public List<String> getExtracted();
539
540
541
542
543
544
545 public String getRemainder();
546 }
547
548 private static class DefaultExtractResult implements ExtractResult {
549 private final List<String> extracted;
550 private final String remainder;
551
552 public DefaultExtractResult(List<String> extracted, String remainder) {
553 this.extracted = extracted;
554 this.remainder = remainder;
555 }
556
557 @Override
558 public List<String> getExtracted() {
559 return Collections.unmodifiableList(extracted);
560 }
561
562 @Override
563 public String getRemainder() {
564 return remainder;
565 }
566 }
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584 public String setAttr(String content, String selector, String attributeKey, String value) {
585
586 Element body = parseContent(content);
587
588 List<Element> elements = body.select(selector);
589 if (elements.size() > 0) {
590
591 for (Element element : elements) {
592 element.attr(attributeKey, value);
593 }
594
595 return body.html();
596 } else {
597
598 return content;
599 }
600 }
601
602
603
604
605
606
607
608 private Element parseContent(String content) {
609 Document doc = Jsoup.parseBodyFragment(content);
610 doc.outputSettings().charset(outputEncoding);
611 return doc.body();
612 }
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628 public List<String> getAttr(String content, String selector, String attributeKey) {
629
630 Element body = parseContent(content);
631
632 List<Element> elements = body.select(selector);
633 List<String> attrs = new ArrayList<String>();
634
635 for (Element element : elements) {
636 String attrValue = element.attr(attributeKey);
637 attrs.add(attrValue);
638 }
639
640 return attrs;
641 }
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658 public String addClass(String content, String selector, List<String> classNames, int amount) {
659
660 Element body = parseContent(content);
661
662 List<Element> elements = body.select(selector);
663 if (amount >= 0) {
664
665 elements = elements.subList(0, Math.min(amount, elements.size()));
666 }
667
668 if (elements.size() > 0) {
669
670 for (Element element : elements) {
671 for (String className : classNames) {
672 element.addClass(className);
673 }
674 }
675
676 return body.html();
677 } else {
678
679 return content;
680 }
681 }
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696 public String addClass(String content, String selector, List<String> classNames) {
697 return addClass(content, selector, classNames, -1);
698 }
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713 public String addClass(String content, String selector, String className) {
714 return addClass(content, selector, Collections.singletonList(className));
715 }
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732 public String wrap(String content, String selector, String wrapHtml, int amount) {
733
734 Element body = parseContent(content);
735
736 List<Element> elements = body.select(selector);
737 if (amount >= 0) {
738
739 elements = elements.subList(0, Math.min(amount, elements.size()));
740 }
741
742 if (elements.size() > 0) {
743
744 for (Element element : elements) {
745 element.wrap(wrapHtml);
746 }
747
748 return body.html();
749 } else {
750
751 return content;
752 }
753 }
754
755
756
757
758
759
760
761
762
763
764
765
766 public String remove(String content, String selector) {
767
768 Element body = parseContent(content);
769
770 List<Element> elements = body.select(selector);
771 if (elements.size() > 0) {
772 for (Element element : elements) {
773 element.remove();
774 }
775
776 return body.html();
777 } else {
778
779 return content;
780 }
781 }
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796 public String replace(String content, String selector, String replacement) {
797 return replaceAll(content, Collections.singletonMap(selector, replacement));
798 }
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813 public String replaceAll(String content, Map<String, String> replacements) {
814
815 Element body = parseContent(content);
816
817 boolean modified = false;
818 for (Entry<String, String> replacementEntry : replacements.entrySet()) {
819 String selector = replacementEntry.getKey();
820 String replacement = replacementEntry.getValue();
821
822 List<Element> elements = body.select(selector);
823 if (elements.size() > 0) {
824
825
826 Element replacementElem = parseContent(replacement).child(0);
827
828 if (replacementElem != null) {
829 for (Element element : elements) {
830 element.replaceWith(replacementElem.clone());
831 }
832
833 modified = true;
834 }
835 }
836 }
837
838 if (modified) {
839 return body.html();
840 } else {
841
842 return content;
843 }
844 }
845
846
847
848
849
850
851
852
853
854
855
856
857 public List<String> text(String content, String selector) {
858
859 Element body = parseContent(content);
860
861 List<Element> elements = body.select(selector);
862 List<String> texts = new ArrayList<String>();
863
864 for (Element element : elements) {
865 texts.add(element.text());
866 }
867
868 return texts;
869 }
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890 public String headingAnchorToId(String content) {
891
892 Element body = parseContent(content);
893
894
895 List<String> headNoIds = concat(HEADINGS, ":not([id])", true);
896
897
898 String nameA = "a[name]:not([href])";
899
900
901 List<Element> headingsInnerA = body.select(StringUtil.join(
902 concat(headNoIds, ":has(" + nameA + ")", true), ", "));
903
904 boolean modified = false;
905 for (Element heading : headingsInnerA) {
906 List<Element> anchors = heading.select(nameA);
907
908 if (!anchors.isEmpty()) {
909 anchorToId(heading, anchors.get(0));
910 modified = true;
911 }
912 }
913
914
915 List<Element> headingsPreA = body.select(StringUtil.join(
916 concat(headNoIds, nameA + " + ", false), ", "));
917
918 for (Element heading : headingsPreA) {
919 Element anchor = heading.previousElementSibling();
920 if (anchor != null) {
921 anchorToId(heading, anchor);
922 modified = true;
923 }
924 }
925
926
927
928
929 List<Element> anchorsPreH = body.select(StringUtil.join(
930 concat(headNoIds, " + " + nameA, true), ", "));
931
932 for (Element anchor : anchorsPreH) {
933 Element heading = anchor.previousElementSibling();
934 if (heading != null) {
935 anchorToId(heading, anchor);
936 modified = true;
937 }
938 }
939
940 if (modified) {
941 return body.html();
942 } else {
943
944 return content;
945 }
946 }
947
948
949
950
951
952
953
954 private static void anchorToId(Element heading, Element anchor) {
955
956 if ("a".equals(anchor.tagName()) && heading.id().isEmpty()) {
957 String aName = anchor.attr("name");
958 if (!aName.isEmpty()) {
959
960 heading.attr("id", aName);
961
962
963 anchor.remove();
964 }
965 }
966 }
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983 public static List<String> concat(List<String> elements, String text, boolean append) {
984 List<String> concats = new ArrayList<String>();
985
986 for (String element : elements) {
987 concats.add(append ? element + text : text + element);
988 }
989
990 return concats;
991 }
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013 public String ensureHeadingIds(String content, String idSeparator) {
1014
1015 Element body = parseContent(content);
1016
1017
1018 List<Element> idElems = body.select("*[id]");
1019 Set<String> ids = new HashSet<String>();
1020 boolean modified = false;
1021 for (Element idElem : idElems) {
1022
1023
1024 String id = idElem.id();
1025 idElem.attr("id", adaptSlug(id, idSeparator));
1026 modified = true;
1027
1028 ids.add(idElem.id());
1029 }
1030
1031 List<String> headNoIds = concat(HEADINGS, ":not([id])", true);
1032
1033
1034 List<Element> headingsNoId = body.select(StringUtil.join(headNoIds, ", "));
1035
1036 if (!headingsNoId.isEmpty() || modified) {
1037 for (Element heading : headingsNoId) {
1038
1039 String headingText = heading.text();
1040 String headingSlug = slug(headingText, idSeparator);
1041
1042 if (headingSlug.length() > 50) {
1043 headingSlug = headingSlug.substring(0, 50);
1044 }
1045 String headingId = generateUniqueId(ids, headingSlug);
1046
1047 heading.attr("id", headingId);
1048 }
1049
1050 return body.html();
1051 } else {
1052
1053 return content;
1054 }
1055 }
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065 private static String generateUniqueId(Set<String> ids, String idBase) {
1066 String id = idBase;
1067 int counter = 1;
1068 while (ids.contains(id)) {
1069 id = idBase + String.valueOf(counter++);
1070 }
1071
1072
1073 ids.add(id);
1074 return id;
1075 }
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087 public String fixTableHeads(String content) {
1088
1089 Element body = parseContent(content);
1090
1091
1092 List<Element> tableHeadRows = body.select("table > tbody > tr:has(th)");
1093 if (tableHeadRows.size() > 0) {
1094 for (Element row : tableHeadRows) {
1095
1096
1097 Element table = row.parent().parent();
1098
1099
1100 row.remove();
1101
1102
1103 Element thead = new Element(Tag.valueOf("thead"), "");
1104 thead.appendChild(row);
1105
1106 table.prependChild(thead);
1107 }
1108
1109 return body.html();
1110 } else {
1111
1112 return content;
1113 }
1114 }
1115
1116
1117 private static final Pattern NONLATIN = Pattern.compile("[^\\w-]");
1118 private static final Pattern WHITESPACE = Pattern.compile("[\\s]");
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132 public static String slug(String input, String separator) {
1133 String slug = adaptSlug(input, separator);
1134 return slug.toLowerCase(Locale.ENGLISH);
1135 }
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146 public static String slug(String input) {
1147 return slug(input, "-");
1148 }
1149
1150
1151
1152
1153
1154
1155
1156
1157 private static String adaptSlug(String input, String separator) {
1158 String nowhitespace = WHITESPACE.matcher(input).replaceAll(separator);
1159 String normalized = Normalizer.normalize(nowhitespace, Form.NFD);
1160 return NONLATIN.matcher(normalized).replaceAll("");
1161 }
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179 public List<? extends IdElement> headingTree(String content) {
1180
1181 Element body = parseContent(content);
1182
1183 List<String> headIds = concat(HEADINGS, "[id]", true);
1184
1185
1186 List<Element> headings = body.select(StringUtil.join(headIds, ", "));
1187
1188 List<HeadingItem> headingItems = new ArrayList<HeadingItem>();
1189 for (Element heading : headings) {
1190 headingItems.add(new HeadingItem(heading.id(), heading.text(), headingIndex(heading)));
1191 }
1192
1193 List<HeadingItem> topHeadings = new ArrayList<HeadingItem>();
1194 Stack<HeadingItem> parentHeadings = new Stack<HeadingItem>();
1195
1196 for (HeadingItem heading : headingItems) {
1197
1198 while (!parentHeadings.isEmpty()
1199 && parentHeadings.peek().headingIndex >= heading.headingIndex) {
1200 parentHeadings.pop();
1201 }
1202
1203 if (parentHeadings.isEmpty()) {
1204
1205 topHeadings.add(heading);
1206 } else {
1207
1208 parentHeadings.peek().children.add(heading);
1209 }
1210
1211
1212 parentHeadings.push(heading);
1213 }
1214
1215 return topHeadings;
1216 }
1217
1218
1219
1220
1221
1222
1223
1224 private static int headingIndex(Element element) {
1225 String tagName = element.tagName();
1226 if (tagName.startsWith("h")) {
1227 try {
1228 return Integer.parseInt(tagName.substring(1));
1229 } catch (Exception ex) {
1230 throw new IllegalArgumentException("Must be a header tag: " + tagName, ex);
1231 }
1232 } else {
1233 throw new IllegalArgumentException("Must be a header tag: " + tagName);
1234 }
1235 }
1236
1237 private static class HeadingItem implements IdElement {
1238 private final String id;
1239 private final String text;
1240 private final int headingIndex;
1241
1242 private final List<HeadingItem> children = new ArrayList<HeadingItem>();
1243
1244 public HeadingItem(String id, String text, int headingIndex) {
1245 this.id = id;
1246 this.text = text;
1247 this.headingIndex = headingIndex;
1248 }
1249
1250 @Override
1251 public String getId() {
1252 return id;
1253 }
1254
1255 @Override
1256 public String getText() {
1257 return text;
1258 }
1259
1260 @Override
1261 public List<HeadingItem> getItems() {
1262 return Collections.unmodifiableList(children);
1263 }
1264 }
1265
1266
1267
1268
1269
1270
1271
1272
1273 public interface IdElement {
1274
1275
1276
1277
1278
1279
1280 public String getId();
1281
1282
1283
1284
1285
1286
1287 public String getText();
1288
1289
1290
1291
1292
1293
1294 public List<? extends IdElement> getItems();
1295 }
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308 public static Element parseBodyFragment(String content) {
1309
1310 Document doc = Jsoup.parseBodyFragment(content);
1311 return doc.body();
1312 }
1313
1314 }