1 package com.randomnoun.common;
2
3
4
5
6
7 import java.io.*;
8 import java.util.ArrayList;
9 import java.util.Arrays;
10 import java.util.HashSet;
11 import java.util.Iterator;
12 import java.util.List;
13 import java.util.Set;
14
15 import javax.xml.parsers.DocumentBuilder;
16 import javax.xml.parsers.DocumentBuilderFactory;
17 import javax.xml.parsers.ParserConfigurationException;
18 import javax.xml.parsers.SAXParser;
19 import javax.xml.parsers.SAXParserFactory;
20 import javax.xml.transform.OutputKeys;
21 import javax.xml.transform.Transformer;
22 import javax.xml.transform.TransformerConfigurationException;
23 import javax.xml.transform.TransformerException;
24 import javax.xml.transform.TransformerFactory;
25 import javax.xml.transform.dom.DOMSource;
26 import javax.xml.transform.stream.StreamResult;
27
28 import org.ccil.cowan.tagsoup.*;
29 import org.ccil.cowan.tagsoup.Parser;
30
31 import org.w3c.dom.*;
32 import org.w3c.dom.Element;
33 import org.xml.sax.*;
34
35 import org.apache.log4j.Logger;
36
37
38
39
40
41
42 public class XmlUtil {
43
44
45
46
47
48
49
50
51
52
53 public static String getCleanXml(String inputXml, boolean isHtml) throws SAXException {
54 return getCleanXml(new ByteArrayInputStream(inputXml.getBytes()), isHtml);
55 }
56
57
58
59
60
61
62
63
64
65
66 public static String getCleanXml(InputStream inputStream, boolean isHtml) throws SAXException {
67 try {
68 ByteArrayOutputStream baos = new ByteArrayOutputStream();
69 InputSource is = new InputSource();
70 is.setByteStream(inputStream);
71
72 XMLReader xmlReader = new Parser();
73 Writer w = new OutputStreamWriter(baos);
74 XMLWriter tagsoupXMLWriter = new XMLWriter(w);
75 tagsoupXMLWriter.setOutputProperty(XMLWriter.OMIT_XML_DECLARATION, "yes");
76 if (isHtml) {
77 HTMLSchema theSchema = new HTMLSchema();
78 xmlReader.setProperty(Parser.schemaProperty, theSchema);
79
80 tagsoupXMLWriter.setOutputProperty(XMLWriter.METHOD, "html");
81 tagsoupXMLWriter.setPrefix(theSchema.getURI(), "");
82 }
83
84 xmlReader.setContentHandler(tagsoupXMLWriter);
85 xmlReader.parse(is);
86 return baos.toString("UTF-8");
87 } catch (IOException ioe) {
88 throw (IllegalStateException) new IllegalStateException("IO Exception reading from string").initCause(ioe);
89 }
90 }
91
92
93
94
95
96
97
98
99
100
101
102 public static String getText(Element element)
103 {
104 if (element == null) { throw new NullPointerException("null element"); }
105 StringBuffer buf = new StringBuffer();
106 NodeList children = element.getChildNodes();
107 for (int i = 0; i < children.getLength(); ++i) {
108 org.w3c.dom.Node child = children.item(i);
109 short nodeType = child.getNodeType();
110 if (nodeType == org.w3c.dom.Node.CDATA_SECTION_NODE) {
111 buf.append(((org.w3c.dom.Text) child).getData());
112 } else if (nodeType == org.w3c.dom.Node.TEXT_NODE) {
113 buf.append(((org.w3c.dom.Text) child).getData());
114 } else if (nodeType == org.w3c.dom.Node.ELEMENT_NODE) {
115 buf.append(getText((Element) child));
116 }
117 }
118 return buf.toString();
119 }
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137 public static String getTextPreserveElements(Element element, String[] tagNames) {
138 if (element == null) { throw new NullPointerException("null element"); }
139 Set<String> tagNamesSet = new HashSet<String>(Arrays.asList(tagNames));
140 StringBuffer buf = new StringBuffer();
141 NodeList children = element.getChildNodes();
142 for (int i = 0; i < children.getLength(); ++i) {
143 org.w3c.dom.Node child = children.item(i);
144 short nodeType = child.getNodeType();
145 if (nodeType == org.w3c.dom.Node.CDATA_SECTION_NODE) {
146 buf.append(((org.w3c.dom.Text) child).getData());
147 } else if (nodeType == org.w3c.dom.Node.TEXT_NODE) {
148 buf.append(((org.w3c.dom.Text) child).getData());
149 } else if (nodeType == org.w3c.dom.Node.ELEMENT_NODE) {
150 String tagName = ((Element) child).getTagName();
151 boolean includeEl = tagNamesSet.contains(tagName);
152 if (includeEl) {
153 buf.append('<');
154 buf.append(tagName);
155 NamedNodeMap nnm = ((Element) child).getAttributes();
156 for (int j = 0; j < nnm.getLength(); j++) {
157 Attr attr = (Attr) nnm.item(j);
158 buf.append(" " + attr.getName());
159 if (attr.getValue()!=null) {
160 buf.append("=\"" + attr.getValue() + "\"");
161 }
162 }
163 buf.append('>');
164 }
165 buf.append(getTextPreserveElements((Element) child, tagNames));
166 if (includeEl) {
167 buf.append("</" + tagName + ">");
168 }
169 }
170 }
171 return buf.toString();
172 }
173
174
175
176
177
178
179
180
181
182
183
184
185 public static String getTextNonRecursive(Element element)
186 {
187 if (element == null) { throw new NullPointerException("null element"); }
188 StringBuffer buf = new StringBuffer();
189 NodeList children = element.getChildNodes();
190 for (int i = 0; i < children.getLength(); ++i) {
191 org.w3c.dom.Node child = children.item(i);
192 short nodeType = child.getNodeType();
193 if (nodeType == org.w3c.dom.Node.CDATA_SECTION_NODE) {
194 buf.append(((org.w3c.dom.Text) child).getData());
195 } else if (nodeType == org.w3c.dom.Node.TEXT_NODE) {
196 buf.append(((org.w3c.dom.Text) child).getData());
197 } else if (nodeType == org.w3c.dom.Node.ELEMENT_NODE) {
198
199 }
200 }
201 return buf.toString();
202 }
203
204
205
206
207
208 public static Document toDocument(String text) throws SAXException {
209 return toDocument(new ByteArrayInputStream(text.getBytes()));
210 }
211
212
213
214
215
216 public static Document toDocument(InputStream is) throws SAXException {
217 try {
218 DocumentBuilderFactory docBuilderFactory = DocumentBuilderFactory.newInstance();
219 DocumentBuilder docBuilder = docBuilderFactory.newDocumentBuilder();
220 Document doc = docBuilder.parse(is);
221 doc.getDocumentElement().normalize();
222 return doc;
223 } catch (ParserConfigurationException pce) {
224
225 throw (IllegalStateException) new IllegalStateException("Error creating DOM parser").initCause(pce);
226 } catch (IOException ioe) {
227
228 throw (IllegalStateException) new IllegalStateException("Error retrieving information").initCause(ioe);
229 }
230 }
231
232
233
234
235
236
237
238
239
240
241
242 public static String getXmlString(Node node, boolean omitXmlDeclaration)
243 throws TransformerException
244 {
245 try {
246 ByteArrayOutputStream baos = new ByteArrayOutputStream();
247 TransformerFactory transformerFactory = TransformerFactory.newInstance();
248 Transformer transformer = transformerFactory.newTransformer();
249 DOMSource source = new DOMSource(node);
250 StreamResult result = new StreamResult(baos);
251 transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, omitXmlDeclaration ? "yes": "no");
252 transformer.transform(source, result);
253 return baos.toString("UTF-8");
254 } catch (TransformerConfigurationException tce) {
255 throw (IllegalStateException) new IllegalStateException("Could not initialise transformer").initCause(tce);
256 } catch (UnsupportedEncodingException uee) {
257 throw (IllegalStateException) new IllegalStateException("Unknown charset UTF-8").initCause(uee);
258 }
259 }
260
261
262
263
264
265
266
267 public static void compact(Node node) {
268 if (node.getNodeType()==Node.TEXT_NODE) {
269 org.w3c.dom.Text el = (org.w3c.dom.Text) node;
270 if (el.getNodeValue()!=null) {
271 el.setNodeValue(el.getNodeValue().trim());
272 }
273 } else if (node.getNodeType()==Node.ELEMENT_NODE) {
274 NodeList childNodes = node.getChildNodes();
275 if (childNodes != null && childNodes.getLength() > 0) {
276 int len = childNodes.getLength();
277 for (int i=0; i<len; i++) {
278 Node childNode = childNodes.item(i);
279 compact(childNode);
280 }
281 }
282 }
283 }
284
285
286
287
288
289
290
291
292
293
294
295
296
297 public static void processContentHandler(ContentHandler contentHandler, String xmlText) throws SAXException, IllegalStateException {
298 SAXParserFactory factory = SAXParserFactory.newInstance();
299 try {
300
301 SAXParser saxParser = factory.newSAXParser();
302 XMLReader xmlReader = saxParser.getXMLReader();
303 xmlReader.setContentHandler(contentHandler);
304 xmlReader.parse(new InputSource(new ByteArrayInputStream(xmlText.getBytes())));
305 } catch (IOException ioe) {
306 throw (IllegalStateException) new IllegalStateException("IO Exception reading from string").initCause(ioe);
307 } catch (ParserConfigurationException pce) {
308 throw (IllegalStateException) new IllegalStateException("Could not initialise parser").initCause(pce);
309 }
310 }
311
312
313
314
315
316
317
318 public static class SimpleTableContentHandler
319 implements ContentHandler
320 {
321
322 public static final Logger logger = Logger.getLogger(SimpleTableContentHandler.class);
323
324
325 List<List<String>> thisTable = null;
326
327 List<String> thisRow = null;
328
329 String thisCell = "";
330
331
332 private enum State {
333
334 START,
335
336 IN_TABLE,
337
338 IN_TR,
339
340 IN_TD
341 }
342
343 State state = State.START;
344
345
346 public void setDocumentLocator(Locator locator) { }
347 public void startDocument() throws SAXException { }
348 public void endDocument() throws SAXException { }
349 public void startPrefixMapping(String prefix, String uri) throws SAXException { }
350 public void endPrefixMapping(String prefix) throws SAXException { }
351 public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException { }
352 public void processingInstruction(String target, String data) throws SAXException { }
353 public void skippedEntity(String name) throws SAXException { }
354
355
356 public void startElement(String uri, String localName, String qName, Attributes atts)
357 throws SAXException
358 {
359 switch (state) {
360 case START:
361 if (qName.equals("table")) {
362 thisTable = new ArrayList<List<String>>();
363 state = State.IN_TABLE;
364 } else {
365 logger.warn("Warning: top-level element '" + qName + "' found (expected 'table')");
366 }
367 break;
368
369 case IN_TABLE:
370 if (qName.equals("tr")) {
371 thisRow = new ArrayList<String>();
372 thisTable.add(thisRow);
373 state = State.IN_TR;
374 }
375 break;
376
377 case IN_TR:
378 if (qName.equals("td")) {
379 thisCell = "";
380 state = State.IN_TD;
381 }
382 break;
383
384 case IN_TD:
385 break;
386
387 default:
388 throw new IllegalStateException("Illegal state " + state + " in SimpleTableContentHandler");
389
390 }
391 }
392
393 public void characters(char[] ch, int start, int length)
394 throws SAXException {
395 if (state==State.IN_TD) {
396 thisCell += new String(ch, start, length);
397 }
398 }
399
400 public void endElement(String uri, String localName, String qName)
401 throws SAXException
402 {
403 if (state == State.IN_TD && qName.equals("td")) {
404 thisRow.add(thisCell);
405 state = State.IN_TR;
406 } else if (state == State.IN_TR && qName.equals("tr")) {
407 state = State.IN_TABLE;
408 }
409 }
410
411 public List<List<String>> getTable() {
412 return thisTable;
413 }
414 }
415
416
417
418
419
420
421
422
423
424 public abstract static class AbstractStackContentHandler implements ContentHandler
425 {
426
427 public static final Logger logger = Logger.getLogger(AbstractStackContentHandler.class);
428
429
430 protected String stack = "";
431 protected String text = null;
432
433
434 public void setDocumentLocator(Locator locator) { }
435 public void startDocument() throws SAXException { }
436 public void endDocument() throws SAXException { }
437 public void startPrefixMapping(String prefix, String uri) throws SAXException { }
438 public void endPrefixMapping(String prefix) throws SAXException { }
439 public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException { }
440 public void processingInstruction(String target, String data) throws SAXException { }
441 public void skippedEntity(String name) throws SAXException { }
442
443 public void startElement(String uri, String localName, String qName, Attributes atts)
444 throws SAXException
445 {
446 stack = stack.equals("") ? qName : stack + "/" + qName;
447 text = "";
448 element(stack);
449 }
450 public void characters(char[] ch, int start, int length) throws SAXException {
451 text += new String(ch, start, length);
452 }
453 public void endElement(String uri, String localName, String qName)
454 throws SAXException
455 {
456 elementText(stack, text);
457 text = "";
458 stack = stack.contains("/") ? stack.substring(0, stack.lastIndexOf("/")) : "";
459 }
460
461
462 public abstract void element(String path) throws SAXException;
463 public abstract void elementText(String path, String content) throws SAXException;
464 }
465
466
467
468
469
470
471
472
473
474 public static class NodeListIterator implements Iterable<org.w3c.dom.Node> {
475 private final NodeList nodeList;
476 public NodeListIterator(NodeList nodeList) {
477 this.nodeList = nodeList;
478 }
479 public Iterator<org.w3c.dom.Node> iterator() {
480 return new Iterator<org.w3c.dom.Node>() {
481 private int index = 0;
482 public boolean hasNext() {
483 return index < nodeList.getLength();
484 }
485 public org.w3c.dom.Node next() {
486 return nodeList.item(index++);
487 }
488 public void remove() {
489 throw new UnsupportedOperationException("remove() not allowed in NodeList");
490 }
491 };
492 }
493 }
494
495
496 public static class ContentHandlerTraverser {
497
498 public void traverse(Node node, ContentHandler handler) throws SAXException {
499 handler.startDocument();
500 traverseNode(node, handler);
501 handler.endDocument();
502 }
503
504 private void traverseNode(Node node, ContentHandler handler) throws SAXException {
505 switch (node.getNodeType()) {
506 case Node.DOCUMENT_NODE:
507 Document doc = (Document) node;
508 traverseChildren(doc.getDocumentElement(), handler);
509 break;
510
511 case Node.ELEMENT_NODE:
512 Element elem = (Element) node;
513 AttributesImpl attrs = new AttributesImpl();
514 NamedNodeMap map = elem.getAttributes();
515 for (int i = 0; i < map.getLength(); i++) {
516 Attr a = (Attr) map.item(i);
517 attrs.addAttribute(
518 a.getNamespaceURI() == null ? "" : a.getNamespaceURI(),
519 a.getLocalName() == null ? a.getName() : a.getLocalName(),
520 a.getName(),
521 "CDATA",
522 a.getValue()
523 );
524 }
525 handler.startElement(
526 elem.getNamespaceURI() == null ? "" : elem.getNamespaceURI(),
527 elem.getLocalName() == null ? elem.getNodeName() : elem.getLocalName(),
528 elem.getNodeName(),
529 attrs
530 );
531
532 traverseChildren(elem, handler);
533
534 handler.endElement(
535 elem.getNamespaceURI() == null ? "" : elem.getNamespaceURI(),
536 elem.getLocalName() == null ? elem.getNodeName() : elem.getLocalName(),
537 elem.getNodeName()
538 );
539 break;
540
541 case Node.TEXT_NODE:
542 String text = ((org.w3c.dom.Text) node).getData();
543 char[] chars = text.toCharArray();
544 handler.characters(chars, 0, chars.length);
545 break;
546
547 case Node.CDATA_SECTION_NODE:
548 String cdata = ((CDATASection) node).getData();
549 char[] cchars = cdata.toCharArray();
550 handler.characters(cchars, 0, cchars.length);
551 break;
552
553 case Node.COMMENT_NODE:
554
555 break;
556 }
557 }
558
559 private void traverseChildren(Node parent, ContentHandler handler) throws SAXException {
560 Node child = parent.getFirstChild();
561 while (child != null) {
562 traverseNode(child, handler);
563 child = child.getNextSibling();
564 }
565 }
566 }
567
568
569
570 }