View Javadoc
1   package com.randomnoun.common;
2   
3   /* (c) 2013 randomnoun. All Rights Reserved. This work is licensed under a
4    * BSD Simplified License. (http://www.randomnoun.com/bsd-simplified.html)
5    */
6   
7   import java.io.*;
8   import java.util.ArrayList;
9   import java.util.Arrays;
10  import java.util.HashSet;
11  import java.util.Iterator;
12  import java.util.List;
13  import java.util.Set;
14  
15  import javax.xml.parsers.DocumentBuilder;
16  import javax.xml.parsers.DocumentBuilderFactory;
17  import javax.xml.parsers.ParserConfigurationException;
18  import javax.xml.parsers.SAXParser;
19  import javax.xml.parsers.SAXParserFactory;
20  import javax.xml.transform.OutputKeys;
21  import javax.xml.transform.Transformer;
22  import javax.xml.transform.TransformerConfigurationException;
23  import javax.xml.transform.TransformerException;
24  import javax.xml.transform.TransformerFactory;
25  import javax.xml.transform.dom.DOMSource;
26  import javax.xml.transform.stream.StreamResult;
27  
28  import org.ccil.cowan.tagsoup.*;
29  import org.ccil.cowan.tagsoup.Parser;
30  
31  import org.w3c.dom.*;
32  import org.w3c.dom.Element;
33  import org.xml.sax.*;
34  
35  import org.apache.log4j.Logger;
36  
37  /** XML utility functions
38   *
39   * @author knoxg
40   * @see <a href="http://www.randomnoun.com/wp/2013/01/25/exciting-things-with-xml/">http://www.randomnoun.com/wp/2013/01/25/exciting-things-with-xml/</a>
41   */
42  public class XmlUtil {
43  	
44  	/** Clean some HTML text through the tagsoup filter. The returned string is guaranteed to be 
45  	 * well-formed XML (and can therefore be used by other tools that expect valid XML). 
46  	 * 
47  	 * @param inputXml input XML document
48  	 * @param isHtml if true, uses the HTML schema, omits the XML declaration, and uses the html method
49  	 * 
50  	 * @throws SAXException if the tagsoup library could not parse the input string
51  	 * @throws IllegalStateException if an error occurred reading from a string (should never occur)
52  	 */ 
53  	public static String getCleanXml(String inputXml, boolean isHtml) throws SAXException {
54  		return getCleanXml(new ByteArrayInputStream(inputXml.getBytes()), isHtml);
55  	}
56  	
57  	/** Clean a HTML inputStream through the tagsoup filter. The returned string is guaranteed to be 
58  	 * well-formed XML (and can therefore be used by other tools that expect valid XML). 
59  	 * 
60  	 * @param inputStream input XML stream
61  	 * @param isHtml if true, uses the HTML schema, omits the XML declaration, and uses the html method
62  	 * 
63  	 * @throws SAXException if the tagsoup library could not parse the input string
64  	 * @throws IllegalStateException if an error occurred reading from a string (should never occur)
65  	 */ 
66  	public static String getCleanXml(InputStream inputStream, boolean isHtml) throws SAXException {
67  		try {
68  			ByteArrayOutputStream baos = new ByteArrayOutputStream();
69  			InputSource is = new InputSource();
70  			is.setByteStream(inputStream); // could use raw inputstream here later
71  
72  			XMLReader xmlReader = new Parser();
73  			Writer w = new OutputStreamWriter(baos);
74  			XMLWriter tagsoupXMLWriter = new XMLWriter(w);
75  			tagsoupXMLWriter.setOutputProperty(XMLWriter.OMIT_XML_DECLARATION, "yes");
76  			if (isHtml) {
77  				HTMLSchema theSchema = new HTMLSchema();
78  				xmlReader.setProperty(Parser.schemaProperty, theSchema);
79  	
80  				tagsoupXMLWriter.setOutputProperty(XMLWriter.METHOD, "html");
81  				tagsoupXMLWriter.setPrefix(theSchema.getURI(), "");
82  			}
83  			
84  			xmlReader.setContentHandler(tagsoupXMLWriter);
85  			xmlReader.parse(is);
86  			return baos.toString("UTF-8");
87  		} catch (IOException ioe) {
88  			throw (IllegalStateException) new IllegalStateException("IO Exception reading from string").initCause(ioe);		
89  		}
90  	}
91  
92  
93  	/**
94  	 * Iterates through the child nodes of the specified element, and returns the contents
95  	 * of all Text and CDATA elements among those nodes, concatenated into a string.
96  	 *
97  	 * <p>Elements are recursed into.
98  	 *
99  	 * @param element the element that contains, as child nodes, the text to be returned.
100 	 * @return the contents of all the CDATA children of the specified element.
101 	 */
102 	public static String getText(Element element)
103 	{
104 		if (element == null) { throw new NullPointerException("null element"); }
105 		StringBuffer buf = new StringBuffer();
106 		NodeList children = element.getChildNodes();
107 		for (int i = 0; i < children.getLength(); ++i) {
108 			org.w3c.dom.Node child = children.item(i);
109 			short nodeType = child.getNodeType();
110 			if (nodeType == org.w3c.dom.Node.CDATA_SECTION_NODE) {
111 				buf.append(((org.w3c.dom.Text) child).getData());			
112 			} else if (nodeType == org.w3c.dom.Node.TEXT_NODE) {
113 				buf.append(((org.w3c.dom.Text) child).getData());
114 			} else if (nodeType == org.w3c.dom.Node.ELEMENT_NODE) {
115 				buf.append(getText((Element) child));
116 			}
117 		}
118 		return buf.toString();
119 	}
120 
121 	/**
122 	 * Iterates through the child nodes of the specified element, and returns the contents
123 	 * of all Text and CDATA elements among those nodes, concatenated into a string. 
124 	 * Any elements with tagNames that are included in the tagNames parameter of this
125 	 * method are also included. 
126 	 * 
127 	 * <p>Attributes of these tags are also included in the result, but may be reordered.
128 	 * 
129 	 * <p>Self-closing elements (e.g. <code>&lt;br/&gt;</code>)
130 	 * are expanded into opening and closing elements (e.g. <code>&lt;br&gt;&lt;/br&gt;</code>)
131 	 *
132 	 * <p>Elements are recursed into.
133 	 *
134 	 * @param element the element that contains, as child nodes, the text to be returned.
135 	 * @return the contents of all the CDATA children of the specified element.
136 	 */
137 	public static String getTextPreserveElements(Element element, String[] tagNames) {
138 		if (element == null) { throw new NullPointerException("null element"); }
139 		Set<String> tagNamesSet = new HashSet<String>(Arrays.asList(tagNames));
140 		StringBuffer buf = new StringBuffer();
141 		NodeList children = element.getChildNodes();
142 		for (int i = 0; i < children.getLength(); ++i) {
143 			org.w3c.dom.Node child = children.item(i);
144 			short nodeType = child.getNodeType();
145 			if (nodeType == org.w3c.dom.Node.CDATA_SECTION_NODE) {
146 				buf.append(((org.w3c.dom.Text) child).getData());			
147 			} else if (nodeType == org.w3c.dom.Node.TEXT_NODE) {
148 				buf.append(((org.w3c.dom.Text) child).getData());
149 			} else if (nodeType == org.w3c.dom.Node.ELEMENT_NODE) {
150 				String tagName = ((Element) child).getTagName();
151 				boolean includeEl = tagNamesSet.contains(tagName);
152 				if (includeEl) {
153 					buf.append('<');
154 					buf.append(tagName);
155 					NamedNodeMap nnm = ((Element) child).getAttributes();
156 					for (int j = 0; j < nnm.getLength(); j++) {
157 						Attr attr = (Attr) nnm.item(j);
158 						buf.append(" " + attr.getName());
159 						if (attr.getValue()!=null) {
160 							buf.append("=\"" + attr.getValue() + "\"");
161 						}
162 					}
163 					buf.append('>');
164 				}
165 				buf.append(getTextPreserveElements((Element) child, tagNames));
166 				if (includeEl) {
167 					buf.append("</" + tagName + ">");
168 				}
169 			}
170 		}
171 		return buf.toString();
172 	}	
173 
174 
175 	
176 	/**
177 	 * Iterates through the child nodes of the specified element, and returns the contents
178 	 * of all Text and CDATA elements among those nodes, concatenated into a string.
179 	 * 
180 	 * <p>Elements are not recursed into.
181 	 *
182 	 * @param element the element that contains, as child nodes, the text to be returned.
183 	 * @return the contents of all the CDATA children of the specified element.
184 	 */
185 	public static String getTextNonRecursive(Element element)
186 	{
187 		if (element == null) { throw new NullPointerException("null element"); }
188 		StringBuffer buf = new StringBuffer();
189 		NodeList children = element.getChildNodes();
190 		for (int i = 0; i < children.getLength(); ++i) {
191 			org.w3c.dom.Node child = children.item(i);
192 			short nodeType = child.getNodeType();
193 			if (nodeType == org.w3c.dom.Node.CDATA_SECTION_NODE) {
194 				buf.append(((org.w3c.dom.Text) child).getData());			
195 			} else if (nodeType == org.w3c.dom.Node.TEXT_NODE) {
196 				buf.append(((org.w3c.dom.Text) child).getData());
197 			} else if (nodeType == org.w3c.dom.Node.ELEMENT_NODE) {
198 				// ignore child elements
199 			}
200 		}
201 		return buf.toString();
202 	}
203 	
204 	/** Return a DOM document object from an XML string
205 	 * 
206 	 * @param text the string representation of the XML to parse 
207 	 */
208 	public static Document toDocument(String text) throws SAXException {
209 		return toDocument(new ByteArrayInputStream(text.getBytes()));
210 	}
211 	
212 	/** Return a DOM document object from an InputStream
213 	 * 
214 	 * @param is the InputStream containing the XML to parse 
215 	 */
216 	public static Document toDocument(InputStream is) throws SAXException {
217 		try {
218 			DocumentBuilderFactory docBuilderFactory = DocumentBuilderFactory.newInstance();
219 			DocumentBuilder docBuilder = docBuilderFactory.newDocumentBuilder();
220 			Document doc = docBuilder.parse(is);
221 			doc.getDocumentElement().normalize(); // Collapses adjacent text nodes into one node.
222 			return doc;
223 		} catch (ParserConfigurationException pce) {
224 			// this can never happen 
225 			throw (IllegalStateException) new IllegalStateException("Error creating DOM parser").initCause(pce);
226 		} catch (IOException ioe) {
227 			// this can also never happen
228 			throw (IllegalStateException) new IllegalStateException("Error retrieving information").initCause(ioe);
229 		} 
230 	}
231 	
232 	/** Converts a document node subtree back into an XML string 
233 	 * 
234 	 * @param node a DOM node 
235 	 * @param omitXmlDeclaration if true, omits the XML declaration from the returned result
236 	 * 
237 	 * @return the XML for this node
238 	 * 
239 	 * @throws TransformerException if the transformation to XML failed
240 	 * @throws IllegalStateException if the transformer could not be initialised 
241 	 */
242 	public static String getXmlString(Node node, boolean omitXmlDeclaration) 
243 		throws TransformerException 
244 	{
245 		try {
246 			ByteArrayOutputStream baos = new ByteArrayOutputStream();
247 			TransformerFactory transformerFactory = TransformerFactory.newInstance();
248 			Transformer transformer = transformerFactory.newTransformer();
249 			DOMSource source = new DOMSource(node);
250 			StreamResult result = new StreamResult(baos);
251 			transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, omitXmlDeclaration ? "yes": "no");
252 			transformer.transform(source, result);
253 			return baos.toString("UTF-8");
254 		} catch (TransformerConfigurationException tce) {
255 			throw (IllegalStateException) new IllegalStateException("Could not initialise transformer").initCause(tce);
256 		} catch (UnsupportedEncodingException uee) {
257 			throw (IllegalStateException) new IllegalStateException("Unknown charset UTF-8").initCause(uee);
258 		}
259 	}
260 	
261 
262 	/** Remove leading/trailing whitespace from all text nodes in this nodeList.
263 	 * Will iterate through subnodes recursively.
264 	 * 
265 	 * @param node
266 	 */
267 	public static void compact(Node node) {
268 		if (node.getNodeType()==Node.TEXT_NODE) {
269 			org.w3c.dom.Text el = (org.w3c.dom.Text) node;
270 			if (el.getNodeValue()!=null) {
271 				el.setNodeValue(el.getNodeValue().trim());
272 			}
273 		} else if (node.getNodeType()==Node.ELEMENT_NODE) {
274 			NodeList childNodes = node.getChildNodes();
275 			if (childNodes != null && childNodes.getLength() > 0) {
276 				int len = childNodes.getLength();
277 				for (int i=0; i<len; i++) {
278 					Node childNode = childNodes.item(i);
279 				    compact(childNode);
280 				}
281 			}
282 		}
283 	}
284 	
285 	
286 	/** Parse a string of XML text using a SAX contentHandler. Nothing is returned by this method - it 
287 	 * is assumed that the contentHandler supplied maintains it's own state as it parses the XML supplied,
288 	 * and that this state can be extracted from this object afterwards.
289 	 * 
290 	 * @param contentHandler a SAX content handler 
291 	 * @param xmlText an XML document (or part thereof)
292 	 * 
293 	 * @throws SAXException if the document could not be parsed
294 	 * @throws IllegalStateException if the parser could not be initialised, or an I/O error occurred 
295 	 *   (should not happen since we're just dealing with strings)
296 	 */
297 	public static void processContentHandler(ContentHandler contentHandler, String xmlText) throws SAXException, IllegalStateException {
298 		 SAXParserFactory factory = SAXParserFactory.newInstance();
299 		 try {
300 			 // Parse the input
301 			 SAXParser saxParser = factory.newSAXParser();
302 			 XMLReader xmlReader = saxParser.getXMLReader();
303 			 xmlReader.setContentHandler(contentHandler);
304 			 xmlReader.parse(new InputSource(new ByteArrayInputStream(xmlText.getBytes())));
305 		 } catch (IOException ioe) {
306 		 	throw (IllegalStateException) new IllegalStateException("IO Exception reading from string").initCause(ioe);
307 		 } catch (ParserConfigurationException pce) {
308 			throw (IllegalStateException) new IllegalStateException("Could not initialise parser").initCause(pce);		 		
309 		 }
310 	}
311 	
312 	/** Convert a table into a List of Lists (each top-level list represents a table row,
313 	 * each second-level list represents a table cell). Only contents are returned; attributes
314 	 * and formatting are ignored.
315 	 * 
316 	 * <p>This class will probably not work when tables are embedded within other tables
317 	 */
318 	public static class SimpleTableContentHandler
319 		implements ContentHandler 
320 	{
321 		/** Logger instance for this class */
322 		public static final Logger logger = Logger.getLogger(SimpleTableContentHandler.class);
323 
324 		/** Current table */
325 		List<List<String>> thisTable = null;
326 		/** Current row in table */
327 		List<String> thisRow = null;
328 		/** Current cell in row */
329 		String thisCell = "";
330 
331 		/** The state of this parser */
332 		private enum State {
333 			/** start of doc, expecting 'table' */
334 			START,
335 			/** in table element, expecting 'tr' */
336 			IN_TABLE,
337 			/** in tr element, expecting 'td' (or other ignored elements) */
338 			IN_TR,
339 			/** in td element, capturing to closing tag */
340 			IN_TD
341 		}
342 
343 		State state = State.START;
344 		
345 		// unused interface methods
346 		public void setDocumentLocator(Locator locator) { }
347 		public void startDocument() throws SAXException { }
348 		public void endDocument() throws SAXException { }
349 		public void startPrefixMapping(String prefix, String uri) throws SAXException { }
350 		public void endPrefixMapping(String prefix) throws SAXException { }
351 		public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException { }
352 		public void processingInstruction(String target, String data) throws SAXException { }
353 		public void skippedEntity(String name) throws SAXException { }
354 
355 
356 		public void startElement(String uri, String localName, String qName, Attributes atts)
357 			throws SAXException 
358 		{
359 			switch (state) {
360 				case START: 
361 					if (qName.equals("table")) {
362 						thisTable = new ArrayList<List<String>>(); 
363 						state = State.IN_TABLE; 
364 					} else {
365 						logger.warn("Warning: top-level element '" + qName + "' found (expected 'table')");
366 					}
367 					break;
368 				
369 				case IN_TABLE:
370 					if (qName.equals("tr")) {
371 						thisRow = new ArrayList<String>();
372 						thisTable.add(thisRow);
373 						state = State.IN_TR;
374 					}
375 					break;
376 					
377 				case IN_TR: 
378 					if (qName.equals("td")) {
379 						thisCell = "";
380 						state = State.IN_TD;
381 					}
382 					break;
383 					
384 				case IN_TD:
385 					break;
386 					
387 				default:
388 					throw new IllegalStateException("Illegal state " + state + " in SimpleTableContentHandler");
389 				
390 			}
391 		}
392 
393 		public void characters(char[] ch, int start, int length)
394 			throws SAXException {
395 			if (state==State.IN_TD) {
396 				thisCell += new String(ch, start, length);
397 			}
398 		}
399 
400 		public void endElement(String uri, String localName, String qName)
401 			throws SAXException 
402 		{
403 			if (state == State.IN_TD && qName.equals("td")) {
404 				thisRow.add(thisCell);
405 				state = State.IN_TR;
406 			} else if (state == State.IN_TR && qName.equals("tr")) {
407 				state = State.IN_TABLE;
408 			}
409 		}
410 	
411 		public List<List<String>> getTable() {
412 			return thisTable;
413 		}
414 	}
415 	
416 	/** An abstract stack-based XML parser. Similar to the apache digester, but without
417 	 * the dozen or so dependent JARs.
418 	 * 
419 	 * <p>Only element text is captured 
420 	 * <p>Element attributes are not parsed by this class.
421 	 * <p>Mixed text/element nodes are not parsed by this class.
422 	 * 
423 	 */
424 	public abstract static class AbstractStackContentHandler implements ContentHandler 
425 	{
426 		/** Logger instance for this class */
427 		public static final Logger logger = Logger.getLogger(AbstractStackContentHandler.class);
428 
429 		/** Location in stack */
430 		protected String stack = "";
431 		protected String text = null;     // text captured so far
432 		
433 		// unused interface methods
434 		public void setDocumentLocator(Locator locator) { }
435 		public void startDocument() throws SAXException { }
436 		public void endDocument() throws SAXException { }
437 		public void startPrefixMapping(String prefix, String uri) throws SAXException { }
438 		public void endPrefixMapping(String prefix) throws SAXException { }
439 		public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException { }
440 		public void processingInstruction(String target, String data) throws SAXException { }
441 		public void skippedEntity(String name) throws SAXException { }
442 
443 		public void startElement(String uri, String localName, String qName, Attributes atts)
444 			throws SAXException 
445 		{
446 			stack = stack.equals("") ? qName : stack + "/" + qName;
447 			text = "";
448 			element(stack);
449 		}
450 		public void characters(char[] ch, int start, int length) throws SAXException {
451 			text += new String(ch, start, length);
452 		}
453 		public void endElement(String uri, String localName, String qName)
454 			throws SAXException 
455 		{
456 			elementText(stack, text);
457 			text = ""; // probably not necessary
458 			stack = stack.contains("/") ? stack.substring(0, stack.lastIndexOf("/")) : "";
459 		}
460 
461 		// abstract methods to be implemented by subclasses
462 		public abstract void element(String path) throws SAXException;
463 		public abstract void elementText(String path, String content) throws SAXException;
464 	}
465 	
466 
467 	/** Convert a NodeList into something that Java1.5 can treat as Iterable,
468 	 * so that it can be used in <code>for (Node node : nodeList) { ... }</code> style
469 	 * constructs.
470 	 * 
471 	 * <p>(org.w3c.dom.traversal.NodeListIterator doesn't currently implement Iterable)
472 	 * 
473 	 */
474 	public static class NodeListIterator implements Iterable<org.w3c.dom.Node> {
475 		private final NodeList nodeList;
476 		public NodeListIterator(NodeList nodeList) {
477 			this.nodeList = nodeList;
478 		}
479 		public Iterator<org.w3c.dom.Node> iterator() {
480 			return new Iterator<org.w3c.dom.Node>() {
481 				private int index = 0;
482 				public boolean hasNext() {
483 					return index < nodeList.getLength();
484 				}
485 				public org.w3c.dom.Node next() {
486 					return nodeList.item(index++);
487 				}
488 				public void remove() {
489 					throw new UnsupportedOperationException("remove() not allowed in NodeList");
490 				}
491 			};
492 		}
493 	}
494 	
495 	/** Class to evaluate a SAX ContentHandler by traversing an XML DOM */
496 	public static class ContentHandlerTraverser {
497 
498         public void traverse(Node node, ContentHandler handler) throws SAXException {
499             handler.startDocument();
500             traverseNode(node, handler);
501             handler.endDocument();
502         }
503 
504         private void traverseNode(Node node, ContentHandler handler) throws SAXException {
505             switch (node.getNodeType()) {
506                 case Node.DOCUMENT_NODE:
507                     Document doc = (Document) node;
508                     traverseChildren(doc.getDocumentElement(), handler);
509                     break;
510 
511                 case Node.ELEMENT_NODE:
512                     Element elem = (Element) node;
513                     AttributesImpl attrs = new AttributesImpl();
514                     NamedNodeMap map = elem.getAttributes();
515                     for (int i = 0; i < map.getLength(); i++) {
516                         Attr a = (Attr) map.item(i);
517                         attrs.addAttribute(
518                             a.getNamespaceURI() == null ? "" : a.getNamespaceURI(),
519                             a.getLocalName() == null ? a.getName() : a.getLocalName(),
520                             a.getName(),
521                             "CDATA",
522                             a.getValue()
523                         );
524                     }
525                     handler.startElement(
526                         elem.getNamespaceURI() == null ? "" : elem.getNamespaceURI(),
527                         elem.getLocalName() == null ? elem.getNodeName() : elem.getLocalName(),
528                         elem.getNodeName(),
529                         attrs
530                     );
531 
532                     traverseChildren(elem, handler);
533 
534                     handler.endElement(
535                         elem.getNamespaceURI() == null ? "" : elem.getNamespaceURI(),
536                         elem.getLocalName() == null ? elem.getNodeName() : elem.getLocalName(),
537                         elem.getNodeName()
538                     );
539                     break;
540 
541                 case Node.TEXT_NODE:
542                     String text = ((org.w3c.dom.Text) node).getData();
543                     char[] chars = text.toCharArray();
544                     handler.characters(chars, 0, chars.length);
545                     break;
546 
547                 case Node.CDATA_SECTION_NODE:
548                     String cdata = ((CDATASection) node).getData();
549                     char[] cchars = cdata.toCharArray();
550                     handler.characters(cchars, 0, cchars.length);
551                     break;
552 
553                 case Node.COMMENT_NODE:
554                     // comments are ignored by ContentHandler
555                     break;
556             }
557         }
558 
559         private void traverseChildren(Node parent, ContentHandler handler) throws SAXException {
560             Node child = parent.getFirstChild();
561             while (child != null) {
562                 traverseNode(child, handler);
563                 child = child.getNextSibling();
564             }
565         }
566     }
567 
568 	
569 	
570 }