001package com.randomnoun.common;
002
003/* (c) 2013 randomnoun. All Rights Reserved. This work is licensed under a
004 * BSD Simplified License. (http://www.randomnoun.com/bsd-simplified.html)
005 */
006
007import java.io.*;
008import java.util.ArrayList;
009import java.util.Arrays;
010import java.util.HashSet;
011import java.util.Iterator;
012import java.util.List;
013import java.util.Set;
014
015import javax.xml.parsers.DocumentBuilder;
016import javax.xml.parsers.DocumentBuilderFactory;
017import javax.xml.parsers.ParserConfigurationException;
018import javax.xml.parsers.SAXParser;
019import javax.xml.parsers.SAXParserFactory;
020import javax.xml.transform.OutputKeys;
021import javax.xml.transform.Transformer;
022import javax.xml.transform.TransformerConfigurationException;
023import javax.xml.transform.TransformerException;
024import javax.xml.transform.TransformerFactory;
025import javax.xml.transform.dom.DOMSource;
026import javax.xml.transform.stream.StreamResult;
027
028import org.ccil.cowan.tagsoup.*;
029import org.ccil.cowan.tagsoup.Parser;
030
031import org.w3c.dom.*;
032import org.w3c.dom.Element;
033import org.xml.sax.*;
034
035import org.apache.log4j.Logger;
036
037/** XML utility functions
038 *
039 * @author knoxg
040 * @see <a href="http://www.randomnoun.com/wp/2013/01/25/exciting-things-with-xml/">http://www.randomnoun.com/wp/2013/01/25/exciting-things-with-xml/</a>
041 */
042public class XmlUtil {
043        
044        /** Clean some HTML text through the tagsoup filter. The returned string is guaranteed to be 
045         * well-formed XML (and can therefore be used by other tools that expect valid XML). 
046         * 
047         * @param inputXml input XML document
048         * @param isHtml if true, uses the HTML schema, omits the XML declaration, and uses the html method
049         * 
050         * @throws SAXException if the tagsoup library could not parse the input string
051         * @throws IllegalStateException if an error occurred reading from a string (should never occur)
052         */ 
053        public static String getCleanXml(String inputXml, boolean isHtml) throws SAXException {
054                return getCleanXml(new ByteArrayInputStream(inputXml.getBytes()), isHtml);
055        }
056        
057        /** Clean a HTML inputStream through the tagsoup filter. The returned string is guaranteed to be 
058         * well-formed XML (and can therefore be used by other tools that expect valid XML). 
059         * 
060         * @param inputStream input XML stream
061         * @param isHtml if true, uses the HTML schema, omits the XML declaration, and uses the html method
062         * 
063         * @throws SAXException if the tagsoup library could not parse the input string
064         * @throws IllegalStateException if an error occurred reading from a string (should never occur)
065         */ 
066        public static String getCleanXml(InputStream inputStream, boolean isHtml) throws SAXException {
067                try {
068                        ByteArrayOutputStream baos = new ByteArrayOutputStream();
069                        InputSource is = new InputSource();
070                        is.setByteStream(inputStream); // could use raw inputstream here later
071
072                        XMLReader xmlReader = new Parser();
073                        Writer w = new OutputStreamWriter(baos);
074                        XMLWriter tagsoupXMLWriter = new XMLWriter(w);
075                        tagsoupXMLWriter.setOutputProperty(XMLWriter.OMIT_XML_DECLARATION, "yes");
076                        if (isHtml) {
077                                HTMLSchema theSchema = new HTMLSchema();
078                                xmlReader.setProperty(Parser.schemaProperty, theSchema);
079        
080                                tagsoupXMLWriter.setOutputProperty(XMLWriter.METHOD, "html");
081                                tagsoupXMLWriter.setPrefix(theSchema.getURI(), "");
082                        }
083                        
084                        xmlReader.setContentHandler(tagsoupXMLWriter);
085                        xmlReader.parse(is);
086                        return baos.toString("UTF-8");
087                } catch (IOException ioe) {
088                        throw (IllegalStateException) new IllegalStateException("IO Exception reading from string").initCause(ioe);             
089                }
090        }
091
092
093        /**
094         * Iterates through the child nodes of the specified element, and returns the contents
095         * of all Text and CDATA elements among those nodes, concatenated into a string.
096         *
097         * <p>Elements are recursed into.
098         *
099         * @param element the element that contains, as child nodes, the text to be returned.
100         * @return the contents of all the CDATA children of the specified element.
101         */
102        public static String getText(Element element)
103        {
104                if (element == null) { throw new NullPointerException("null element"); }
105                StringBuffer buf = new StringBuffer();
106                NodeList children = element.getChildNodes();
107                for (int i = 0; i < children.getLength(); ++i) {
108                        org.w3c.dom.Node child = children.item(i);
109                        short nodeType = child.getNodeType();
110                        if (nodeType == org.w3c.dom.Node.CDATA_SECTION_NODE) {
111                                buf.append(((org.w3c.dom.Text) child).getData());                       
112                        } else if (nodeType == org.w3c.dom.Node.TEXT_NODE) {
113                                buf.append(((org.w3c.dom.Text) child).getData());
114                        } else if (nodeType == org.w3c.dom.Node.ELEMENT_NODE) {
115                                buf.append(getText((Element) child));
116                        }
117                }
118                return buf.toString();
119        }
120
121        /**
122         * Iterates through the child nodes of the specified element, and returns the contents
123         * of all Text and CDATA elements among those nodes, concatenated into a string. 
124         * Any elements with tagNames that are included in the tagNames parameter of this
125         * method are also included. 
126         * 
127         * <p>Attributes of these tags are also included in the result, but may be reordered.
128         * 
129         * <p>Self-closing elements (e.g. <code>&lt;br/&gt;</code>)
130         * are expanded into opening and closing elements (e.g. <code>&lt;br&gt;&lt;/br&gt;</code>)
131         *
132         * <p>Elements are recursed into.
133         *
134         * @param element the element that contains, as child nodes, the text to be returned.
135         * @return the contents of all the CDATA children of the specified element.
136         */
137        public static String getTextPreserveElements(Element element, String[] tagNames) {
138                if (element == null) { throw new NullPointerException("null element"); }
139                Set<String> tagNamesSet = new HashSet<String>(Arrays.asList(tagNames));
140                StringBuffer buf = new StringBuffer();
141                NodeList children = element.getChildNodes();
142                for (int i = 0; i < children.getLength(); ++i) {
143                        org.w3c.dom.Node child = children.item(i);
144                        short nodeType = child.getNodeType();
145                        if (nodeType == org.w3c.dom.Node.CDATA_SECTION_NODE) {
146                                buf.append(((org.w3c.dom.Text) child).getData());                       
147                        } else if (nodeType == org.w3c.dom.Node.TEXT_NODE) {
148                                buf.append(((org.w3c.dom.Text) child).getData());
149                        } else if (nodeType == org.w3c.dom.Node.ELEMENT_NODE) {
150                                String tagName = ((Element) child).getTagName();
151                                boolean includeEl = tagNamesSet.contains(tagName);
152                                if (includeEl) {
153                                        buf.append('<');
154                                        buf.append(tagName);
155                                        NamedNodeMap nnm = ((Element) child).getAttributes();
156                                        for (int j = 0; j < nnm.getLength(); j++) {
157                                                Attr attr = (Attr) nnm.item(j);
158                                                buf.append(" " + attr.getName());
159                                                if (attr.getValue()!=null) {
160                                                        buf.append("=\"" + attr.getValue() + "\"");
161                                                }
162                                        }
163                                        buf.append('>');
164                                }
165                                buf.append(getTextPreserveElements((Element) child, tagNames));
166                                if (includeEl) {
167                                        buf.append("</" + tagName + ">");
168                                }
169                        }
170                }
171                return buf.toString();
172        }       
173
174
175        
176        /**
177         * Iterates through the child nodes of the specified element, and returns the contents
178         * of all Text and CDATA elements among those nodes, concatenated into a string.
179         * 
180         * <p>Elements are not recursed into.
181         *
182         * @param element the element that contains, as child nodes, the text to be returned.
183         * @return the contents of all the CDATA children of the specified element.
184         */
185        public static String getTextNonRecursive(Element element)
186        {
187                if (element == null) { throw new NullPointerException("null element"); }
188                StringBuffer buf = new StringBuffer();
189                NodeList children = element.getChildNodes();
190                for (int i = 0; i < children.getLength(); ++i) {
191                        org.w3c.dom.Node child = children.item(i);
192                        short nodeType = child.getNodeType();
193                        if (nodeType == org.w3c.dom.Node.CDATA_SECTION_NODE) {
194                                buf.append(((org.w3c.dom.Text) child).getData());                       
195                        } else if (nodeType == org.w3c.dom.Node.TEXT_NODE) {
196                                buf.append(((org.w3c.dom.Text) child).getData());
197                        } else if (nodeType == org.w3c.dom.Node.ELEMENT_NODE) {
198                                // ignore child elements
199                        }
200                }
201                return buf.toString();
202        }
203        
204        /** Return a DOM document object from an XML string
205         * 
206         * @param text the string representation of the XML to parse 
207         */
208        public static Document toDocument(String text) throws SAXException {
209                return toDocument(new ByteArrayInputStream(text.getBytes()));
210        }
211        
212        /** Return a DOM document object from an InputStream
213         * 
214         * @param is the InputStream containing the XML to parse 
215         */
216        public static Document toDocument(InputStream is) throws SAXException {
217                try {
218                        DocumentBuilderFactory docBuilderFactory = DocumentBuilderFactory.newInstance();
219                        DocumentBuilder docBuilder = docBuilderFactory.newDocumentBuilder();
220                        Document doc = docBuilder.parse(is);
221                        doc.getDocumentElement().normalize(); // Collapses adjacent text nodes into one node.
222                        return doc;
223                } catch (ParserConfigurationException pce) {
224                        // this can never happen 
225                        throw (IllegalStateException) new IllegalStateException("Error creating DOM parser").initCause(pce);
226                } catch (IOException ioe) {
227                        // this can also never happen
228                        throw (IllegalStateException) new IllegalStateException("Error retrieving information").initCause(ioe);
229                } 
230        }
231        
232        /** Converts a document node subtree back into an XML string 
233         * 
234         * @param node a DOM node 
235         * @param omitXmlDeclaration if true, omits the XML declaration from the returned result
236         * 
237         * @return the XML for this node
238         * 
239         * @throws TransformerException if the transformation to XML failed
240         * @throws IllegalStateException if the transformer could not be initialised 
241         */
242        public static String getXmlString(Node node, boolean omitXmlDeclaration) 
243                throws TransformerException 
244        {
245                try {
246                        ByteArrayOutputStream baos = new ByteArrayOutputStream();
247                        TransformerFactory transformerFactory = TransformerFactory.newInstance();
248                        Transformer transformer = transformerFactory.newTransformer();
249                        DOMSource source = new DOMSource(node);
250                        StreamResult result = new StreamResult(baos);
251                        transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, omitXmlDeclaration ? "yes": "no");
252                        transformer.transform(source, result);
253                        return baos.toString("UTF-8");
254                } catch (TransformerConfigurationException tce) {
255                        throw (IllegalStateException) new IllegalStateException("Could not initialise transformer").initCause(tce);
256                } catch (UnsupportedEncodingException uee) {
257                        throw (IllegalStateException) new IllegalStateException("Unknown charset UTF-8").initCause(uee);
258                }
259        }
260        
261
262        /** Remove leading/trailing whitespace from all text nodes in this nodeList.
263         * Will iterate through subnodes recursively.
264         * 
265         * @param node
266         */
267        public static void compact(Node node) {
268                if (node.getNodeType()==Node.TEXT_NODE) {
269                        org.w3c.dom.Text el = (org.w3c.dom.Text) node;
270                        if (el.getNodeValue()!=null) {
271                                el.setNodeValue(el.getNodeValue().trim());
272                        }
273                } else if (node.getNodeType()==Node.ELEMENT_NODE) {
274                        NodeList childNodes = node.getChildNodes();
275                        if (childNodes != null && childNodes.getLength() > 0) {
276                                int len = childNodes.getLength();
277                                for (int i=0; i<len; i++) {
278                                        Node childNode = childNodes.item(i);
279                                    compact(childNode);
280                                }
281                        }
282                }
283        }
284        
285        
286        /** Parse a string of XML text using a SAX contentHandler. Nothing is returned by this method - it 
287         * is assumed that the contentHandler supplied maintains it's own state as it parses the XML supplied,
288         * and that this state can be extracted from this object afterwards.
289         * 
290         * @param contentHandler a SAX content handler 
291         * @param xmlText an XML document (or part thereof)
292         * 
293         * @throws SAXException if the document could not be parsed
294         * @throws IllegalStateException if the parser could not be initialised, or an I/O error occurred 
295         *   (should not happen since we're just dealing with strings)
296         */
297        public static void processContentHandler(ContentHandler contentHandler, String xmlText) throws SAXException, IllegalStateException {
298                 SAXParserFactory factory = SAXParserFactory.newInstance();
299                 try {
300                         // Parse the input
301                         SAXParser saxParser = factory.newSAXParser();
302                         XMLReader xmlReader = saxParser.getXMLReader();
303                         xmlReader.setContentHandler(contentHandler);
304                         xmlReader.parse(new InputSource(new ByteArrayInputStream(xmlText.getBytes())));
305                 } catch (IOException ioe) {
306                        throw (IllegalStateException) new IllegalStateException("IO Exception reading from string").initCause(ioe);
307                 } catch (ParserConfigurationException pce) {
308                        throw (IllegalStateException) new IllegalStateException("Could not initialise parser").initCause(pce);                          
309                 }
310        }
311        
312        /** Convert a table into a List of Lists (each top-level list represents a table row,
313         * each second-level list represents a table cell). Only contents are returned; attributes
314         * and formatting are ignored.
315         * 
316         * <p>This class will probably not work when tables are embedded within other tables
317         */
318        public static class SimpleTableContentHandler
319                implements ContentHandler 
320        {
321                /** Logger instance for this class */
322                public static final Logger logger = Logger.getLogger(SimpleTableContentHandler.class);
323
324                /** Current table */
325                List<List<String>> thisTable = null;
326                /** Current row in table */
327                List<String> thisRow = null;
328                /** Current cell in row */
329                String thisCell = "";
330
331                /** The state of this parser */
332                private enum State {
333                        /** start of doc, expecting 'table' */
334                        START,
335                        /** in table element, expecting 'tr' */
336                        IN_TABLE,
337                        /** in tr element, expecting 'td' (or other ignored elements) */
338                        IN_TR,
339                        /** in td element, capturing to closing tag */
340                        IN_TD
341                }
342
343                State state = State.START;
344                
345                // unused interface methods
346                public void setDocumentLocator(Locator locator) { }
347                public void startDocument() throws SAXException { }
348                public void endDocument() throws SAXException { }
349                public void startPrefixMapping(String prefix, String uri) throws SAXException { }
350                public void endPrefixMapping(String prefix) throws SAXException { }
351                public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException { }
352                public void processingInstruction(String target, String data) throws SAXException { }
353                public void skippedEntity(String name) throws SAXException { }
354
355
356                public void startElement(String uri, String localName, String qName, Attributes atts)
357                        throws SAXException 
358                {
359                        switch (state) {
360                                case START: 
361                                        if (qName.equals("table")) {
362                                                thisTable = new ArrayList<List<String>>(); 
363                                                state = State.IN_TABLE; 
364                                        } else {
365                                                logger.warn("Warning: top-level element '" + qName + "' found (expected 'table')");
366                                        }
367                                        break;
368                                
369                                case IN_TABLE:
370                                        if (qName.equals("tr")) {
371                                                thisRow = new ArrayList<String>();
372                                                thisTable.add(thisRow);
373                                                state = State.IN_TR;
374                                        }
375                                        break;
376                                        
377                                case IN_TR: 
378                                        if (qName.equals("td")) {
379                                                thisCell = "";
380                                                state = State.IN_TD;
381                                        }
382                                        break;
383                                        
384                                case IN_TD:
385                                        break;
386                                        
387                                default:
388                                        throw new IllegalStateException("Illegal state " + state + " in SimpleTableContentHandler");
389                                
390                        }
391                }
392
393                public void characters(char[] ch, int start, int length)
394                        throws SAXException {
395                        if (state==State.IN_TD) {
396                                thisCell += new String(ch, start, length);
397                        }
398                }
399
400                public void endElement(String uri, String localName, String qName)
401                        throws SAXException 
402                {
403                        if (state == State.IN_TD && qName.equals("td")) {
404                                thisRow.add(thisCell);
405                                state = State.IN_TR;
406                        } else if (state == State.IN_TR && qName.equals("tr")) {
407                                state = State.IN_TABLE;
408                        }
409                }
410        
411                public List<List<String>> getTable() {
412                        return thisTable;
413                }
414        }
415        
416        /** An abstract stack-based XML parser. Similar to the apache digester, but without
417         * the dozen or so dependent JARs.
418         * 
419         * <p>Only element text is captured 
420         * <p>Element attributes are not parsed by this class.
421         * <p>Mixed text/element nodes are not parsed by this class.
422         * 
423         */
424        public abstract static class AbstractStackContentHandler implements ContentHandler 
425        {
426                /** Logger instance for this class */
427                public static final Logger logger = Logger.getLogger(AbstractStackContentHandler.class);
428
429                /** Location in stack */
430                protected String stack = "";
431                protected String text = null;     // text captured so far
432                
433                // unused interface methods
434                public void setDocumentLocator(Locator locator) { }
435                public void startDocument() throws SAXException { }
436                public void endDocument() throws SAXException { }
437                public void startPrefixMapping(String prefix, String uri) throws SAXException { }
438                public void endPrefixMapping(String prefix) throws SAXException { }
439                public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException { }
440                public void processingInstruction(String target, String data) throws SAXException { }
441                public void skippedEntity(String name) throws SAXException { }
442
443                public void startElement(String uri, String localName, String qName, Attributes atts)
444                        throws SAXException 
445                {
446                        stack = stack.equals("") ? qName : stack + "/" + qName;
447                        text = "";
448                        element(stack);
449                }
450                public void characters(char[] ch, int start, int length) throws SAXException {
451                        text += new String(ch, start, length);
452                }
453                public void endElement(String uri, String localName, String qName)
454                        throws SAXException 
455                {
456                        elementText(stack, text);
457                        text = ""; // probably not necessary
458                        stack = stack.contains("/") ? stack.substring(0, stack.lastIndexOf("/")) : "";
459                }
460
461                // abstract methods to be implemented by subclasses
462                public abstract void element(String path) throws SAXException;
463                public abstract void elementText(String path, String content) throws SAXException;
464        }
465        
466
467        /** Convert a NodeList into something that Java1.5 can treat as Iterable,
468         * so that it can be used in <code>for (Node node : nodeList) { ... }</code> style
469         * constructs.
470         * 
471         * <p>(org.w3c.dom.traversal.NodeListIterator doesn't currently implement Iterable)
472         * 
473         */
474        public static class NodeListIterator implements Iterable<org.w3c.dom.Node> {
475                private final NodeList nodeList;
476                public NodeListIterator(NodeList nodeList) {
477                        this.nodeList = nodeList;
478                }
479                public Iterator<org.w3c.dom.Node> iterator() {
480                        return new Iterator<org.w3c.dom.Node>() {
481                                private int index = 0;
482                                public boolean hasNext() {
483                                        return index < nodeList.getLength();
484                                }
485                                public org.w3c.dom.Node next() {
486                                        return nodeList.item(index++);
487                                }
488                                public void remove() {
489                                        throw new UnsupportedOperationException("remove() not allowed in NodeList");
490                                }
491                        };
492                }
493        }
494
495        
496        
497}