001package com.randomnoun.common; 002 003/* (c) 2013 randomnoun. All Rights Reserved. This work is licensed under a 004 * BSD Simplified License. (http://www.randomnoun.com/bsd-simplified.html) 005 */ 006 007import java.io.*; 008import java.util.ArrayList; 009import java.util.Arrays; 010import java.util.HashSet; 011import java.util.Iterator; 012import java.util.List; 013import java.util.Set; 014 015import javax.xml.parsers.DocumentBuilder; 016import javax.xml.parsers.DocumentBuilderFactory; 017import javax.xml.parsers.ParserConfigurationException; 018import javax.xml.parsers.SAXParser; 019import javax.xml.parsers.SAXParserFactory; 020import javax.xml.transform.OutputKeys; 021import javax.xml.transform.Transformer; 022import javax.xml.transform.TransformerConfigurationException; 023import javax.xml.transform.TransformerException; 024import javax.xml.transform.TransformerFactory; 025import javax.xml.transform.dom.DOMSource; 026import javax.xml.transform.stream.StreamResult; 027 028import org.ccil.cowan.tagsoup.*; 029import org.ccil.cowan.tagsoup.Parser; 030 031import org.w3c.dom.*; 032import org.w3c.dom.Element; 033import org.xml.sax.*; 034 035import org.apache.log4j.Logger; 036 037/** XML utility functions 038 * 039 * @author knoxg 040 * @see <a href="http://www.randomnoun.com/wp/2013/01/25/exciting-things-with-xml/">http://www.randomnoun.com/wp/2013/01/25/exciting-things-with-xml/</a> 041 */ 042public class XmlUtil { 043 044 /** Clean some HTML text through the tagsoup filter. The returned string is guaranteed to be 045 * well-formed XML (and can therefore be used by other tools that expect valid XML). 046 * 047 * @param inputXml input XML document 048 * @param isHtml if true, uses the HTML schema, omits the XML declaration, and uses the html method 049 * 050 * @throws SAXException if the tagsoup library could not parse the input string 051 * @throws IllegalStateException if an error occurred reading from a string (should never occur) 052 */ 053 public static String getCleanXml(String inputXml, boolean isHtml) throws SAXException { 054 return getCleanXml(new ByteArrayInputStream(inputXml.getBytes()), isHtml); 055 } 056 057 /** Clean a HTML inputStream through the tagsoup filter. The returned string is guaranteed to be 058 * well-formed XML (and can therefore be used by other tools that expect valid XML). 059 * 060 * @param inputStream input XML stream 061 * @param isHtml if true, uses the HTML schema, omits the XML declaration, and uses the html method 062 * 063 * @throws SAXException if the tagsoup library could not parse the input string 064 * @throws IllegalStateException if an error occurred reading from a string (should never occur) 065 */ 066 public static String getCleanXml(InputStream inputStream, boolean isHtml) throws SAXException { 067 try { 068 ByteArrayOutputStream baos = new ByteArrayOutputStream(); 069 InputSource is = new InputSource(); 070 is.setByteStream(inputStream); // could use raw inputstream here later 071 072 XMLReader xmlReader = new Parser(); 073 Writer w = new OutputStreamWriter(baos); 074 XMLWriter tagsoupXMLWriter = new XMLWriter(w); 075 tagsoupXMLWriter.setOutputProperty(XMLWriter.OMIT_XML_DECLARATION, "yes"); 076 if (isHtml) { 077 HTMLSchema theSchema = new HTMLSchema(); 078 xmlReader.setProperty(Parser.schemaProperty, theSchema); 079 080 tagsoupXMLWriter.setOutputProperty(XMLWriter.METHOD, "html"); 081 tagsoupXMLWriter.setPrefix(theSchema.getURI(), ""); 082 } 083 084 xmlReader.setContentHandler(tagsoupXMLWriter); 085 xmlReader.parse(is); 086 return baos.toString("UTF-8"); 087 } catch (IOException ioe) { 088 throw (IllegalStateException) new IllegalStateException("IO Exception reading from string").initCause(ioe); 089 } 090 } 091 092 093 /** 094 * Iterates through the child nodes of the specified element, and returns the contents 095 * of all Text and CDATA elements among those nodes, concatenated into a string. 096 * 097 * <p>Elements are recursed into. 098 * 099 * @param element the element that contains, as child nodes, the text to be returned. 100 * @return the contents of all the CDATA children of the specified element. 101 */ 102 public static String getText(Element element) 103 { 104 if (element == null) { throw new NullPointerException("null element"); } 105 StringBuffer buf = new StringBuffer(); 106 NodeList children = element.getChildNodes(); 107 for (int i = 0; i < children.getLength(); ++i) { 108 org.w3c.dom.Node child = children.item(i); 109 short nodeType = child.getNodeType(); 110 if (nodeType == org.w3c.dom.Node.CDATA_SECTION_NODE) { 111 buf.append(((org.w3c.dom.Text) child).getData()); 112 } else if (nodeType == org.w3c.dom.Node.TEXT_NODE) { 113 buf.append(((org.w3c.dom.Text) child).getData()); 114 } else if (nodeType == org.w3c.dom.Node.ELEMENT_NODE) { 115 buf.append(getText((Element) child)); 116 } 117 } 118 return buf.toString(); 119 } 120 121 /** 122 * Iterates through the child nodes of the specified element, and returns the contents 123 * of all Text and CDATA elements among those nodes, concatenated into a string. 124 * Any elements with tagNames that are included in the tagNames parameter of this 125 * method are also included. 126 * 127 * <p>Attributes of these tags are also included in the result, but may be reordered. 128 * 129 * <p>Self-closing elements (e.g. <code><br/></code>) 130 * are expanded into opening and closing elements (e.g. <code><br></br></code>) 131 * 132 * <p>Elements are recursed into. 133 * 134 * @param element the element that contains, as child nodes, the text to be returned. 135 * @return the contents of all the CDATA children of the specified element. 136 */ 137 public static String getTextPreserveElements(Element element, String[] tagNames) { 138 if (element == null) { throw new NullPointerException("null element"); } 139 Set<String> tagNamesSet = new HashSet<String>(Arrays.asList(tagNames)); 140 StringBuffer buf = new StringBuffer(); 141 NodeList children = element.getChildNodes(); 142 for (int i = 0; i < children.getLength(); ++i) { 143 org.w3c.dom.Node child = children.item(i); 144 short nodeType = child.getNodeType(); 145 if (nodeType == org.w3c.dom.Node.CDATA_SECTION_NODE) { 146 buf.append(((org.w3c.dom.Text) child).getData()); 147 } else if (nodeType == org.w3c.dom.Node.TEXT_NODE) { 148 buf.append(((org.w3c.dom.Text) child).getData()); 149 } else if (nodeType == org.w3c.dom.Node.ELEMENT_NODE) { 150 String tagName = ((Element) child).getTagName(); 151 boolean includeEl = tagNamesSet.contains(tagName); 152 if (includeEl) { 153 buf.append('<'); 154 buf.append(tagName); 155 NamedNodeMap nnm = ((Element) child).getAttributes(); 156 for (int j = 0; j < nnm.getLength(); j++) { 157 Attr attr = (Attr) nnm.item(j); 158 buf.append(" " + attr.getName()); 159 if (attr.getValue()!=null) { 160 buf.append("=\"" + attr.getValue() + "\""); 161 } 162 } 163 buf.append('>'); 164 } 165 buf.append(getTextPreserveElements((Element) child, tagNames)); 166 if (includeEl) { 167 buf.append("</" + tagName + ">"); 168 } 169 } 170 } 171 return buf.toString(); 172 } 173 174 175 176 /** 177 * Iterates through the child nodes of the specified element, and returns the contents 178 * of all Text and CDATA elements among those nodes, concatenated into a string. 179 * 180 * <p>Elements are not recursed into. 181 * 182 * @param element the element that contains, as child nodes, the text to be returned. 183 * @return the contents of all the CDATA children of the specified element. 184 */ 185 public static String getTextNonRecursive(Element element) 186 { 187 if (element == null) { throw new NullPointerException("null element"); } 188 StringBuffer buf = new StringBuffer(); 189 NodeList children = element.getChildNodes(); 190 for (int i = 0; i < children.getLength(); ++i) { 191 org.w3c.dom.Node child = children.item(i); 192 short nodeType = child.getNodeType(); 193 if (nodeType == org.w3c.dom.Node.CDATA_SECTION_NODE) { 194 buf.append(((org.w3c.dom.Text) child).getData()); 195 } else if (nodeType == org.w3c.dom.Node.TEXT_NODE) { 196 buf.append(((org.w3c.dom.Text) child).getData()); 197 } else if (nodeType == org.w3c.dom.Node.ELEMENT_NODE) { 198 // ignore child elements 199 } 200 } 201 return buf.toString(); 202 } 203 204 /** Return a DOM document object from an XML string 205 * 206 * @param text the string representation of the XML to parse 207 */ 208 public static Document toDocument(String text) throws SAXException { 209 return toDocument(new ByteArrayInputStream(text.getBytes())); 210 } 211 212 /** Return a DOM document object from an InputStream 213 * 214 * @param is the InputStream containing the XML to parse 215 */ 216 public static Document toDocument(InputStream is) throws SAXException { 217 try { 218 DocumentBuilderFactory docBuilderFactory = DocumentBuilderFactory.newInstance(); 219 DocumentBuilder docBuilder = docBuilderFactory.newDocumentBuilder(); 220 Document doc = docBuilder.parse(is); 221 doc.getDocumentElement().normalize(); // Collapses adjacent text nodes into one node. 222 return doc; 223 } catch (ParserConfigurationException pce) { 224 // this can never happen 225 throw (IllegalStateException) new IllegalStateException("Error creating DOM parser").initCause(pce); 226 } catch (IOException ioe) { 227 // this can also never happen 228 throw (IllegalStateException) new IllegalStateException("Error retrieving information").initCause(ioe); 229 } 230 } 231 232 /** Converts a document node subtree back into an XML string 233 * 234 * @param node a DOM node 235 * @param omitXmlDeclaration if true, omits the XML declaration from the returned result 236 * 237 * @return the XML for this node 238 * 239 * @throws TransformerException if the transformation to XML failed 240 * @throws IllegalStateException if the transformer could not be initialised 241 */ 242 public static String getXmlString(Node node, boolean omitXmlDeclaration) 243 throws TransformerException 244 { 245 try { 246 ByteArrayOutputStream baos = new ByteArrayOutputStream(); 247 TransformerFactory transformerFactory = TransformerFactory.newInstance(); 248 Transformer transformer = transformerFactory.newTransformer(); 249 DOMSource source = new DOMSource(node); 250 StreamResult result = new StreamResult(baos); 251 transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, omitXmlDeclaration ? "yes": "no"); 252 transformer.transform(source, result); 253 return baos.toString("UTF-8"); 254 } catch (TransformerConfigurationException tce) { 255 throw (IllegalStateException) new IllegalStateException("Could not initialise transformer").initCause(tce); 256 } catch (UnsupportedEncodingException uee) { 257 throw (IllegalStateException) new IllegalStateException("Unknown charset UTF-8").initCause(uee); 258 } 259 } 260 261 262 /** Remove leading/trailing whitespace from all text nodes in this nodeList. 263 * Will iterate through subnodes recursively. 264 * 265 * @param node 266 */ 267 public static void compact(Node node) { 268 if (node.getNodeType()==Node.TEXT_NODE) { 269 org.w3c.dom.Text el = (org.w3c.dom.Text) node; 270 if (el.getNodeValue()!=null) { 271 el.setNodeValue(el.getNodeValue().trim()); 272 } 273 } else if (node.getNodeType()==Node.ELEMENT_NODE) { 274 NodeList childNodes = node.getChildNodes(); 275 if (childNodes != null && childNodes.getLength() > 0) { 276 int len = childNodes.getLength(); 277 for (int i=0; i<len; i++) { 278 Node childNode = childNodes.item(i); 279 compact(childNode); 280 } 281 } 282 } 283 } 284 285 286 /** Parse a string of XML text using a SAX contentHandler. Nothing is returned by this method - it 287 * is assumed that the contentHandler supplied maintains it's own state as it parses the XML supplied, 288 * and that this state can be extracted from this object afterwards. 289 * 290 * @param contentHandler a SAX content handler 291 * @param xmlText an XML document (or part thereof) 292 * 293 * @throws SAXException if the document could not be parsed 294 * @throws IllegalStateException if the parser could not be initialised, or an I/O error occurred 295 * (should not happen since we're just dealing with strings) 296 */ 297 public static void processContentHandler(ContentHandler contentHandler, String xmlText) throws SAXException, IllegalStateException { 298 SAXParserFactory factory = SAXParserFactory.newInstance(); 299 try { 300 // Parse the input 301 SAXParser saxParser = factory.newSAXParser(); 302 XMLReader xmlReader = saxParser.getXMLReader(); 303 xmlReader.setContentHandler(contentHandler); 304 xmlReader.parse(new InputSource(new ByteArrayInputStream(xmlText.getBytes()))); 305 } catch (IOException ioe) { 306 throw (IllegalStateException) new IllegalStateException("IO Exception reading from string").initCause(ioe); 307 } catch (ParserConfigurationException pce) { 308 throw (IllegalStateException) new IllegalStateException("Could not initialise parser").initCause(pce); 309 } 310 } 311 312 /** Convert a table into a List of Lists (each top-level list represents a table row, 313 * each second-level list represents a table cell). Only contents are returned; attributes 314 * and formatting are ignored. 315 * 316 * <p>This class will probably not work when tables are embedded within other tables 317 */ 318 public static class SimpleTableContentHandler 319 implements ContentHandler 320 { 321 /** Logger instance for this class */ 322 public static final Logger logger = Logger.getLogger(SimpleTableContentHandler.class); 323 324 /** Current table */ 325 List<List<String>> thisTable = null; 326 /** Current row in table */ 327 List<String> thisRow = null; 328 /** Current cell in row */ 329 String thisCell = ""; 330 331 /** The state of this parser */ 332 private enum State { 333 /** start of doc, expecting 'table' */ 334 START, 335 /** in table element, expecting 'tr' */ 336 IN_TABLE, 337 /** in tr element, expecting 'td' (or other ignored elements) */ 338 IN_TR, 339 /** in td element, capturing to closing tag */ 340 IN_TD 341 } 342 343 State state = State.START; 344 345 // unused interface methods 346 public void setDocumentLocator(Locator locator) { } 347 public void startDocument() throws SAXException { } 348 public void endDocument() throws SAXException { } 349 public void startPrefixMapping(String prefix, String uri) throws SAXException { } 350 public void endPrefixMapping(String prefix) throws SAXException { } 351 public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException { } 352 public void processingInstruction(String target, String data) throws SAXException { } 353 public void skippedEntity(String name) throws SAXException { } 354 355 356 public void startElement(String uri, String localName, String qName, Attributes atts) 357 throws SAXException 358 { 359 switch (state) { 360 case START: 361 if (qName.equals("table")) { 362 thisTable = new ArrayList<List<String>>(); 363 state = State.IN_TABLE; 364 } else { 365 logger.warn("Warning: top-level element '" + qName + "' found (expected 'table')"); 366 } 367 break; 368 369 case IN_TABLE: 370 if (qName.equals("tr")) { 371 thisRow = new ArrayList<String>(); 372 thisTable.add(thisRow); 373 state = State.IN_TR; 374 } 375 break; 376 377 case IN_TR: 378 if (qName.equals("td")) { 379 thisCell = ""; 380 state = State.IN_TD; 381 } 382 break; 383 384 case IN_TD: 385 break; 386 387 default: 388 throw new IllegalStateException("Illegal state " + state + " in SimpleTableContentHandler"); 389 390 } 391 } 392 393 public void characters(char[] ch, int start, int length) 394 throws SAXException { 395 if (state==State.IN_TD) { 396 thisCell += new String(ch, start, length); 397 } 398 } 399 400 public void endElement(String uri, String localName, String qName) 401 throws SAXException 402 { 403 if (state == State.IN_TD && qName.equals("td")) { 404 thisRow.add(thisCell); 405 state = State.IN_TR; 406 } else if (state == State.IN_TR && qName.equals("tr")) { 407 state = State.IN_TABLE; 408 } 409 } 410 411 public List<List<String>> getTable() { 412 return thisTable; 413 } 414 } 415 416 /** An abstract stack-based XML parser. Similar to the apache digester, but without 417 * the dozen or so dependent JARs. 418 * 419 * <p>Only element text is captured 420 * <p>Element attributes are not parsed by this class. 421 * <p>Mixed text/element nodes are not parsed by this class. 422 * 423 */ 424 public abstract static class AbstractStackContentHandler implements ContentHandler 425 { 426 /** Logger instance for this class */ 427 public static final Logger logger = Logger.getLogger(AbstractStackContentHandler.class); 428 429 /** Location in stack */ 430 protected String stack = ""; 431 protected String text = null; // text captured so far 432 433 // unused interface methods 434 public void setDocumentLocator(Locator locator) { } 435 public void startDocument() throws SAXException { } 436 public void endDocument() throws SAXException { } 437 public void startPrefixMapping(String prefix, String uri) throws SAXException { } 438 public void endPrefixMapping(String prefix) throws SAXException { } 439 public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException { } 440 public void processingInstruction(String target, String data) throws SAXException { } 441 public void skippedEntity(String name) throws SAXException { } 442 443 public void startElement(String uri, String localName, String qName, Attributes atts) 444 throws SAXException 445 { 446 stack = stack.equals("") ? qName : stack + "/" + qName; 447 text = ""; 448 element(stack); 449 } 450 public void characters(char[] ch, int start, int length) throws SAXException { 451 text += new String(ch, start, length); 452 } 453 public void endElement(String uri, String localName, String qName) 454 throws SAXException 455 { 456 elementText(stack, text); 457 text = ""; // probably not necessary 458 stack = stack.contains("/") ? stack.substring(0, stack.lastIndexOf("/")) : ""; 459 } 460 461 // abstract methods to be implemented by subclasses 462 public abstract void element(String path) throws SAXException; 463 public abstract void elementText(String path, String content) throws SAXException; 464 } 465 466 467 /** Convert a NodeList into something that Java1.5 can treat as Iterable, 468 * so that it can be used in <code>for (Node node : nodeList) { ... }</code> style 469 * constructs. 470 * 471 * <p>(org.w3c.dom.traversal.NodeListIterator doesn't currently implement Iterable) 472 * 473 */ 474 public static class NodeListIterator implements Iterable<org.w3c.dom.Node> { 475 private final NodeList nodeList; 476 public NodeListIterator(NodeList nodeList) { 477 this.nodeList = nodeList; 478 } 479 public Iterator<org.w3c.dom.Node> iterator() { 480 return new Iterator<org.w3c.dom.Node>() { 481 private int index = 0; 482 public boolean hasNext() { 483 return index < nodeList.getLength(); 484 } 485 public org.w3c.dom.Node next() { 486 return nodeList.item(index++); 487 } 488 public void remove() { 489 throw new UnsupportedOperationException("remove() not allowed in NodeList"); 490 } 491 }; 492 } 493 } 494 495 496 497}