/*-- Copyright (C) 2000-2004 Jason Hunter & Brett McLaughlin. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions, and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions, and the disclaimer that follows these conditions in the documentation and/or other materials provided with the distribution. 3. The name "JDOM" must not be used to endorse or promote products derived from this software without prior written permission. For written permission, please contact . 4. Products derived from this software may not be called "JDOM", nor may "JDOM" appear in their name, without prior written permission from the JDOM Project Management . In addition, we request (but do not require) that you include in the end-user documentation provided with the redistribution and/or in the software itself an acknowledgement equivalent to the following: "This product includes software developed by the JDOM Project (http://www.jdom.org/)." Alternatively, the acknowledgment may be graphical using the logos available at http://www.jdom.org/images/logos. THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE JDOM AUTHORS OR THE PROJECT CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. This software consists of voluntary contributions made by many individuals on behalf of the JDOM Project and was originally created by Jason Hunter and Brett McLaughlin . For more information on the JDOM Project, please see . */ package org.jdom.input; import java.util.HashMap; import org.jdom.*; import javax.xml.stream.XMLStreamConstants; import javax.xml.stream.XMLStreamException; import javax.xml.stream.XMLStreamReader; /** * Builds a JDOM {@link org.jdom.Document org.jdom.Document} using a * {@link javax.xml.stream.XMLStreamReader}. * * @version $Revision: 1.04 $, $Date: 2004/12/11 00:00:00 $ * @author Tatu Saloranta * @author Bradley S. Huffman */ public class StAXBuilder { /** * Map that contains conversion from textual attribute types StAX uses, * to int values JDOM uses. */ final static HashMap attrTypes = new HashMap(32); static { attrTypes.put("CDATA", new Integer(Attribute.CDATA_TYPE)); attrTypes.put("cdata", new Integer(Attribute.CDATA_TYPE)); attrTypes.put("ID", new Integer(Attribute.ID_TYPE)); attrTypes.put("id", new Integer(Attribute.ID_TYPE)); attrTypes.put("IDREF", new Integer(Attribute.IDREF_TYPE)); attrTypes.put("idref", new Integer(Attribute.IDREF_TYPE)); attrTypes.put("IDREFS", new Integer(Attribute.IDREFS_TYPE)); attrTypes.put("idrefs", new Integer(Attribute.IDREFS_TYPE)); attrTypes.put("ENTITY", new Integer(Attribute.ENTITY_TYPE)); attrTypes.put("entity", new Integer(Attribute.ENTITY_TYPE)); attrTypes.put("ENTITIES", new Integer(Attribute.ENTITIES_TYPE)); attrTypes.put("entities", new Integer(Attribute.ENTITIES_TYPE)); attrTypes.put("NMTOKEN", new Integer(Attribute.NMTOKEN_TYPE)); attrTypes.put("nmtoken", new Integer(Attribute.NMTOKEN_TYPE)); attrTypes.put("NMTOKENS", new Integer(Attribute.NMTOKENS_TYPE)); attrTypes.put("nmtokens", new Integer(Attribute.NMTOKENS_TYPE)); attrTypes.put("NOTATION", new Integer(Attribute.NOTATION_TYPE)); attrTypes.put("notation", new Integer(Attribute.NOTATION_TYPE)); attrTypes.put("ENUMERATED", new Integer(Attribute.ENUMERATED_TYPE)); attrTypes.put("enumerated", new Integer(Attribute.ENUMERATED_TYPE)); } // // // Configuration settings: /** The factory for creating new JDOM objects */ private JDOMFactory factory = null; /** * Whether ignorable white space should be ignored, ie not added * in the resulting JDOM tree. If true, it will be ignored; if false, * it will be added in the tree. Default value if false. */ protected boolean cfgIgnoreWS = false; /** * Object that will be used when trying to remove indentation white * space: if so, the object is consulted to figure out what consistutes * indentation white space, as well as about context in which such * white space is to be removed. *

* Note that only such text events (CHARACTERS) are considered that * are not known to be fully ignorable (ignorable white space would * be reported as SPACE) by this removal process. SPACE events can * be trimmed simply by setting {@link #cfgIgnoreWS} to true. */ protected StAXTextModifier textModifier = null; /** * Default constructor. */ public StAXBuilder() { } /* * This sets a custom JDOMFactory for the builder. Use this to build * the tree with your own subclasses of the JDOM classes. * * @param factory JDOMFactory to use */ public void setFactory(JDOMFactory f) { factory = f; } public void setTextModifier(StAXTextModifier mod) { textModifier = mod; } /** * Method used to set value of {@link #cfgIgnoreWS}; that is, to * make parser either remove ignorable white space (true), or * to include it (false). *

* Whether all-whitespace text segment is ignorable white space or * not is based on DTD read in, as per XML specifications (white space * is only significant in mixed content or pure text elements). */ public void setIgnoreWhitespace(boolean state) { cfgIgnoreWS = state; } /** * Method used to enable or disable automatic heuristic removal * of indentation white If set to true, the builder will * try to remove white space that seems to be used for * indentation purposes; otherwise it will not try to do any removal. *

* Note that this setting only applies to all-whitespace segments * that have NOT been determined to be ignorable white space (either * because DTD is not available, or because such white space is in * mixed or text-only element content). As such it is a heuristics * that should only be enabled when application knows that such * white space removal does not cause problems. *

* Also note that internally the method calls * {@link #setTextModifier} with either the default text modifier * (true), or with null (false). */ public void setRemoveIndentation(boolean state) { if (state) { setTextModifier(IndentRemover.getInstance()); } else { setTextModifier(null); } } /** * Returns the current {@link org.jdom.JDOMFactory} in use, if * one has been previously set with {@link #setFactory}, otherwise * null. * * @return the factory builder will use */ public JDOMFactory getFactory() { return factory; } /** * This will build a JDOM tree given a StAX stream reader. * * @param r Stream reader from which input is read. * @return Document - JDOM document object. * @throws XMLStreamException If the reader threw such exception (to * indicate a parsing or I/O problem) */ public Document build(XMLStreamReader r) throws XMLStreamException { /* Should we do sanity checking to see that r is positioned at * beginning? Not doing so will allow creating documents from * sub-trees, though? (not necessarily, depending on the * build loop: it may expect END_DOCUMENT?) */ JDOMFactory f = factory; if (f == null) { f = new UncheckedJDOMFactory(); } Document doc = f.document(null); buildTree(f, r, doc, textModifier); return doc; } /** * This takes a XMLStreamReader and builds up * a JDOM tree. Recursion has been eliminated by using nodes' * parent/child relationship; this improves performance somewhat * (classic recursion-by-iteration-and-explicit stack transformation) * * @param f Node factory to use for creating JDOM nodes * @param r Stream reader to use for reading the document from which * to build the tree * @param doc JDOM Document being built. * @param tmod Text modifier to use for modifying content of text * nodes (CHARACTERS, not CDATA), if any; null if no modifications * are needed (modifier is usually used for trimming unnecessary * but non-ignorable white space). */ protected void buildTree(JDOMFactory f, XMLStreamReader r, Document doc, StAXTextModifier tmod) throws XMLStreamException { Element current = null; // At top level /* Only relevant when trying to trim indentation. But if so, let's * just always allow modifications in prolog/epilog. */ boolean allowTextMods = (tmod != null); int evtType = XMLStreamConstants.START_DOCUMENT; main_loop: while (true) { int prevEvent = evtType; evtType = r.next(); /* 11-Dec-2004, TSa: We may want to trim (indentation) white * space... and it's easiest to do as a completely separate * piece of logic, before the main switch. */ if (allowTextMods) { // Ok; did we get CHARACTERS to potentially modify? if (evtType == XMLStreamConstants.CHARACTERS) { // Mayhaps we could be interested in modifying it? if (tmod.possiblyModifyText(r, prevEvent)) { /* Need to get text before iterating to see the * following event (as that'll lose it) */ String txt = r.getText(); evtType = r.next(); // So how should the text be modified if at all? txt = tmod.textToIncludeBetween(r, prevEvent, evtType, txt); // Need to output if it's non-empty text, then: if (txt != null && txt.length() > 0) { /* See discussion below for CHARACTERS case; basically * we apparently can't add anything in epilog/prolog, * not even white space. */ if (current != null) { f.addContent(current, f.text(txt)); } } prevEvent = XMLStreamConstants.CHARACTERS; // Ok, let's fall down to handle new current event } } // And then can just fall back to the regular handling } Content child; switch (evtType) { case XMLStreamConstants.CDATA: child = f.cdata(r.getText()); break; case XMLStreamConstants.SPACE: if (cfgIgnoreWS) { continue main_loop; } // fall through case XMLStreamConstants.CHARACTERS: /* Small complication: although (ignorable) white space * is allowed in prolog/epilog, and StAX may report such * event, JDOM barfs if trying to add it. Thus, let's just * ignore all textual stuff outside the tree: */ if (current == null) { continue main_loop; } child = f.text(r.getText()); break; case XMLStreamConstants.COMMENT: child = f.comment(r.getText()); break; case XMLStreamConstants.END_DOCUMENT: break main_loop; case XMLStreamConstants.END_ELEMENT: current = current.getParentElement(); if (tmod != null) { allowTextMods = tmod.allowModificationsAfter(r, evtType); } continue main_loop; case XMLStreamConstants.ENTITY_DECLARATION: case XMLStreamConstants.NOTATION_DECLARATION: /* Shouldn't really get these, but maybe some stream readers * do provide the info. If so, better ignore it -- DTD event * should have most/all we need. */ continue main_loop; case XMLStreamConstants.ENTITY_REFERENCE: child = f.entityRef(r.getLocalName()); break; case XMLStreamConstants.PROCESSING_INSTRUCTION: child = f.processingInstruction(r.getPITarget(), r.getPIData()); break; case XMLStreamConstants.START_ELEMENT: // Ok, need to add a new element... { Element newElem = null; String nsURI = r.getNamespaceURI(); String elemPrefix = r.getPrefix(); // needed for special handling of elem's namespace String ln = r.getLocalName(); if (nsURI == null || nsURI.length() == 0) { if (elemPrefix == null || elemPrefix.length() == 0) { newElem = f.element(ln); } else { /* Happens when a prefix is bound to the default * (empty) namespace... */ newElem = f.element(ln, elemPrefix, ""); } } else { newElem = f.element(ln, elemPrefix, nsURI); } /* Let's add element right away (probably have to do * it to bind attribute namespaces, too) */ if (current == null) { // at root doc.setRootElement(newElem); } else { f.addContent(current, newElem); } // Any declared namespaces? for (int i = 0, len = r.getNamespaceCount(); i < len; ++i) { String prefix = r.getNamespacePrefix(i); if (prefix == null) { prefix = ""; } Namespace ns = Namespace.getNamespace(prefix, r.getNamespaceURI(i)); // JDOM has special handling for element's "own" ns: if (prefix.equals(elemPrefix)) { ; // already set by when it was constructed... } else { f.addNamespaceDeclaration(newElem, ns); } } // And then the attributes: for (int i = 0, len = r.getAttributeCount(); i < len; ++i) { String prefix = r.getAttributePrefix(i); Namespace ns; if (prefix == null || prefix.length() == 0) { // Attribute not in any namespace ns = Namespace.NO_NAMESPACE; } else { ns = newElem.getNamespace(prefix); } Attribute attr = f.attribute(r.getAttributeLocalName(i), r.getAttributeValue(i), resolveAttrType(r.getAttributeType(i)), ns); f.setAttribute(newElem, attr); } // And then 'push' new element... current = newElem; } if (tmod != null) { allowTextMods = tmod.allowModificationsAfter(r, evtType); } // Already added the element, can continue continue main_loop; case XMLStreamConstants.START_DOCUMENT: /* This should only be received at the beginning of document... * so, should we indicate the problem or not? */ /* For now, let it pass: maybe some (broken) readers pass * that info as first event in beginning of doc? */ continue main_loop; case XMLStreamConstants.DTD: /* !!! Note: StAX does not expose enough information about * doctype declaration (specifically, public and system id!); * should (re-)parse information... not yet implemented */ // TBI continue main_loop; // Should never get these, from a stream reader: /* (commented out entries are just FYI; default catches * them all) */ //case XMLStreamConstants.ATTRIBUTE: //case XMLStreamConstants.NAMESPACE: default: throw new XMLStreamException("Unrecognized iterator event type: "+r.getEventType()+"; should not receive such types (broken stream reader?)"); } if (child != null) { if (current == null) { f.addContent(doc, child); } else { f.addContent(current, child); } } } } /** * Method called when option {@link #cfgRemoveIndent} is turned on; * to determine if current CHARACTERS event looks like it might * be used for indentation purposes (it is all white space, and * is either immediately after a start element, or could be * immediately before a start element). *

* The default implementation just checks whether the text segment * (known to be all white space) starts with a * linefeed character. */ protected boolean isIndentationWhitespace(XMLStreamReader r) throws XMLStreamException { String text = r.getText(); // Should never be empty... but let's be sure if (text.length() > 0) { char c = text.charAt(0); return (c == '\n' || c == '\r'); } return false; } // // // Private methods: private static int resolveAttrType(String typeStr) { if (typeStr != null && typeStr.length() > 0) { Integer I = (Integer) attrTypes.get(typeStr); if (I != null) { return I.intValue(); } } return Attribute.UNDECLARED_TYPE; } // // // Basic text modifier class(es) public static class IndentRemover extends StAXTextModifier { final static IndentRemover sInstance = new IndentRemover(); protected IndentRemover() { super(); } public static IndentRemover getInstance() { return sInstance; } /** * Always removes indentation after * all start and elements without any further checks; essentially * allowing (indentation) white space removal anywhere in the * document. */ public boolean allowModificationsAfter(XMLStreamReader r, int eventType) throws XMLStreamException { return true; } /** * Enables modifications for * so-called "indentation * white space", ie. all-whitespace (non-CDATA) text segment that * starts with * a linefeed character (\n or \r); provided it follows a non-text * event (anything other than CDATA, ENTITY_REFERENCE and CHARACTERS; * none of which usually should be adjacent to CHARACTERS event, * if text coalescing is enabled, and automatic entity expansion * is not disabled). */ public boolean possiblyModifyText(XMLStreamReader r, int prevEvent) throws XMLStreamException { if (r.getEventType() == XMLStreamConstants.CHARACTERS) { if (!(prevEvent == XMLStreamConstants.CHARACTERS || prevEvent == XMLStreamConstants.CDATA || prevEvent == XMLStreamConstants.ENTITY_REFERENCE)) { if (r.isWhiteSpace()) { String txt = r.getText(); if (txt.length() > 0) { // should always be true char c = txt.charAt(0); return (c == '\n' || c == '\r'); } } } } return false; } /** * If we ever get this far, we will still check that * the CHARACTERS event is not immediately followed by another * textual event. If so, we'll just remove the (all white space) * text event. */ public String textToIncludeBetween(XMLStreamReader r, int prevEvent, int nextEvent, String text) throws XMLStreamException { /* Only remove white space if neither preceding nor following * event is of non-ignorable textual type (CHARACTERS, CDATA, * ENTITY_REFERENCE; note that SPACE should never be adjacent * to CHARACTERS event). */ if (nextEvent == XMLStreamConstants.CHARACTERS || nextEvent == XMLStreamConstants.CDATA || nextEvent == XMLStreamConstants.ENTITY_REFERENCE) { return text; } /* If we got this far, we know it's indentation white space * and should just be removed completely: */ return null; } } // // // Testing /** * Trivial test driver for testing functionality. */ public static void main(String[] args) throws Exception { if (args.length != 1) { System.err.println("Usage: java ... [file]"); System.exit(1); } String filename = args[0]; java.io.Reader r = new java.io.FileReader(filename); javax.xml.stream.XMLInputFactory f = javax.xml.stream.XMLInputFactory.newInstance(); XMLStreamReader sr = f.createXMLStreamReader(r); StAXBuilder builder = new StAXBuilder(); Document domDoc = builder.build(sr); System.out.println("Done [with "+sr.getClass()+"]:"); System.out.println("----- JDom -----"); org.jdom.output.XMLOutputter outputter = new org.jdom.output.XMLOutputter(); java.io.PrintWriter pw = new java.io.PrintWriter(System.out); outputter.output(domDoc, pw); pw.flush(); System.out.println("----- /JDom -----"); } }