| XmlDocumentFormat.java |
1 /*
2 * XmlDocumentFormat.java
3 *
4 * Copyright (c) 1998-2004, The University of Sheffield.
5 *
6 * This file is part of GATE (see http://gate.ac.uk/), and is free
7 * software, licenced under the GNU Library General Public License,
8 * Version 2, June 1991 (in the distribution as file licence.html,
9 * and also available at http://gate.ac.uk/gate/licence.html).
10 *
11 * Cristian URSU, 26/May/2000
12 *
13 * $Id: XmlDocumentFormat.java,v 1.52 2004/10/22 15:35:25 kalina Exp $
14 */
15
16 package gate.corpora;
17
18 //import com.sun.xml.parser.* ;
19 import java.io.*;
20 import java.net.URLConnection;
21
22 import javax.xml.parsers.*;
23
24 import org.xml.sax.InputSource;
25 import org.xml.sax.SAXException;
26
27 import gate.*;
28 import gate.creole.ResourceInstantiationException;
29 import gate.event.StatusListener;
30 import gate.util.DocumentFormatException;
31 import gate.util.Out;
32 import gate.xml.*;
33 //import org.w3c.www.mime.*;
34
35 /** The format of Documents. Subclasses of DocumentFormat know about
36 * particular MIME types and how to unpack the information in any
37 * markup or formatting they contain into GATE annotations. Each MIME
38 * type has its own subclass of DocumentFormat, e.g. XmlDocumentFormat,
39 * RtfDocumentFormat, MpegDocumentFormat. These classes register themselves
40 * with a static index residing here when they are constructed. Static
41 * getDocumentFormat methods can then be used to get the appropriate
42 * format class for a particular document.
43 */
44 public class XmlDocumentFormat extends TextualDocumentFormat
45 {
46 /** Debug flag */
47 private static final boolean DEBUG = false;
48
49 /** Default construction */
50 public XmlDocumentFormat() { super(); }
51
52 /** We could collect repositioning information during XML parsing */
53 public Boolean supportsRepositioning() {
54 return new Boolean(true);
55 } // supportsRepositioning
56
57 /** Old style of unpackMarkup (without collecting of RepositioningInfo) */
58 public void unpackMarkup(Document doc) throws DocumentFormatException {
59 unpackMarkup(doc, (RepositioningInfo) null, (RepositioningInfo) null);
60 } // unpackMarkup
61
62
63 /** Unpack the markup in the document. This converts markup from the
64 * native format (e.g. XML) into annotations in GATE format.
65 * Uses the markupElementsMap to determine which elements to convert, and
66 * what annotation type names to use. If the document was created from a
67 * String, then is recomandable to set the doc's sourceUrl to <b>null</b>.
68 * So, if the document has a valid URL, then the parser will try to
69 * parse the XML document pointed by the URL.If the URL is not valid, or
70 * is null, then the doc's content will be parsed. If the doc's content is
71 * not a valid XML then the parser might crash.
72 *
73 * @param doc The gate document you want to parse. If
74 * <code>doc.getSourceUrl()</code> returns <b>null</b> then the content of
75 * doc will be parsed. Using a URL is recomended because the parser will
76 * report errors corectlly if the XML document is not well formed.
77 */
78 public void unpackMarkup(Document doc, RepositioningInfo repInfo,
79 RepositioningInfo ampCodingInfo) throws DocumentFormatException {
80 if( (doc == null) ||
81 (doc.getSourceUrl() == null && doc.getContent() == null)){
82
83 throw new DocumentFormatException(
84 "GATE document is null or no content found. Nothing to parse!");
85 }// End if
86
87 boolean docHasContentButNoValidURL = false;
88 // This is a test to see if the GATE document has a valid URL or a valid
89 // content. If doesn't has a valid URL then try to parse its content as XML
90 try{
91 if (doc.getSourceUrl() == null && doc.getContent() != null){
92 // The doc's url is null but there is a content.
93 docHasContentButNoValidURL = true;
94 }else {URLConnection conn = doc.getSourceUrl().openConnection();}
95 }catch (IOException ex1){
96 // The URL is not null but is not valid.
97 if(doc.getContent() == null)
98 // The document content is also null. There is nothing we can do.
99 throw new DocumentFormatException("The document doesn't have a" +
100 " valid URL and also no content");
101 docHasContentButNoValidURL = true;
102 }// End try
103
104 // Create a status listener
105 StatusListener statusListener = new StatusListener(){
106 public void statusChanged(String text){
107 // This is implemented in DocumentFormat.java and inherited here
108 fireStatusChanged(text);
109 }
110 };
111 GateFormatXmlDocumentHandler gateXmlHandler = null;
112 XmlDocumentHandler xmlDocHandler = null;
113 if (docHasContentButNoValidURL)
114 parseDocumentWithoutURL(doc, repInfo, ampCodingInfo);
115 else try {
116 // use Excerces XML parser with JAXP
117 // System.setProperty("javax.xml.parsers.SAXParserFactory",
118 // "org.apache.xerces.jaxp.SAXParserFactoryImpl");
119 // Get a parser factory.
120 SAXParserFactory saxParserFactory = SAXParserFactory.newInstance();
121 // Set up the factory to create the appropriate type of parser
122 // non validating one
123 saxParserFactory.setValidating(false);
124 // non namesapace aware one
125 saxParserFactory.setNamespaceAware(true);
126 // create it
127 SAXParser xmlParser = saxParserFactory.newSAXParser();
128 if (isGateXmlDocument){
129 // Construct the appropiate xml handler for the job.
130 gateXmlHandler = new GateFormatXmlDocumentHandler(doc);
131 // Register a status listener
132 gateXmlHandler.addStatusListener(statusListener);
133 // Parse the Gate Document
134 xmlParser.parse(doc.getSourceUrl().toString(), gateXmlHandler);
135 gateXmlHandler.removeStatusListener(statusListener);
136 }else{
137 // Create a new Xml document handler
138 xmlDocHandler = new XmlDocumentHandler( doc,
139 this.markupElementsMap,
140 this.element2StringMap);
141 // Register a status listener with it
142 xmlDocHandler.addStatusListener(statusListener);
143 // set repositioning object
144 xmlDocHandler.setRepositioningInfo(repInfo);
145 // set the object with ampersand coding positions
146 xmlDocHandler.setAmpCodingInfo(ampCodingInfo);
147
148 // Parse the document handler
149 /* Angel
150 xmlParser.parse(doc.getSourceUrl().toString(), xmlDocHandler );
151 Angel */
152 // try to choose concret parser (Xerces)
153 // Angel - start
154
155 org.xml.sax.XMLReader newxmlParser = xmlParser.getXMLReader();
156 //Niraj org.apache.xerces.parsers.SAXParser newxmlParser =
157 // Niraj new org.apache.xerces.parsers.SAXParser();
158 // Set up the factory to create the appropriate type of parser
159 // non validating one
160 // http://xml.org/sax/features/validation set to false
161 newxmlParser.setFeature("http://xml.org/sax/features/validation", false);
162 // namesapace aware one
163 // http://xml.org/sax/features/namespaces set to true
164 newxmlParser.setFeature("http://xml.org/sax/features/namespaces", true);
165 newxmlParser.setFeature("http://xml.org/sax/features/namespace-prefixes", true);
166 newxmlParser.setContentHandler(xmlDocHandler);
167 newxmlParser.setErrorHandler(xmlDocHandler);
168 newxmlParser.setDTDHandler(xmlDocHandler);
169 newxmlParser.setEntityResolver(xmlDocHandler);
170 newxmlParser.parse(doc.getSourceUrl().toString());
171 // Angel - end
172 ((DocumentImpl) doc).setNextAnnotationId(
173 xmlDocHandler.getCustomObjectsId());
174 xmlDocHandler.removeStatusListener(statusListener);
175 }// End if
176 } catch (ParserConfigurationException e){
177 throw
178 new DocumentFormatException("XML parser configuration exception ", e);
179 } catch (SAXException e){
180 doc.getFeatures().put("parsingError", new Boolean(true));
181
182 Boolean bThrow = (Boolean)
183 doc.getFeatures().get(GateConstants.THROWEX_FORMAT_PROPERTY_NAME);
184
185 if(bThrow != null && bThrow.booleanValue()) {
186 // the next line is commented to avoid Document creation fail on error
187 throw new DocumentFormatException(e);
188 }
189 else {
190 Out.println("Warning: Document remains unparsed. \n"
191 +"\n Stack Dump: ");
192 e.printStackTrace(Out.getPrintWriter());
193 } // if
194
195 } catch (IOException e){
196 throw new DocumentFormatException("I/O exception for " +
197 doc.getSourceUrl().toString());
198 }finally{
199 if(gateXmlHandler != null)
200 gateXmlHandler.removeStatusListener(statusListener);
201 if (xmlDocHandler != null)
202 xmlDocHandler.removeStatusListener(statusListener);
203 }// End if else try
204 }// unpackMarkup
205
206 /** Called from unpackMarkup() if the document have been created from a
207 * string
208 */
209 private void parseDocumentWithoutURL(gate.Document aDocument,
210 RepositioningInfo repInfo,
211 RepositioningInfo ampCodingInfo)
212 throws DocumentFormatException {
213
214 XmlDocumentHandler xmlDocHandler = null;
215 // Create a status listener
216 StatusListener statusList = new StatusListener(){
217 public void statusChanged(String text){
218 // this is implemented in DocumentFormat.java and inherited here
219 fireStatusChanged(text);
220 }
221 };
222 try{
223 Reader reader = new StringReader(aDocument.getContent().toString());
224 //
225 //
226 // new InputStreamReader(
227 // new ByteArrayInputStream(aDocument.getContent().toString().getBytes("UTF-8")),
228 // "UTF-8");
229 InputSource is = new InputSource(reader);
230
231
232 // use Excerces XML parser with JAXP
233 // System.setProperty("javax.xml.parsers.SAXParserFactory",
234 // "org.apache.xerces.jaxp.SAXParserFactoryImpl");
235 // Get a parser factory.
236 SAXParserFactory saxParserFactory = SAXParserFactory.newInstance();
237 // Set up the factory to create the appropriate type of parser
238 // non validating one
239 saxParserFactory.setValidating(false);
240 // non namesapace aware one
241 saxParserFactory.setNamespaceAware(true);
242 // create it
243 SAXParser xmlParser = saxParserFactory.newSAXParser();
244
245 // create a new Xml document handler
246 xmlDocHandler = new XmlDocumentHandler(aDocument,
247 this.markupElementsMap,
248 this.element2StringMap);
249 // Regsiter the statusListener with xmlDocHandler
250 xmlDocHandler.addStatusListener(statusList);
251 // set repositioning object
252 xmlDocHandler.setRepositioningInfo(repInfo);
253 // set the object with ampersand coding positions
254 xmlDocHandler.setAmpCodingInfo(ampCodingInfo);
255 // Parse the document handler
256 /* Angel
257 // xmlParser.parse(is, xmlDocHandler);
258 Angel */
259
260 // Angel - start
261 // try to choose concret parser
262 org.xml.sax.XMLReader newxmlParser = xmlParser.getXMLReader();
263 // Niraj org.apache.xerces.parsers.SAXParser newxmlParser =
264 // Niraj new org.apache.xerces.parsers.SAXParser();
265 // Set up the factory to create the appropriate type of parser
266 // non validating one
267 // http://xml.org/sax/features/validation set to false
268 newxmlParser.setFeature("http://xml.org/sax/features/validation", false);
269 // namesapace aware one
270 // http://xml.org/sax/features/namespaces set to true
271 newxmlParser.setFeature("http://xml.org/sax/features/namespaces", true);
272 newxmlParser.setFeature("http://xml.org/sax/features/namespace-prefixes", true);
273 newxmlParser.setContentHandler(xmlDocHandler);
274 newxmlParser.setErrorHandler(xmlDocHandler);
275 newxmlParser.setDTDHandler(xmlDocHandler);
276 newxmlParser.setEntityResolver(xmlDocHandler);
277 newxmlParser.parse(is);
278 // Angel - end
279
280 ((DocumentImpl) aDocument).setNextAnnotationId(
281 xmlDocHandler.getCustomObjectsId());
282 } catch (ParserConfigurationException e){
283 throw new DocumentFormatException(
284 "XML parser configuration exception ", e);
285 } catch (SAXException e){
286 throw new DocumentFormatException(e);
287 } catch (IOException e){
288 throw new DocumentFormatException(e);
289 }finally{
290 // Remove the statusListener with xmlDocHandler
291 xmlDocHandler.removeStatusListener(statusList);
292 }// End try
293 }// End parseDocumentWithoutURL()
294
295 /** Initialise this resource, and return it. */
296 public Resource init() throws ResourceInstantiationException{
297 // Register XML mime type
298 MimeType mime = new MimeType("text","xml");
299 // Register the class handler for this mime type
300 mimeString2ClassHandlerMap.put(mime.getType()+ "/" + mime.getSubtype(),
301 this);
302 // Register the mime type with mine string
303 mimeString2mimeTypeMap.put(mime.getType() + "/" + mime.getSubtype(), mime);
304 //sometimes XML file appear as application/xml
305 mimeString2mimeTypeMap.put("application/xml", mime);
306 // Register file sufixes for this mime type
307 suffixes2mimeTypeMap.put("xml",mime);
308 suffixes2mimeTypeMap.put("xhtm",mime);
309 suffixes2mimeTypeMap.put("xhtml",mime);
310 // Register magic numbers for this mime type
311 magic2mimeTypeMap.put("<?xml",mime);
312 // Set the mimeType for this language resource
313 setMimeType(mime);
314 return this;
315 }// init()
316
317 }//class XmlDocumentFormat
318