| SgmlDocumentFormat.java |
1 /*
2 * SgmlDocumentFormat.java
3 *
4 * Copyright (c) 1998-2004, The University of Sheffield.
5 *
6 * This file is part of GATE (see http://gate.ac.uk/), and is free
7 * software, licenced under the GNU Library General Public License,
8 * Version 2, June 1991 (in the distribution as file licence.html,
9 * and also available at http://gate.ac.uk/gate/licence.html).
10 *
11 * Cristian URSU, 4/July/2000
12 *
13 * $Id: SgmlDocumentFormat.java,v 1.31 2004/07/21 17:10:03 akshay Exp $
14 */
15
16 package gate.corpora;
17
18 import java.io.IOException;
19
20 import javax.xml.parsers.*;
21
22 import org.xml.sax.SAXException;
23
24 import gate.Document;
25 import gate.Resource;
26 import gate.creole.ResourceInstantiationException;
27 import gate.event.StatusListener;
28 import gate.sgml.Sgml2Xml;
29 import gate.util.DocumentFormatException;
30 import gate.xml.XmlDocumentHandler;
31
32 /** The format of Documents. Subclasses of DocumentFormat know about
33 * particular MIME types and how to unpack the information in any
34 * markup or formatting they contain into GATE annotations. Each MIME
35 * type has its own subclass of DocumentFormat, e.g. XmlDocumentFormat,
36 * RtfDocumentFormat, MpegDocumentFormat. These classes register themselves
37 * with a static index residing here when they are constructed. Static
38 * getDocumentFormat methods can then be used to get the appropriate
39 * format class for a particular document.
40 */
41 public class SgmlDocumentFormat extends TextualDocumentFormat
42 {
43 /** Debug flag */
44 private static final boolean DEBUG = false;
45
46 /** Default construction */
47 public SgmlDocumentFormat() { super(); }
48
49 /** Unpack the markup in the document. This converts markup from the
50 * native format (e.g. SGML) into annotations in GATE format.
51 * Uses the markupElementsMap to determine which elements to convert, and
52 * what annotation type names to use.
53 * The doc's content is first converted to a wel formed XML.
54 * If this succeddes then the document is saved into a temp file and parsed
55 * as an XML document.
56 *
57 * @param doc The gate document you want to parse.
58 *
59 */
60 public void unpackMarkup(Document doc) throws DocumentFormatException{
61 if ( (doc == null) ||
62 (doc.getSourceUrl() == null && doc.getContent() == null)){
63
64 throw new DocumentFormatException(
65 "GATE document is null or no content found. Nothing to parse!");
66 }// End if
67 // Create a status listener
68 StatusListener statusListener = new StatusListener(){
69 public void statusChanged(String text){
70 fireStatusChanged(text);
71 }
72 };
73 XmlDocumentHandler xmlDocHandler = null;
74 try {
75 Sgml2Xml sgml2Xml = new Sgml2Xml(doc);
76
77 fireStatusChanged("Performing SGML to XML...");
78
79 // convert the SGML document
80 String xmlUri = sgml2Xml.convert();
81
82 fireStatusChanged("DONE !");
83
84 //Out.println("Conversion done..." + xmlUri);
85 //Out.println(sgml2Xml.convert());
86 // Get a parser factory.
87 SAXParserFactory saxParserFactory = SAXParserFactory.newInstance();
88 // Set up the factory to create the appropriate type of parser
89
90 // Set up the factory to create the appropriate type of parser
91 // non validating one
92 saxParserFactory.setValidating(false);
93 // non namesapace aware one
94 saxParserFactory.setNamespaceAware(true);
95
96 // Create a SAX parser
97 SAXParser parser = saxParserFactory.newSAXParser();
98
99 // use it
100 if (null != doc){
101 // create a new Xml document handler
102 xmlDocHandler = new XmlDocumentHandler(doc,
103 this.markupElementsMap,
104 this.element2StringMap);
105
106 // register a status listener with it
107 xmlDocHandler.addStatusListener(statusListener);
108
109 parser.parse(xmlUri, xmlDocHandler);
110 ((DocumentImpl) doc).setNextAnnotationId(
111 xmlDocHandler.getCustomObjectsId());
112 }// end if
113 } catch (ParserConfigurationException e){
114 throw
115 new DocumentFormatException("XML parser configuration exception ", e);
116 } catch (SAXException e){
117 throw new DocumentFormatException(e);
118 } catch (IOException e){
119 throw new DocumentFormatException("I/O exception for " +
120 doc.getSourceUrl().toString());
121 }finally{
122 if (xmlDocHandler != null)
123 xmlDocHandler.removeStatusListener(statusListener);
124 }// End try
125
126 }// unpackMarkup
127
128 /** This method converts the document's content from SGML 2 XML.*/
129 private String sgml2Xml(Document doc) {
130 String xmlUri = doc.getSourceUrl().toString ();
131
132 return xmlUri;
133 }// sgml2Xml()
134
135 /** Initialise this resource, and return it. */
136 public Resource init() throws ResourceInstantiationException{
137 // Register SGML mime type
138 MimeType mime = new MimeType("text","sgml");
139 // Register the class handler for this mime type
140 mimeString2ClassHandlerMap.put(mime.getType()+ "/" + mime.getSubtype(),
141 this);
142 // Register the mime type with mine string
143 mimeString2mimeTypeMap.put(mime.getType() + "/" + mime.getSubtype(), mime);
144 // Register file sufixes for this mime type
145 suffixes2mimeTypeMap.put("sgm",mime);
146 suffixes2mimeTypeMap.put("sgml",mime);
147 setMimeType(mime);
148 return this;
149 }// init
150
151 }//class SgmlDocumentFormat
152