| RtfDocumentFormat.java |
1 /*
2 * RtfDocumentFormat.java
3 *
4 * Copyright (c) 1998-2004, The University of Sheffield.
5 *
6 * This file is part of GATE (see http://gate.ac.uk/), and is free
7 * software, licenced under the GNU Library General Public License,
8 * Version 2, June 1991 (in the distribution as file licence.html,
9 * and also available at http://gate.ac.uk/gate/licence.html).
10 *
11 * Cristian URSU, 26/July/2000
12 *
13 * $Id: RtfDocumentFormat.java,v 1.19 2004/07/21 17:10:03 akshay Exp $
14 */
15
16 package gate.corpora;
17
18 import java.io.*;
19
20 import javax.swing.text.*;
21 import javax.swing.text.rtf.RTFEditorKit;
22
23 import gate.Resource;
24 import gate.creole.ResourceInstantiationException;
25 import gate.util.DocumentFormatException;
26 //import org.w3c.www.mime.*;
27
28 /** The format of Documents. Subclasses of DocumentFormat know about
29 * particular MIME types and how to unpack the information in any
30 * markup or formatting they contain into GATE annotations. Each MIME
31 * type has its own subclass of DocumentFormat, e.g. XmlDocumentFormat,
32 * RtfDocumentFormat, MpegDocumentFormat. These classes register themselves
33 * with a static index residing here when they are constructed. Static
34 * getDocumentFormat methods can then be used to get the appropriate
35 * format class for a particular document.
36 */
37 public class RtfDocumentFormat extends TextualDocumentFormat{
38
39 /** Debug flag */
40 private static final boolean DEBUG = false;
41
42 /** Default construction */
43 public RtfDocumentFormat() { super(); }
44
45 /** Unpack the markup in the document. This converts markup from the
46 * native format (e.g.RTF) into annotations in GATE format.
47 * Uses the markupElementsMap to determine which elements to convert, and
48 * what annotation type names to use.
49 * It always tryes to parse te doc's content. It doesn't matter if the
50 * sourceUrl is null or not.
51 *
52 * @param doc The gate document you want to parse.
53 *
54 */
55 public void unpackMarkup(gate.Document doc) throws DocumentFormatException {
56
57 if ( (doc == null) ||
58 (doc.getSourceUrl() == null && doc.getContent() == null)){
59
60 throw new DocumentFormatException(
61 "GATE document is null or no content found. Nothing to parse!");
62 }// End if
63
64 // create a RTF editor kit
65 RTFEditorKit aRtfEditorkit = new RTFEditorKit();
66
67 // create a Styled Document
68 // NOTE that RTF Kit works only with Systled Document interface
69 StyledDocument styledDoc = new DefaultStyledDocument();
70
71 // get an Input stream from the gate document
72 InputStream in = new ByteArrayInputStream(
73 doc.getContent().toString().getBytes()
74 );
75
76 try {
77 aRtfEditorkit.read(in, styledDoc, 0);
78 // replace the document content with the one without markups
79 doc.setContent(new DocumentContentImpl(
80 styledDoc.getText(0,styledDoc.getLength())
81 )
82 );
83 } catch (BadLocationException e) {
84 throw new DocumentFormatException(e);
85 } catch (IOException e){
86 throw new DocumentFormatException("I/O exception for " +
87 doc.getSourceUrl().toExternalForm(),e);
88 }
89 } // unpackMarkup(doc)
90
91 /** Initialise this resource, and return it. */
92 public Resource init() throws ResourceInstantiationException{
93 // Register RTF mime type
94 MimeType mime = new MimeType("text","rtf");
95 // Register the class handler for this mime type
96 mimeString2ClassHandlerMap.put(mime.getType()+ "/" + mime.getSubtype(),
97 this);
98 // Register the mime type with mine string
99 mimeString2mimeTypeMap.put(mime.getType() + "/" + mime.getSubtype(), mime);
100 // Register file sufixes for this mime type
101 suffixes2mimeTypeMap.put("rtf",mime);
102 // Register magic numbers for this mime type
103 magic2mimeTypeMap.put("{\\rtf1",mime);
104 // Set the mimeType for this language resource
105 setMimeType(mime);
106 return this;
107 }// init()
108 }// class RtfDocumentFormat
109