| EmailDocumentFormat.java |
1 /*
2 * EmailDocumentFormat.java
3 *
4 * Copyright (c) 1998-2004, The University of Sheffield.
5 *
6 * This file is part of GATE (see http://gate.ac.uk/), and is free
7 * software, licenced under the GNU Library General Public License,
8 * Version 2, June 1991 (in the distribution as file licence.html,
9 * and also available at http://gate.ac.uk/gate/licence.html).
10 *
11 * Cristian URSU, 3/Aug/2000
12 *
13 * $Id: EmailDocumentFormat.java,v 1.28 2004/07/21 17:10:03 akshay Exp $
14 */
15
16 package gate.corpora;
17
18 import java.io.IOException;
19 import java.util.Iterator;
20
21 import gate.*;
22 import gate.creole.ResourceInstantiationException;
23 import gate.email.EmailDocumentHandler;
24 import gate.event.StatusListener;
25 import gate.util.DocumentFormatException;
26 import gate.util.InvalidOffsetException;
27
28 //import org.w3c.www.mime.*;
29
30 /** The format of Documents. Subclasses of DocumentFormat know about
31 * particular MIME types and how to unpack the information in any
32 * markup or formatting they contain into GATE annotations. Each MIME
33 * type has its own subclass of DocumentFormat, e.g. XmlDocumentFormat,
34 * RtfDocumentFormat, MpegDocumentFormat. These classes register themselves
35 * with a static index residing here when they are constructed. Static
36 * getDocumentFormat methods can then be used to get the appropriate
37 * format class for a particular document.
38 */
39 public class EmailDocumentFormat extends TextualDocumentFormat
40 {
41 /** Debug flag */
42 private static final boolean DEBUG = false;
43
44 /** Default construction */
45 public EmailDocumentFormat() { super();}
46
47 /** Unpack the markup in the document. This converts markup from the
48 * native format (e.g. EMAIL) into annotations in GATE format.
49 * Uses the markupElementsMap to determine which elements to convert, and
50 * what annotation type names to use.
51 * It always tryes to parse te doc's content. It doesn't matter if the
52 * sourceUrl is null or not.
53 *
54 * @param doc The gate document you want to parse.
55 *
56 */
57
58 public void unpackMarkup(gate.Document doc) throws DocumentFormatException{
59 if ( (doc == null) ||
60 (doc.getSourceUrl() == null && doc.getContent() == null)){
61
62 throw new DocumentFormatException(
63 "GATE document is null or no content found. Nothing to parse!");
64 }// End if
65
66 setNewLineProperty(doc);
67
68 // create an EmailDocumentHandler
69 EmailDocumentHandler emailDocHandler = null;
70 emailDocHandler = new gate.email.EmailDocumentHandler(
71 doc,
72 this.markupElementsMap,
73 this.element2StringMap);
74 StatusListener statusListener = new StatusListener(){
75 public void statusChanged(String text) {
76 // this is implemented in DocumentFormat.java and inherited here
77 fireStatusChanged(text);
78 }//statusChanged(String text)
79 };
80 // Register a status listener with it
81 emailDocHandler.addStatusListener(statusListener);
82 try{
83 // Call the method that creates annotations on the gate document
84 emailDocHandler.annotateMessages();
85 // Process the body annotations and search for paragraphs
86 AnnotationSet bodyAnnotations = doc.getAnnotations(
87 GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME).get("body");
88 if (bodyAnnotations != null && !bodyAnnotations.isEmpty()){
89 Iterator iter = bodyAnnotations.iterator();
90 while(iter.hasNext()){
91 Annotation a = (Annotation)iter.next();
92 annotateParagraphs(doc,a.getStartNode().getOffset().intValue(),
93 a.getEndNode().getOffset().intValue(),
94 GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
95 }// End while
96 }// End if
97 } catch (IOException e){
98 throw new DocumentFormatException("Couldn't create a buffered reader ",e);
99 } catch (InvalidOffsetException e){
100 throw new DocumentFormatException(e);
101 }finally{
102 emailDocHandler.removeStatusListener(statusListener);
103 }// End try
104 }//unpackMarkup(doc)
105
106 /** Initialise this resource, and return it. */
107 public Resource init() throws ResourceInstantiationException{
108 // Register EMAIL mime type
109 MimeType mime = new MimeType("text","email");
110 // Register the class handler for this mime type
111 mimeString2ClassHandlerMap.put(mime.getType()+ "/" + mime.getSubtype(),
112 this);
113 // Register the mime type with mine string
114 mimeString2mimeTypeMap.put(mime.getType() + "/" + mime.getSubtype(), mime);
115 // Register file sufixes for this mime type
116 suffixes2mimeTypeMap.put("eml",mime);
117 suffixes2mimeTypeMap.put("email",mime);
118 suffixes2mimeTypeMap.put("mail",mime);
119 // Register magic numbers for this mime type
120 magic2mimeTypeMap.put("Subject:",mime);
121 // Set the mimeType for this language resource
122 setMimeType(mime);
123 return this;
124 }// init()
125 }// class EmailDocumentFormat
126
127