| TextualDocumentFormat.java |
1 /*
2 * TextualDocumentFormat.java
3 *
4 * Copyright (c) 1998-2004, The University of Sheffield.
5 *
6 * This file is part of GATE (see http://gate.ac.uk/), and is free
7 * software, licenced under the GNU Library General Public License,
8 * Version 2, June 1991 (in the distribution as file licence.html,
9 * and also available at http://gate.ac.uk/gate/licence.html).
10 *
11 * Cristian URSU, 26/May/2000
12 *
13 * $Id: TextualDocumentFormat.java,v 1.24 2004/07/21 17:10:03 akshay Exp $
14 */
15
16 package gate.corpora;
17
18 import gate.*;
19 import gate.creole.ResourceInstantiationException;
20 import gate.util.DocumentFormatException;
21
22 //import org.w3c.www.mime.*;
23
24 /** The format of Documents. Subclasses of DocumentFormat know about
25 * particular MIME types and how to unpack the information in any
26 * markup or formatting they contain into GATE annotations. Each MIME
27 * type has its own subclass of DocumentFormat, e.g. XmlDocumentFormat,
28 * RtfDocumentFormat, MpegDocumentFormat. These classes register themselves
29 * with a static index residing here when they are constructed. Static
30 * getDocumentFormat methods can then be used to get the appropriate
31 * format class for a particular document.
32 */
33 public class TextualDocumentFormat extends DocumentFormat
34 {
35
36 /** Debug flag */
37 private static final boolean DEBUG = false;
38
39 /** Default construction */
40 public TextualDocumentFormat() { super(); }
41
42 /** Initialise this resource, and return it. */
43 public Resource init() throws ResourceInstantiationException{
44 // Register plain text mime type
45 MimeType mime = new MimeType("text","plain");
46 // Register the class handler for this mime type
47 mimeString2ClassHandlerMap.put(mime.getType()+ "/" + mime.getSubtype(),
48 this);
49 // Register the mime type with mine string
50 mimeString2mimeTypeMap.put(mime.getType() + "/" + mime.getSubtype(), mime);
51 // Register file sufixes for this mime type
52 suffixes2mimeTypeMap.put("txt",mime);
53 suffixes2mimeTypeMap.put("text",mime);
54 // Set the mimeType for this language resource
55 setMimeType(mime);
56 return this;
57 } // init()
58
59 /** Unpack the markup in the document. This converts markup from the
60 * native format (e.g. XML, RTF) into annotations in GATE format.
61 * Uses the markupElementsMap to determine which elements to convert, and
62 * what annotation type names to use.
63 */
64 public void unpackMarkup(Document doc) throws DocumentFormatException{
65 if (doc == null || doc.getContent() == null) return;
66 setNewLineProperty(doc);
67 // Create paragraph annotations in the specified annotation set
68 int endOffset = doc.getContent().toString().length();
69 int startOffset = 0;
70 annotateParagraphs(doc,startOffset,endOffset,
71 GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
72 }//unpackMarkup
73
74 public void unpackMarkup(Document doc, RepositioningInfo repInfo,
75 RepositioningInfo ampCodingInfo)
76 throws DocumentFormatException {
77 unpackMarkup(doc);
78 } // unpackMarkup
79
80
81 /**
82 * Check the new line sequence and set document property.
83 * <BR>
84 * Possible values are CRLF, LFCR, CR, LF
85 */
86 protected void setNewLineProperty(Document doc) {
87 String content = doc.getContent().toString();
88 String newLineType = "";
89
90 char ch = ' ';
91 char lastch = ' ';
92 for(int i=0; i < content.length(); ++i) {
93 ch = content.charAt(i);
94 if(lastch == '\r') {
95 if(ch == '\n') {
96 newLineType = "CRLF";
97 break;
98 }
99 else {
100 newLineType = "CR";
101 break;
102 }
103 }
104 if(lastch == '\n') {
105 if(ch == '\r') {
106 newLineType = "LFCR";
107 break;
108 }
109 else {
110 newLineType = "LF";
111 break;
112 }
113 }
114 lastch = ch;
115 } // for
116
117 doc.getFeatures().put(GateConstants.DOCUMENT_NEW_LINE_TYPE, newLineType);
118 } // setNewLineProperty()
119
120 /** Delete '\r' in combination CRLF or LFCR in document content */
121 private void removeExtraNewLine(Document doc) {
122 String content = doc.getContent().toString();
123 StringBuffer buff = new StringBuffer(content);
124
125 char ch = ' ';
126 char lastch = ' ';
127 for(int i=content.length()-1; i > -1; --i) {
128 ch = content.charAt(i);
129 if(ch == '\n' && lastch == '\r') {
130 buff.deleteCharAt(i+1);
131 }
132 if(ch == '\r' && lastch == '\n') {
133 buff.deleteCharAt(i);
134 ch = lastch;
135 }
136 lastch = ch;
137 } // for
138
139 doc.setContent(new DocumentContentImpl(buff.toString()));
140 } // removeExtraNewLine(Document doc)
141
142 /** This method annotates paragraphs in a GATE document. The investigated text
143 * spans beetween start and end offsets and the paragraph annotations are
144 * created in the annotSetName. If annotSetName is null then they are creted
145 * in the default annotation set.
146 * @param aDoc is the gate document on which the paragraph detection would
147 * be performed.If it is null or its content it's null then the method woul
148 * simply return doing nothing.
149 * @param startOffset is the index form the document content from which the
150 * paragraph detection will start
151 * @param endOffset is the offset where the detection will end.
152 * @param annotSetName is the name of the set in which paragraph annotation
153 * would be created.The annotation type created will be "paragraph"
154 */
155 public void annotateParagraphs(Document aDoc,int startOffset,int endOffset,
156 String annotSetName)throws DocumentFormatException{
157 // Simply return if the document is null or its content
158 if (aDoc == null || aDoc.getContent() == null) return;
159 // Simply return if the start is > than the end
160 if (startOffset > endOffset) return;
161 // Decide where to put the newly detected annotations
162 AnnotationSet annotSet = null;
163 if (annotSetName == null)
164 annotSet = aDoc.getAnnotations();
165 else
166 annotSet = aDoc.getAnnotations(annotSetName);
167 // Extract the document content
168 String content = aDoc.getContent().toString();
169 // This is the offset marking the start of a para
170 int startOffsetPara = startOffset;
171 // This marks the ned of a para
172 int endOffsetPara = endOffset;
173 // The initial sate of the FSA
174 int state = 1;
175 // This field marks that a BR entity was read
176 // A BR entity can be NL or NL CR, depending on the operating system (UNIX
177 // or DOS)
178 boolean readBR = false;
179 int index = startOffset;
180 while (index < endOffset){
181 // Read the current char
182 char ch = content.charAt(index);
183 // Test if a BR entity was read
184 if (ch =='\n'){
185 readBR = true;
186 // If \n is followed by a \r then advance the index in order to read a
187 // BR entity
188 while ((index+1 < endOffset) && (content.charAt(index+1) == '\r'))
189 index ++;
190 }// End if
191 switch(state){
192 // It is the initial and also a final state
193 // Stay in state 1 while it reads whitespaces
194 case 1:{
195 // If reads a non whitespace char then move to state 2 and record
196 // the beggining of a paragraph
197 if (!Character.isWhitespace(ch)){
198 state = 2;
199 startOffsetPara = index;
200 }// End if
201 }break;
202 // It can be also a final state.
203 case 2:{
204 // Stay in state 2 while reading chars != BR entities
205 if (readBR){
206 // If you find a BR char go to state 3. The possible end of the para
207 // can be index. This will be confirmed by state 3. So, this is why
208 // the end of a para is recorded here.
209 readBR = false;
210 endOffsetPara = index;
211 state = 3;
212 }// End if
213 }break;
214 // It can be also a final state
215 // From state 3 there are only 2 possible ways: (state 2 or state1)
216 // In state 1 it needs to read a BR
217 // For state 2 it nead to read something different then a BR
218 case 3:{
219 if (readBR){
220 // A BR was read. Go to state 1
221 readBR = false;
222 state = 1;
223 // Create an annotation type paragraph
224 try{
225 annotSet.add( new Long(startOffsetPara),
226 new Long(endOffsetPara),
227 "paragraph",
228 Factory.newFeatureMap());
229 } catch (gate.util.InvalidOffsetException ioe){
230 throw new DocumentFormatException("Coudn't create a paragraph"+
231 " annotation",ioe);
232 }// End try
233 }else{
234 // Go to state 2 an keep reading chars
235 state = 2;
236 }// End if
237 }break;
238 }// End switch
239 // Prepare to read the next char.
240 index ++;
241 }// End while
242 endOffsetPara = index;
243 // Investigate where the finite automata has stoped
244 if ( state==2 || state==3 ){
245 // Create an annotation type paragraph
246 try{
247 annotSet.add( new Long(startOffsetPara),
248 // Create the final annotation using the endOffset
249 new Long(endOffsetPara),
250 "paragraph",
251 Factory.newFeatureMap());
252 } catch (gate.util.InvalidOffsetException ioe){
253 throw new DocumentFormatException("Coudn't create a paragraph"+
254 " annotation",ioe);
255 }// End try
256 }// End if
257 }// End annotateParagraphs();
258
259 public DataStore getDataStore(){ return null;}
260
261 } // class TextualDocumentFormat
262