| DocumentFormat.java |
1 /*
2 * DocumentFormat.java
3 *
4 * Copyright (c) 1998-2004, The University of Sheffield.
5 *
6 * This file is part of GATE (see http://gate.ac.uk/), and is free
7 * software, licenced under the GNU Library General Public License,
8 * Version 2, June 1991 (in the distribution as file licence.html,
9 * and also available at http://gate.ac.uk/gate/licence.html).
10 *
11 * Hamish Cunningham, 25/May/2000
12 *
13 * $Id: DocumentFormat.java,v 1.51 2004/07/21 17:10:02 akshay Exp $
14 */
15
16 package gate;
17
18 import java.io.*;
19 import java.net.URL;
20 import java.util.*;
21
22 import gate.corpora.MimeType;
23 import gate.corpora.RepositioningInfo;
24 import gate.creole.AbstractLanguageResource;
25 import gate.event.StatusListener;
26 import gate.util.DocumentFormatException;
27
28 /** The format of Documents. Subclasses of DocumentFormat know about
29 * particular MIME types and how to unpack the information in any
30 * markup or formatting they contain into GATE annotations. Each MIME
31 * type has its own subclass of DocumentFormat, e.g. XmlDocumentFormat,
32 * RtfDocumentFormat, MpegDocumentFormat. These classes register themselves
33 * with a static index residing here when they are constructed. Static
34 * getDocumentFormat methods can then be used to get the appropriate
35 * format class for a particular document.
36 */
37 public abstract class DocumentFormat
38 extends AbstractLanguageResource implements LanguageResource{
39 /** Debug flag */
40 private static final boolean DEBUG = false;
41
42 /** This fields indicates whether the document being processed is in a
43 * Gate XML custom format.
44 * Detection is done in runMagicNumbers().
45 */
46 protected static boolean isGateXmlDocument = false;
47
48 /** The MIME type of this format. */
49 private MimeType mimeType = null;
50
51 /** Map of MimeTypeString to ClassHandler class. This is used to find the
52 * language resource that deals with the specific Document format
53 */
54 protected static Map mimeString2ClassHandlerMap = new HashMap();
55 /** Map of MimeType to DocumentFormat Class. This is used to find the
56 * DocumentFormat subclass that deals with a particular MIME type.
57 */
58 protected static Map mimeString2mimeTypeMap = new HashMap();
59
60 /** Map of Set of file suffixes to MimeType. This is used to figure
61 * out what MIME type a document is from its file name.
62 */
63 protected static Map suffixes2mimeTypeMap = new HashMap();
64
65 /** Map of Set of magic numbers to MimeType. This is used to guess the
66 * MIME type of a document, when we don't have any other clues.
67 */
68 protected static Map magic2mimeTypeMap = new HashMap();
69
70 /** Map of markup elements to annotation types. If it is null, the
71 * unpackMarkup() method will convert all markup, using the element names
72 * for annotation types. If it is non-null, only those elements specified
73 * here will be converted.
74 */
75 protected Map markupElementsMap = null;
76
77 /** This map is used inside uppackMarkup() method...
78 * When an element from the map is encounted, The corresponding string
79 * element is added to the document content
80 */
81 protected Map element2StringMap = null;
82
83 /** The features of this resource */
84 private FeatureMap features = null;
85
86 /** Default construction */
87 public DocumentFormat() {}
88
89 /** listeners for status report */
90 private transient Vector statusListeners;
91
92 /** Flag for enable/disable collecting of repositioning information */
93 private Boolean shouldCollectRepositioning = new Boolean(false);
94
95 /** If the document format could collect repositioning information
96 * during the unpack phase this method will return <B>true</B>.
97 * <BR>
98 * You should override this method in the child class of the defined
99 * document format if it could collect the repositioning information.
100 */
101 public Boolean supportsRepositioning() {
102 return new Boolean(false);
103 } // supportsRepositioning
104
105 public void setShouldCollectRepositioning(Boolean b) {
106 if(supportsRepositioning().booleanValue() && b.booleanValue()) {
107 shouldCollectRepositioning = b;
108 }
109 else {
110 shouldCollectRepositioning = new Boolean(false);
111 } // if
112 } // setShouldCollectRepositioning
113
114 public Boolean getShouldCollectRepositioning() {
115 return shouldCollectRepositioning;
116 } //
117
118 /** Unpack the markup in the document. This converts markup from the
119 * native format (e.g. XML, RTF) into annotations in GATE format.
120 * Uses the markupElementsMap to determine which elements to convert, and
121 * what annotation type names to use.
122 */
123 abstract public void unpackMarkup(Document doc)
124 throws DocumentFormatException;
125
126 abstract public void unpackMarkup(Document doc, RepositioningInfo repInfo,
127 RepositioningInfo ampCodingInfo)
128 throws DocumentFormatException;
129 /** Unpack the markup in the document. This method calls unpackMarkup on the
130 * GATE document, but after it saves its content as a feature atached to
131 * the document. This method is usefull if one wants to save the content
132 * of the document being unpacked. After the markups have been unpacked,
133 * the content of the document will be replaced with a new one containing
134 * the text between markups.
135 *
136 * @param doc the document that will be upacked
137 * @param originalContentFeatureType the name of the feature that will hold
138 * the document's content.
139 */
140 public void unpackMarkup( Document doc,
141 String originalContentFeatureType )
142 throws DocumentFormatException{
143 FeatureMap fm = doc.getFeatures();
144 if (fm == null) fm = Factory.newFeatureMap();
145 fm.put(originalContentFeatureType, doc.getContent().toString());
146 doc.setFeatures(fm);
147 unpackMarkup(doc);
148 }// unpackMarkup();
149
150 /**
151 * Returns a MimeType having as input a fileSufix.
152 * If the file sufix is <b>null</b> or not recognised then,
153 * <b>null</b> will be returned.
154 * @param fileSufix The file sufix associated with a recognisabe mime type.
155 * @return The MimeType associated with this file suffix.
156 */
157 static private MimeType getMimeType(String fileSufix){
158 // Get a mimeType string associated with this fileSuffix
159 // Eg: for html returns MimeType("text/html"), for xml returns
160 // MimeType("text/xml")
161 if(fileSufix == null) return null;
162 return (MimeType) suffixes2mimeTypeMap.get(fileSufix.toLowerCase());
163 }//getMimeType
164
165 /**
166 * Returns a MymeType having as input a URL object. If the MimeType wasn't
167 * recognized it returns <b>null</b>.
168 * @param url The URL object from which the MimeType will be extracted
169 * @return A MimeType object for that URL, or <b>null</b> if the Mime Type is
170 * unknown.
171 */
172 static private MimeType getMimeType(URL url) {
173 String mimeTypeString = null;
174 String charsetFromWebServer = null;
175 String contentType = null;
176 InputStream is = null;
177 MimeType mimeTypeFromWebServer = null;
178 MimeType mimeTypeFromFileSuffix = null;
179 MimeType mimeTypeFromMagicNumbers = null;
180 String fileSufix = null;
181
182 if (url == null)
183 return null;
184 // Ask the web server for the content type
185 // We expect to get contentType something like this:
186 // "text/html; charset=iso-8859-1"
187 // Charset is optional
188 try{
189 is = url.openConnection().getInputStream();
190 contentType = url.openConnection().getContentType();
191 } catch (IOException e){
192 // Failed to get the content type with te Web server.
193 // Let's try some other methods like FileSuffix or magic numbers.
194 }
195 // If a content Type was returned by the server, try to get the mime Type
196 // string
197 // If contentType is something like this:"text/html; charset=iso-8859-1"
198 // try to get content Type string (text/html)
199 if (contentType != null){
200 StringTokenizer st = new StringTokenizer(contentType, ";");
201 // We assume that the first token is the mime type string...
202 // If this doesn't happen then BAD LUCK :(( ...
203 if (st.hasMoreTokens())
204 mimeTypeString = st.nextToken().toLowerCase();
205 // The next token it should be the CharSet
206 if (st.hasMoreTokens())
207 charsetFromWebServer = st.nextToken().toLowerCase();
208 if (charsetFromWebServer != null){
209 //We have something like : "charset=iso-8859-1" and let's extract the
210 // encoding.
211 st = new StringTokenizer(charsetFromWebServer, "=");
212 // Don't need this anymore
213 charsetFromWebServer = null;
214 // Discarding the first token which is : "charset"
215 if (st.hasMoreTokens())
216 st.nextToken().toUpperCase();
217 // Get the encoding : "ISO-8859-1"
218 if (st.hasMoreTokens())
219 charsetFromWebServer = st.nextToken().toUpperCase();
220 } // End if
221 }// end if
222 // Return the corresponding MimeType with WebServer from the associated MAP
223 mimeTypeFromWebServer = (MimeType)
224 mimeString2mimeTypeMap.get(mimeTypeString);
225 // Let's try a file suffix detection
226 // Get the file sufix from the URL.See method definition for more details
227 fileSufix = getFileSufix(url);
228 // Get the mime type based on the on file sufix
229 mimeTypeFromFileSuffix = getMimeType(fileSufix);
230
231 // Let's perform a magic numbers guess..
232 mimeTypeFromMagicNumbers = guessTypeUsingMagicNumbers(is,
233 charsetFromWebServer);
234 //All those types enter into a deciding system
235 return decideBetweenThreeMimeTypes( mimeTypeFromWebServer,
236 mimeTypeFromFileSuffix,
237 mimeTypeFromMagicNumbers);
238 }//getMimeType
239
240 /**
241 * This method decides what mimeType is in majority
242 * @param aMimeTypeFromWebServer a MimeType
243 * @param aMimeTypeFromFileSuffix a MimeType
244 * @param aMimeTypeFromMagicNumbers a MimeType
245 * @return the MimeType which occurs most. If all are null, then returns
246 * <b>null</b>
247 */
248 protected static MimeType decideBetweenThreeMimeTypes(
249 MimeType aMimeTypeFromWebServer,
250 MimeType aMimeTypeFromFileSuffix,
251 MimeType aMimeTypeFromMagicNumbers){
252
253 // First a voting system
254 if (areEqual(aMimeTypeFromWebServer,aMimeTypeFromFileSuffix))
255 return aMimeTypeFromFileSuffix;
256 if (areEqual(aMimeTypeFromFileSuffix,aMimeTypeFromMagicNumbers))
257 return aMimeTypeFromFileSuffix;
258 if (areEqual(aMimeTypeFromWebServer,aMimeTypeFromMagicNumbers))
259 return aMimeTypeFromWebServer;
260
261 // 1 is the highest priority
262 if (aMimeTypeFromFileSuffix != null)
263 aMimeTypeFromFileSuffix.addParameter("Priority","1");
264 // 2 is the second priority
265 if (aMimeTypeFromWebServer != null)
266 aMimeTypeFromWebServer.addParameter("Priority","2");
267 // 3 is the third priority
268 if (aMimeTypeFromMagicNumbers != null)
269 aMimeTypeFromMagicNumbers.addParameter("Priority","3");
270
271 return decideBetweenTwoMimeTypes(
272 decideBetweenTwoMimeTypes(aMimeTypeFromWebServer,
273 aMimeTypeFromFileSuffix),
274 aMimeTypeFromMagicNumbers);
275
276 }// decideBetweenThreeMimeTypes
277
278 /** Decide between two mimeTypes. The decistion is made on "Priority"
279 * parameter set into decideBetweenThreeMimeTypes method. If both mimeTypes
280 * doesn't have "Priority" paramether set, it will return one on them.
281 * @param aMimeType a MimeType object with "Prority" parameter set
282 * @param anotherMimeType a MimeType object with "Prority" parameter set
283 * @return One of the two mime types.
284 */
285 protected static MimeType decideBetweenTwoMimeTypes( MimeType aMimeType,
286 MimeType anotherMimeType){
287 if (aMimeType == null) return anotherMimeType;
288 if (anotherMimeType == null) return aMimeType;
289
290 int priority1 = 0;
291 int priority2 = 0;
292 // Both of them are not null
293 if (aMimeType.hasParameter("Priority"))
294 try{
295 priority1 =
296 new Integer(aMimeType.getParameterValue("Priority")).intValue();
297 }catch (NumberFormatException e){
298 return anotherMimeType;
299 }
300 if (anotherMimeType.hasParameter("Priority"))
301 try{
302 priority2 =
303 new Integer(anotherMimeType.getParameterValue("Priority")).intValue();
304 }catch (NumberFormatException e){
305 return aMimeType;
306 }
307
308 // The lower the number, the highest the priority
309 if (priority1 <= priority2)
310 return aMimeType;
311 else
312 return anotherMimeType;
313 }// decideBetweenTwoMimeTypes
314
315 /**
316 * Tests if two MimeType objects are equal.
317 * @return true only if boths MimeType objects are different than <b>null</b>
318 * and their Types and Subtypes are equals. The method is case sensitive.
319 */
320 protected static boolean areEqual( MimeType aMimeType,
321 MimeType anotherMimeType){
322 if (aMimeType == null || anotherMimeType == null)
323 return false;
324
325 if ( aMimeType.getType().equals(anotherMimeType.getType()) &&
326 aMimeType.getSubtype().equals(anotherMimeType.getSubtype())
327 ) return true;
328 else
329 return false;
330 }// are Equal
331
332 /**
333 * This method tries to guess the mime Type using some magic numbers.
334 * @param aInputStream a InputStream which has to be transformed into a
335 * InputStreamReader
336 * @param anEncoding the encoding. If is null or unknown then a
337 * InputStreamReader with default encodings will be created.
338 * @return the mime type associated with magic numbers
339 */
340 protected static MimeType guessTypeUsingMagicNumbers(InputStream aInputStream,
341 String anEncoding){
342
343 if (aInputStream == null) return null;
344 InputStreamReader reader = null;
345 if (anEncoding != null)
346 try{
347 reader = new InputStreamReader(aInputStream, anEncoding);
348 } catch (UnsupportedEncodingException e){
349 reader = null;
350 }
351 if (reader == null)
352 // Create a reader with the default encoding system
353 reader = new InputStreamReader(aInputStream);
354
355 // We have a input stream reader
356 return runMagicNumbers(reader);
357 }//guessTypeUsingMagicNumbers
358
359 /** Performs magic over Gate Document */
360 protected static MimeType runMagicNumbers(InputStreamReader aReader){
361 // No reader, nothing to detect
362 if( aReader == null) return null;
363
364 // Prepare to run the magic stuff
365 String strBuffer = null;
366 int bufferSize = 2048;
367 int charReads = 0;
368 char[] cbuf = new char[bufferSize];
369
370 try {
371 charReads = aReader.read(cbuf,0,bufferSize);
372 } catch (IOException e){
373 return null;
374 }// End try
375
376 if (charReads == -1)
377 // the document is empty
378 return null;
379
380 // Create a string form the buffer and perform some search on it.
381 strBuffer = new String(cbuf,0,charReads);
382
383 // If this fails then surrender
384 return getTypeFromContent(strBuffer);
385 }// runMagicNumbers
386
387 private static MimeType getTypeFromContent(String aContent){
388 MimeType detectedMimeType = null;
389 // Detect whether or not is a GateXmlDocument
390 if ( aContent.indexOf("<GateDocument") != -1 ||
391 aContent.indexOf(" GateDocument") != -1)
392 isGateXmlDocument = true;
393 else
394 isGateXmlDocument = false;
395
396 // Run the magic numbers test
397 Set magicSet = magic2mimeTypeMap.keySet();
398 Iterator iterator=magicSet.iterator();
399 String magic;
400 // change case to cover more variants
401 aContent = aContent.toLowerCase();
402 while (iterator.hasNext()){
403 magic = ((String) iterator.next()).toLowerCase();
404 if (aContent.indexOf(magic) != -1)
405 detectedMimeType = (MimeType) magic2mimeTypeMap.get(magic);
406 }// End while
407
408 // If this fails then surrender
409 return detectedMimeType;
410 }// getTypeFromContent
411
412 /**
413 * Return the fileSuffix or null if the url doesn't have a file suffix
414 * If the url is null then the file suffix will be null also
415 */
416 private static String getFileSufix(URL url){
417 String fileName = null;
418 String fileSuffix = null;
419
420 // GIGO test (garbage in garbage out)
421 if (url != null){
422 // get the file name from the URL
423 fileName = url.getFile();
424
425 // tokenize this file name with "." as separator...
426 // the last token will be the file suffix
427 StringTokenizer st = new StringTokenizer(fileName,".");
428
429 // fileSuffix is the last token
430 while (st.hasMoreTokens())
431 fileSuffix = st.nextToken();
432 // here fileSuffix is the last token
433 } // End if
434 return fileSuffix;
435 }//getFileSufix
436
437 /**
438 * Find a DocumentFormat implementation that deals with a particular
439 * MIME type, given that type.
440 * @param aGateDocument this document will receive as a feature
441 * the associated Mime Type. The name of the feature is
442 * MimeType and its value is in the format type/subtype
443 * @param mimeType the mime type that is given as input
444 */
445 static public DocumentFormat getDocumentFormat(gate.Document aGateDocument,
446 MimeType mimeType){
447 FeatureMap aFeatureMap = null;
448 if(mimeType == null) {
449 String content = aGateDocument.getContent().toString();
450 // reduce size for better performance
451 if(content.length() > 2048) content = content.substring(0, 2048);
452 mimeType = getTypeFromContent( content );
453 }
454
455 if (mimeType != null){
456 // If the Gate Document doesn't have a feature map atached then
457 // We will create and set one.
458 if(aGateDocument.getFeatures() == null){
459 aFeatureMap = Factory.newFeatureMap();
460 aGateDocument.setFeatures(aFeatureMap);
461 }// end if
462 aGateDocument.getFeatures().put("MimeType",mimeType.getType() + "/" +
463 mimeType.getSubtype());
464
465 return (DocumentFormat) mimeString2ClassHandlerMap.get(mimeType.getType()
466 + "/" + mimeType.getSubtype());
467 }// end If
468 return null;
469 } // getDocumentFormat(aGateDocument, MimeType)
470
471 /**
472 * Find a DocumentFormat implementation that deals with a particular
473 * MIME type, given the file suffix (e.g. ".txt") that the document came
474 * from.
475 * @param aGateDocument this document will receive as a feature
476 * the associated Mime Type. The name of the feature is
477 * MimeType and its value is in the format type/subtype
478 * @param fileSuffix the file suffix that is given as input
479 */
480 static public DocumentFormat getDocumentFormat(gate.Document aGateDocument,
481 String fileSuffix) {
482 return getDocumentFormat(aGateDocument, getMimeType(fileSuffix));
483 } // getDocumentFormat(String)
484
485 /**
486 * Find a DocumentFormat implementation that deals with a particular
487 * MIME type, given the URL of the Document. If it is an HTTP URL, we
488 * can ask the web server. If it has a recognised file extension, we
489 * can use that. Otherwise we need to use a map of magic numbers
490 * to MIME types to guess the type, and then look up the format using the
491 * type.
492 * @param aGateDocument this document will receive as a feature
493 * the associated Mime Type. The name of the feature is
494 * MimeType and its value is in the format type/subtype
495 * @param url the URL that is given as input
496 */
497 static public DocumentFormat getDocumentFormat(gate.Document aGateDocument,
498 URL url) {
499 return getDocumentFormat(aGateDocument, getMimeType(url));
500 } // getDocumentFormat(URL)
501
502 /** Get the feature set */
503 public FeatureMap getFeatures() { return features; }
504
505 /** Get the markup elements map */
506 public Map getMarkupElementsMap() { return markupElementsMap; }
507
508 /** Get the element 2 string map */
509 public Map getElement2StringMap() { return element2StringMap; }
510
511 /** Set the markup elements map */
512 public void setMarkupElementsMap(Map markupElementsMap) {
513 this.markupElementsMap = markupElementsMap;
514 }
515
516 /** Set the element 2 string map */
517 public void setElement2StringMap(Map anElement2StringMap) {
518 element2StringMap = anElement2StringMap;
519 }
520
521 /** Set the features map*/
522 public void setFeatures(FeatureMap features){this.features = features;}
523
524 /** Set the mime type*/
525
526 public void setMimeType(MimeType aMimeType){mimeType = aMimeType;}
527 /** Gets the mime Type*/
528 public MimeType getMimeType(){return mimeType;}
529
530 //StatusReporter Implementation
531
532
533 public synchronized void removeStatusListener(StatusListener l) {
534 if (statusListeners != null && statusListeners.contains(l)) {
535 Vector v = (Vector) statusListeners.clone();
536 v.removeElement(l);
537 statusListeners = v;
538 }
539 }
540 public synchronized void addStatusListener(StatusListener l) {
541 Vector v = statusListeners == null ? new Vector(2) : (Vector) statusListeners.clone();
542 if (!v.contains(l)) {
543 v.addElement(l);
544 statusListeners = v;
545 }
546 }
547 protected void fireStatusChanged(String e) {
548 if (statusListeners != null) {
549 Vector listeners = statusListeners;
550 int count = listeners.size();
551 for (int i = 0; i < count; i++) {
552 ((StatusListener) listeners.elementAt(i)).statusChanged(e);
553 }
554 }
555 }
556
557 } // class DocumentFormat
558