| CookBook.java |
1 /*
2 * CookBook.java
3 *
4 * Copyright (c) 1998-2004, The University of Sheffield.
5 *
6 * This file is part of GATE (see http://gate.ac.uk/), and is free
7 * software, licenced under the GNU Library General Public License,
8 * Version 2, June 1991 (in the distribution as file licence.html,
9 * and also available at http://gate.ac.uk/gate/licence.html).
10 *
11 * Hamish Cunningham, 16/Feb/2000
12 *
13 * $Id: CookBook.java,v 1.34 2004/07/21 17:10:02 akshay Exp $
14 */
15
16 package gate;
17
18 import java.io.*;
19 import java.util.*;
20
21 import junit.framework.*;
22
23 import gate.creole.*;
24 import gate.creole.gazetteer.DefaultGazetteer;
25 import gate.creole.orthomatcher.OrthoMatcher;
26 import gate.creole.splitter.SentenceSplitter;
27 import gate.creole.tokeniser.DefaultTokeniser;
28 import gate.util.*;
29
30
31 /**
32 * <P><B>NOTE: this class has been REPLACED by the GateExamples package;
33 * see
34 * <A HREF=http://gate.ac.uk/GateExamples/doc/>http://gate.ac.uk/GateExamples/doc/</A>.</B>
35 *
36 * <P>
37 * This class provides examples of using the GATE APIs.
38 * Read this documentation along with a copy of the
39 * <A HREF=http://gate.ac.uk/gate/doc/java2html/gate/CookBook.java.html>source
40 * code</A>.
41 *
42 * <P>
43 * The CookBook is set up as
44 * part of the GATE test suite (using the
45 * <A HREF="http://www.junit.org/>JUnit testing framework</A>), so there's
46 * an easy way to run the examples (viz.,
47 * <A HREF=../gate/TestGate.html>gate.TestGate</A>'s <TT>main</TT> method,
48 * which will invoke the
49 * JUnit test runner). Also, we can use JUnit's assert methods: e.g.
50 * <TT>assertTrue(corpus.isEmpty());</TT>
51 * tests that a corpus object is empty, and creates a test failure report if
52 * this is not the case. (To add a new test class to the suite, see the
53 * <A HREF=../gate/util/TestTemplate.html>gate.util.TestTemplate</A> class.)
54 *
55 * <P>
56 * Programming to the GATE Java API involves manipulating the classes and
57 * interfaces in the <A HREF=package-summary.html>gate package</A>
58 * (and to a lesser extent other packages). These are
59 * often interfaces; classes there are often to do with getting
60 * access to objects that implement the interfaces (without exposing those
61 * implementations). In other words, there's a lot of interface-based design
62 * around.
63 *
64 * <P>
65 * For more details and for a conceptual view, see
66 * <A HREF=http://gate.ac.uk/sale/tao/>Developing Language Processing
67 * Components with GATE</A> (for which this class provides some of the
68 * examples).
69 *
70 * <P>
71 * The rest of this documentation refers to methods in the code that
72 * provide examples of using the GATE API.
73 *
74 * <P>
75 * The <A HREF=#testResourceCreation()>testResourceCreation</A> method gives
76 * an example of creating a resource via
77 * <A HREF=../gate/Factory.html>gate.Factory</A>.
78 *
79 * <P>
80 * The <A HREF=Corpus.html>Corpus interface</A> represents collections of
81 * <A HREF=Document.html>Documents</A> (and takes the place of the old TIPSTER
82 * <TT>Collection</TT> class).
83 *
84 * <P>
85 * The <A HREF=#testCorpusConstruction()>testCorpusConstruction</A> method
86 * gives an example of how to create a new transient Corpus object.
87 *
88 * <P>
89 * The <A HREF=#testAddingDocuments()>testAddingDocuments</A> method gives
90 * examples of adding documents to corpora.
91 *
92 * <P>
93 * The <A HREF=#testAddingAnnotations()>testAddingAnnotations</A> method gives
94 * examples of adding annotations to documents.
95 *
96 *
97 * <P>
98 * The <A HREF=#testUsingFeatures()>testUsingFeatures</A> method gives
99 * examples of using features. <A HREF=FeatureMap.html>The FeatureMap
100 * interface</A> is a mechanism for associating arbitrary data with GATE
101 * entities. Corpora, documents and annotations all share this
102 * mechanism. Simple feature maps use Java's Map interface.
103 *
104 *
105 * <H3>Other sources of examples</H3>
106 *
107 * <P>
108 * See also the other test classes, although note that they also use methods
109 * that are not part of the public API. Test classes include:
110 * <A HREF=corpora/TestCreole.html>TestCreole</A>;
111 * <A HREF=corpora/TestCorpus.html>TestCorpus</A>;
112 * <A HREF=corpora/TestDocument.html>TestDocument</A>;
113 * <A HREF=corpora/TestAnnotation.html>TestAnnotation</A>; anything
114 * else starting "Test" - about 30 of them at the last count.
115 */
116 public class CookBook extends TestCase
117 {
118 /** Debug flag */
119 private static final boolean DEBUG = false;
120
121 /** A corpus */
122 Corpus corpus = null;
123
124 /** A document */
125 Document doc1 = null;
126
127 /** Another document */
128 Document doc2 = null;
129
130 /** Constructing a resource */
131 public void testResourceCreation() throws GateException {
132
133 // before creating a resource we need a feature map to store
134 // parameter values
135 FeatureMap params = Factory.newFeatureMap();
136
137 // to create a document we need a sourceUrlName parameter giving
138 // the location of the source for the document content
139 params.put(Document.DOCUMENT_URL_PARAMETER_NAME,
140 Gate.getUrl("tests/doc0.html"));
141 params.put(Document.DOCUMENT_MARKUP_AWARE_PARAMETER_NAME,
142 new Boolean(true));
143 Resource res = Factory.createResource("gate.corpora.DocumentImpl", params);
144
145 // now we have a document
146 assertTrue(
147 "should be document but the class is: " + res.getClass().getName(),
148 res instanceof gate.Document
149 );
150 Document doc = (Document) res;
151 AnnotationSet markupAnnotations = doc.getAnnotations(
152 GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
153 //this is useless as doc.getAnnotations() will never return null!
154 assertNotNull("no markup annotations on doc " + doc, markupAnnotations);
155 int numMarkupAnnotations = markupAnnotations.size();
156 if(DEBUG)
157 Out.prln("annotations on doc after unpack= " + numMarkupAnnotations);
158 assertTrue(
159 "wrong number annots on doc: " + doc + numMarkupAnnotations,
160 numMarkupAnnotations == 27
161 );
162
163 } // testResourceCreation
164
165 /** Constructing a corpus */
166 public void testCorpusConstruction() throws GateException {
167
168 // corpus constructors require a name
169 corpus = Factory.newCorpus("My example corpus");
170
171 // the corpus interface inherits all the sorted set methods
172 assertTrue(corpus.isEmpty());
173
174 } // testCorpusConstruction
175
176 /** Adding documents to a corpus */
177 public void testAddingDocuments() throws GateException {
178
179 corpus = Factory.newCorpus("My example corpus");
180
181 // add a document or two....
182 corpus.add(doc1);
183 corpus.add(doc2);
184
185 // iterate the corpus members and do some random tests
186 Iterator iter = corpus.iterator();
187 while(iter.hasNext()) {
188 Document doc = (Document) iter.next();
189 assertTrue(
190 "document url not as expected",
191 doc.getSourceUrl().toExternalForm().endsWith("doc0.html") ||
192 doc.getSourceUrl().toExternalForm().endsWith("test1.htm")
193 );
194 } // while
195
196 } // testAddingDocuments
197
198 /** Adding annotations to documents */
199 public void testAddingAnnotations() {
200 AnnotationSet as = doc1.getAnnotations();
201 FeatureMap fm = doc1.getFeatures();
202 Integer id;
203
204 // during creation of annotations offsets are checked and an invalid
205 // offset exception thrown if they are invalid
206 try {
207 id = as.add(new Long(10), new Long(20), "T1", fm);
208 } catch (InvalidOffsetException e) {
209 fail(e.toString());
210 }
211 } // testAddingAnnotations
212
213 /** Using the FeatureMap interface */
214 public void testUsingFeatures() {
215 AnnotationSet as = doc1.getAnnotations();
216 Integer id; // the id of new annotations
217
218 // putting features on documents
219 FeatureMap fm = Factory.newFeatureMap();
220 doc1.setFeatures(fm);
221 assertTrue(fm.size() == 0);
222 fm.put("author", "segovia");
223 assertTrue(fm.get("author").equals("segovia"));
224 fm.put("author", "brendl"); // map puts overwrite existing values
225 assertTrue(fm.get("author").equals("brendl"));
226 assertTrue(fm.size() == 1);
227
228 } // testUsingFeatures
229
230 /** String to print when wrong command-line args */
231 private static String usage =
232 "usage: CookBook [-dir directory-name | file(s)]";
233
234 /**
235 * Main function: an example of embedding GATE-based
236 * batch processing. The method:
237 * <UL>
238 * <LI>
239 * initialises the GATE library, and creates PRs for
240 * tokenisation, sentence splitting and part of speech tagging
241 * <LI>
242 * takes a directory name as argument (-dir option) or just a list
243 * of files
244 * <LI>
245 * creates a directory called "out" and an index.html file there
246 * <LI>
247 * for each .html file in that directory:
248 * <BR> create a GATE document from the file
249 * <BR> run the PRs on the document
250 * <BR> dump some output for the file to "out/gate__[file name].txt",
251 * and add a line to the index
252 * </UL>
253 */
254 public static void main(String[] args) throws Exception {
255 // say "hi"
256 Out.prln("CookBook.main");
257 Out.prln("processing command line arguments");
258
259 // check we have a directory name or list of files
260 List inputFiles = null;
261 if(args.length < 1) throw new GateException(usage);
262
263 // set up a list of all the files to process
264 if(args[0].equals("-dir")) { // list all the files in the dir
265 if(args.length < 2) throw new GateException(usage);
266 File dir = new File(args[1]);
267 File[] filesArray = dir.listFiles();
268 if(filesArray == null)
269 throw new GateException(
270 dir.getPath() + " is not a directory; " + usage
271 );
272 inputFiles = Arrays.asList(filesArray);
273
274 } else { // all args should be file names
275 inputFiles = new ArrayList();
276 for(int i = 0; i < args.length; i++)
277 inputFiles.add(new File(args[i]));
278 }
279
280 // did we get some file names?
281 if(inputFiles.isEmpty()) {
282 throw new GateException("No files to process!");
283 }
284
285 // initialise GATE
286 Out.prln("initialising GATE");
287 Gate.init();
288
289 // create some processing resources
290 Out.prln("creating PRs");
291 //create a tokeniser
292 DefaultTokeniser tokeniser = (DefaultTokeniser)Factory.createResource(
293 "gate.creole.tokeniser.DefaultTokeniser");
294 //create a sentence splitter
295 SentenceSplitter splitter = (SentenceSplitter)Factory.createResource(
296 "gate.creole.splitter.SentenceSplitter");
297 //create a POS tagger
298 POSTagger tagger = (POSTagger)Factory.createResource(
299 "gate.creole.POSTagger");
300
301 //create a gazetteer
302 DefaultGazetteer gazetteer = (DefaultGazetteer)Factory.createResource(
303 "gate.creole.gazetteer.DefaultGazetteer");
304
305 //create a grammar
306 ANNIETransducer transducer = (ANNIETransducer)Factory.createResource(
307 "gate.creole.ANNIETransducer");
308
309 //create an orthomatcher
310 OrthoMatcher orthomatcher = (OrthoMatcher) Factory.createResource(
311 "gate.creole.orthomatcher.OrthoMatcher");
312
313 // make the "out" directory that will contain the results.
314 String outDirName =
315 ((File) inputFiles.get(0)).getParent() + Strings.getFileSep() + "out";
316 if(! new File(outDirName).mkdir()){
317 throw new GateException("Could not create the output directory");
318 }
319
320 // construct a name for the output index file; open; dump header
321 String nl = Strings.getNl(); // shorthand for platform's newline
322 String fsep =
323 Strings.getFileSep(); // shorthand for platform's file separator
324 String indexName =
325 ( (File) inputFiles.get(0) ).getParent() + fsep + "index.html";
326 FileWriter indexWriter = new FileWriter(new File(indexName));
327 indexWriter.write("<HTML><HEAD><TITLE>Documents list</TITLE></HEAD>");
328 indexWriter.write(nl + "<BODY>" + nl + "<UL>" + nl);
329
330 // main loop:
331 // for each document
332 // create a gate doc
333 // set as the document for the PRs
334 // run the PRs
335 // dump output from the doc to out/gate__.....txt
336 // delete the doc
337
338 // loop on files list
339 Iterator filesIter = inputFiles.iterator();
340 Out.prln("looping on input files list");
341 while(filesIter.hasNext()) {
342 File inFile = (File) filesIter.next(); // the current file
343 Out.prln("processing file " + inFile.getPath());
344 FeatureMap params = Factory.newFeatureMap(); // params list for new doc
345
346 // set the source URL parameter to a "file:..." URL string
347 params.put(Document.DOCUMENT_URL_PARAMETER_NAME,
348 inFile.toURL().toExternalForm());
349
350 // use the platform's default encoding rather than GATE's
351 params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, "");
352
353 // create the document
354 Document doc = (Document) Factory.createResource(
355 "gate.corpora.DocumentImpl", params
356 );
357
358 // set the document param on the PRs
359 tokeniser.setDocument(doc);
360 splitter.setDocument(doc);
361 tagger.setDocument(doc);
362 gazetteer.setDocument(doc);
363 transducer.setDocument(doc);
364 orthomatcher.setDocument(doc);
365
366 // run each PR
367 tokeniser.execute();
368 splitter.execute();
369 tagger.execute();
370 gazetteer.execute();
371 transducer.execute();
372 orthomatcher.execute();
373
374 // dump out results
375
376 // construct a name for the output file and open a stream
377 StringBuffer outFileName = new StringBuffer(inFile.getParent());
378 outFileName.append(fsep);
379 outFileName.append("out");
380 outFileName.append(fsep);
381 outFileName.append("gate__");
382 outFileName.append(inFile.getName());
383 outFileName.append(".txt");
384 File outFile = new File(outFileName.toString());
385 FileWriter outFileWriter = new FileWriter(outFile);
386 Out.prln("dumping " + outFile.getPath());
387
388 // iterate round the token annotations writing to the out file
389 // NOTE: to dump all to XML: outFileWriter.write(doc.toXml(tokens));
390 AnnotationSet tokens = doc.getAnnotations("nercAS").
391 get(ANNIEConstants.TOKEN_ANNOTATION_TYPE);
392 Iterator iter = tokens.iterator();
393 while(iter.hasNext()) {
394 Annotation token = (Annotation) iter.next();
395 FeatureMap tokFeats = token.getFeatures();
396 String tokStr = (String) tokFeats.
397 get(ANNIEConstants.TOKEN_STRING_FEATURE_NAME);
398 String tokPos = (String) tokFeats.
399 get(ANNIEConstants.TOKEN_CATEGORY_FEATURE_NAME);
400 outFileWriter.write(tokStr + "\t" + tokPos + nl);
401 }
402 outFileWriter.write(doc.getFeatures().get("entitySet").toString());
403
404 // close the out file stream; add an index line
405 outFileWriter.close();
406 indexWriter.write(
407 "<LI><A href=\"" + inFile.getName() + "\">" + inFile.getName() +
408 "</a>" + " -> " + "<a href=\"" + "out" + fsep + outFile.getName() +
409 "\">" + "out" + fsep + outFile.getName() + "</a></LI>\n"
410 );
411
412 // make the doc a candidate for garbage collection
413 Out.prln("deleting gate doc");
414
415 Factory.deleteResource(doc);
416 } // input files loop
417
418 // finish the index file
419 indexWriter.write(nl + "</UL>" + nl + "</BODY></HTML>" + nl);
420 indexWriter.close();
421
422 Out.prln("The End (roll credits)");
423 } // main
424
425 /** Fixture set up: initialise members before each test method */
426 public void setUp() throws GateException, IOException {
427 corpus = Factory.newCorpus("My example corpus");
428
429 doc1 = Factory.newDocument(Gate.getUrl("tests/doc0.html"));
430 doc2 = Factory.newDocument(Gate.getUrl("tests/html/test1.htm"));
431 } // setUp
432
433 /** Construction */
434 public CookBook(String name) { super(name); }
435
436 /** Test suite routine for the test runner */
437 public static Test suite() {
438 return new TestSuite(CookBook.class);
439 } // suite
440
441 } // class CookBook
442