| Batch.java |
1 /*
2 * Batch.java - transducer class
3 *
4 * Copyright (c) 1998-2004, The University of Sheffield.
5 *
6 * This file is part of GATE (see http://gate.ac.uk/), and is free
7 * software, licenced under the GNU Library General Public License,
8 * Version 2, June 1991 (in the distribution as file licence.html,
9 * and also available at http://gate.ac.uk/gate/licence.html).
10 *
11 * Hamish Cunningham, 10/08/98
12 *
13 * $Id: Batch.java,v 1.36 2004/07/21 17:10:07 akshay Exp $
14 *
15 * DEVELOPER NOTES:
16 *
17 * This is one that got away; the relation between constructors,
18 * initTransducer and parseTransducer are totally screwy and get worse
19 * every time I add something (e.g. support for resource loading).
20 * We should probably junk this whole thing and start again....
21 */
22
23 package gate.jape;
24
25 import java.net.URL;
26 import java.util.Iterator;
27 import java.util.Vector;
28
29 import gate.*;
30 import gate.creole.ExecutionException;
31 import gate.event.ProgressListener;
32 import gate.event.StatusListener;
33 import gate.util.Err;
34 import gate.util.Out;
35
36 /** Batch processing of JAPE transducers against documents or collections.
37 * Construction will parse or deserialise a transducer as required.
38 */
39 public class Batch implements JapeConstants {
40 /** Debug flag */
41 private static final boolean DEBUG = false;
42
43 /** The name of the transducer file, a .jape or .ser. */
44 // private String japeFileName;
45
46 /** The URL that points to a .jape file */
47 private URL japeURL;
48
49 /**The encoding used for reading the grammar file(s)*/
50 private String encoding;
51
52 /** The JAPE transducer. */
53 private Transducer transducer;
54
55 /** A stream connected to the JAPE file (often null). */
56 // private InputStream japeStream = null;
57
58 /** Create non-initialised instance (private, used in main). */
59 private Batch() { }
60
61 /** Create a fully initialised instance.
62 * <P><CODE>japeFileName</CODE>: the name of a .jape or .ser transducer
63 * file. This may be an absolute path, or may a .jar
64 * that lives somewhere on the classpath.
65 */
66 public Batch(URL url, String encoding) throws JapeException {
67 this.japeURL = url;
68 this.encoding = encoding;
69 parseJape();
70 if(transducer != null){
71 transducer.addStatusListener(new StatusListener(){
72 public void statusChanged(String text){
73 fireStatusChanged(text);
74 }
75 });
76
77 transducer.addProgressListener(new ProgressListener(){
78 public void progressChanged(int value){
79 fireProgressChanged(value);
80 }
81
82 public void processFinished(){
83 fireProcessFinished();
84 }
85 });
86 }
87
88 } // full init constructor
89
90 public Batch(URL url, String encoding, StatusListener sListener)
91 throws JapeException {
92
93 this.addStatusListener(sListener);
94 this.japeURL = url;
95 this.encoding = encoding;
96 parseJape();
97 if(transducer != null){
98 transducer.addStatusListener(new StatusListener(){
99 public void statusChanged(String text){
100 fireStatusChanged(text);
101 }
102 });
103
104 transducer.addProgressListener(new ProgressListener(){
105 public void progressChanged(int value){
106 fireProgressChanged(value);
107 }
108
109 public void processFinished(){
110 fireProcessFinished();
111 }
112 });
113 }
114 } // full init constructor
115
116 /**
117 * Notifies this PR that it should stop its execution as soon as possible.
118 */
119 public synchronized void interrupt(){
120 transducer.interrupt();
121 }
122 /** Create a fully initialised instance.
123 * <P><CODE>japeFileName</CODE>: the name of a .jape or .ser transducer
124 * file. This may be an absolute path, or may a .jar
125 * that lives somewhere on the classpath.
126 */
127 /*
128 public Batch(String japeFileName) throws JapeException {
129 this.japeFileName = japeFileName;
130 initTransducer();
131 } // full init constructor
132 */
133 /*
134 public Batch(String japeFileName, StatusListener sListener)
135 throws JapeException {
136 this.japeFileName = japeFileName;
137 this.addStatusListener(sListener);
138 initTransducer();
139 } // full init constructor
140 */
141
142 /** Create a fully initialised instance from an InputStream connected
143 * to the JAPE file.
144 */
145 /*
146 public Batch(InputStream japeStream) throws JapeException {
147 if(japeStream == null)
148 throw new JapeException(
149 "attempt to create a batch parser with null input stream"
150 );
151 this.japeFileName = "stream";
152 this.japeStream = japeStream;
153 initTransducer();
154 } // full init constructor
155 */
156 /** Create a fully initialised instance from a resource path and resource
157 * name.
158 */
159 /*
160 public Batch(String resPath, String resName) throws JapeException {
161 fromResource = true;
162 this.japeFileName = resName;
163 this.resPath = resPath;
164 initTransducer();
165 } // full init constructor
166 */
167
168 /** Get the transducer. */
169 public Transducer getTransducer() { return transducer; }
170
171 /** Instantiate transducer member as necessary. */
172 /*
173 private void initTransducer()
174 throws JapeException {
175 if(fromResource) {
176 parseJape(resPath, japeFileName);
177 } else if(japeFileName.endsWith(".ser") || japeFileName.endsWith(".SER"))
178 deserialiseJape(new File(japeFileName));
179 else if(japeFileName.endsWith(".jape") || japeFileName.endsWith(".JAPE"))
180 parseJape();
181 else if(japeFileName.endsWith(".jar") || japeFileName.endsWith(".JAR"))
182 deserialiseJape();
183 else if(japeFileName.equals("stream"))
184 parseJape(japeStream);
185 else
186 throw new JapeException(
187 "unknown file type (not .jape, .ser or .jar):" + japeFileName
188 );
189 if(transducer != null) transducer.addStatusListener(new StatusListener() {
190 public void statusChanged(String text){
191 fireStatusChangedEvent(text);
192 }
193 });
194 }
195 */
196 /** Parse a jape file from {@link #japeURL} and store the transducer. */
197 private void parseJape() throws JapeException {
198 try {
199 gate.jape.parser.ParseCpsl parser =
200 new gate.jape.parser.ParseCpsl(japeURL, encoding);
201
202 StatusListener listener = null;
203 listener = new StatusListener(){
204 public void statusChanged(String text){
205 fireStatusChanged(text);
206 }
207 };
208 parser.addStatusListener(listener);
209 transducer = parser.MultiPhaseTransducer();
210 parser.removeStatusListener(listener);
211 //the call to finish needs to be handled from here now as it
212 //was removed from the .jj file
213 transducer.addStatusListener(listener);
214 transducer.finish();
215 transducer.removeStatusListener(listener);
216
217 } catch (gate.jape.parser.ParseException e) {
218 throw new
219 JapeException("Batch: error parsing transducer: " + e.getMessage());
220 } catch (java.io.IOException e) {
221 throw new
222 JapeException("Batch: couldn't open JAPE file: " + e.getMessage());
223 }
224 } // parseJape
225
226 /** Parse a jape file from an InputStream and store the transducer. */
227 /*
228 private void parseJape(InputStream japeStream) throws JapeException {
229 try {
230 gate.jape.parser.ParseCpsl parser =
231 new gate.jape.parser.ParseCpsl(japeFileName, japeStream);
232 transducer = parser.MultiPhaseTransducer();
233 } catch (gate.jape.parser.ParseException e) {
234 throw new
235 JapeException("Batch: error parsing transducer: " + e.getMessage());
236 } catch (java.io.IOException e) {
237 throw new
238 JapeException("Batch: couldn't read JAPE stream: " + e.getMessage());
239 }
240 } // parseJape(InputStream)
241 */
242 /** Parse a jape file from a resource and store the transducer. */
243 /*
244 private void parseJape(String resPath, String resName) throws JapeException {
245 try {
246 gate.jape.parser.ParseCpsl parser =
247 new gate.jape.parser.ParseCpsl(resPath, resName);
248 transducer = parser.MultiPhaseTransducer();
249 } catch (gate.jape.parser.ParseException e) {
250 throw new
251 JapeException("Batch: error parsing transducer: " + e.getMessage());
252 } catch (java.io.IOException e) {
253 throw new
254 JapeException("Batch: couldn't read JAPE resource: " + e.getMessage());
255 }
256 } // parseJape(resPath, resName)
257 */
258
259 /** Deserialise from a .ser file. */
260 /*
261 private void deserialiseJape(File japeFile) throws JapeException {
262
263 // set up a file input stream
264 FileInputStream japeInputStream = null;
265 try {
266 japeInputStream = new FileInputStream(japeFile.getPath());
267 } catch (IOException e) {
268 throw new JapeException(
269 "Can't read from " + japeFile.getPath() + ": " + e.getMessage()
270 );
271 }
272
273 // call the input stream deserialise method
274 deserialiseJape(japeInputStream);
275 } // deserialiseJape(File)
276 */
277 /** Deserialise from a JAR file. */
278 /*
279 private void deserialiseJape() throws JapeException {
280 // find the jar from CLASSPATH
281 //SearchPath classPath =
282 // new SearchPath(System.getProperty("java.class.path"), ".");
283 File jarFile = new File(japeFileName); //classPath.getFile(japeFileName);
284 if(jarFile == null)
285 throw new JapeException("Batch: can't find " + japeFileName);
286
287 // get a byte array input stream with the .ser in out of the jar file
288 JarFile jar = null;
289 BufferedInputStream japeInputStream = null;
290 try {
291 jar = new JarFile(jarFile.getPath());
292 japeInputStream = new BufferedInputStream(
293 jar.getInputStream(jar.getJarEntry(jarNameToSerName(japeFileName)))
294 );
295 } catch(IOException e) {
296 throw new JapeException("couldn't read jar file " + japeFileName);
297 }
298
299
300 // call the input stream deserialise method
301 deserialiseJape(japeInputStream);
302 } // deserialiseJape()
303 */
304 /** Create a transducer from an object input stream (deserialisation). */
305 /*
306 private void deserialiseJape(InputStream japeInputStream)
307 throws JapeException {
308 try {
309 ObjectInputStream ois = new ObjectInputStream(japeInputStream);
310 transducer = (Transducer) ois.readObject();
311 ois.close();
312 japeInputStream.close(); // redundant?
313 } catch (IOException e) {
314 throw new JapeException(
315 "Batch: can't deserialise InputStream (1): " + e.getMessage()
316 );
317 } catch (ClassNotFoundException e) {
318 throw new JapeException(
319 "Batch: can't deserialise InputStream (2): " + e.getMessage()
320 );
321 }
322 } // deserialise(OIS)
323 */
324 /** Create a .ser name from a .jar name. */
325 /*
326 private String jarNameToSerName(String jarName) {
327 return jarName.substring(0, jarName.length() - 4) + ".ser";
328 } // jarNameToSerName
329 */
330
331 /** Process the given collection. */
332 public void transduce(Corpus coll) throws JapeException, ExecutionException {
333 // for each doc run the transducer
334 Iterator iter = coll.iterator();
335 while(iter.hasNext()) {
336 Document doc = (Document) iter.next();
337 // transducer.transduce(doc);
338 transduce(doc, doc.getAnnotations(), doc.getAnnotations());
339 }
340 } // transduce(coll)
341
342 /** Process a single document. */
343 public void transduce(Document doc) throws JapeException, ExecutionException {
344 transducer.transduce(doc, doc.getAnnotations(), doc.getAnnotations());
345 } // transduce(doc)
346
347 /** Process a single document. */
348 public void transduce(Document doc, AnnotationSet inputAS,
349 AnnotationSet outputAS) throws JapeException,
350 ExecutionException {
351 //no need to transduce empty document
352 if (inputAS == null || inputAS.isEmpty())
353 return;
354 transducer.transduce(doc, inputAS, outputAS);
355
356 } // transduce(doc)
357
358 /** Process a single text. */
359 /*
360 public Document transduce(String text) throws JapeException {
361 Document doc = null;
362 try {
363 doc = Factory.newDocument(text);
364 } catch (ResourceInstantiationException e) {
365 throw new JapeException(e.toString());
366 }
367 transducer.transduce(doc, doc.getAnnotations());
368 return doc;
369 } // transduce(text)
370 */
371 /** Process a single file. */
372 /*
373 public Document transduce(File textFile) throws JapeException {
374 String text = null;
375 try {
376 text = gate.util.Files.getString(textFile);
377 } catch(IOException e) { throw new JapeException(e.toString()); }
378 return transduce(text);
379 } // transduce(textFile)
380 */
381 /** Process a set of files. */
382 /*
383 public Corpus transduce(String[] textFileNames) throws JapeException {
384 Corpus coll = null;
385 try {
386 coll = Factory.newCorpus("JAPE batch corpus");
387 Document doc = null;
388 for(int i = 0; i < textFileNames.length; i++) {
389 doc = Factory.newDocument(textFileNames[i]);
390 doc.setFeatures(Factory.newFeatureMap());
391 /*coll.createDocument(
392 textFileNames[i],
393 null, // the text - should get read from disk
394 new AnnotationSetImpl(doc),
395 Factory.newFeatureMap(),
396 Document.COPIED
397 );*/
398 /*
399 transducer.transduce(doc, doc.getAnnotations());
400 }
401 } catch(ResourceInstantiationException e) {
402 throw new JapeException(e.toString());
403 }
404 return coll;
405 } // transduce(textFileNames)
406 */
407 /** This is where it all happens. This is <I>the</I> place to be. Take
408 * your summer holidays here. Visit on Saturday nights. Buy a season
409 * ticket from <CODE>www.programmer.gone.insane.com</CODE>.
410 * <P>
411 * Takes a .jape/.jar/.ser
412 * file name (-j option) which is assumed to hold a pattern
413 * grammar for a multi-phase transducer, and a collection
414 * name (-c option) or a list of files. As needed it then parses and
415 * compiles the transducer, then transduces all the documents in the
416 * collection and saves it to disk.
417 */
418 public static void main(String args[]) {
419 /*
420 // oh great bug in the sky give us this day our daily fuckup
421 //gate.util.Debug.setDebug(true);
422 //gate.util.Debug.setDebug(Rule.class, true);
423 //gate.util.Debug.setDebug(LeftHandSide.class, true);
424 //gate.util.Debug.setDebug(BasicPatternElement.class, true);
425 //gate.util.Debug.setDebug(AnnotationSet.class, true);
426
427 // The persistent name of the collection.
428 String persCollName = null;;
429
430 // The collection to process.
431 Corpus collection = null;
432
433 // create one of us
434 Batch batch = new Batch();
435
436 // process the options
437 int i = 0;
438 for( ; i<args.length; i++) {
439 if(args[i].equals("-c") && ++i < args.length) // -c = coll name
440 persCollName = args[i];
441 else if(args[i].equals("-j") && ++i < args.length)// -j = transducer name
442 batch.japeFileName = args[i];
443 else if(args[i].equals("-v")) // -v = verbose
444 batch.setVerbose(true);
445 else if(args[i].startsWith("-"))
446 batch.usage("unknown option " + args[i]);
447 else
448 break;
449 } // for each arg
450
451 // file name list
452 String[] fileNames = null;
453 if(args.length > i) {
454 fileNames = new String[args.length - i];
455 for(int j = 0; i<args.length; j++, i++)
456 fileNames[j] = args[i];
457 }
458
459 // did they give valid options?
460 if(batch.japeFileName == null)
461 batch.usage("you must supply a transducer name");
462 if(fileNames != null && persCollName != null)
463 batch.usage("can't read a collection AND process a file list");
464
465 // parse the transducer or bomb
466 batch.message("parsing the transducer");
467 try { batch.initTransducer(); }
468 catch(JapeException e) {
469 batch.usage("oops: " + e.toString());
470 }
471
472 Corpus coll = null;
473 if(persCollName != null) { // we got a collection name, not a list of files
474
475 // open the collection or bomb
476 coll = null;
477 batch.message("opening the collection");
478 try {
479 coll = Factory.newCorpus(persCollName);
480 } catch(ResourceInstantiationException e) {
481 batch.usage("oops (x): " + e);
482 }
483
484 // transduce
485 batch.message("calling transducer");
486 try { batch.transduce(coll); }
487 catch(JapeException e) {
488 batch.usage("oops (1): " + e.toString());
489 }
490
491 // save to disk
492 batch.message("saving the collection");
493 batch.usage("couldn't sync coll ");
494
495 // we got a list of files, not a collection
496 } else {
497 batch.message("transducing transient collection");
498 try {
499 coll = batch.transduce(fileNames);
500 } catch(JapeException e) {
501 batch.usage("oops (2): " + e.toString());
502 }
503 }
504
505 // we won! we won! we can smash up all the computers now!
506 batch.message("done");
507 //System.exit(0);
508 */
509 } // main
510
511
512 /** Whether to print progress messages or not. */
513 private boolean verbose = false;
514
515 /** Set verbosity. */
516 public void setVerbose(boolean turtleSoup) { verbose = turtleSoup; }
517
518 /** You got something wrong, dumbo. */
519 public void usage(String errorMessage) {
520 String usageMessage =
521 "usage: java gate.jape.Batch.main [-v] " +
522 "-j japefile(.ser|.jape|.jar) " +
523 "(-c CollectionName | filenames)";
524
525 Err.println(errorMessage);
526 Err.println(usageMessage);
527 // System.exit(1);
528
529 } // usage
530
531 /** Hello? Anybody there?? */
532 public void message(String mess) {
533 if(verbose) Out.println("Batch: " + mess);
534 } // message
535
536 public void setFeatures(gate.FeatureMap newFeatures) {
537 features = newFeatures;
538 }
539 public gate.FeatureMap getFeatures() {
540 return features;
541 }
542 public synchronized void removeProgressListener(ProgressListener l) {
543 if (progressListeners != null && progressListeners.contains(l)) {
544 Vector v = (Vector) progressListeners.clone();
545 v.removeElement(l);
546 progressListeners = v;
547 }
548 }
549 public synchronized void addProgressListener(ProgressListener l) {
550 Vector v = progressListeners == null ? new Vector(2) : (Vector) progressListeners.clone();
551 if (!v.contains(l)) {
552 v.addElement(l);
553 progressListeners = v;
554 }
555 }
556
557 //ProcessProgressReporter implementation ends here
558
559 /** Are we initialising from a resource? */
560 // private boolean fromResource = false;
561
562 /** Path to the resources tree */
563 // private String resPath = null;
564
565
566 private gate.FeatureMap features;
567 private transient Vector progressListeners;
568 private transient Vector statusListeners;
569 private boolean enableDebugging;
570
571 protected void fireProgressChanged(int e) {
572 if (progressListeners != null) {
573 Vector listeners = progressListeners;
574 int count = listeners.size();
575 for (int i = 0; i < count; i++) {
576 ((ProgressListener) listeners.elementAt(i)).progressChanged(e);
577 }
578 }
579 }
580 protected void fireProcessFinished() {
581 if (progressListeners != null) {
582 Vector listeners = progressListeners;
583 int count = listeners.size();
584 for (int i = 0; i < count; i++) {
585 ((ProgressListener) listeners.elementAt(i)).processFinished();
586 }
587 }
588 }
589 public synchronized void removeStatusListener(StatusListener l) {
590 if (statusListeners != null && statusListeners.contains(l)) {
591 Vector v = (Vector) statusListeners.clone();
592 v.removeElement(l);
593 statusListeners = v;
594 }
595 }
596 public synchronized void addStatusListener(StatusListener l) {
597 Vector v = statusListeners == null ? new Vector(2) : (Vector) statusListeners.clone();
598 if (!v.contains(l)) {
599 v.addElement(l);
600 statusListeners = v;
601 }
602 }
603 protected void fireStatusChanged(String e) {
604 if (statusListeners != null) {
605 Vector listeners = statusListeners;
606 int count = listeners.size();
607 for (int i = 0; i < count; i++) {
608 ((StatusListener) listeners.elementAt(i)).statusChanged(e);
609 }
610 }
611 }
612
613 /**
614 * Sets the ontology to be used by the transducers
615 * @param ontology
616 */
617 public void setOntology(gate.creole.ontology.Ontology ontology) {
618 transducer.setOntology(ontology);
619 }
620 public boolean isEnableDebugging() {
621 return enableDebugging;
622 }
623 public void setEnableDebugging(boolean enableDebugging) {
624 this.enableDebugging = enableDebugging;
625 //propagate
626 if(transducer != null) transducer.setEnableDebugging(enableDebugging);
627 }
628
629
630 /*
631 private void writeObject(ObjectOutputStream oos) throws IOException {
632 Out.prln("writing batch");
633 oos.defaultWriteObject();
634 Out.prln("finished writing batch");
635 } // writeObject
636 */
637
638 } // class Batch
639
640