| Morph.java |
1 package gate.creole.morph;
2
3
4 /*
5 * Morph.java
6 *
7 * Copyright (c) 1998-2004, The University of Sheffield.
8 *
9 * This file is part of GATE (see http://gate.ac.uk/), and is free
10 * software, licenced under the GNU Library General Public License,
11 * Version 2, June1991.
12 *
13 * A copy of this licence is included in the distribution in the file
14 * licence.html, and is also available at http://gate.ac.uk/gate/licence.html.
15 *
16 * Niraj Aswani, 13/10/2003
17 *
18 * $Id: Morph.java,v 1.11 2004/07/30 16:57:54 niraj Exp $
19 */
20
21
22 import java.net.URL;
23 import java.util.Iterator;
24
25 import gate.*;
26 import gate.creole.*;
27 import gate.util.GateRuntimeException;
28
29 /**
30 * Description: This class is a wrapper for {@link gate.creole.morph.Interpret},
31 * the Morphological Analyzer.
32 */
33 public class Morph
34 extends AbstractLanguageAnalyser
35 implements ProcessingResource {
36
37
38 /** Document to be processed by the morpher, must be provided at Runtime. */
39 private gate.Document document;
40
41 /** File which cotains rules to be processed */
42 private URL rulesFile;
43
44 /** Instance of BaseWord class - English Morpher */
45 private Interpret interpret;
46
47 /** Feature Name that should be displayed for the root word */
48 private String rootFeatureName;
49
50 /** Feature Name that should be displayed for the affix */
51 private String affixFeatureName;
52
53 /** The name of the annotation set used for input */
54 private String annotationSetName;
55
56 /** Boolean value that tells if parser should behave in caseSensitive mode */
57 private Boolean caseSensitive;
58
59 private Boolean considerPOSTag;
60
61 /** Default Constructor */
62 public Morph() {
63 }
64
65 /**
66 * This method creates the instance of the BaseWord - English Morpher and
67 * returns the instance of current class with different attributes and
68 * the instance of BaseWord class wrapped into it.
69 * @return Resource
70 * @throws ResourceInstantiationException
71 */
72 public Resource init() throws ResourceInstantiationException {
73 interpret = new Interpret();
74 if (rulesFile == null) {
75 // no rule file is there, simply run the interpret to interpret it and
76 throw new ResourceInstantiationException("\n\n No Rule File Provided");
77 }
78
79 // compile the rules
80 interpret.init(rulesFile);
81
82 return this;
83 }
84
85 /**
86 * Method is executed after the init() method has finished its execution.
87 * <BR>Method does the following operations:
88 * <OL type="1">
89 * <LI> creates the annotationSet
90 * <LI> fetches word tokens from the document, one at a time
91 * <LI> runs the morpher on each individual word token
92 * <LI> finds the root and the affix for that word
93 * <LI> adds them as features to the current token
94 * @throws ExecutionException
95 */
96 public void execute() throws ExecutionException {
97 // lets start the progress and initialize the progress counter
98 fireProgressChanged(0);
99
100 // If no document provided to process throw an exception
101 if (document == null) {
102 fireProcessFinished();
103 throw new GateRuntimeException("No document to process!");
104 }
105
106 // get the annotationSet name provided by the user, or otherwise use the
107 // default method
108 AnnotationSet inputAs = (annotationSetName == null ||
109 annotationSetName.length() == 0) ?
110 document.getAnnotations() :
111 document.getAnnotations(annotationSetName);
112
113 // Morpher requires English tokenizer to be run before running the Morpher
114 // Fetch tokens from the document
115 AnnotationSet tokens = inputAs.get(TOKEN_ANNOTATION_TYPE);
116 if (tokens == null || tokens.isEmpty()) {
117 fireProcessFinished();
118 throw new ExecutionException("Either "+document.getName()+" does not have any contents or \n run the POS Tagger first and then Morpher");
119 //javax.swing.JOptionPane.showMessageDialog(null, "Either "+document.getName()+" does not have any contents or \n run the POS Tagger first and then Morpher"); ;
120 //return;
121 }
122
123 // create iterator to get access to each and every individual token
124 Iterator tokensIter = tokens.iterator();
125
126 // variables used to keep track on progress
127 int tokenSize = tokens.size();
128 int tokensProcessed = 0;
129 int lastReport = 0;
130
131 //lets process each token one at a time
132 while (tokensIter != null && tokensIter.hasNext()) {
133 Annotation currentToken = (Annotation) tokensIter.next();
134 String tokenValue = (String) (currentToken.getFeatures().
135 get(TOKEN_STRING_FEATURE_NAME));
136 if(considerPOSTag != null && considerPOSTag.booleanValue() && !currentToken.getFeatures().containsKey(TOKEN_CATEGORY_FEATURE_NAME)) {
137 fireProcessFinished();
138 throw new ExecutionException("please run the POS Tagger first and then Morpher");
139 //javax.swing.JOptionPane.showMessageDialog(null, "please run the POS Tagger first and then Morpher"); ;
140 //return;
141 }
142
143 String posCategory = (String) (currentToken.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME));
144 if(posCategory == null) {
145 posCategory = "*";
146 }
147
148 if(considerPOSTag == null || !considerPOSTag.booleanValue()) {
149 posCategory = "*";
150 }
151
152 // run the Morpher
153 if(!caseSensitive.booleanValue()) {
154 tokenValue = tokenValue.toLowerCase();
155 }
156
157 String baseWord = interpret.runMorpher(tokenValue, posCategory);
158 String affixWord = interpret.getAffix();
159
160 // no need to add affix feature if it is null
161 if (affixWord != null) {
162 currentToken.getFeatures().put(affixFeatureName, affixWord);
163 }
164 // add the root word as a feature
165 currentToken.getFeatures().put(rootFeatureName, baseWord);
166
167 // measure the progress and update every after 100 tokens
168 tokensProcessed++;
169 if(tokensProcessed - lastReport > 100){
170 lastReport = tokensProcessed;
171 fireProgressChanged(tokensProcessed * 100 /tokenSize);
172 }
173 }
174 // process finished, acknowledge user about this.
175 fireProcessFinished();
176 }
177
178 // getter and setter method
179 /**
180 * Sets the document to be processed
181 * @param document - document to be processed
182 */
183 public void setDocument(gate.Document document) {
184 this.document = document;
185 }
186
187
188 /**
189 * This method should only be called after init()
190 * @param word
191 * @return the rootWord
192 */
193 public String findBaseWord(String word, String cat) {
194 return interpret.runMorpher(word, cat);
195 }
196
197 /**
198 * This method should only be called after init()
199 * @param word
200 * @return the afix of the rootWord
201 */
202 public String findAffix(String word, String cat) {
203 interpret.runMorpher(word, cat);
204 return interpret.getAffix();
205 }
206
207
208 /**
209 * Returns the document under process
210 */
211 public gate.Document getDocument() {
212 return this.document;
213 }
214
215 /**
216 * Sets the rule file to be processed
217 * @param rulesFileURL - rule File name to be processed
218 */
219 public void setRulesFile(URL rulesFile) {
220 this.rulesFile = rulesFile;
221 }
222
223 /**
224 * Returns the document under process
225 */
226 public URL getRulesFile() {
227 return this.rulesFile;
228 }
229
230 /**
231 * Returns the feature name that has been currently set to display the root
232 * word
233 */
234 public String getRootFeatureName() {
235 return rootFeatureName;
236 }
237
238 /**
239 * Sets the feature name that should be displayed for the root word
240 * @param rootFeatureName
241 */
242 public void setRootFeatureName(String rootFeatureName) {
243 this.rootFeatureName = rootFeatureName;
244 }
245
246 /**
247 * Returns the feature name that has been currently set to display the affix
248 * word
249 */
250 public String getAffixFeatureName() {
251 return affixFeatureName;
252 }
253
254 /**
255 * Sets the feature name that should be displayed for the affix
256 * @param affixFeatureName
257 */
258 public void setAffixFeatureName(String affixFeatureName) {
259 this.affixFeatureName = affixFeatureName;
260 }
261
262 /**
263 * Returns the name of the AnnotationSet that has been provided to create
264 * the AnnotationSet
265 */
266 public String getAnnotationSetName() {
267 return annotationSetName;
268 }
269
270 /**
271 * Sets the AnnonationSet name, that is used to create the AnnotationSet
272 * @param annotationSetName
273 */
274 public void setAnnotationSetName(String annotationSetName) {
275 this.annotationSetName = annotationSetName;
276 }
277
278 /**
279 * A method which returns if the parser is in caseSenstive mode
280 * @return a {@link Boolean} value.
281 */
282 public Boolean getCaseSensitive() {
283 return this.caseSensitive;
284 }
285
286 /**
287 * Sets the caseSensitive value, that is used to tell parser if it should
288 * convert document to lowercase before parsing
289 */
290 public void setCaseSensitive(java.lang.Boolean value) {
291 this.caseSensitive = value;
292 }
293
294 public Boolean getConsiderPOSTag() {
295 return this.considerPOSTag;
296 }
297
298 public void setConsiderPOSTag(Boolean value) {
299 this.considerPOSTag = value;
300 }
301 }
302