| FlexibleGazetteer.java |
1 /*
2 * FlexibleGazetteer.java
3 *
4 * Copyright (c) 2004, The University of Sheffield.
5 *
6 * This file is part of GATE (see http://gate.ac.uk/), and is free
7 * software, licenced under the GNU Library General Public License,
8 * Version 2, June1991.
9 *
10 * A copy of this licence is included in the distribution in the file
11 * licence.html, and is also available at http://gate.ac.uk/gate/licence.html.
12 *
13 * Niraj Aswani 02/2002
14 *
15 */
16
17 package gate.creole.gazetteer;
18
19 import java.util.*;
20 import gate.util.*;
21 import gate.*;
22 import gate.creole.*;
23
24 /**
25 * <p>Title: Flexible Gazetteer </p>
26 * <p> The Flexible Gazetteer provides users with the flexibility to choose </p>
27 * <p> their own customized input and an external Gazetteer. For example, </p>
28 * <p> the user might want to replace words in the text with their base </p>
29 * <p> forms (which is an output of the Morphological Analyser) or to segment </p>
30 * <p> a Chinese text (using the Chinese Tokeniser) before running the </p>
31 * <p> Gazetteer on the Chinese text. </p>
32 *
33 * <p> The Flexible Gazetteer performs lookup over a document based on the </p>
34 * <p> values of an arbitrary feature of an arbitrary annotation type, by </p>
35 * <p> using an externally provided gazetteer. It is important to use an </p>
36 * <p> external gazetteer as this allows the use of any type of gazetteer </p>
37 * <p> (e.g. an Ontological gazetteer). </p>
38 * <p>Copyright: Copyright (c) 2003</p>
39 * <p>Company: </p>
40 * @author not attributable
41 * @version 1.0
42 */
43
44 public class FlexibleGazetteer
45 extends AbstractLanguageAnalyser
46 implements ProcessingResource {
47
48 /**
49 * Constructor
50 */
51 public FlexibleGazetteer() {
52 changedNodes = new ArrayList();
53 }
54
55 /** Does the actual loading and parsing of the lists. This method must be
56 * called before the gazetteer can be used
57 */
58 public Resource init() throws ResourceInstantiationException {
59
60 /*
61 if (listsURL == null) {
62 throw new ResourceInstantiationException(
63 "No URL provided for gazetteer creation!");
64 }
65
66 if (gazetteerClassName == null) {
67 throw new ResourceInstantiationException(
68 "No Gazetter Name provided");
69 }
70 */
71 return this;
72 }
73
74 /**
75 * This method runs the gazetteer. It assumes that all the needed parameters
76 * are set. If they are not, an exception will be fired.
77 */
78 public void execute() throws ExecutionException {
79 fireProgressChanged(0);
80 fireStatusChanged("Checking Document...");
81 if (document == null) {
82 throw new ExecutionException(
83 "No document to process!"
84 );
85 }
86
87 fireStatusChanged("Creating temporary Document...");
88 StringBuffer newdocString = new StringBuffer(document.getContent().toString());
89 Document tempDoc = null;
90 boolean chineseSplit = false;
91
92 if (inputFeatureNames == null || inputFeatureNames.size() == 0) {
93 inputFeatureNames = new ArrayList();
94 }
95
96 Iterator tokenIter = getTokenIterator(document, inputAnnotationSetName);
97 long totalDeductedSpaces = 0;
98 fireStatusChanged("Replacing contents with the feature value...");
99
100 outer:while (tokenIter != null && tokenIter.hasNext()) {
101 Annotation currentToken = (Annotation) tokenIter.next();
102
103 // check if it is a chinesesplit
104 // if it is, replace no space character with a single space
105 if (currentToken.getType().equals(ANNIEConstants.
106 SPACE_TOKEN_ANNOTATION_TYPE) &&
107 ( (String) (currentToken.getFeatures().get(ANNIEConstants.
108 TOKEN_KIND_FEATURE_NAME))).equals("ChineseSplit")) {
109
110 // for chinese split startnode and end node are same
111 long startOffset = currentToken.getStartNode().getOffset().
112 longValue();
113
114 // because we are adding a space in place of chinesesplit
115 // the endoffset will become newStartOffset + 1
116 long newStartOffset = startOffset - totalDeductedSpaces;
117 long newEndOffset = newStartOffset + 1;
118 NodePosition newNode = new NodePosition(startOffset, startOffset,
119 newStartOffset, newEndOffset,
120 totalDeductedSpaces);
121 chineseSplit = true;
122
123 // here is the addition of space in the document
124 totalDeductedSpaces--;
125 changedNodes.add(newNode);
126 newdocString = newdocString.insert( (int) newStartOffset, ' ');
127 continue outer;
128 }
129
130 // search in the provided inputFeaturesNames
131 // if the current token has a feature value that user
132 // wants to paste on and replace the original string of the token
133 inner:for (int i = 0; i < inputFeatureNames.size(); i++) {
134 String[] keyVal = ( (String) (inputFeatureNames.get(i))).split("[.]");
135
136 if (keyVal.length == 2) {
137 // val is the feature name
138 // key is the annotationName
139 if (currentToken.getType().equals(keyVal[0])) {
140 FeatureMap features = currentToken.getFeatures();
141 String newTokenValue = (String) (features.get(keyVal[1]));
142
143 // what if provided feature doesnot exist
144 if (newTokenValue == null) {
145 continue;
146
147 }
148 else {
149 // feature value found so we need to replace it
150 // find the start and end offsets for this token
151 long startOffset = currentToken.getStartNode().getOffset().
152 longValue();
153 long endOffset = currentToken.getEndNode().getOffset().
154 longValue();
155
156 // what is the actual string
157 String actualString = (String) (features.get(ANNIEConstants.
158 TOKEN_STRING_FEATURE_NAME));
159
160 // if the feature value and the actual string both are same
161 // we don't need to replace it
162 if (actualString.equals(newTokenValue)) {
163 // there is no need to change anything for this
164 break inner;
165 }
166
167 // let us find the difference between the lengths of the
168 // actual string and the newTokenValue
169 long lengthDifference = actualString.length() -
170 newTokenValue.length();
171
172 // so lets find the new startOffset and endOffset
173 long newStartOffset = startOffset - totalDeductedSpaces;
174 long newEndOffset = newStartOffset + newTokenValue.length();
175
176 // and make the entry for this
177 NodePosition newNode = new NodePosition(startOffset,
178 endOffset,
179 newStartOffset, newEndOffset, totalDeductedSpaces);
180 changedNodes.add(newNode);
181 // how many spaces have been added or removed till the current
182 // position of the token
183 totalDeductedSpaces += lengthDifference;
184
185 // and finally replace the actual string in the document
186 // with the new document
187 newdocString = newdocString.replace( (int) newStartOffset,
188 (int) newStartOffset +
189 actualString.length(),
190 newTokenValue);
191 break inner;
192 }
193 }
194 }
195 }
196 }
197
198 fireStatusChanged("New Document to be processed with Gazetteer...");
199 try {
200 FeatureMap params = Factory.newFeatureMap();
201 params.put("stringContent", newdocString.toString());
202 FeatureMap features = Factory.newFeatureMap();
203 Gate.setHiddenAttribute(features, true);
204 tempDoc = (Document) Factory.createResource("gate.corpora.DocumentImpl",
205 params, features);
206 }
207 catch (ResourceInstantiationException rie) {
208 throw new ExecutionException("Temporary document cannot be created");
209 }
210
211 // lets create the gazetteer based on the provided gazetteer name
212 FeatureMap params = Factory.newFeatureMap();
213 gazetteerInst.setDocument(tempDoc);
214 gazetteerInst.setAnnotationSetName(this.outputAnnotationSetName);
215
216 fireStatusChanged("Executing Gazetteer...");
217 gazetteerInst.execute();
218
219 // now the tempDoc has been looked up, we need to shift the tokens from
220 // this temp document to the original document
221 fireStatusChanged("Transfering new tags to the original one...");
222 Iterator tokensIter = getTokenIterator(tempDoc, outputAnnotationSetName);
223 AnnotationSet original = (outputAnnotationSetName == null) ?
224 document.getAnnotations() :
225 document.getAnnotations(outputAnnotationSetName);
226 long totalSpaceAdded = 0;
227 long difference = 0;
228
229 int foundNode = -1;
230 while (tokensIter != null && tokensIter.hasNext()) {
231 Annotation currentToken = (Annotation) (tokensIter.next());
232 long startOffset = currentToken.getStartNode().getOffset().longValue();
233 long endOffset = currentToken.getEndNode().getOffset().longValue();
234
235 // search through the changedNodes and if it is found we will have to
236 // find the new offsets
237 int i = foundNode + 1;
238 boolean found = false;
239 inner1:for (; i < changedNodes.size(); i++) {
240
241 NodePosition tempNode = (NodePosition) (changedNodes.get(i));
242
243 // all the nodes are in the sorted order based on there offsets
244 // so if we reach beyond the position of the current text
245 // under consideration, simply terminate the loop
246 if (tempNode.getNewStartNode() > startOffset) {
247 // so we lets point to the node whose startOffset
248 // is less than the startOffset of the current node
249 // this will allow us to find out how many
250 // extra spaces were added or removed before the current token
251 i = i - 1;
252 break inner1;
253 }
254
255 // how do we know if we want to change the offset
256 if (tempNode.getNewStartNode() == startOffset) {
257 // yes it is available
258
259 // lets find the end node
260 int k = i;
261 for (;
262 k >= 0 && k < changedNodes.size() &&
263 endOffset >
264 ( (NodePosition) changedNodes.get(k)).getNewStartNode(); k++)
265 ;
266 long spacesToAdd = 0;
267 if (k - 1 == i && k - 1 >= 0) {
268 spacesToAdd = (tempNode.getOldEndNode() - tempNode.getNewEndNode());
269 }
270 else if (k - 1 < 0) {
271 spacesToAdd = 0;
272 }
273 else {
274 spacesToAdd = ( (NodePosition) changedNodes.get(k - 1)).
275 getOldEndNode() -
276 ( (NodePosition) changedNodes.get(k - 1)).
277 getNewEndNode();
278 }
279
280 // and how many to be added before the endnode
281 // as any look up notation can be for the text with one or more tokens
282 FeatureMap newFeatureMap = currentToken.getFeatures();
283 try {
284
285 original.add(new Long(startOffset +
286 (tempNode.getOldStartNode() -
287 tempNode.getNewStartNode())),
288 new Long(endOffset + spacesToAdd),
289 //new Long(endOffset + (tempNode.getOldEndNode()
290 // - tempNode.getNewEndNode())),
291 ANNIEConstants.LOOKUP_ANNOTATION_TYPE,
292 newFeatureMap);
293
294 }
295 catch (InvalidOffsetException ioe) {
296 throw new ExecutionException("Offset Error");
297 }
298 found = true;
299 foundNode = i;
300 break inner1;
301 }
302 }
303
304 if (!found) {
305 long totalStartSpaces = 0;
306 long totalEndSpaces = 0;
307
308 // check if we have reached at the end of the changedNodes
309 // if yes we need to find the last node
310 i = (changedNodes.size() == i) ? i - 1 : i;
311
312 // lets find the end node
313 int k = i;
314 for (;
315 k > 0 && k < changedNodes.size() &&
316 endOffset > ( (NodePosition) changedNodes.get(k)).getNewStartNode();
317 k++)
318 ;
319 long spacesToAdd = 0;
320 if (k - 1 == i && k - 1 >= 0) {
321 spacesToAdd = ( ( (NodePosition) changedNodes.get(i)).getOldEndNode() -
322 ( (NodePosition) changedNodes.get(i)).getNewEndNode());
323 }
324 else if (k - 1 < 0) {
325 spacesToAdd = 0;
326 }
327 else {
328 spacesToAdd = ( (NodePosition) changedNodes.get(k - 1)).
329 getOldEndNode() -
330 ( (NodePosition) changedNodes.get(k - 1)).getNewEndNode();
331 }
332
333 if (i >= 0) {
334 //totalStartSpaces = ((NodePosition)
335 // changedNodes.get(i)).getOldStartNode()
336 // - ((NodePosition) changedNodes.get(i)).getNewStartNode();
337 totalStartSpaces = ( (NodePosition) changedNodes.get(i)).
338 getOldEndNode() -
339 ( (NodePosition) changedNodes.get(i)).
340 getNewEndNode();
341 //totalEndSpaces = ((NodePosition)
342 // changedNodes.get(i)).getOldEndNode() -
343 // ((NodePosition) changedNodes.get(i)).getNewEndNode();
344 totalEndSpaces = spacesToAdd;
345 foundNode = i;
346 }
347
348 // no it is not available
349 FeatureMap newFeatureMap = currentToken.getFeatures();
350 try {
351 original.add(new Long(startOffset + totalStartSpaces),
352 new Long(endOffset + totalEndSpaces),
353 ANNIEConstants.LOOKUP_ANNOTATION_TYPE,
354 newFeatureMap);
355 }
356 catch (InvalidOffsetException ioe) {
357 throw new ExecutionException("Offset Error");
358 }
359
360 }
361 }
362
363 // now remove the newDoc
364 Factory.deleteResource(tempDoc);
365 fireProcessFinished();
366 }
367
368 /**
369 * Sets the document to work on
370 * @param doc
371 */
372 public void setDocument(gate.Document doc) {
373 this.document = doc;
374 }
375
376 /**
377 * Returns the document set up by user to work on
378 * @return a {@link Document}
379 */
380 public gate.Document getDocument() {
381 return this.document;
382 }
383
384 /**
385 * sets the outputAnnotationSetName
386 * @param annName
387 */
388 public void setOutputAnnotationSetName(String annName) {
389 this.outputAnnotationSetName = annName;
390 }
391
392 /**
393 * Returns the outputAnnotationSetName
394 * @return a {@link String} value.
395 */
396 public String getOutputAnnotationSetName() {
397 return this.outputAnnotationSetName;
398 }
399
400 /**
401 * sets the inputAnnotationSetName
402 * @param annName
403 */
404 public void setInputAnnotationSetName(String annName) {
405 this.inputAnnotationSetName = annName;
406 }
407
408 /**
409 * Returns the inputAnnotationSetName
410 * @return a {@link String} value.
411 */
412 public String getInputAnnotationSetName() {
413 return this.inputAnnotationSetName;
414 }
415
416 /**
417 * Feature names for example: Token.string, Token.root etc... Values of these
418 * features should be used to replace the actual string of these features. This
419 * method allows a user to set the name of such features
420 * @param inputs
421 */
422 public void setInputFeatureNames(java.util.List inputs) {
423 this.inputFeatureNames = inputs;
424 }
425
426 /**
427 * Returns the feature names that are provided by the user to use their values
428 * to replace their actual strings in the document
429 * @return a {@link List} value.
430 */
431 public java.util.List getInputFeatureNames() {
432 return this.inputFeatureNames;
433 }
434
435 public Gazetteer getGazetteerInst() {
436 return this.gazetteerInst;
437 }
438
439 public void setGazetteerInst(gate.creole.gazetteer.Gazetteer gazetteerInst) {
440 this.gazetteerInst = gazetteerInst;
441 }
442
443 /**
444 * This method takes the document and the annotationSetName and then creates
445 * a interator for the annotations available in the document under the
446 * provided annotationSetName
447 * @param doc
448 * @param annotationSetName
449 * @return an {@link Iterator}
450 */
451 public Iterator getTokenIterator(gate.Document doc, String annotationSetName) {
452 AnnotationSet inputAs = (annotationSetName == null) ? doc.getAnnotations() :
453 doc.getAnnotations(annotationSetName);
454 AnnotationSet tempSet = inputAs.get();
455 if(tempSet == null)
456 return null;
457
458 List tokens = new ArrayList(inputAs.get());
459
460 if(tokens == null)
461 return null;
462
463 Comparator offsetComparator = new OffsetComparator();
464 Collections.sort(tokens, offsetComparator);
465 Iterator tokenIter = tokens.iterator();
466 return tokenIter;
467 }
468
469 // Gazetteer Runtime parameters
470 private gate.Document document;
471 private java.lang.String outputAnnotationSetName;
472 private java.lang.String inputAnnotationSetName;
473
474 // Flexible Gazetteer parameter
475 private Gazetteer gazetteerInst;
476 private java.util.List inputFeatureNames;
477
478 // parameters required within the program
479 private ArrayList changedNodes;
480 }