| POSTagger.java |
1 /*
2 * Copyright (c) 1998-2004, The University of Sheffield.
3 *
4 * This file is part of GATE (see http://gate.ac.uk/), and is free
5 * software, licenced under the GNU Library General Public License,
6 * Version 2, June 1991 (in the distribution as file licence.html,
7 * and also available at http://gate.ac.uk/gate/licence.html).
8 *
9 * Valentin Tablan, 01 Feb 2000
10 *
11 * $Id: POSTagger.java,v 1.20 2004/07/21 17:10:03 akshay Exp $
12 */
13
14 package gate.creole;
15
16 import java.text.NumberFormat;
17 import java.util.*;
18
19 import gate.*;
20 import gate.util.GateRuntimeException;
21 import gate.util.OffsetComparator;
22 /**
23 * This class is a wrapper for HepTag, Mark Hepple's POS tagger.
24 */
25 public class POSTagger extends AbstractLanguageAnalyser {
26
27 public static final String
28 TAG_DOCUMENT_PARAMETER_NAME = "document";
29
30 public static final String
31 TAG_INPUT_AS_PARAMETER_NAME = "inputASName";
32
33 public static final String
34 TAG_LEXICON_URL_PARAMETER_NAME = "lexiconURL";
35
36 public static final String
37 TAG_RULES_URL_PARAMETER_NAME = "rulesURL";
38
39 public POSTagger() {
40 }
41
42 public Resource init()throws ResourceInstantiationException{
43 if(lexiconURL == null){
44 throw new ResourceInstantiationException(
45 "NoURL provided for the lexicon!");
46 }
47 if(rulesURL == null){
48 throw new ResourceInstantiationException(
49 "No URL provided for the rules!");
50 }
51 try{
52 tagger = new hepple.postag.POSTagger(lexiconURL,rulesURL);
53 }catch(Exception e){
54 throw new ResourceInstantiationException(e);
55 }
56 return this;
57 }
58
59
60 public void execute() throws ExecutionException{
61 try{
62 //check the parameters
63 if(document == null) throw new GateRuntimeException(
64 "No document to process!");
65 if(inputASName != null && inputASName.equals("")) inputASName = null;
66 AnnotationSet inputAS = (inputASName == null) ?
67 document.getAnnotations() :
68 document.getAnnotations(inputASName);
69
70
71 AnnotationSet sentencesAS = inputAS.get(SENTENCE_ANNOTATION_TYPE);
72 AnnotationSet tokensAS = inputAS.get(TOKEN_ANNOTATION_TYPE);
73 if(sentencesAS != null && sentencesAS.size() > 0
74 && tokensAS != null && tokensAS.size() > 0){
75 long startTime = System.currentTimeMillis();
76 fireStatusChanged("POS tagging " + document.getName());
77 fireProgressChanged(0);
78 //prepare the input for HepTag
79 List sentenceForTagger = new ArrayList();
80 List sentencesForTagger = new ArrayList(1);
81 sentencesForTagger.add(sentenceForTagger);
82
83 //define a comparator for annotations by start offset
84 Comparator offsetComparator = new OffsetComparator();
85
86 //read all the tokens and all the sentences
87 List sentencesList = new ArrayList(sentencesAS);
88 Collections.sort(sentencesList, offsetComparator);
89 List tokensList = new ArrayList(tokensAS);
90 Collections.sort(tokensList, offsetComparator);
91
92 Iterator sentencesIter = sentencesList.iterator();
93 ListIterator tokensIter = tokensList.listIterator();
94
95 List tokensInCurrentSentence = new ArrayList();
96 Annotation currentToken = (Annotation)tokensIter.next();
97 int sentIndex = 0;
98 int sentCnt = sentencesAS.size();
99 while(sentencesIter.hasNext()){
100 Annotation currentSentence = (Annotation)sentencesIter.next();
101 tokensInCurrentSentence.clear();
102 sentenceForTagger.clear();
103 while(currentToken != null
104 &&
105 currentToken.getEndNode().getOffset().compareTo(
106 currentSentence.getEndNode().getOffset()) <= 0){
107 tokensInCurrentSentence.add(currentToken);
108 sentenceForTagger.add(currentToken.getFeatures().
109 get(TOKEN_STRING_FEATURE_NAME));
110 currentToken = (Annotation)(tokensIter.hasNext() ?
111 tokensIter.next() : null);
112 }
113 //run the POS tagger
114 List taggerResults = (List)tagger.runTagger(sentencesForTagger).get(0);
115 //add the results
116 //make sure no malfunction accured
117 if(taggerResults.size() != tokensInCurrentSentence.size())
118 throw new GateRuntimeException(
119 "POS Tagger malfunction: the output size (" +
120 taggerResults.size() +
121 ") is different from the input size (" +
122 tokensInCurrentSentence.size() + ")!");
123 Iterator resIter = taggerResults.iterator();
124 Iterator tokIter = tokensInCurrentSentence.iterator();
125 while(resIter.hasNext()){
126 ((Annotation)tokIter.next()).getFeatures().
127 put(TOKEN_CATEGORY_FEATURE_NAME ,((String[])resIter.next())[1]);
128 }
129 fireProgressChanged(sentIndex++ * 100 / sentCnt);
130 }//while(sentencesIter.hasNext())
131 if(currentToken != null){
132 //we have remaining tokens after the last sentence
133 tokensInCurrentSentence.clear();
134 sentenceForTagger.clear();
135 while(currentToken != null){
136 tokensInCurrentSentence.add(currentToken);
137 sentenceForTagger.add(currentToken.getFeatures().
138 get(TOKEN_STRING_FEATURE_NAME));
139 currentToken = (Annotation)(tokensIter.hasNext() ?
140 tokensIter.next() : null);
141 }
142 //run the POS tagger
143 List taggerResults = (List)tagger.runTagger(sentencesForTagger).get(0);
144 //add the results
145 //make sure no malfunction accured
146 if(taggerResults.size() != tokensInCurrentSentence.size())
147 throw new GateRuntimeException(
148 "POS Tagger malfunction: the output size (" +
149 taggerResults.size() +
150 ") is different from the input size (" +
151 tokensInCurrentSentence.size() + ")!");
152 Iterator resIter = taggerResults.iterator();
153 Iterator tokIter = tokensInCurrentSentence.iterator();
154 while(resIter.hasNext()){
155 ((Annotation)tokIter.next()).getFeatures().
156 put(TOKEN_CATEGORY_FEATURE_NAME ,((String[])resIter.next())[1]);
157 }
158 }//if(currentToken != null)
159 fireProcessFinished();
160 fireStatusChanged(
161 document.getName() + " tagged in " +
162 NumberFormat.getInstance().format(
163 (double)(System.currentTimeMillis() - startTime) / 1000) +
164 " seconds!");
165 }else{
166 throw new GateRuntimeException("No sentences or tokens to process!\n" +
167 "Please run a sentence splitter "+
168 "and tokeniser first!");
169 }
170
171 //OLD version
172 /*
173 AnnotationSet as = inputAS.get(SENTENCE_ANNOTATION_TYPE);
174 if(as != null && as.size() > 0){
175 List sentences = new ArrayList(as);
176 Collections.sort(sentences, offsetComparator);
177 Iterator sentIter = sentences.iterator();
178 int sentIndex = 0;
179 int sentCnt = sentences.size();
180 long startTime= System.currentTimeMillis();
181 while(sentIter.hasNext()){
182 start = System.currentTimeMillis();
183 Annotation sentenceAnn = (Annotation)sentIter.next();
184 AnnotationSet rangeSet = inputAS.get(
185 sentenceAnn.getStartNode().getOffset(),
186 sentenceAnn.getEndNode().getOffset());
187 if(rangeSet == null) continue;
188 AnnotationSet tokensSet = rangeSet.get(TOKEN_ANNOTATION_TYPE);
189 if(tokensSet == null) continue;
190 List tokens = new ArrayList(tokensSet);
191 Collections.sort(tokens, offsetComparator);
192
193 // List tokens = (List)sentenceAnn.getFeatures().get("tokens");
194 List sentence = new ArrayList(tokens.size());
195 Iterator tokIter = tokens.iterator();
196 while(tokIter.hasNext()){
197 Annotation token = (Annotation)tokIter.next();
198 String text = (String)token.getFeatures().get(TOKEN_STRING_FEATURE_NAME);
199 sentence.add(text);
200 }//while(tokIter.hasNext())
201
202 //run the POSTagger over this sentence
203 List sentences4tagger = new ArrayList(1);
204 sentences4tagger.add(sentence);
205 prepTime += System.currentTimeMillis() - start;
206 start = System.currentTimeMillis();
207 List taggerResults = tagger.runTagger(sentences4tagger);
208 posTime += System.currentTimeMillis() - start;
209 start = System.currentTimeMillis();
210 //add the results to the output annotation set
211 //we only get one sentence
212 List sentenceFromTagger = (List)taggerResults.get(0);
213 if(sentenceFromTagger.size() != sentence.size()){
214 String taggerResult = "";
215 for(int i = 0; i< sentenceFromTagger.size(); i++){
216 taggerResult += ((String[])sentenceFromTagger.get(i))[1] + ", ";
217 }
218 throw new GateRuntimeException(
219 "POS Tagger malfunction: the output size (" +
220 sentenceFromTagger.size() +
221 ") is different from the input size (" +
222 sentence.size() + ")!" +
223 "\n Input: " + sentence + "\nOutput: " + taggerResult);
224 }
225 for(int i = 0; i< sentence.size(); i++){
226 String category = ((String[])sentenceFromTagger.get(i))[1];
227 Annotation token = (Annotation)tokens.get(i);
228 token.getFeatures().
229 put(TOKEN_CATEGORY_FEATURE_NAME, category);
230 }//for(i = 0; i<= sentence.size(); i++)
231 postTime += System.currentTimeMillis() - start;
232 fireProgressChanged(sentIndex++ * 100 / sentCnt);
233 }//while(sentIter.hasNext())
234 Out.prln("POS preparation time:" + prepTime);
235 Out.prln("POS execution time:" + posTime);
236 Out.prln("POS after execution time:" + postTime);
237 fireProcessFinished();
238 long endTime = System.currentTimeMillis();
239 fireStatusChanged(document.getName() + " tagged in " +
240 NumberFormat.getInstance().format(
241 (double)(endTime - startTime) / 1000) + " seconds!");
242 }else{
243 throw new GateRuntimeException("No sentences to process!\n" +
244 "Please run a sentence splitter first!");
245 }//if(as != null && as.size() > 0)
246 */
247 }catch(Exception e){
248 throw new ExecutionException(e);
249 }
250 }
251
252
253 public void setLexiconURL(java.net.URL newLexiconURL) {
254 lexiconURL = newLexiconURL;
255 }
256 public java.net.URL getLexiconURL() {
257 return lexiconURL;
258 }
259 public void setRulesURL(java.net.URL newRulesURL) {
260 rulesURL = newRulesURL;
261 }
262 public java.net.URL getRulesURL() {
263 return rulesURL;
264 }
265 public void setInputASName(String newInputASName) {
266 inputASName = newInputASName;
267 }
268 public String getInputASName() {
269 return inputASName;
270 }
271
272 protected hepple.postag.POSTagger tagger;
273 private java.net.URL lexiconURL;
274 private java.net.URL rulesURL;
275 private String inputASName;
276 }