| POSTagger.java |
1 /*
2 * Copyright (c) 1998-2004, The University of Sheffield.
3 *
4 * This file is part of GATE (see http://gate.ac.uk/), and is free
5 * software, licenced under the GNU Library General Public License,
6 * Version 2, June 1991 (in the distribution as file licence.html,
7 * and also available at http://gate.ac.uk/gate/licence.html).
8 *
9 * Valentin Tablan, 01 Feb 2000
10 *
11 * $Id: POSTagger.java,v 1.22 2004/12/01 15:34:54 niraj Exp $
12 */
13
14 package gate.creole;
15
16 import java.text.NumberFormat;
17 import java.util.*;
18
19 import gate.*;
20 import gate.util.GateRuntimeException;
21 import gate.util.OffsetComparator;
22 /**
23 * This class is a wrapper for HepTag, Mark Hepple's POS tagger.
24 */
25 public class POSTagger extends AbstractLanguageAnalyser {
26
27 public static final String
28 TAG_DOCUMENT_PARAMETER_NAME = "document";
29
30 public static final String
31 TAG_INPUT_AS_PARAMETER_NAME = "inputASName";
32
33 public static final String
34 TAG_LEXICON_URL_PARAMETER_NAME = "lexiconURL";
35
36 public static final String
37 TAG_RULES_URL_PARAMETER_NAME = "rulesURL";
38
39 public static final String
40 TAG_ENCODING_PARAMETER_NAME = "encoding";
41
42 public POSTagger() {
43 }
44
45 public Resource init()throws ResourceInstantiationException{
46 if(lexiconURL == null){
47 throw new ResourceInstantiationException(
48 "NoURL provided for the lexicon!");
49 }
50 if(rulesURL == null){
51 throw new ResourceInstantiationException(
52 "No URL provided for the rules!");
53 }
54 try{
55 tagger = new hepple.postag.POSTagger(lexiconURL,rulesURL);
56 }catch(Exception e){
57 throw new ResourceInstantiationException(e);
58 }
59 return this;
60 }
61
62
63 public void execute() throws ExecutionException{
64 try{
65 //check the parameters
66 if(document == null) throw new GateRuntimeException(
67 "No document to process!");
68 if(inputASName != null && inputASName.equals("")) inputASName = null;
69 AnnotationSet inputAS = (inputASName == null) ?
70 document.getAnnotations() :
71 document.getAnnotations(inputASName);
72
73
74 AnnotationSet sentencesAS = inputAS.get(SENTENCE_ANNOTATION_TYPE);
75 AnnotationSet tokensAS = inputAS.get(TOKEN_ANNOTATION_TYPE);
76 if(sentencesAS != null && sentencesAS.size() > 0
77 && tokensAS != null && tokensAS.size() > 0){
78 long startTime = System.currentTimeMillis();
79 fireStatusChanged("POS tagging " + document.getName());
80 fireProgressChanged(0);
81 //prepare the input for HepTag
82 List sentenceForTagger = new ArrayList();
83 List sentencesForTagger = new ArrayList(1);
84 sentencesForTagger.add(sentenceForTagger);
85
86 //define a comparator for annotations by start offset
87 Comparator offsetComparator = new OffsetComparator();
88
89 //read all the tokens and all the sentences
90 List sentencesList = new ArrayList(sentencesAS);
91 Collections.sort(sentencesList, offsetComparator);
92 List tokensList = new ArrayList(tokensAS);
93 Collections.sort(tokensList, offsetComparator);
94
95 Iterator sentencesIter = sentencesList.iterator();
96 ListIterator tokensIter = tokensList.listIterator();
97
98 List tokensInCurrentSentence = new ArrayList();
99 Annotation currentToken = (Annotation)tokensIter.next();
100 int sentIndex = 0;
101 int sentCnt = sentencesAS.size();
102 while(sentencesIter.hasNext()){
103 Annotation currentSentence = (Annotation)sentencesIter.next();
104 tokensInCurrentSentence.clear();
105 sentenceForTagger.clear();
106 while(currentToken != null
107 &&
108 currentToken.getEndNode().getOffset().compareTo(
109 currentSentence.getEndNode().getOffset()) <= 0){
110 tokensInCurrentSentence.add(currentToken);
111 sentenceForTagger.add(currentToken.getFeatures().
112 get(TOKEN_STRING_FEATURE_NAME));
113 currentToken = (Annotation)(tokensIter.hasNext() ?
114 tokensIter.next() : null);
115 }
116 tagger.setEncoding(this.encoding);
117 //run the POS tagger
118 List taggerList = tagger.runTagger(sentencesForTagger);
119 if(taggerList != null && taggerList.size() > 0){
120 List taggerResults = (List) taggerList.get(0);
121 //add the results
122 //make sure no malfunction occurred
123 if(taggerResults.size() != tokensInCurrentSentence.size())
124 throw new GateRuntimeException(
125 "POS Tagger malfunction: the output size (" +
126 taggerResults.size() +
127 ") is different from the input size (" +
128 tokensInCurrentSentence.size() + ")!");
129 Iterator resIter = taggerResults.iterator();
130 Iterator tokIter = tokensInCurrentSentence.iterator();
131 while(resIter.hasNext()){
132 ((Annotation)tokIter.next()).getFeatures().
133 put(TOKEN_CATEGORY_FEATURE_NAME ,((String[])resIter.next())[1]);
134 }
135 }
136 fireProgressChanged(sentIndex++ * 100 / sentCnt);
137 }//while(sentencesIter.hasNext())
138 if(currentToken != null){
139 //we have remaining tokens after the last sentence
140 tokensInCurrentSentence.clear();
141 sentenceForTagger.clear();
142 while(currentToken != null){
143 tokensInCurrentSentence.add(currentToken);
144 sentenceForTagger.add(currentToken.getFeatures().
145 get(TOKEN_STRING_FEATURE_NAME));
146 currentToken = (Annotation)(tokensIter.hasNext() ?
147 tokensIter.next() : null);
148 }
149 //run the POS tagger
150 List taggerResults = (List)tagger.runTagger(sentencesForTagger).get(0);
151 //add the results
152 //make sure no malfunction accured
153 if(taggerResults.size() != tokensInCurrentSentence.size())
154 throw new GateRuntimeException(
155 "POS Tagger malfunction: the output size (" +
156 taggerResults.size() +
157 ") is different from the input size (" +
158 tokensInCurrentSentence.size() + ")!");
159 Iterator resIter = taggerResults.iterator();
160 Iterator tokIter = tokensInCurrentSentence.iterator();
161 while(resIter.hasNext()){
162 ((Annotation)tokIter.next()).getFeatures().
163 put(TOKEN_CATEGORY_FEATURE_NAME ,((String[])resIter.next())[1]);
164 }
165 }//if(currentToken != null)
166 fireProcessFinished();
167 fireStatusChanged(
168 document.getName() + " tagged in " +
169 NumberFormat.getInstance().format(
170 (double)(System.currentTimeMillis() - startTime) / 1000) +
171 " seconds!");
172 }else{
173 throw new GateRuntimeException("No sentences or tokens to process!\n" +
174 "Please run a sentence splitter "+
175 "and tokeniser first!");
176 }
177
178 //OLD version
179 /*
180 AnnotationSet as = inputAS.get(SENTENCE_ANNOTATION_TYPE);
181 if(as != null && as.size() > 0){
182 List sentences = new ArrayList(as);
183 Collections.sort(sentences, offsetComparator);
184 Iterator sentIter = sentences.iterator();
185 int sentIndex = 0;
186 int sentCnt = sentences.size();
187 long startTime= System.currentTimeMillis();
188 while(sentIter.hasNext()){
189 start = System.currentTimeMillis();
190 Annotation sentenceAnn = (Annotation)sentIter.next();
191 AnnotationSet rangeSet = inputAS.get(
192 sentenceAnn.getStartNode().getOffset(),
193 sentenceAnn.getEndNode().getOffset());
194 if(rangeSet == null) continue;
195 AnnotationSet tokensSet = rangeSet.get(TOKEN_ANNOTATION_TYPE);
196 if(tokensSet == null) continue;
197 List tokens = new ArrayList(tokensSet);
198 Collections.sort(tokens, offsetComparator);
199
200 // List tokens = (List)sentenceAnn.getFeatures().get("tokens");
201 List sentence = new ArrayList(tokens.size());
202 Iterator tokIter = tokens.iterator();
203 while(tokIter.hasNext()){
204 Annotation token = (Annotation)tokIter.next();
205 String text = (String)token.getFeatures().get(TOKEN_STRING_FEATURE_NAME);
206 sentence.add(text);
207 }//while(tokIter.hasNext())
208
209 //run the POSTagger over this sentence
210 List sentences4tagger = new ArrayList(1);
211 sentences4tagger.add(sentence);
212 prepTime += System.currentTimeMillis() - start;
213 start = System.currentTimeMillis();
214 List taggerResults = tagger.runTagger(sentences4tagger);
215 posTime += System.currentTimeMillis() - start;
216 start = System.currentTimeMillis();
217 //add the results to the output annotation set
218 //we only get one sentence
219 List sentenceFromTagger = (List)taggerResults.get(0);
220 if(sentenceFromTagger.size() != sentence.size()){
221 String taggerResult = "";
222 for(int i = 0; i< sentenceFromTagger.size(); i++){
223 taggerResult += ((String[])sentenceFromTagger.get(i))[1] + ", ";
224 }
225 throw new GateRuntimeException(
226 "POS Tagger malfunction: the output size (" +
227 sentenceFromTagger.size() +
228 ") is different from the input size (" +
229 sentence.size() + ")!" +
230 "\n Input: " + sentence + "\nOutput: " + taggerResult);
231 }
232 for(int i = 0; i< sentence.size(); i++){
233 String category = ((String[])sentenceFromTagger.get(i))[1];
234 Annotation token = (Annotation)tokens.get(i);
235 token.getFeatures().
236 put(TOKEN_CATEGORY_FEATURE_NAME, category);
237 }//for(i = 0; i<= sentence.size(); i++)
238 postTime += System.currentTimeMillis() - start;
239 fireProgressChanged(sentIndex++ * 100 / sentCnt);
240 }//while(sentIter.hasNext())
241 Out.prln("POS preparation time:" + prepTime);
242 Out.prln("POS execution time:" + posTime);
243 Out.prln("POS after execution time:" + postTime);
244 fireProcessFinished();
245 long endTime = System.currentTimeMillis();
246 fireStatusChanged(document.getName() + " tagged in " +
247 NumberFormat.getInstance().format(
248 (double)(endTime - startTime) / 1000) + " seconds!");
249 }else{
250 throw new GateRuntimeException("No sentences to process!\n" +
251 "Please run a sentence splitter first!");
252 }//if(as != null && as.size() > 0)
253 */
254 }catch(Exception e){
255 throw new ExecutionException(e);
256 }
257 }
258
259
260 public void setLexiconURL(java.net.URL newLexiconURL) {
261 lexiconURL = newLexiconURL;
262 }
263 public java.net.URL getLexiconURL() {
264 return lexiconURL;
265 }
266 public void setRulesURL(java.net.URL newRulesURL) {
267 rulesURL = newRulesURL;
268 }
269 public void setEncoding(String encoding) {
270 this.encoding = encoding;
271 }
272
273 public java.net.URL getRulesURL() {
274 return rulesURL;
275 }
276 public void setInputASName(String newInputASName) {
277 inputASName = newInputASName;
278 }
279 public String getInputASName() {
280 return inputASName;
281 }
282 public String getEncoding() {
283 return this.encoding;
284 }
285
286 protected hepple.postag.POSTagger tagger;
287 private java.net.URL lexiconURL;
288 private java.net.URL rulesURL;
289 private String inputASName;
290 private String encoding;
291 }
292