| WebCrimeReportAnalyser.java |
1 package gate.util.web;
2
3 import java.io.IOException;
4 import java.net.MalformedURLException;
5 import java.net.URL;
6 import java.util.*;
7
8 import javax.servlet.ServletContext;
9
10 import gate.*;
11 import gate.annotation.AnnotationSetImpl;
12 import gate.corpora.RepositioningInfo;
13 import gate.creole.SerialAnalyserController;
14 import gate.util.GateException;
15
16
17 public class WebCrimeReportAnalyser {
18
19 public static final String SOCIS_CONTROLLER_KEY = "socis.controller";
20 public static final String GATE_INIT_KEY = "gate.init";
21
22
23 public String filePath = "";
24
25 private SerialAnalyserController controller;
26
27 public void initCrimeReportAnalyser() throws GateException {
28 controller = (SerialAnalyserController)
29 Factory.createResource("gate.creole.SerialAnalyserController",
30 Factory.newFeatureMap(),
31 Factory.newFeatureMap(),
32 "Crime Report Analyser");
33
34 ProcessingResource tokeniser = (ProcessingResource)
35 Factory.createResource("gate.creole.tokeniser.DefaultTokeniser",
36 Factory.newFeatureMap());
37
38 controller.add(tokeniser);
39
40 ProcessingResource split = (ProcessingResource)
41 Factory.createResource("gate.creole.splitter.SentenceSplitter",
42 Factory.newFeatureMap());
43
44 controller.add(split);
45
46 ProcessingResource postagger = (ProcessingResource)
47 Factory.createResource("gate.creole.POSTagger",
48 Factory.newFeatureMap());
49
50 controller.add(postagger);
51
52 /* ProcessingResource prechunking = (ProcessingResource)
53 Factory.createResource("chunking.PreChunking",
54 Factory.newFeatureMap());
55
56 controller.add(prechunking);
57 System.out.println("after tokeniser");
58 System.out.println("Freemem: " + Runtime.getRuntime().freeMemory());
59
60 FeatureMap fm = Factory.newFeatureMap();
61 fm.put("inputASName","ChunkAnnotations");
62 fm.put("outputASName","ChunkAnnotations");
63
64 try {
65 URL urlnp = new URL("jar:file:" + filePath + "files.jar!/resources/grammars/Chunk/mainNPChunk.jape");
66 fm.put("grammarURL",urlnp);
67 } catch(MalformedURLException e) {
68
69 e.printStackTrace();
70 }
71
72 ProcessingResource npchunk = (ProcessingResource)
73 Factory.createResource("gate.creole.ANNIETransducer",
74 fm);
75
76 controller.add(npchunk);
77
78 FeatureMap fm1 = Factory.newFeatureMap();
79 fm1.put("inputASName","ChunkAnnotations");
80 fm1.put("outputASName","ChunkAnnotations");
81
82 try {
83 URL urlvp = new URL("jar:file:" + filePath + "files.jar!/resources/grammars/Chunk/mainNPChunk.jape");
84 fm1.put("grammarURL",urlvp);
85 } catch(MalformedURLException e) {
86 e.printStackTrace();
87 }
88
89 ProcessingResource vpchunk = (ProcessingResource)
90 Factory.createResource("gate.creole.ANNIETransducer",
91 fm1);
92
93 controller.add(vpchunk);*/
94
95 FeatureMap fm_gaz = Factory.newFeatureMap();
96 fm_gaz.put("encoding","ISO-8859-1");
97
98 try {
99 URL urlgaz = new URL("jar:file:" + filePath + "files.jar!/resources/gazetters/general/lists.def");
100 fm_gaz.put("listsURL",urlgaz);
101 } catch(MalformedURLException e) {
102 e.printStackTrace();
103 }
104
105 ProcessingResource gazetteer = (ProcessingResource)
106 Factory.createResource("gate.creole.gazetteer.DefaultGazetteer",
107 fm_gaz);
108
109 controller.add(gazetteer);
110
111 FeatureMap fm_gra = Factory.newFeatureMap();
112
113 try {
114 URL urlgra = new URL("jar:file:" + filePath + "files.jar!/resources/grammars/NamedEntities/socismain.jape");
115 fm_gra.put("grammarURL",urlgra);
116 } catch(MalformedURLException e) {
117 e.printStackTrace();
118 }
119
120 ProcessingResource grammar = (ProcessingResource)
121 Factory.createResource("gate.creole.ANNIETransducer",
122 fm_gra);
123
124 controller.add(grammar);
125
126 } // initIndexAnalyser()
127
128 public String process(ServletContext app, String url, String[] annotations)
129 throws GateException, IOException {
130
131 long start;
132
133 // Is this the first time a gate demo has been run? If so,
134 // initiali[s|z]e gate. It's a very heavy process, so only do
135 // it once.
136
137 if (app.getAttribute(GATE_INIT_KEY) == null) {
138 Gate.setLocalWebServer(false);
139 Gate.setNetConnected(false);
140
141 System.setProperty("java.protocol.handler.pkgs",
142 "gate.util.protocols");
143
144 // Do the deed
145 Gate.init();
146
147 app.setAttribute(GATE_INIT_KEY, "true");
148 }
149
150 // Now do the same for the SOCIS controller
151
152 if (app.getAttribute(SOCIS_CONTROLLER_KEY) == null) {
153
154 CreoleRegister reg = Gate.getCreoleRegister();
155
156 filePath = app.getInitParameter("files.path");
157
158 // URL filesURL = new URL("jar:file:" + filePath + "files.jar!/");
159 // try {
160 // reg.registerDirectories(filesURL);
161 // } catch(GateException e) {
162 // System.out.println(e.getMessage());
163 // }
164
165 initCrimeReportAnalyser();
166
167 app.setAttribute(SOCIS_CONTROLLER_KEY, controller);
168 }
169 else {
170 // The SOCIS demo has already run, so take the existing
171 // controller from the application attribute hash
172
173 controller = (SerialAnalyserController)
174 app.getAttribute(SOCIS_CONTROLLER_KEY);
175 }
176
177 Corpus corpus =
178 (Corpus) Factory.createResource("gate.corpora.CorpusImpl");
179
180 /* here the url specified by the user */
181 URL textURL = new URL(url);
182
183 FeatureMap params = Factory.newFeatureMap();
184 params.put("sourceUrl", textURL);
185 params.put("preserveOriginalContent", new Boolean(true));
186 params.put("collectRepositioningInfo", new Boolean(true));
187
188 Document doc = (Document)
189 Factory.createResource("gate.corpora.DocumentImpl",params);
190
191 corpus.add(doc);
192
193 controller.setCorpus(corpus);
194 controller.execute();
195
196 AnnotationSet defaultAnnotSet = doc.getAnnotations();
197 AnnotationSet chunkAnnotSet = doc.getAnnotations("ChunkAnnotations");
198 Set annotTypesRequired = new HashSet();
199 Set chunkTypesRequired = new HashSet();
200
201 for (int i=0;i<annotations.length;i++) {
202 annotTypesRequired.add(annotations[i]);
203 }
204
205 /* socis stuff */
206 /*annotTypesRequired.add("Location");
207 annotTypesRequired.add("Time");
208 annotTypesRequired.add("Organization");
209 annotTypesRequired.add("Person");
210 annotTypesRequired.add("Id_No");
211 annotTypesRequired.add("Date");
212 annotTypesRequired.add("Money");
213 annotTypesRequired.add("Percent");
214 annotTypesRequired.add("Conv_make");
215 annotTypesRequired.add("Offence");
216 annotTypesRequired.add("Age");
217 annotTypesRequired.add("Drug");
218 annotTypesRequired.add("Address"); */
219
220 /* required chunks */
221 /*
222 chunkTypesRequired.add("NPCHUNK");
223 chunkTypesRequired.add("VPCHUNK"); */
224
225 AnnotationSet socis = defaultAnnotSet.get(annotTypesRequired);
226
227 //AnnotationSet chunks = chunkAnnotSet.get(chunkTypesRequired);
228
229 FeatureMap features = doc.getFeatures();
230 String originalContent = (String)
231 features.get(GateConstants.ORIGINAL_DOCUMENT_CONTENT_FEATURE_NAME);
232
233 RepositioningInfo info = (RepositioningInfo)
234 features.get(GateConstants.DOCUMENT_REPOSITIONING_INFO_FEATURE_NAME);
235
236 Annotation currAnnot;
237 SortedAnnotationList sortedAnnotationsNamedEntities =
238 new SortedAnnotationList();
239
240 // The AnnotationSet socis can be null if no annotations have
241 // been found
242 if (socis != null) {
243 Iterator it = socis.iterator();
244 while(it.hasNext()) {
245 currAnnot = (Annotation) it.next();
246 sortedAnnotationsNamedEntities.addSortedExclusive(currAnnot);
247 }
248 }
249
250 AnnotationSet uniqueNamedEntities =
251 new AnnotationSetImpl(doc);
252
253 uniqueNamedEntities.addAll(sortedAnnotationsNamedEntities);
254
255 SortedAnnotationList sortedAnnotationsChunks =
256 new SortedAnnotationList();
257
258 /*it = chunks.iterator();
259 while(it.hasNext()) {
260 currAnnot = (Annotation) it.next();
261 sortedAnnotationsChunks.addSortedExclusive(currAnnot);
262 } //while
263
264 AnnotationSet uniqueChunks = new AnnotationSetImpl((Document) null);
265
266 uniqueChunks.addAll(sortedAnnotationsChunks); */
267
268 String xmlDocumentNamedEntities = doc.toXml(uniqueNamedEntities, true);
269 //String xmlDocumentChunks = doc.toXml(uniqueChunks,true);
270
271 //delete the used resources
272 Factory.deleteResource(doc);
273 Factory.deleteResource(corpus);
274 return xmlDocumentNamedEntities;
275
276 }
277
278 public static class SortedAnnotationList extends Vector {
279
280 public SortedAnnotationList() {
281 super();
282 }
283 public boolean addSortedExclusive(Annotation annot) {
284 Annotation currAnnot = null;
285 for(int i=0; i<size() ; ++i) {
286 currAnnot = (Annotation) get(i);
287 if(annot.overlaps(currAnnot)) {
288 return false;
289
290 } //if
291
292 } //for
293 long annotStart = annot.getStartNode().getOffset().longValue();
294 long currStart;
295 for (int i=0; i < size(); ++i) {
296 currAnnot = (Annotation) get(i);
297 currStart = currAnnot.getStartNode().getOffset().longValue();
298 if(annotStart < currStart) {
299 insertElementAt(annot, i);
300 return true;
301
302 } //if
303
304 } //for
305
306 int size = size();
307 insertElementAt(annot, size);
308 return true;
309 } //addSortedExclusive
310
311 } //SortedAnnotationList
312
313 }
314
315