| HtmlDocumentHandler.java |
1 /*
2 * HtmlDocumentHandler.java
3 *
4 * Copyright (c) 1998-2004, The University of Sheffield.
5 *
6 * This file is part of GATE (see http://gate.ac.uk/), and is free
7 * software, licenced under the GNU Library General Public License,
8 * Version 2, June 1991 (in the distribution as file licence.html,
9 * and also available at http://gate.ac.uk/gate/licence.html).
10 *
11 * Cristian URSU, 12/June/2000
12 *
13 * $Id: HtmlDocumentHandler.java,v 1.36 2004/07/26 14:59:31 valyt Exp $
14 */
15
16 package gate.html;
17
18 import java.util.*;
19
20 import javax.swing.text.BadLocationException;
21 import javax.swing.text.MutableAttributeSet;
22 import javax.swing.text.html.HTML;
23 import javax.swing.text.html.HTMLEditorKit.ParserCallback;
24
25 import gate.*;
26 import gate.corpora.DocumentContentImpl;
27 import gate.corpora.RepositioningInfo;
28 import gate.event.StatusListener;
29 import gate.util.Err;
30 import gate.util.InvalidOffsetException;
31
32
33 /** Implements the behaviour of the HTML reader.
34 * Methods of an object of this class are called by the HTML parser when
35 * events will appear.
36 * The idea is to parse the HTML document and construct Gate annotations
37 * objects.
38 * This class also will replace the content of the Gate document with a
39 * new one containing anly text from the HTML document.
40 */
41 public class HtmlDocumentHandler extends ParserCallback {
42
43 /** Debug flag */
44 private static final boolean DEBUG = false;
45
46 /** Constructor initialises all the private memeber data.
47 * This will use the default annotation set taken from the gate document.
48 * @param aDocument The gate document that will be processed
49 * @param aMarkupElementsMap The map containing the elements that will
50 * transform into annotations
51 */
52 public HtmlDocumentHandler(gate.Document aDocument, Map aMarkupElementsMap) {
53 this(aDocument,aMarkupElementsMap,null);
54 }
55
56 /** Constructor initialises all the private memeber data
57 * @param aDocument The gate document that will be processed
58 * @param aMarkupElementsMap The map containing the elements that will
59 * transform into annotations
60 * @param anAnnotationSet The annotation set that will contain annotations
61 * resulted from the processing of the gate document
62 */
63 public HtmlDocumentHandler(gate.Document aDocument,
64 Map aMarkupElementsMap,
65 gate.AnnotationSet anAnnotationSet) {
66 // init stack
67 stack = new java.util.Stack();
68
69 // this string contains the plain text (the text without markup)
70 tmpDocContent = new StringBuffer(aDocument.getContent().size().intValue());
71
72 // colector is used later to transform all custom objects into
73 // annotation objects
74 colector = new LinkedList();
75
76 // the Gate document
77 doc = aDocument;
78
79 // this map contains the elements name that we want to create
80 // if it's null all the elements from the XML documents will be transformed
81 // into Gate annotation objects
82 markupElementsMap = aMarkupElementsMap;
83
84 // init an annotation set for this gate document
85 basicAS = anAnnotationSet;
86
87 customObjectsId = 0;
88 }//HtmlDocumentHandler
89
90 /** Keep the refference to this structure */
91 private RepositioningInfo reposInfo = null;
92
93 /** Keep the refference to this structure */
94 private RepositioningInfo ampCodingInfo = null;
95
96 /** Set repositioning information structure refference. If you set this
97 * refference to <B>null</B> information wouldn't be collected.
98 */
99 public void setRepositioningInfo(RepositioningInfo info) {
100 reposInfo = info;
101 } // setRepositioningInfo
102
103 /** Return current RepositioningInfo object */
104 public RepositioningInfo getRepositioningInfo() {
105 return reposInfo;
106 } // getRepositioningInfo
107
108 /** Set repositioning information structure refference for ampersand coding.
109 * If you set this refference to <B>null</B> information wouldn't be used.
110 */
111 public void setAmpCodingInfo(RepositioningInfo info) {
112 ampCodingInfo = info;
113 } // setRepositioningInfo
114
115 /** Return current RepositioningInfo object for ampersand coding. */
116 public RepositioningInfo getAmpCodingInfo() {
117 return ampCodingInfo;
118 } // getRepositioningInfo
119
120 /** The text inside the STYLE tag is processed with <code>handleText()</code>.
121 * We should skip inserting of this text in the document. */
122 private boolean isInsideStyleTag = false;
123
124 /** This method is called when the HTML parser encounts the beginning
125 * of a tag that means that the tag is paired by an end tag and it's
126 * not an empty one.
127 */
128 public void handleStartTag(HTML.Tag t, MutableAttributeSet a, int pos) {
129 // Fire the status listener if the elements processed exceded the rate
130 if (0 == (++elements % ELEMENTS_RATE))
131 fireStatusChangedEvent("Processed elements : " + elements);
132
133 // Start of STYLE tag
134 if(HTML.Tag.STYLE.equals(t)) {
135 isInsideStyleTag = true;
136 } // if
137
138 // Construct a feature map from the attributes list
139 FeatureMap fm = Factory.newFeatureMap();
140
141 // Take all the attributes an put them into the feature map
142 if (0 != a.getAttributeCount()){
143 Enumeration enumeration = a.getAttributeNames();
144 while (enumeration.hasMoreElements()){
145 Object attribute = enumeration.nextElement();
146 fm.put(attribute.toString(),(a.getAttribute(attribute)).toString());
147 }// while
148 }// if
149
150 // Just analize the tag t and add some\n chars and spaces to the
151 // tmpDocContent.The reason behind is that we need to have a readable form
152 // for the final document.
153 customizeAppearanceOfDocumentWithStartTag(t);
154
155 // If until here the "tmpDocContent" ends with a NON whitespace char,
156 // then we add a space char before calculating the START index of this
157 // tag.
158 // This is done in order not to concatenate the content of two separate tags
159 // and obtain a different NEW word.
160 int tmpDocContentSize = tmpDocContent.length();
161 if ( tmpDocContentSize != 0 &&
162 !Character.isWhitespace(tmpDocContent.charAt(tmpDocContentSize - 1))
163 ) tmpDocContent.append(" ");
164
165 // create the start index of the annotation
166 Long startIndex = new Long(tmpDocContent.length());
167
168 // initialy the start index is equal with the End index
169 CustomObject obj = new CustomObject(t.toString(),fm,startIndex,startIndex);
170
171 // put it into the stack
172 stack.push (obj);
173
174 }//handleStartTag
175
176 /** This method is called when the HTML parser encounts the end of a tag
177 * that means that the tag is paired by a beginning tag
178 */
179 public void handleEndTag(HTML.Tag t, int pos){
180 // obj is for internal use
181 CustomObject obj = null;
182
183 // end of STYLE tag
184 if(HTML.Tag.STYLE.equals(t)) {
185 isInsideStyleTag = false;
186 } // if
187
188 // If the stack is not empty then we get the object from the stack
189 if (!stack.isEmpty()){
190 obj = (CustomObject) stack.pop();
191 // Before adding it to the colector, we need to check if is an
192 // emptyAndSpan one. See CustomObject's isEmptyAndSpan field.
193 if (obj.getStart().equals(obj.getEnd())){
194 // The element had an end tag and its start was equal to its end. Hence
195 // it is anEmptyAndSpan one.
196 obj.getFM().put("isEmptyAndSpan","true");
197 }// End iff
198 // we add it to the colector
199 colector.add(obj);
200 }// End if
201
202 // If element has text between, then customize its apearance
203 if ( obj != null &&
204 obj.getStart().longValue() != obj.getEnd().longValue()
205 )
206 // Customize the appearance of the document
207 customizeAppearanceOfDocumentWithEndTag(t);
208
209 // if t is the </HTML> tag then we reached the end of theHTMLdocument
210 if (t == HTML.Tag.HTML){
211 // replace the old content with the new one
212 doc.setContent (new DocumentContentImpl(tmpDocContent.toString()));
213
214 // If basicAs is null then get the default annotation
215 // set from this gate document
216 if (basicAS == null)
217 basicAS = doc.getAnnotations(
218 GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
219
220 // sort colector ascending on its id
221 Collections.sort(colector);
222 // iterate through colector and construct annotations
223 while (!colector.isEmpty()){
224 obj = (CustomObject) colector.getFirst();
225 colector.remove(obj);
226 // Construct an annotation from this obj
227 try{
228 if (markupElementsMap == null){
229 basicAS.add( obj.getStart(),
230 obj.getEnd(),
231 obj.getElemName(),
232 obj.getFM()
233 );
234 }else{
235 String annotationType =
236 (String) markupElementsMap.get(obj.getElemName());
237 if (annotationType != null)
238 basicAS.add( obj.getStart(),
239 obj.getEnd(),
240 annotationType,
241 obj.getFM()
242 );
243 }
244 }catch (InvalidOffsetException e){
245 Err.prln("Error creating an annot :" + obj + " Discarded...");
246 }// end try
247 // }// end if
248 }//while
249
250 // notify the listener about the total amount of elements that
251 // has been processed
252 fireStatusChangedEvent("Total elements : " + elements);
253
254 }//else
255
256 }//handleEndTag
257
258 /** This method is called when the HTML parser encounts an empty tag
259 */
260 public void handleSimpleTag(HTML.Tag t, MutableAttributeSet a, int pos){
261 // fire the status listener if the elements processed exceded the rate
262 if ((++elements % ELEMENTS_RATE) == 0)
263 fireStatusChangedEvent("Processed elements : " + elements);
264
265 // construct a feature map from the attributes list
266 // these are empty elements
267 FeatureMap fm = Factory.newFeatureMap();
268
269 // take all the attributes an put them into the feature map
270 if (0 != a.getAttributeCount ()){
271
272 // Out.println("HAS attributes = " + a.getAttributeCount ());
273 Enumeration enumeration = a.getAttributeNames ();
274 while (enumeration.hasMoreElements ()){
275 Object attribute = enumeration.nextElement ();
276 fm.put ( attribute.toString(),(a.getAttribute(attribute)).toString());
277
278 }//while
279
280 }//if
281
282 // create the start index of the annotation
283 Long startIndex = new Long(tmpDocContent.length());
284
285 // initialy the start index is equal with the End index
286 CustomObject obj = new CustomObject(t.toString(),fm,startIndex,startIndex);
287
288 // we add the object directly into the colector
289 // we don't add it to the stack because this is an empty tag
290 colector.add(obj);
291
292 // Just analize the tag t and add some\n chars and spaces to the
293 // tmpDocContent.The reason behind is that we need to have a readable form
294 // for the final document.
295 customizeAppearanceOfDocumentWithSimpleTag(t);
296
297 } // handleSimpleTag
298
299 /** This method is called when the HTML parser encounts text (PCDATA)
300 */
301 public void handleText(char[] text, int pos){
302
303 // Skip the STYLE tag content
304 if(isInsideStyleTag) return;
305
306 // create a string object based on the reported text
307 String content = new String(text);
308
309 // remove the difference between JDK 1.3 and JDK 1.4
310 String trimContent = content.trim();
311 if(trimContent.length() == 0) {
312 return;
313 } // if
314
315 int trimCorrection = content.indexOf(trimContent.charAt(0));
316 content = trimContent;
317
318 StringBuffer contentBuffer = new StringBuffer("");
319 int tmpDocContentSize = tmpDocContent.length();
320 boolean incrementStartIndex = false;
321 // If the first char of the text just read "text[0]" is NOT whitespace AND
322 // the last char of the tmpDocContent[SIZE-1] is NOT whitespace then
323 // concatenation "tmpDocContent + content" will result into a new different
324 // word... and we want to avoid that...
325 if ( tmpDocContentSize != 0 &&
326 content.length() != 0 &&
327 !Character.isWhitespace(content.charAt(0)) &&
328 !Character.isWhitespace(tmpDocContent.charAt(tmpDocContentSize - 1))){
329
330 contentBuffer.append(" ");
331 incrementStartIndex = true;
332 }// End if
333 // update the document content
334
335 // put the repositioning information
336 if(reposInfo != null) {
337 int extractedPos = tmpDocContent.length() + contentBuffer.length();
338 addRepositioningInfo(content, pos + trimCorrection, extractedPos);
339 } // if
340
341 contentBuffer.append(content);
342 // calculate the End index for all the elements of the stack
343 // the expression is : End index = Current doc length + text length
344 Long end = new Long(tmpDocContent.length() + contentBuffer.length());
345
346 CustomObject obj = null;
347 // Iterate through stack to modify the End index of the existing elements
348
349 java.util.Iterator anIterator = stack.iterator();
350 while (anIterator.hasNext ()){
351 // get the object and move to the next one
352 obj = (CustomObject) anIterator.next ();
353 if (incrementStartIndex && obj.getStart().equals(obj.getEnd())){
354 obj.setStart(new Long(obj.getStart().longValue() + 1));
355 }// End if
356 // sets its End index
357 obj.setEnd(end);
358 }// End while
359
360 tmpDocContent.append(contentBuffer.toString());
361 }// end handleText();
362
363 /** For given content the list with shrink position information is searched
364 * and on the corresponding positions the correct repositioning information
365 * is calculated and generated.
366 */
367 public void addRepositioningInfo(String content, int pos, int extractedPos) {
368 int contentLength = content.length();
369
370 // wrong way (without correction and analysing)
371 //reposInfo.addPositionInfo(pos, contentLength, extractedPos, contentLength);
372
373 RepositioningInfo.PositionInfo pi = null;
374 long startPos = pos;
375 long correction = 0;
376 long substituteStart;
377 long remainingLen;
378 long offsetInExtracted;
379
380 for(int i = 0; i < ampCodingInfo.size(); ++i) {
381 pi = (RepositioningInfo.PositionInfo) ampCodingInfo.get(i);
382 substituteStart = pi.getOriginalPosition();
383
384 if(substituteStart >= startPos) {
385 if(substituteStart > pos + contentLength + correction) {
386 break; // outside the current text
387 } // if
388
389 // should create two repositioning information records
390 remainingLen = substituteStart - (startPos + correction);
391 offsetInExtracted = startPos - pos;
392 if(remainingLen > 0) {
393 reposInfo.addPositionInfo(startPos + correction, remainingLen,
394 extractedPos + offsetInExtracted, remainingLen);
395 } // if
396 // record for shrank text
397 reposInfo.addPositionInfo(substituteStart, pi.getOriginalLength(),
398 extractedPos + offsetInExtracted + remainingLen,
399 pi.getCurrentLength());
400 startPos = startPos + remainingLen + pi.getCurrentLength();
401 correction += pi.getOriginalLength() - pi.getCurrentLength();
402 } // if
403 } // for
404
405 // there is some text remaining for repositioning
406 offsetInExtracted = startPos - pos;
407 remainingLen = contentLength - offsetInExtracted;
408 if(remainingLen > 0) {
409 reposInfo.addPositionInfo(startPos + correction, remainingLen,
410 extractedPos + offsetInExtracted, remainingLen);
411 } // if
412 } // addRepositioningInfo
413
414 /** This method analizes the tag t and adds some \n chars and spaces to the
415 * tmpDocContent.The reason behind is that we need to have a readable form
416 * for the final document. This method modifies the content of tmpDocContent.
417 * @param t the Html tag encounted by the HTML parser
418 */
419 protected void customizeAppearanceOfDocumentWithSimpleTag(HTML.Tag t){
420 boolean modification = false;
421 // if the HTML tag is BR then we add a new line character to the document
422 if (HTML.Tag.BR == t){
423 tmpDocContent.append("\n");
424 modification = true;
425 }// End if
426 if (modification == true){
427 Long end = new Long (tmpDocContent.length());
428 java.util.Iterator anIterator = stack.iterator();
429 while (anIterator.hasNext ()){
430 // get the object and move to the next one
431 CustomObject obj = (CustomObject) anIterator.next();
432 // sets its End index
433 obj.setEnd(end);
434 }// End while
435 }//End if
436 }// customizeAppearanceOfDocumentWithSimpleTag
437
438 /** This method analizes the tag t and adds some \n chars and spaces to the
439 * tmpDocContent.The reason behind is that we need to have a readable form
440 * for the final document. This method modifies the content of tmpDocContent.
441 * @param t the Html tag encounted by the HTML parser
442 */
443 protected void customizeAppearanceOfDocumentWithStartTag(HTML.Tag t){
444 boolean modification = false;
445 if (HTML.Tag.P == t){
446 int tmpDocContentSize = tmpDocContent.length();
447 if ( tmpDocContentSize >= 2 &&
448 '\n' != tmpDocContent.charAt(tmpDocContentSize - 2)
449 ) { tmpDocContent.append("\n"); modification = true;}
450 }// End if
451 if (modification == true){
452 Long end = new Long (tmpDocContent.length());
453 java.util.Iterator anIterator = stack.iterator();
454 while (anIterator.hasNext ()){
455 // get the object and move to the next one
456 CustomObject obj = (CustomObject) anIterator.next();
457 // sets its End index
458 obj.setEnd(end);
459 }// End while
460 }//End if
461 }// customizeAppearanceOfDocumentWithStartTag
462
463 /** This method analizes the tag t and adds some \n chars and spaces to the
464 * tmpDocContent.The reason behind is that we need to have a readable form
465 * for the final document. This method modifies the content of tmpDocContent.
466 * @param t the Html tag encounted by the HTML parser
467 */
468 protected void customizeAppearanceOfDocumentWithEndTag(HTML.Tag t){
469 boolean modification = false;
470 // if the HTML tag is BR then we add a new line character to the document
471 if ( (HTML.Tag.P == t) ||
472
473 (HTML.Tag.H1 == t) ||
474 (HTML.Tag.H2 == t) ||
475 (HTML.Tag.H3 == t) ||
476 (HTML.Tag.H4 == t) ||
477 (HTML.Tag.H5 == t) ||
478 (HTML.Tag.H6 == t) ||
479 (HTML.Tag.TR == t) ||
480 (HTML.Tag.CENTER == t) ||
481 (HTML.Tag.LI == t)
482 ){ tmpDocContent.append("\n"); modification = true;}
483
484 if (HTML.Tag.TITLE == t){
485 tmpDocContent.append("\n\n");
486 modification = true;
487 }// End if
488
489 if (modification == true){
490 Long end = new Long (tmpDocContent.length());
491 java.util.Iterator anIterator = stack.iterator();
492 while (anIterator.hasNext ()){
493 // get the object and move to the next one
494 CustomObject obj = (CustomObject) anIterator.next();
495 // sets its End index
496 obj.setEnd(end);
497 }// End while
498 }//End if
499 }// customizeAppearanceOfDocumentWithEndTag
500
501 /**
502 * This method is called when the HTML parser encounts an error
503 * it depends on the programmer if he wants to deal with that error
504 */
505 public void handleError(String errorMsg, int pos) {
506 //Out.println ("ERROR CALLED : " + errorMsg);
507 }
508
509 /** This method is called once, when the HTML parser reaches the end
510 * of its input streamin order to notify the parserCallback that there
511 * is nothing more to parse.
512 */
513 public void flush() throws BadLocationException{
514 }// flush
515
516 /** This method is called when the HTML parser encounts a comment
517 */
518 public void handleComment(char[] text, int pos) {
519 }
520
521 //StatusReporter Implementation
522
523 public void addStatusListener(StatusListener listener) {
524 myStatusListeners.add(listener);
525 }
526
527 public void removeStatusListener(StatusListener listener) {
528 myStatusListeners.remove(listener);
529 }
530
531 protected void fireStatusChangedEvent(String text) {
532 Iterator listenersIter = myStatusListeners.iterator();
533 while(listenersIter.hasNext())
534 ((StatusListener)listenersIter.next()).statusChanged(text);
535 }
536
537 /**
538 * This method verifies if data contained by the CustomObject can be used
539 * to create a GATE annotation.
540 */
541 /* private boolean canCreateAnnotation(CustomObject aCustomObject){
542 long start = aCustomObject.getStart().longValue();
543 long end = aCustomObject.getEnd().longValue();
544 long gateDocumentSize = doc.getContent().size().longValue();
545
546 if (start < 0 || end < 0 ) return false;
547 if (start > end ) return false;
548 if ((start > gateDocumentSize) || (end > gateDocumentSize)) return false;
549 return true;
550 }// canCreateAnnotation
551 */
552
553 // HtmlDocumentHandler member data
554
555 // this constant indicates when to fire the status listener
556 // this listener will add an overhead and we don't want a big overhead
557 // this listener will be callled from ELEMENTS_RATE to ELEMENTS_RATE
558 final static int ELEMENTS_RATE = 128;
559
560 // this map contains the elements name that we want to create
561 // if it's null all the elements from the HTML documents will be transformed
562 // into Gate annotation objects otherwise only the elements it contains will
563 // be transformed
564 private Map markupElementsMap = null;
565
566 // the content of the HTML document, without any tag
567 // for internal use
568 private StringBuffer tmpDocContent = null;
569
570 // a stack used to remember elements and to keep the order
571 private java.util.Stack stack = null;
572
573 // a gate document
574 private gate.Document doc = null;
575
576 // an annotation set used for creating annotation reffering the doc
577 private gate.AnnotationSet basicAS;
578
579 // listeners for status report
580 protected List myStatusListeners = new LinkedList();
581
582 // this reports the the number of elements that have beed processed so far
583 private int elements = 0;
584
585 protected long customObjectsId = 0;
586 // we need a colection to retain all the CustomObjects that will be
587 // transformed into annotation over the gate document...
588 // the transformation will take place inside onDocumentEnd() method
589 private LinkedList colector = null;
590
591 // Inner class
592 /**
593 * The objects belonging to this class are used inside the stack.
594 * This class is for internal needs
595 */
596 class CustomObject implements Comparable {
597
598 // constructor
599 public CustomObject(String anElemName, FeatureMap aFm,
600 Long aStart, Long anEnd) {
601 elemName = anElemName;
602 fm = aFm;
603 start = aStart;
604 end = anEnd;
605 id = new Long(customObjectsId ++);
606 }// End CustomObject()
607
608 // Methos implemented as required by Comparable interface
609 public int compareTo(Object o){
610 CustomObject obj = (CustomObject) o;
611 return this.id.compareTo(obj.getId());
612 }// compareTo();
613
614 // accesor
615 public String getElemName() {
616 return elemName;
617 }// getElemName()
618
619 public FeatureMap getFM() {
620 return fm;
621 }// getFM()
622
623 public Long getStart() {
624 return start;
625 }// getStart()
626
627 public Long getEnd() {
628 return end;
629 }// getEnd()
630
631 public Long getId(){ return id;}
632
633 // mutator
634 public void setElemName(String anElemName) {
635 elemName = anElemName;
636 }// getElemName()
637
638 public void setFM(FeatureMap aFm) {
639 fm = aFm;
640 }// setFM();
641
642 public void setStart(Long aStart) {
643 start = aStart;
644 }// setStart();
645
646 public void setEnd(Long anEnd) {
647 end = anEnd;
648 }// setEnd();
649
650 // data fields
651 private String elemName = null;
652 private FeatureMap fm = null;
653 private Long start = null;
654 private Long end = null;
655 private Long id = null;
656
657 } // End inner class CustomObject
658
659 }//End class HtmlDocumentHandler
660
661
662
663