| DocumentImpl.java |
1 /*
2 * DocumentImpl.java
3 *
4 * Copyright (c) 1998-2004, The University of Sheffield.
5 *
6 * This file is part of GATE (see http://gate.ac.uk/), and is free
7 * software, licenced under the GNU Library General Public License,
8 * Version 2, June 1991 (in the distribution as file licence.html,
9 * and also available at http://gate.ac.uk/gate/licence.html).
10 *
11 * Hamish Cunningham, 11/Feb/2000
12 *
13 * $Id: DocumentImpl.java,v 1.131 2004/07/23 11:33:20 kalina Exp $
14 */
15
16 package gate.corpora;
17
18 import java.io.IOException;
19 import java.net.URL;
20 import java.util.*;
21
22 import gate.*;
23 import gate.annotation.AnnotationSetImpl;
24 import gate.creole.AbstractLanguageResource;
25 import gate.creole.ResourceInstantiationException;
26 import gate.event.*;
27 import gate.util.*;
28
29 /** Represents the commonalities between all sorts of documents.
30 *
31 * <H2>Editing</H2>
32 *
33 * <P>
34 * The DocumentImpl class implements the Document interface.
35 * The DocumentContentImpl class models the textual or audio-visual
36 * materials which are the source and content of Documents.
37 * The AnnotationSetImpl class supplies annotations on Documents.
38 *
39 * <P>
40 * Abbreviations:
41 *
42 * <UL>
43 * <LI>
44 * DC = DocumentContent
45 * <LI>
46 * D = Document
47 * <LI>
48 * AS = AnnotationSet
49 * </UL>
50 *
51 * <P>
52 * We add an edit method to each of these classes; for DC and AS
53 * the methods are package private; D has the public method.
54 *
55 * <PRE>
56 * void edit(Long start, Long end, DocumentContent replacement)
57 * throws InvalidOffsetException;
58 * </PRE>
59 *
60 * <P>
61 * D receives edit requests and forwards them to DC and AS.
62 * On DC, this method makes a change to the content - e.g. replacing
63 * a String range from start to end with replacement. (Deletions
64 * are catered for by having replacement = null.) D then calls
65 * AS.edit on each of its annotation sets.
66 *
67 * <P>
68 * On AS, edit calls replacement.size() (i.e. DC.size()) to
69 * figure out how long the replacement is (0 for null). It then
70 * considers annotations that terminate (start or end) in
71 * the altered or deleted range as invalid; annotations that
72 * terminate after the range have their offsets adjusted.
73 * I.e.:
74 * <UL>
75 * <LI>
76 * the nodes that pointed inside the old modified area are invalid now and
77 * will be deleted along with the connected annotations;
78 * <LI>
79 * the nodes that are before the start of the modified area remain
80 * untouched;
81 * <LI>
82 * the nodes that are after the end of the affected area will have the
83 * offset changed according to the formula below.
84 * </UL>
85 *
86 * <P>
87 * A note re. AS and annotations: annotations no longer have
88 * offsets as in the old model, they now have nodes, and nodes
89 * have offsets.
90 *
91 * <P>
92 * To implement AS.edit, we have several indices:
93 * <PRE>
94 * HashMap annotsByStartNode, annotsByEndNode;
95 * </PRE>
96 * which map node ids to annotations;
97 * <PRE>
98 * RBTreeMap nodesByOffset;
99 * </PRE>
100 * which maps offset to Nodes.
101 *
102 * <P>
103 * When we get an edit request, we traverse that part of the
104 * nodesByOffset tree representing the altered or deleted
105 * range of the DC. For each node found, we delete any annotations
106 * that terminate on the node, and then delete the node itself.
107 * We then traverse the rest of the tree, changing the offset
108 * on all remaining nodes by:
109 * <PRE>
110 * newOffset =
111 * oldOffset -
112 * (
113 * (end - start) - // size of mod
114 * ( (replacement == null) ? 0 : replacement.size() ) // size of repl
115 * );
116 * </PRE>
117 * Note that we use the same convention as e.g. java.lang.String: start
118 * offsets are inclusive; end offsets are exclusive. I.e. for string "abcd"
119 * range 1-3 = "bc". Examples, for a node with offset 4:
120 * <PRE>
121 * edit(1, 3, "BC");
122 * newOffset = 4 - ( (3 - 1) - 2 ) = 4
123 *
124 * edit(1, 3, null);
125 * newOffset = 4 - ( (3 - 1) - 0 ) = 2
126 *
127 * edit(1, 3, "BBCC");
128 * newOffset = 4 - ( (3 - 1) - 4 ) = 6
129 * </PRE>
130 */
131 public class DocumentImpl
132 extends AbstractLanguageResource implements TextualDocument, CreoleListener,
133 DatastoreListener {
134 /** Debug flag */
135 private static final boolean DEBUG = false;
136
137 /** If you set this flag to true the original content of the document will
138 * be kept in the document feature. <br>
139 * Default value is false to avoid the unnecessary waste of memory */
140 private Boolean preserveOriginalContent = new Boolean(false);
141
142 /** If you set this flag to true the repositioning information for
143 * the document will be kept in the document feature. <br>
144 * Default value is false to avoid the unnecessary waste of time and memory
145 */
146 private Boolean collectRepositioningInfo = new Boolean(false);
147
148 /**
149 * This is a variable which contains the latest crossed over annotation
150 * found during export with preserving format, i.e., toXml(annotations)
151 * method.
152 */
153 private Annotation crossedOverAnnotation = null;
154
155 /** Default construction. Content left empty. */
156 public DocumentImpl() {
157 content = new DocumentContentImpl();
158 stringContent = "";
159 } // default construction
160
161 /** Cover unpredictable Features creation */
162 public FeatureMap getFeatures() {
163 if (features == null) {
164 features = new SimpleFeatureMapImpl();
165 }
166 return features;
167 }
168
169 /** Initialise this resource, and return it. */
170 public Resource init() throws ResourceInstantiationException {
171 // set up the source URL and create the content
172 if(sourceUrl == null) {
173 if(stringContent == null) {
174 throw new ResourceInstantiationException(
175 "The sourceURL and document's content were null."
176 );
177 }
178
179 content = new DocumentContentImpl(stringContent);
180 getFeatures().put("gate.SourceURL", "created from String");
181 } else {
182 try {
183 content = new DocumentContentImpl(
184 sourceUrl, getEncoding(), sourceUrlStartOffset, sourceUrlEndOffset);
185 getFeatures().put("gate.SourceURL", sourceUrl.toExternalForm());
186 } catch(IOException e) {
187 e.printStackTrace();
188 throw new ResourceInstantiationException("DocumentImpl.init: " + e);
189 }
190
191 if(preserveOriginalContent.booleanValue() && content != null) {
192 String originalContent = new String(
193 ((DocumentContentImpl) content).getOriginalContent());
194 getFeatures().put(GateConstants.ORIGINAL_DOCUMENT_CONTENT_FEATURE_NAME,
195 originalContent);
196 } // if
197 }
198
199 // set up a DocumentFormat if markup unpacking required
200 if(getMarkupAware().booleanValue()) {
201 DocumentFormat docFormat =
202 DocumentFormat.getDocumentFormat(this, sourceUrl);
203 try {
204 if(docFormat != null){
205 StatusListener sListener = (StatusListener)
206 gate.gui.MainFrame.getListeners().
207 get("gate.event.StatusListener");
208 if(sListener != null) docFormat.addStatusListener(sListener);
209
210 // set the flag if true and if the document format support collecting
211 docFormat.setShouldCollectRepositioning(collectRepositioningInfo);
212
213 if(docFormat.getShouldCollectRepositioning().booleanValue()) {
214 // unpack with collectiong of repositioning information
215 RepositioningInfo info = new RepositioningInfo();
216
217 String origContent = (String) getFeatures().get(
218 GateConstants.ORIGINAL_DOCUMENT_CONTENT_FEATURE_NAME);
219
220 RepositioningInfo ampCodingInfo = new RepositioningInfo();
221 if(origContent != null) {
222 boolean shouldCorrectCR = docFormat instanceof XmlDocumentFormat;
223 collectInformationForAmpCodding(origContent, ampCodingInfo,
224 shouldCorrectCR);
225 if(docFormat instanceof HtmlDocumentFormat) {
226 collectInformationForWS(origContent, ampCodingInfo);
227 } // if
228 } // if
229
230 docFormat.unpackMarkup(this, info, ampCodingInfo);
231
232 if(origContent != null
233 && docFormat instanceof XmlDocumentFormat) {
234 // CRLF correction of RepositioningInfo
235 correctRepositioningForCRLFInXML(origContent, info);
236 } // if
237
238 getFeatures().put(
239 GateConstants.DOCUMENT_REPOSITIONING_INFO_FEATURE_NAME, info);
240 }
241 else {
242 // normal old fashioned unpack
243 docFormat.unpackMarkup(this);
244 }
245 docFormat.removeStatusListener(sListener);
246 } //if format != null
247 } catch(DocumentFormatException e) {
248 throw new ResourceInstantiationException(
249 "Couldn't unpack markup in document " + sourceUrl.toExternalForm() +
250 " " + e
251 );
252 }
253 } // if markup aware
254
255 //try{
256 // FileWriter fw = new FileWriter("d:/temp/doccontent.txt");
257 // fw.write(getContent().toString());
258 // fw.flush();
259 // fw.close();
260 //}catch(IOException ioe){
261 // ioe.printStackTrace();
262 //}
263
264 return this;
265 } // init()
266
267 /**
268 * Correct repositioning information for substitution of "\r\n" with "\n"
269 */
270 private void correctRepositioningForCRLFInXML(String content,
271 RepositioningInfo info) {
272 int index = -1;
273
274 do {
275 index = content.indexOf("\r\n", index+1);
276 if(index != -1) {
277 info.correctInformationOriginalMove(index, 1);
278 } // if
279 } while(index != -1);
280 } // correctRepositioningForCRLF
281
282 /**
283 * Collect information for substitution of "&xxx;" with "y"
284 *
285 * It couldn't be collected a position information about
286 * some unicode and &-coded symbols during parsing. The parser "hide" the
287 * information about the position of such kind of parsed text.
288 * So, there is minimal chance to have &-coded symbol inside the covered by
289 * repositioning records area. The new record should be created for every
290 * coded symbol outside the existing records.
291 * <BR>
292 * If <code>shouldCorrectCR</code> flag is <code>true</code> the correction
293 * for CRLF substitution is performed.
294 */
295 private void collectInformationForAmpCodding(String content,
296 RepositioningInfo info,
297 boolean shouldCorrectCR) {
298
299 if(content == null || info == null) return;
300
301 int ampIndex = -1;
302 int semiIndex;
303
304 do {
305 ampIndex = content.indexOf('&', ampIndex+1);
306 if(ampIndex != -1) {
307 semiIndex = content.indexOf(';', ampIndex+1);
308 // have semicolon and it is near enough for amp codding
309 if(semiIndex != -1 && (semiIndex-ampIndex) < 8) {
310 info.addPositionInfo(ampIndex, semiIndex-ampIndex+1, 0, 1);
311 }
312 else {
313 // no semicolon or it is too far
314 // analyse for amp codding without semicolon
315 int maxEnd = Math.min(ampIndex+8, content.length());
316 String ampCandidate = content.substring(ampIndex, maxEnd);
317 int ampCodingSize = analyseAmpCodding(ampCandidate);
318
319 if(ampCodingSize != -1) {
320 info.addPositionInfo(ampIndex, ampCodingSize, 0, 1);
321 } // if
322
323 } // if - semicolon found
324 } // if - ampersand found
325 } while (ampIndex != -1);
326
327 // correct the collected information to adjust it's positions
328 // with reported by the parser
329 int index = -1;
330
331 if(shouldCorrectCR) {
332 do {
333 index = content.indexOf("\r\n", index+1);
334 if(index != -1) {
335 info.correctInformationOriginalMove(index, -1);
336 } // if
337 } while(index != -1);
338 } // if
339 } // collectInformationForAmpCodding
340
341 /**
342 * This function compute size of the ampersand codded sequence when
343 * semicolin is not present.
344 */
345 private int analyseAmpCodding(String content) {
346 int result = -1;
347
348 try {
349 char ch = content.charAt(1);
350
351 switch(ch) {
352 case 'l' : // <
353 case 'L' : // <
354 if(content.charAt(2) == 't' || content.charAt(2) == 'T') {
355 result = 3;
356 } // if
357 break;
358 case 'g' : // >
359 case 'G' : // >
360 if(content.charAt(2) == 't' || content.charAt(2) == 'T') {
361 result = 3;
362 } // if
363 break;
364 case 'a' : // &
365 case 'A' : // &
366 if(content.substring(2, 4).equalsIgnoreCase("mp")) {
367 result = 4;
368 } // if
369 break;
370 case 'q' : // "
371 case 'Q' : // "
372 if(content.substring(2, 5).equalsIgnoreCase("uot")) {
373 result = 5;
374 } // if
375 break;
376 case '#' : // #number (example ‘, 䰸)
377 int endIndex = 2;
378 boolean hexCoded = false;
379 if(content.charAt(2) == 'x' || content.charAt(2) == 'X') {
380 // Hex codding
381 ++endIndex;
382 hexCoded = true;
383 } // if
384
385 while (endIndex < 8
386 && isNumber(content.charAt(endIndex), hexCoded) ) {
387 ++endIndex;
388 } // while
389 result = endIndex;
390 break;
391 } // switch
392 } catch (StringIndexOutOfBoundsException ex) {
393 // do nothing
394 } // catch
395
396 return result;
397 } // analyseAmpCodding
398
399 /** Check for numeric range. If hex is true the A..F range is included */
400 private boolean isNumber(char ch, boolean hex) {
401 if(ch >= '0' && ch <= '9') return true;
402
403 if(hex) {
404 if(ch >= 'A' && ch <= 'F') return true;
405 if(ch >= 'a' && ch <= 'f') return true;
406 } // if
407
408 return false;
409 } // isNumber
410
411 /** HTML parser perform substitution of multiple whitespaces (WS) with
412 * a single WS. To create correct repositioning information structure we
413 * should keep the information for such multiple WS.
414 * <BR>
415 * The criteria for WS is <code>(ch <= ' ')</code>.
416 */
417 private void collectInformationForWS(String content, RepositioningInfo info) {
418
419 if(content == null || info == null) return;
420
421 // analyse the content and correct the repositioning information
422 char ch;
423 int startWS, endWS;
424
425 startWS = endWS = -1;
426 int contentLength = content.length();
427
428 for(int i=0; i<contentLength; ++i) {
429 ch = content.charAt(i);
430
431 // is whitespace
432 if(ch <= ' ') {
433 if(startWS == -1) {
434 startWS = i;
435 } // if
436 endWS = i;
437 }
438 else {
439 if(endWS - startWS > 0) {
440 // put the repositioning information about the WS substitution
441 info.addPositionInfo(
442 (long)startWS, (long)(endWS - startWS + 1), 0, 1);
443 } // if
444 // clear positions
445 startWS = endWS = -1;
446 }// if
447 } // for
448 } // collectInformationForWS
449
450 /** Clear all the data members of the object. */
451 public void cleanup() {
452
453 defaultAnnots = null;
454 if ( (namedAnnotSets != null) && (!namedAnnotSets.isEmpty()))
455 namedAnnotSets.clear();
456 if (DEBUG) Out.prln("Document cleanup called");
457 if (this.lrPersistentId != null)
458 Gate.getCreoleRegister().removeCreoleListener(this);
459 if(this.getDataStore() != null)
460 this.getDataStore().removeDatastoreListener(this);
461 } // cleanup()
462
463
464 /** Documents are identified by URLs */
465 public URL getSourceUrl() { return sourceUrl; }
466
467 /** Set method for the document's URL */
468 public void setSourceUrl(URL sourceUrl) {
469 this.sourceUrl = sourceUrl;
470 } // setSourceUrl
471
472 /** Documents may be packed within files; in this case an optional pair of
473 * offsets refer to the location of the document.
474 */
475 public Long[] getSourceUrlOffsets() {
476 Long[] sourceUrlOffsets = new Long[2];
477 sourceUrlOffsets[0] = sourceUrlStartOffset;
478 sourceUrlOffsets[1] = sourceUrlEndOffset;
479 return sourceUrlOffsets;
480 } // getSourceUrlOffsets
481
482 /**
483 * Allow/disallow preserving of the original document content.
484 * If is <B>true</B> the original content will be retrieved from
485 * the DocumentContent object and preserved as document feature.
486 */
487 public void setPreserveOriginalContent(Boolean b) {
488 preserveOriginalContent = b;
489 } // setPreserveOriginalContent
490
491 /** Get the preserving of content status of the Document.
492 *
493 * @return whether the Document should preserve it's original content.
494 */
495 public Boolean getPreserveOriginalContent() {
496 return preserveOriginalContent;
497 } // getPreserveOriginalContent
498
499 /**
500 * Allow/disallow collecting of repositioning information.
501 * If is <B>true</B> information will be retrieved and preserved
502 * as document feature.<BR>
503 * Preserving of repositioning information give the possibilities
504 * for converting of coordinates between the original document content and
505 * extracted from the document text.
506 */
507 public void setCollectRepositioningInfo(Boolean b) {
508 collectRepositioningInfo = b;
509 } // setCollectRepositioningInfo
510
511 /** Get the collectiong and preserving of repositioning information
512 * for the Document. <BR>
513 * Preserving of repositioning information give the possibilities
514 * for converting of coordinates between the original document content and
515 * extracted from the document text.
516 *
517 * @return whether the Document should collect and preserve information.
518 */
519 public Boolean getCollectRepositioningInfo() {
520 return collectRepositioningInfo;
521 } // getCollectRepositioningInfo
522
523 /** Documents may be packed within files; in this case an optional pair of
524 * offsets refer to the location of the document. This method gets the
525 * start offset.
526 */
527 public Long getSourceUrlStartOffset() { return sourceUrlStartOffset; }
528
529 /** Documents may be packed within files; in this case an optional pair of
530 * offsets refer to the location of the document. This method sets the
531 * start offset.
532 */
533 public void setSourceUrlStartOffset(Long sourceUrlStartOffset) {
534 this.sourceUrlStartOffset = sourceUrlStartOffset;
535 } // setSourceUrlStartOffset
536
537 /** Documents may be packed within files; in this case an optional pair of
538 * offsets refer to the location of the document. This method gets the
539 * end offset.
540 */
541 public Long getSourceUrlEndOffset() { return sourceUrlEndOffset; }
542
543 /** Documents may be packed within files; in this case an optional pair of
544 * offsets refer to the location of the document. This method sets the
545 * end offset.
546 */
547 public void setSourceUrlEndOffset(Long sourceUrlEndOffset) {
548 this.sourceUrlEndOffset = sourceUrlEndOffset;
549 } // setSourceUrlStartOffset
550
551 /** The content of the document: a String for text; MPEG for video; etc. */
552 public DocumentContent getContent() { return content; }
553
554 /** Set method for the document content */
555 public void setContent(DocumentContent content) {
556 this.content = content;
557 this.stringContent = content.toString();
558 }
559
560 /** Get the encoding of the document content source */
561 public String getEncoding() {
562 //we need to make sure we ALWAYS have an encoding
563 if(encoding == null || encoding.trim().length() == 0){
564 //no encoding definded: use the platform default
565 encoding = java.nio.charset.Charset.forName(
566 System.getProperty("file.encoding")).name();
567 }
568 return encoding;
569 }
570
571 /** Set the encoding of the document content source */
572 public void setEncoding(String encoding) { this.encoding = encoding; }
573
574 /** Get the default set of annotations. The set is created if it
575 * doesn't exist yet.
576 */
577 public AnnotationSet getAnnotations() {
578 if(defaultAnnots == null){
579 defaultAnnots = new AnnotationSetImpl(this);
580 fireAnnotationSetAdded(new DocumentEvent(
581 this, DocumentEvent.ANNOTATION_SET_ADDED, null));
582 }//if
583 return defaultAnnots;
584 } // getAnnotations()
585
586 /** Get a named set of annotations. Creates a new set if one with this
587 * name doesn't exist yet.
588 * If the provided name is null then it returns the default annotation set.
589 */
590 public AnnotationSet getAnnotations(String name) {
591 if(name == null) return getAnnotations();
592 if(namedAnnotSets == null)
593 namedAnnotSets = new HashMap();
594 AnnotationSet namedSet = (AnnotationSet) namedAnnotSets.get(name);
595
596 if(namedSet == null) {
597 namedSet = new AnnotationSetImpl(this, name);
598 namedAnnotSets.put(name, namedSet);
599
600 DocumentEvent evt = new DocumentEvent(
601 this, DocumentEvent.ANNOTATION_SET_ADDED, name
602 );
603 fireAnnotationSetAdded(evt);
604 }
605 return namedSet;
606 } // getAnnotations(name)
607
608 /** Make the document markup-aware. This will trigger the creation
609 * of a DocumentFormat object at Document initialisation time; the
610 * DocumentFormat object will unpack the markup in the Document and
611 * add it as annotations. Documents are <B>not</B> markup-aware by default.
612 *
613 * @param newMarkupAware markup awareness status.
614 */
615 public void setMarkupAware(Boolean newMarkupAware) {
616 this.markupAware = newMarkupAware;
617 }
618
619 /** Get the markup awareness status of the Document.
620 * <B>Documents are markup-aware by default.</B>
621 * @return whether the Document is markup aware.
622 */
623 public Boolean getMarkupAware() { return markupAware; }
624
625 /** Returns an XML document aming to preserve the original markups(
626 * the original markup will be in the same place and format as it was
627 * before processing the document) and include (if possible)
628 * the annotations specified in the aSourceAnnotationSet.
629 * It is equivalent to toXml(aSourceAnnotationSet, true).
630 */
631 public String toXml(Set aSourceAnnotationSet){
632 return toXml(aSourceAnnotationSet, true);
633 }
634
635 /** Returns an XML document aming to preserve the original markups(
636 * the original markup will be in the same place and format as it was
637 * before processing the document) and include (if possible)
638 * the annotations specified in the aSourceAnnotationSet.
639 * <b>Warning:</b> Annotations from the aSourceAnnotationSet will be lost
640 * if they will cause a crosed over situation.
641 * @param aSourceAnnotationSet is an annotation set containing all the
642 * annotations that will be combined with the original marup set. If the
643 * param is <code>null</code> it will only dump the original markups.
644 * @param includeFeatures is a boolean that controls whether the annotation
645 * features should be included or not. If false, only the annotation type
646 * is included in the tag.
647 * @return a string representing an XML document containing the original
648 * markup + dumped annotations form the aSourceAnnotationSet
649 */
650 public String toXml(Set aSourceAnnotationSet, boolean includeFeatures){
651
652 if(hasOriginalContentFeatures()) {
653 return saveAnnotationSetAsXmlInOrig(aSourceAnnotationSet,includeFeatures);
654 } // if
655
656 AnnotationSet originalMarkupsAnnotSet =
657 this.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
658
659 // Create a dumping annotation set on the document. It will be used for
660 // dumping annotations...
661 // AnnotationSet dumpingSet = new AnnotationSetImpl((Document) this);
662 List dumpingList = new ArrayList(originalMarkupsAnnotSet.size());
663
664 // This set will be constructed inside this method. If is not empty, the
665 // annotation contained will be lost.
666 /* if (!dumpingSet.isEmpty()){
667 Out.prln("WARNING: The dumping annotation set was not empty."+
668 "All annotation it contained were lost.");
669 dumpingSet.clear();
670 }// End if
671 */
672 StatusListener sListener = (StatusListener)
673 gate.gui.MainFrame.getListeners().
674 get("gate.event.StatusListener");
675 // Construct the dumping set in that way that all annotations will verify
676 // the condition that there are not annotations which are crossed.
677 // First add all annotation from the original markups
678 if(sListener != null)
679 sListener.statusChanged("Constructing the dumping annotation set.");
680 // dumpingSet.addAll(originalMarkupsAnnotSet);
681 dumpingList.addAll(originalMarkupsAnnotSet);
682 // Then take all the annotations from aSourceAnnotationSet and verify if
683 // they can be inserted safely into the dumpingSet. Where not possible,
684 // report.
685 if (aSourceAnnotationSet != null){
686 Iterator iter = aSourceAnnotationSet.iterator();
687 while (iter.hasNext()){
688 Annotation currentAnnot = (Annotation) iter.next();
689 if(insertsSafety(dumpingList,currentAnnot)){
690 // dumpingSet.add(currentAnnot);
691 dumpingList.add(currentAnnot);
692 }else if (crossedOverAnnotation != null && DEBUG){
693 try {
694 Out.prln("Warning: Annotations were found to violate the " +
695 "crossed over condition: \n" +
696 "1. [" +
697 getContent().getContent(
698 crossedOverAnnotation.getStartNode().getOffset(),
699 crossedOverAnnotation.getEndNode().getOffset()) +
700 " (" + crossedOverAnnotation.getType() + ": " +
701 crossedOverAnnotation.getStartNode().getOffset() +
702 ";" + crossedOverAnnotation.getEndNode().getOffset() +
703 ")]\n" +
704 "2. [" +
705 getContent().getContent(
706 currentAnnot.getStartNode().getOffset(),
707 currentAnnot.getEndNode().getOffset()) +
708 " (" + currentAnnot.getType() + ": " +
709 currentAnnot.getStartNode().getOffset() +
710 ";" + currentAnnot.getEndNode().getOffset() +
711 ")]\nThe second one will be discarded.\n" );
712 } catch (gate.util.InvalidOffsetException ex) {
713 throw new GateRuntimeException(ex.getMessage());
714 }
715 }// End if
716 }// End while
717 }// End if
718
719 //kalina: order the dumping list by start offset
720 Collections.sort(dumpingList, new gate.util.OffsetComparator());
721
722 // The dumpingSet is ready to be exported as XML
723 // Here we go.
724 if(sListener != null) sListener.statusChanged("Dumping annotations as XML");
725 StringBuffer xmlDoc = new StringBuffer(
726 DOC_SIZE_MULTIPLICATION_FACTOR*(this.getContent().size().intValue()));
727
728 // Add xml header if original format was xml
729 String mimeType = getFeatures() == null ?
730 null :
731 (String)getFeatures().get("MimeType");
732 boolean wasXML = mimeType != null && mimeType.equalsIgnoreCase("text/xml");
733
734 if(wasXML){
735 xmlDoc.append("<?xml version=\"1.0\" encoding=\"");
736 xmlDoc.append(getEncoding());
737 xmlDoc.append("\" ?>");
738 xmlDoc.append(Strings.getNl());
739 }// ENd if
740 // Identify and extract the root annotation from the dumpingSet.
741 theRootAnnotation = identifyTheRootAnnotation(dumpingList);
742 // If a root annotation has been identified then add it eplicitley at the
743 // beginning of the document
744 if (theRootAnnotation != null){
745 dumpingList.remove(theRootAnnotation);
746 xmlDoc.append(writeStartTag(theRootAnnotation,includeFeatures));
747 }// End if
748 // Construct and append the rest of the document
749 xmlDoc.append(saveAnnotationSetAsXml(dumpingList, includeFeatures));
750 // If a root annotation has been identified then add it eplicitley at the
751 // end of the document
752 if (theRootAnnotation != null){
753 xmlDoc.append(writeEndTag(theRootAnnotation));
754 }// End if
755
756 if(sListener != null) sListener.statusChanged("Done.");
757 return xmlDoc.toString();
758 }//End toXml()
759
760 /** This method verifies if aSourceAnnotation can ve inserted safety into the
761 * aTargetAnnotSet. Safety means that it doesn't violate the crossed over
762 * contition with any annotation from the aTargetAnnotSet.
763 * @param aTargetAnnotSet the annotation set to include the aSourceAnnotation
764 * @param aSourceAnnotation the annotation to be inserted into the
765 * aTargetAnnotSet
766 * @return true if the annotation inserts safety, or false otherwise.
767 */
768 private boolean insertsSafety(AnnotationSet aTargetAnnotSet,
769 Annotation aSourceAnnotation){
770
771 if (aTargetAnnotSet == null || aSourceAnnotation == null) {
772 this.crossedOverAnnotation = null;
773 return false;
774 }
775 if (aSourceAnnotation.getStartNode() == null ||
776 aSourceAnnotation.getStartNode().getOffset()== null) {
777 this.crossedOverAnnotation = null;
778 return false;
779 }
780 if (aSourceAnnotation.getEndNode() == null ||
781 aSourceAnnotation.getEndNode().getOffset()== null) {
782 this.crossedOverAnnotation = null;
783 return false;
784 }
785
786 // Get the start and end offsets
787 Long start = aSourceAnnotation.getStartNode().getOffset();
788 Long end = aSourceAnnotation.getEndNode().getOffset();
789 // Read aSourceAnnotation offsets long
790 long s2 = start.longValue();
791 long e2 = end.longValue();
792
793 // Obtain a set with all annotations annotations that overlap
794 // totaly or partially with the interval defined by the two provided offsets
795 AnnotationSet as = aTargetAnnotSet.get(start,end);
796
797 // Investigate all the annotations from as to see if there is one that
798 // comes in conflict with aSourceAnnotation
799 Iterator it = as.iterator();
800 while(it.hasNext()){
801 Annotation ann = (Annotation) it.next();
802 // Read ann offsets
803 long s1 = ann.getStartNode().getOffset().longValue();
804 long e1 = ann.getEndNode().getOffset().longValue();
805
806 if (s1<s2 && s2<e1 && e1<e2) {
807 this.crossedOverAnnotation = ann;
808 return false;
809 }
810 if (s2<s1 && s1<e2 && e2<e1) {
811 this.crossedOverAnnotation = ann;
812 return false;
813 }
814 }// End while
815 return true;
816 }// insertsSafety()
817
818 private boolean insertsSafety(List aTargetAnnotList,
819 Annotation aSourceAnnotation){
820
821 if (aTargetAnnotList == null || aSourceAnnotation == null) {
822 this.crossedOverAnnotation = null;
823 return false;
824 }
825 if (aSourceAnnotation.getStartNode() == null ||
826 aSourceAnnotation.getStartNode().getOffset()== null) {
827 this.crossedOverAnnotation = null;
828 return false;
829 }
830 if (aSourceAnnotation.getEndNode() == null ||
831 aSourceAnnotation.getEndNode().getOffset()== null) {
832 this.crossedOverAnnotation = null;
833 return false;
834 }
835
836 // Get the start and end offsets
837 Long start = aSourceAnnotation.getStartNode().getOffset();
838 Long end = aSourceAnnotation.getEndNode().getOffset();
839 // Read aSourceAnnotation offsets long
840 long s2 = start.longValue();
841 long e2 = end.longValue();
842
843 // Obtain a set with all annotations annotations that overlap
844 // totaly or partially with the interval defined by the two provided offsets
845 List as = new ArrayList();
846 for (int i=0; i < aTargetAnnotList.size(); i++) {
847 Annotation annot = (Annotation) aTargetAnnotList.get(i);
848 if (annot.getStartNode().getOffset().longValue() >= s2
849 &&
850 annot.getStartNode().getOffset().longValue() <= e2)
851 as.add(annot);
852 else if (annot.getEndNode().getOffset().longValue() >= s2
853 &&
854 annot.getEndNode().getOffset().longValue() <= e2)
855 as.add(annot);
856 }
857
858 // Investigate all the annotations from as to see if there is one that
859 // comes in conflict with aSourceAnnotation
860 Iterator it = as.iterator();
861 while(it.hasNext()){
862 Annotation ann = (Annotation) it.next();
863 // Read ann offsets
864 long s1 = ann.getStartNode().getOffset().longValue();
865 long e1 = ann.getEndNode().getOffset().longValue();
866
867 if (s1<s2 && s2<e1 && e1<e2) {
868 this.crossedOverAnnotation = ann;
869 return false;
870 }
871 if (s2<s1 && s1<e2 && e2<e1) {
872 this.crossedOverAnnotation = ann;
873 return false;
874 }
875 }// End while
876 return true;
877 }// insertsSafety()
878
879 /** This method saves all the annotations from aDumpAnnotSet and combines
880 * them with the document content.
881 * @param aDumpAnnotSet is a GATE annotation set prepared to be used
882 * on the raw text from document content. If aDumpAnnotSet is <b>null<b>
883 * then an empty string will be returned.
884 * @param includeFeatures is a boolean, which controls whether the annotation
885 * features and gate ID are included or not.
886 * @return The XML document obtained from raw text + the information from
887 * the dump annotation set.
888 */
889 private String saveAnnotationSetAsXml(AnnotationSet aDumpAnnotSet,
890 boolean includeFeatures){
891 String content = null;
892 if (this.getContent()== null)
893 content = new String("");
894 else
895 content = this.getContent().toString();
896 StringBuffer docContStrBuff = filterNonXmlChars(new StringBuffer(content));
897 if (aDumpAnnotSet == null) return docContStrBuff.toString();
898
899 TreeMap offsets2CharsMap = new TreeMap();
900 if (this.getContent().size().longValue() != 0){
901 // Fill the offsets2CharsMap with all the indices where
902 // special chars appear
903 buildEntityMapFromString(content,offsets2CharsMap);
904 }//End if
905 // The saving alghorithm is as follows:
906 ///////////////////////////////////////////
907 // Construct a set of annot with all IDs in asc order.
908 // All annotations that end at that offset swap their place in descending
909 // order. For each node write all the tags from left to right.
910
911 // Construct the node set
912 TreeSet offsets = new TreeSet();
913 Iterator iter = aDumpAnnotSet.iterator();
914 while (iter.hasNext()){
915 Annotation annot = (Annotation) iter.next();
916 offsets.add(annot.getStartNode().getOffset());
917 offsets.add(annot.getEndNode().getOffset());
918 }// End while
919
920 // ofsets is sorted in ascending order.
921 // Iterate this set in descending order and remove an offset at each
922 // iteration
923 while (!offsets.isEmpty()){
924 Long offset = (Long)offsets.last();
925 // Remove the offset from the set
926 offsets.remove(offset);
927 // Now, use it.
928 // Returns a list with annotations that needs to be serialized in that
929 // offset.
930 List annotations = getAnnotationsForOffset(aDumpAnnotSet,offset);
931 // Attention: the annotation are serialized from left to right
932 // StringBuffer tmpBuff = new StringBuffer("");
933 StringBuffer tmpBuff = new StringBuffer(
934 DOC_SIZE_MULTIPLICATION_FACTOR*(this.getContent().size().intValue()));
935 Stack stack = new Stack();
936 // Iterate through all these annotations and serialize them
937 Iterator it = annotations.iterator();
938 while(it.hasNext()){
939 Annotation a = (Annotation) it.next();
940 it.remove();
941 // Test if a Ends at offset
942 if ( offset.equals(a.getEndNode().getOffset()) ){
943 // Test if a Starts at offset
944 if ( offset.equals(a.getStartNode().getOffset()) ){
945 // Here, the annotation a Starts and Ends at the offset
946 if ( null != a.getFeatures().get("isEmptyAndSpan") &&
947 "true".equals((String)a.getFeatures().get("isEmptyAndSpan"))){
948
949 // Assert: annotation a with start == end and isEmptyAndSpan
950 tmpBuff.append(writeStartTag(a, includeFeatures));
951 stack.push(a);
952 }else{
953 // Assert annotation a with start == end and an empty tag
954 tmpBuff.append(writeEmptyTag(a));
955 // The annotation is removed from dumped set
956 aDumpAnnotSet.remove(a);
957 }// End if
958 }else{
959 // Here the annotation a Ends at the offset.
960 // In this case empty the stack and write the end tag
961 if (!stack.isEmpty()){
962 while(!stack.isEmpty()){
963 Annotation a1 = (Annotation)stack.pop();
964 tmpBuff.append(writeEndTag(a1));
965 }// End while
966 }// End if
967 tmpBuff.append(writeEndTag(a));
968 }// End if
969 }else{
970 // The annotation a does NOT end at the offset. Let's see if it starts
971 // at the offset
972 if ( offset.equals(a.getStartNode().getOffset()) ){
973 // The annotation a starts at the offset.
974 // In this case empty the stack and write the end tag
975 if (!stack.isEmpty()){
976 while(!stack.isEmpty()){
977 Annotation a1 = (Annotation)stack.pop();
978 tmpBuff.append(writeEndTag(a1));
979 }// End while
980 }// End if
981 tmpBuff.append(writeStartTag(a, includeFeatures));
982 // The annotation is removed from dumped set
983 aDumpAnnotSet.remove(a);
984 }// End if ( offset.equals(a.getStartNode().getOffset()) )
985 }// End if ( offset.equals(a.getEndNode().getOffset()) )
986 }// End while(it.hasNext()){
987
988 // In this case empty the stack and write the end tag
989 if (!stack.isEmpty()){
990 while(!stack.isEmpty()){
991 Annotation a1 = (Annotation)stack.pop();
992 tmpBuff.append(writeEndTag(a1));
993 }// End while
994 }// End if
995
996 // Before inserting tmpBuff into docContStrBuff we need to check
997 // if there are chars to be replaced and if there are, they would be
998 // replaced.
999 if (!offsets2CharsMap.isEmpty()){
1000 Long offsChar = (Long) offsets2CharsMap.lastKey();
1001 while( !offsets2CharsMap.isEmpty() &&
1002 offsChar.intValue() >= offset.intValue()){
1003 // Replace the char at offsChar with its corresponding entity form
1004 // the entitiesMap.
1005 docContStrBuff.replace(offsChar.intValue(),offsChar.intValue()+1,
1006 (String)entitiesMap.get((Character)offsets2CharsMap.get(offsChar)));
1007 // Discard the offsChar after it was used.
1008 offsets2CharsMap.remove(offsChar);
1009 // Investigate next offsChar
1010 if (!offsets2CharsMap.isEmpty())
1011 offsChar = (Long) offsets2CharsMap.lastKey();
1012 }// End while
1013 }// End if
1014 // Insert tmpBuff to the location where it belongs in docContStrBuff
1015 docContStrBuff.insert(offset.intValue(),tmpBuff.toString());
1016 }// End while(!offsets.isEmpty())
1017 // Need to replace the entities in the remaining text, if there is any text
1018 // So, if there are any more items in offsets2CharsMap they need to be
1019 // replaced
1020 while (!offsets2CharsMap.isEmpty()){
1021 Long offsChar = (Long) offsets2CharsMap.lastKey();
1022 // Replace the char with its entity
1023 docContStrBuff.replace(offsChar.intValue(),offsChar.intValue()+1,
1024 (String)entitiesMap.get((Character)offsets2CharsMap.get(offsChar)));
1025 // remove the offset from the map
1026 offsets2CharsMap.remove(offsChar);
1027 }// End while
1028 return docContStrBuff.toString();
1029 }// saveAnnotationSetAsXml()
1030
1031 private String saveAnnotationSetAsXml(List aDumpAnnotList,
1032 boolean includeFeatures){
1033 String content = null;
1034 if (this.getContent()== null)
1035 content = new String("");
1036 else
1037 content = this.getContent().toString();
1038 StringBuffer docContStrBuff = filterNonXmlChars(new StringBuffer(content));
1039 if (aDumpAnnotList == null) return docContStrBuff.toString();
1040
1041 StringBuffer resultStrBuff = new StringBuffer(
1042 DOC_SIZE_MULTIPLICATION_FACTOR*(this.getContent().size().intValue()));
1043 // last offset position used to extract portions of text
1044 Long lastOffset = new Long(0);
1045
1046 TreeMap offsets2CharsMap = new TreeMap();
1047 HashMap annotsForOffset = new HashMap(100);
1048 if (this.getContent().size().longValue() != 0){
1049 // Fill the offsets2CharsMap with all the indices where
1050 // special chars appear
1051 buildEntityMapFromString(content,offsets2CharsMap);
1052 }//End if
1053 // The saving alghorithm is as follows:
1054 ///////////////////////////////////////////
1055 // Construct a set of annot with all IDs in asc order.
1056 // All annotations that end at that offset swap their place in descending
1057 // order. For each node write all the tags from left to right.
1058
1059 // Construct the node set
1060 TreeSet offsets = new TreeSet();
1061 Iterator iter = aDumpAnnotList.iterator();
1062 Annotation annot;
1063 Long start;
1064 Long end;
1065 while (iter.hasNext()){
1066 annot = (Annotation) iter.next();
1067 start = annot.getStartNode().getOffset();
1068 end = annot.getEndNode().getOffset();
1069 offsets.add(start);
1070 offsets.add(end);
1071 if (annotsForOffset.containsKey(start)) {
1072 ((List) annotsForOffset.get(start)).add(annot);
1073 } else {
1074 List newList = new ArrayList(10);
1075 newList.add(annot);
1076 annotsForOffset.put(start, newList);
1077 }
1078 if (annotsForOffset.containsKey(end)) {
1079 ((List) annotsForOffset.get(end)).add(annot);
1080 } else {
1081 List newList = new ArrayList(10);
1082 newList.add(annot);
1083 annotsForOffset.put(end, newList);
1084 }
1085 }// End while
1086
1087 // ofsets is sorted in ascending order.
1088 // Iterate this set in descending order and remove an offset at each
1089 // iteration
1090 Iterator offsetIt = offsets.iterator();
1091 Long offset;
1092 List annotations;
1093 // This don't have to be a large buffer - just for tags
1094 StringBuffer tmpBuff = new StringBuffer(255);
1095 Stack stack = new Stack();
1096 while (offsetIt.hasNext()){
1097 offset = (Long)offsetIt.next();
1098 // Now, use it.
1099 // Returns a list with annotations that needs to be serialized in that
1100 // offset.
1101 annotations = (List) annotsForOffset.get(offset);
1102 // order annotations in list for offset to print tags in correct order
1103 annotations = getAnnotationsForOffset(annotations, offset);
1104 // clear structures
1105 tmpBuff.setLength(0);
1106 stack.clear();
1107
1108 // Iterate through all these annotations and serialize them
1109 Iterator it = annotations.iterator();
1110 Annotation a;
1111 Annotation annStack;
1112 while(it.hasNext()){
1113 a = (Annotation) it.next();
1114 // Test if a Ends at offset
1115 if ( offset.equals(a.getEndNode().getOffset()) ){
1116 // Test if a Starts at offset
1117 if ( offset.equals(a.getStartNode().getOffset()) ){
1118 // Here, the annotation a Starts and Ends at the offset
1119 if ( null != a.getFeatures().get("isEmptyAndSpan") &&
1120 "true".equals((String)a.getFeatures().get("isEmptyAndSpan"))){
1121
1122 // Assert: annotation a with start == end and isEmptyAndSpan
1123 tmpBuff.append(writeStartTag(a, includeFeatures));
1124 stack.push(a);
1125 }else{
1126 // Assert annotation a with start == end and an empty tag
1127 tmpBuff.append(writeEmptyTag(a));
1128 // The annotation is removed from dumped set
1129 aDumpAnnotList.remove(a);
1130 }// End if
1131 }else{
1132 // Here the annotation a Ends at the offset.
1133 // In this case empty the stack and write the end tag
1134 if (!stack.isEmpty()){
1135 while(!stack.isEmpty()){
1136 annStack = (Annotation)stack.pop();
1137 tmpBuff.append(writeEndTag(annStack));
1138 }// End while
1139 }// End if
1140 tmpBuff.append(writeEndTag(a));
1141 }// End if
1142 }else{
1143 // The annotation a does NOT end at the offset. Let's see if it starts
1144 // at the offset
1145 if ( offset.equals(a.getStartNode().getOffset()) ){
1146 // The annotation a starts at the offset.
1147 // In this case empty the stack and write the end tag
1148 if (!stack.isEmpty()){
1149 while(!stack.isEmpty()){
1150 annStack = (Annotation)stack.pop();
1151 tmpBuff.append(writeEndTag(annStack));
1152 }// End while
1153 }// End if
1154 tmpBuff.append(writeStartTag(a, includeFeatures));
1155 // The annotation is removed from dumped set
1156 }// End if ( offset.equals(a.getStartNode().getOffset()) )
1157 }// End if ( offset.equals(a.getEndNode().getOffset()) )
1158 }// End while(it.hasNext()){
1159
1160 // In this case empty the stack and write the end tag
1161 if (!stack.isEmpty()){
1162 while(!stack.isEmpty()){
1163 annStack = (Annotation)stack.pop();
1164 tmpBuff.append(writeEndTag(annStack));
1165 }// End while
1166 }// End if
1167
1168 // extract text from content and replace spec chars
1169 StringBuffer partText = new StringBuffer();
1170 SortedMap offsetsInRange =
1171 offsets2CharsMap.subMap(lastOffset, offset);
1172 Long tmpOffset;
1173 Long tmpLastOffset = lastOffset;
1174 String replacement;
1175
1176 // Before inserting tmpBuff into the buffer we need to check
1177 // if there are chars to be replaced in range
1178 if(!offsetsInRange.isEmpty()) {
1179 tmpOffset = (Long) offsetsInRange.firstKey();
1180 replacement =
1181 (String)entitiesMap.get((Character)offsets2CharsMap.get(tmpOffset));
1182 partText.append(docContStrBuff.substring(tmpLastOffset.intValue(),
1183 tmpOffset.intValue()));
1184 partText.append(replacement);
1185 tmpLastOffset = new Long(tmpOffset.longValue()+1);
1186 }
1187 partText.append(docContStrBuff.substring(tmpLastOffset.intValue(),
1188 offset.intValue()));
1189 resultStrBuff.append(partText);
1190 // Insert tmpBuff to the result string
1191 resultStrBuff.append(tmpBuff.toString());
1192 lastOffset = offset;
1193 }// End while(!offsets.isEmpty())
1194
1195 // get text to the end of content
1196 // extract text from content and replace spec chars
1197 StringBuffer partText = new StringBuffer();
1198 SortedMap offsetsInRange =
1199 offsets2CharsMap.subMap(lastOffset, new Long(docContStrBuff.length()));
1200 Long tmpOffset;
1201 Long tmpLastOffset = lastOffset;
1202 String replacement;
1203
1204 // Need to replace the entities in the remaining text, if there is any text
1205 // So, if there are any more items in offsets2CharsMap for remaining text
1206 // they need to be replaced
1207 if(!offsetsInRange.isEmpty()) {
1208 tmpOffset = (Long) offsetsInRange.firstKey();
1209 replacement =
1210 (String)entitiesMap.get((Character)offsets2CharsMap.get(tmpOffset));
1211 partText.append(docContStrBuff.substring(tmpLastOffset.intValue(),
1212 tmpOffset.intValue()));
1213 partText.append(replacement);
1214 tmpLastOffset = new Long(tmpOffset.longValue()+1);
1215 }
1216 partText.append(docContStrBuff.substring(tmpLastOffset.intValue(),
1217 docContStrBuff.length()));
1218 resultStrBuff.append(partText);
1219
1220 return resultStrBuff.toString();
1221 }// saveAnnotationSetAsXml()
1222
1223/* Old method created by Cristian. Create content backward.
1224
1225 private String saveAnnotationSetAsXml(List aDumpAnnotList,
1226 boolean includeFeatures){
1227 String content = null;
1228 if (this.getContent()== null)
1229 content = new String("");
1230 else
1231 content = this.getContent().toString();
1232 StringBuffer docContStrBuff = filterNonXmlChars(new StringBuffer(content));
1233 if (aDumpAnnotList == null) return docContStrBuff.toString();
1234
1235 TreeMap offsets2CharsMap = new TreeMap();
1236 HashMap annotsForOffset = new HashMap(100);
1237 if (this.getContent().size().longValue() != 0){
1238 // Fill the offsets2CharsMap with all the indices where
1239 // special chars appear
1240 buildEntityMapFromString(content,offsets2CharsMap);
1241 }//End if
1242 // The saving alghorithm is as follows:
1243 ///////////////////////////////////////////
1244 // Construct a set of annot with all IDs in asc order.
1245 // All annotations that end at that offset swap their place in descending
1246 // order. For each node write all the tags from left to right.
1247
1248 // Construct the node set
1249 TreeSet offsets = new TreeSet();
1250 Iterator iter = aDumpAnnotList.iterator();
1251 while (iter.hasNext()){
1252 Annotation annot = (Annotation) iter.next();
1253 offsets.add(annot.getStartNode().getOffset());
1254 offsets.add(annot.getEndNode().getOffset());
1255 if (annotsForOffset.containsKey(annot.getStartNode().getOffset())) {
1256 ((List) annotsForOffset.get(annot.getStartNode().getOffset())).add(annot);
1257 } else {
1258 List newList = new ArrayList(10);
1259 newList.add(annot);
1260 annotsForOffset.put(annot.getStartNode().getOffset(), newList);
1261 }
1262 if (annotsForOffset.containsKey(annot.getEndNode().getOffset())) {
1263 ((List) annotsForOffset.get(annot.getEndNode().getOffset())).add(annot);
1264 } else {
1265 List newList = new ArrayList(10);
1266 newList.add(annot);
1267 annotsForOffset.put(annot.getEndNode().getOffset(), newList);
1268 }
1269 }// End while
1270
1271 // ofsets is sorted in ascending order.
1272 // Iterate this set in descending order and remove an offset at each
1273 // iteration
1274 while (!offsets.isEmpty()){
1275 Long offset = (Long)offsets.last();
1276 // Remove the offset from the set
1277 offsets.remove(offset);
1278 // Now, use it.
1279 // Returns a list with annotations that needs to be serialized in that
1280 // offset.
1281// List annotations = getAnnotationsForOffset(aDumpAnnotList,offset);
1282 List annotations = (List) annotsForOffset.get(offset);
1283 annotations = getAnnotationsForOffset(annotations,offset);
1284 // Attention: the annotation are serialized from left to right
1285// StringBuffer tmpBuff = new StringBuffer("");
1286 StringBuffer tmpBuff = new StringBuffer(
1287 DOC_SIZE_MULTIPLICATION_FACTOR*(this.getContent().size().intValue()));
1288 Stack stack = new Stack();
1289 // Iterate through all these annotations and serialize them
1290 Iterator it = annotations.iterator();
1291 while(it.hasNext()){
1292 Annotation a = (Annotation) it.next();
1293 it.remove();
1294 // Test if a Ends at offset
1295 if ( offset.equals(a.getEndNode().getOffset()) ){
1296 // Test if a Starts at offset
1297 if ( offset.equals(a.getStartNode().getOffset()) ){
1298 // Here, the annotation a Starts and Ends at the offset
1299 if ( null != a.getFeatures().get("isEmptyAndSpan") &&
1300 "true".equals((String)a.getFeatures().get("isEmptyAndSpan"))){
1301
1302 // Assert: annotation a with start == end and isEmptyAndSpan
1303 tmpBuff.append(writeStartTag(a, includeFeatures));
1304 stack.push(a);
1305 }else{
1306 // Assert annotation a with start == end and an empty tag
1307 tmpBuff.append(writeEmptyTag(a));
1308 // The annotation is removed from dumped set
1309 aDumpAnnotList.remove(a);
1310 }// End if
1311 }else{
1312 // Here the annotation a Ends at the offset.
1313 // In this case empty the stack and write the end tag
1314 if (!stack.isEmpty()){
1315 while(!stack.isEmpty()){
1316 Annotation a1 = (Annotation)stack.pop();
1317 tmpBuff.append(writeEndTag(a1));
1318 }// End while
1319 }// End if
1320 tmpBuff.append(writeEndTag(a));
1321 }// End if
1322 }else{
1323 // The annotation a does NOT end at the offset. Let's see if it starts
1324 // at the offset
1325 if ( offset.equals(a.getStartNode().getOffset()) ){
1326 // The annotation a starts at the offset.
1327 // In this case empty the stack and write the end tag
1328 if (!stack.isEmpty()){
1329 while(!stack.isEmpty()){
1330 Annotation a1 = (Annotation)stack.pop();
1331 tmpBuff.append(writeEndTag(a1));
1332 }// End while
1333 }// End if
1334 tmpBuff.append(writeStartTag(a, includeFeatures));
1335 // The annotation is removed from dumped set
1336 aDumpAnnotList.remove(a);
1337 }// End if ( offset.equals(a.getStartNode().getOffset()) )
1338 }// End if ( offset.equals(a.getEndNode().getOffset()) )
1339 }// End while(it.hasNext()){
1340
1341 // In this case empty the stack and write the end tag
1342 if (!stack.isEmpty()){
1343 while(!stack.isEmpty()){
1344 Annotation a1 = (Annotation)stack.pop();
1345 tmpBuff.append(writeEndTag(a1));
1346 }// End while
1347 }// End if
1348
1349 // Before inserting tmpBuff into docContStrBuff we need to check
1350 // if there are chars to be replaced and if there are, they would be
1351 // replaced.
1352 if (!offsets2CharsMap.isEmpty()){
1353 Long offsChar = (Long) offsets2CharsMap.lastKey();
1354 while( !offsets2CharsMap.isEmpty() &&
1355 offsChar.intValue() >= offset.intValue()){
1356 // Replace the char at offsChar with its corresponding entity form
1357 // the entitiesMap.
1358 docContStrBuff.replace(offsChar.intValue(),offsChar.intValue()+1,
1359 (String)entitiesMap.get((Character)offsets2CharsMap.get(offsChar)));
1360 // Discard the offsChar after it was used.
1361 offsets2CharsMap.remove(offsChar);
1362 // Investigate next offsChar
1363 if (!offsets2CharsMap.isEmpty())
1364 offsChar = (Long) offsets2CharsMap.lastKey();
1365 }// End while
1366 }// End if
1367 // Insert tmpBuff to the location where it belongs in docContStrBuff
1368 docContStrBuff.insert(offset.intValue(),tmpBuff.toString());
1369 }// End while(!offsets.isEmpty())
1370 // Need to replace the entities in the remaining text, if there is any text
1371 // So, if there are any more items in offsets2CharsMap they need to be
1372 // replaced
1373 while (!offsets2CharsMap.isEmpty()){
1374 Long offsChar = (Long) offsets2CharsMap.lastKey();
1375 // Replace the char with its entity
1376 docContStrBuff.replace(offsChar.intValue(),offsChar.intValue()+1,
1377 (String)entitiesMap.get((Character)offsets2CharsMap.get(offsChar)));
1378 // remove the offset from the map
1379 offsets2CharsMap.remove(offsChar);
1380 }// End while
1381 return docContStrBuff.toString();
1382 }// saveAnnotationSetAsXml()
1383*/
1384
1385 /**
1386 * Return true only if the document has features for original content and
1387 * repositioning information.
1388 */
1389 private boolean hasOriginalContentFeatures() {
1390 FeatureMap features = getFeatures();
1391 boolean result = false;
1392
1393 result =
1394 (features.get(GateConstants.ORIGINAL_DOCUMENT_CONTENT_FEATURE_NAME) != null)
1395 &&
1396 (features.get(GateConstants.DOCUMENT_REPOSITIONING_INFO_FEATURE_NAME)
1397 != null);
1398
1399 return result;
1400 } // hasOriginalContentFeatures
1401
1402 /** This method saves all the annotations from aDumpAnnotSet and combines
1403 * them with the original document content, if preserved as feature.
1404 * @param aSourceAnnotationSet is a GATE annotation set prepared to be used
1405 * on the raw text from document content. If aDumpAnnotSet is <b>null<b>
1406 * then an empty string will be returned.
1407 * @param includeFeatures is a boolean, which controls whether the annotation
1408 * features and gate ID are included or not.
1409 * @return The XML document obtained from raw text + the information from
1410 * the dump annotation set.
1411 */
1412 private String saveAnnotationSetAsXmlInOrig(Set aSourceAnnotationSet,
1413 boolean includeFeatures){
1414 StringBuffer docContStrBuff;
1415
1416 String origContent;
1417
1418 origContent =
1419 (String)features.get(GateConstants.ORIGINAL_DOCUMENT_CONTENT_FEATURE_NAME);
1420 if(origContent == null) {
1421 origContent = "";
1422 } // if
1423
1424 long originalContentSize = origContent.length();
1425
1426 RepositioningInfo repositioning = (RepositioningInfo)
1427 getFeatures().get(GateConstants.DOCUMENT_REPOSITIONING_INFO_FEATURE_NAME);
1428
1429 docContStrBuff = new StringBuffer(origContent);
1430 if (aSourceAnnotationSet == null) return docContStrBuff.toString();
1431
1432 StatusListener sListener = (StatusListener)
1433 gate.gui.MainFrame.getListeners().
1434 get("gate.event.StatusListener");
1435
1436 AnnotationSet originalMarkupsAnnotSet =
1437 this.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
1438 // Create a dumping annotation set on the document. It will be used for
1439 // dumping annotations...
1440 AnnotationSet dumpingSet = new AnnotationSetImpl((Document) this);
1441 if(sListener != null)
1442 sListener.statusChanged("Constructing the dumping annotation set.");
1443 // Then take all the annotations from aSourceAnnotationSet and verify if
1444 // they can be inserted safely into the dumpingSet. Where not possible,
1445 // report.
1446 if (aSourceAnnotationSet != null){
1447 Iterator iter = aSourceAnnotationSet.iterator();
1448 Annotation currentAnnot;
1449 while (iter.hasNext()){
1450 currentAnnot = (Annotation) iter.next();
1451 if(insertsSafety(originalMarkupsAnnotSet, currentAnnot)
1452 && insertsSafety(dumpingSet, currentAnnot)){
1453 dumpingSet.add(currentAnnot);
1454 }else{
1455 Out.prln("Warning: Annotation with ID=" + currentAnnot.getId() +
1456 ", startOffset=" + currentAnnot.getStartNode().getOffset() +
1457 ", endOffset=" + currentAnnot.getEndNode().getOffset() +
1458 ", type=" + currentAnnot.getType()+ " was found to violate the" +
1459 " crossed over condition. It will be discarded");
1460 }// End if
1461 }// End while
1462 }// End if
1463
1464 // The dumpingSet is ready to be exported as XML
1465 // Here we go.
1466 if(sListener != null) sListener.statusChanged("Dumping annotations as XML");
1467
1468 ///////////////////////////////////////////
1469 // Construct a set of annot with all IDs in asc order.
1470 // All annotations that end at that offset swap their place in descending
1471 // order. For each node write all the tags from left to right.
1472
1473 // Construct the node set
1474 TreeSet offsets = new TreeSet();
1475 Iterator iter = aSourceAnnotationSet.iterator();
1476 while (iter.hasNext()){
1477 Annotation annot = (Annotation) iter.next();
1478 offsets.add(annot.getStartNode().getOffset());
1479 offsets.add(annot.getEndNode().getOffset());
1480 }// End while
1481
1482 // ofsets is sorted in ascending order.
1483 // Iterate this set in descending order and remove an offset at each
1484 // iteration
1485 while (!offsets.isEmpty()){
1486 Long offset = (Long)offsets.last();
1487 // Remove the offset from the set
1488 offsets.remove(offset);
1489 // Now, use it.
1490 // Returns a list with annotations that needs to be serialized in that
1491 // offset.
1492 List annotations = getAnnotationsForOffset(aSourceAnnotationSet,offset);
1493 // Attention: the annotation are serialized from left to right
1494 StringBuffer tmpBuff = new StringBuffer("");
1495 Stack stack = new Stack();
1496 // Iterate through all these annotations and serialize them
1497 Iterator it = annotations.iterator();
1498 Annotation a = null;
1499 while(it.hasNext()) {
1500 a = (Annotation) it.next();
1501 it.remove();
1502 // Test if a Ends at offset
1503 if ( offset.equals(a.getEndNode().getOffset()) ){
1504 // Test if a Starts at offset
1505 if ( offset.equals(a.getStartNode().getOffset()) ){
1506 // Here, the annotation a Starts and Ends at the offset
1507 if ( null != a.getFeatures().get("isEmptyAndSpan") &&
1508 "true".equals((String)a.getFeatures().get("isEmptyAndSpan"))){
1509
1510 // Assert: annotation a with start == end and isEmptyAndSpan
1511 tmpBuff.append(writeStartTag(a, includeFeatures, false));
1512 stack.push(a);
1513 }else{
1514 // Assert annotation a with start == end and an empty tag
1515 tmpBuff.append(writeEmptyTag(a, false));
1516 // The annotation is removed from dumped set
1517 aSourceAnnotationSet.remove(a);
1518 }// End if
1519 }else{
1520 // Here the annotation a Ends at the offset.
1521 // In this case empty the stack and write the end tag
1522 while(!stack.isEmpty()){
1523 Annotation a1 = (Annotation)stack.pop();
1524 tmpBuff.append(writeEndTag(a1));
1525 }// End while
1526 tmpBuff.append(writeEndTag(a));
1527 }// End if
1528 }else{
1529 // The annotation a does NOT end at the offset. Let's see if it starts
1530 // at the offset
1531 if ( offset.equals(a.getStartNode().getOffset()) ){
1532 // The annotation a starts at the offset.
1533 // In this case empty the stack and write the end tag
1534 while(!stack.isEmpty()){
1535 Annotation a1 = (Annotation)stack.pop();
1536 tmpBuff.append(writeEndTag(a1));
1537 }// End while
1538
1539 tmpBuff.append(writeStartTag(a, includeFeatures, false));
1540 // The annotation is removed from dumped set
1541 aSourceAnnotationSet.remove(a);
1542 }// End if ( offset.equals(a.getStartNode().getOffset()) )
1543 }// End if ( offset.equals(a.getEndNode().getOffset()) )
1544 }// End while(it.hasNext()){
1545
1546 // In this case empty the stack and write the end tag
1547 while(!stack.isEmpty()){
1548 Annotation a1 = (Annotation)stack.pop();
1549 tmpBuff.append(writeEndTag(a1));
1550 }// End while
1551
1552 long originalPosition = -1;
1553 boolean backPositioning =
1554 a != null && offset.equals(a.getEndNode().getOffset());
1555 if ( backPositioning ) {
1556 // end of the annotation correction
1557 originalPosition =
1558 repositioning.getOriginalPos(offset.intValue(), true);
1559 } // if
1560
1561 if(originalPosition == -1) {
1562 originalPosition = repositioning.getOriginalPos(offset.intValue());
1563 } // if
1564
1565 // Insert tmpBuff to the location where it belongs in docContStrBuff
1566 if(originalPosition != -1 && originalPosition <= originalContentSize ) {
1567 docContStrBuff.insert((int) originalPosition, tmpBuff.toString());
1568 }
1569 else {
1570 Out.prln("Error in the repositioning. The offset ("+offset.intValue()
1571 +") could not be positioned in the original document. \n"
1572 +"Calculated position is: "+originalPosition
1573 +" placed back: "+backPositioning);
1574 } // if
1575
1576 }// End while(!offsets.isEmpty())
1577 if (theRootAnnotation != null)
1578 docContStrBuff.append(writeEndTag(theRootAnnotation));
1579 return docContStrBuff.toString();
1580 } // saveAnnotationSetAsXmlInOrig()
1581
1582 /** This method returns a list with annotations ordered that way that
1583 * they can be serialized from left to right, at the offset. If one of the
1584 * params is null then an empty list will be returned.
1585 * @param aDumpAnnotSet is a set containing all annotations that will be
1586 * dumped.
1587 * @param offset represent the offset at witch the annotation must start
1588 * AND/OR end.
1589 * @return a list with those annotations that need to be serialized.
1590 */
1591 private List getAnnotationsForOffset(Set aDumpAnnotSet, Long offset){
1592 List annotationList = new LinkedList();
1593 if (aDumpAnnotSet == null || offset == null) return annotationList;
1594 Set annotThatStartAtOffset = new TreeSet(
1595 new AnnotationComparator(ORDER_ON_END_OFFSET,DESC));
1596 Set annotThatEndAtOffset = new TreeSet(
1597 new AnnotationComparator(ORDER_ON_START_OFFSET,DESC));
1598 Set annotThatStartAndEndAtOffset = new TreeSet(
1599 new AnnotationComparator(ORDER_ON_ANNOT_ID,ASC));
1600
1601 // Fill these tree lists with annotation tat start, end or start and
1602 // end at the offset.
1603 Iterator iter = aDumpAnnotSet.iterator();
1604 while(iter.hasNext()){
1605 Annotation ann = (Annotation) iter.next();
1606 if (offset.equals(ann.getStartNode().getOffset())){
1607 if (offset.equals(ann.getEndNode().getOffset()))
1608 annotThatStartAndEndAtOffset.add(ann);
1609 else
1610 annotThatStartAtOffset.add(ann);
1611 }else{
1612 if (offset.equals(ann.getEndNode().getOffset()))
1613 annotThatEndAtOffset.add(ann);
1614 }// End if
1615 }// End while
1616 annotationList.addAll(annotThatEndAtOffset);
1617 annotThatEndAtOffset = null;
1618 annotationList.addAll(annotThatStartAtOffset);
1619 annotThatStartAtOffset = null;
1620 iter = annotThatStartAndEndAtOffset.iterator();
1621 while(iter.hasNext()){
1622 Annotation ann = (Annotation) iter.next();
1623 Iterator it = annotationList.iterator();
1624 boolean breaked = false;
1625 while (it.hasNext()){
1626 Annotation annFromList = (Annotation) it.next();
1627 if (annFromList.getId().intValue() > ann.getId().intValue()){
1628 annotationList.add(annotationList.indexOf(annFromList),ann);
1629 breaked = true;
1630 break;
1631 }// End if
1632 }// End while
1633 if (!breaked)
1634 annotationList.add(ann);
1635 iter.remove();
1636 }// End while
1637 return annotationList;
1638 }// getAnnotationsForOffset()
1639
1640 private List getAnnotationsForOffset(List aDumpAnnotList, Long offset){
1641 List annotationList = new ArrayList();
1642 if (aDumpAnnotList == null || offset == null) return annotationList;
1643 Set annotThatStartAtOffset;
1644 Set annotThatEndAtOffset;
1645 Set annotThatStartAndEndAtOffset;
1646 annotThatStartAtOffset = new TreeSet(
1647 new AnnotationComparator(ORDER_ON_END_OFFSET, DESC));
1648 annotThatEndAtOffset = new TreeSet(
1649 new AnnotationComparator(ORDER_ON_START_OFFSET, DESC));
1650 annotThatStartAndEndAtOffset = new TreeSet(
1651 new AnnotationComparator(ORDER_ON_ANNOT_ID, ASC));
1652
1653 // Fill these tree lists with annotation tat start, end or start and
1654 // end at the offset.
1655 Iterator iter = aDumpAnnotList.iterator();
1656 while(iter.hasNext()){
1657 Annotation ann = (Annotation) iter.next();
1658 if (offset.equals(ann.getStartNode().getOffset())){
1659 if (offset.equals(ann.getEndNode().getOffset()))
1660 annotThatStartAndEndAtOffset.add(ann);
1661 else
1662 annotThatStartAtOffset.add(ann);
1663 }else{
1664 if (offset.equals(ann.getEndNode().getOffset()))
1665 annotThatEndAtOffset.add(ann);
1666 }// End if
1667 }// End while
1668
1669 annotationList.addAll(annotThatEndAtOffset);
1670 annotationList.addAll(annotThatStartAtOffset);
1671 annotThatEndAtOffset = null;
1672 annotThatStartAtOffset = null;
1673
1674 iter = annotThatStartAndEndAtOffset.iterator();
1675 while(iter.hasNext()){
1676 Annotation ann = (Annotation) iter.next();
1677 Iterator it = annotationList.iterator();
1678 boolean breaked = false;
1679 while (it.hasNext()){
1680 Annotation annFromList = (Annotation) it.next();
1681 if (annFromList.getId().intValue() > ann.getId().intValue()){
1682 annotationList.add(annotationList.indexOf(annFromList),ann);
1683 breaked = true;
1684 break;
1685 }// End if
1686 }// End while
1687 if (!breaked)
1688 annotationList.add(ann);
1689 iter.remove();
1690 }// End while
1691 return annotationList;
1692 }// getAnnotationsForOffset()
1693
1694 private String writeStartTag(Annotation annot, boolean includeFeatures){
1695 return writeStartTag(annot, includeFeatures, true);
1696 } // writeStartTag
1697
1698 /** Returns a string representing a start tag based on the input annot*/
1699 private String writeStartTag(Annotation annot, boolean includeFeatures,
1700 boolean includeNamespace){
1701 AnnotationSet originalMarkupsAnnotSet =
1702 this.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
1703
1704 StringBuffer strBuff = new StringBuffer("");
1705 if (annot == null) return strBuff.toString();
1706// if (!addGatePreserveFormatTag && isRootTag){
1707 if (theRootAnnotation != null && annot.getId().equals(theRootAnnotation.getId())){
1708 //the features are included either if desired or if that's an annotation
1709 //from the original markup of the document. We don't want for example to
1710 //spoil all links in an HTML file!
1711 if (includeFeatures) {
1712 strBuff.append("<");
1713 strBuff.append(annot.getType());
1714 strBuff.append(" ");
1715 if(includeNamespace) {
1716 strBuff.append(" xmlns:gate=\"http://www.gate.ac.uk\"");
1717 strBuff.append(" gate:");
1718 }
1719 strBuff.append("gateId=\"");
1720 strBuff.append(annot.getId());
1721 strBuff.append("\"");
1722 strBuff.append(" ");
1723 if(includeNamespace) {
1724 strBuff.append("gate:");
1725 }
1726 strBuff.append("annotMaxId=\"");
1727 strBuff.append(nextAnnotationId);
1728 strBuff.append("\"");
1729 strBuff.append(writeFeatures(annot.getFeatures(), includeNamespace));
1730 strBuff.append(">");
1731 }
1732 else if (originalMarkupsAnnotSet.contains(annot)) {
1733 strBuff.append("<");
1734 strBuff.append(annot.getType());
1735 strBuff.append(writeFeatures(annot.getFeatures(), includeNamespace));
1736 strBuff.append(">");
1737 }
1738 else {
1739 strBuff.append("<");
1740 strBuff.append(annot.getType());
1741 strBuff.append(">");
1742 }
1743
1744 }else{
1745 //the features are included either if desired or if that's an annotation
1746 //from the original markup of the document. We don't want for example to
1747 //spoil all links in an HTML file!
1748 if (includeFeatures) {
1749 strBuff.append("<");
1750 strBuff.append(annot.getType());
1751 strBuff.append(" ");
1752 if(includeNamespace) {
1753 strBuff.append("gate:");
1754 } // if includeNamespaces
1755 strBuff.append("gateId=\"");
1756 strBuff.append(annot.getId());
1757 strBuff.append("\"");
1758 strBuff.append(writeFeatures(annot.getFeatures(), includeNamespace));
1759 strBuff.append(">");
1760 }
1761 else if (originalMarkupsAnnotSet.contains(annot)) {
1762 strBuff.append("<");
1763 strBuff.append(annot.getType());
1764 strBuff.append(writeFeatures(annot.getFeatures(), includeNamespace));
1765 strBuff.append(">");
1766 }
1767 else {
1768 strBuff.append("<");
1769 strBuff.append(annot.getType());
1770 strBuff.append(">");
1771 }
1772 }// End if
1773 return strBuff.toString();
1774 }// writeStartTag()
1775
1776 /**
1777 * Identifies the root annotations inside an annotation set.
1778 * The root annotation is the one that starts at offset 0, and has the
1779 * greatest span. If there are more than one with this function, then the
1780 * annotation with the smalled ID wil be selected as root.
1781 * If none is identified it will return null.
1782 * @param anAnnotationSet The annotation set possibly containing
1783 * the root annotation.
1784 * @return The root annotation or null is it fails
1785 */
1786 private Annotation identifyTheRootAnnotation(AnnotationSet anAnnotationSet){
1787 if (anAnnotationSet == null) return null;
1788 // If the starting node of this annotation is not null, then the annotation
1789 // set will not have a root annotation.
1790 Node startNode = anAnnotationSet.firstNode();
1791 Node endNode = anAnnotationSet.lastNode();
1792 // This is placed here just to speed things up. The alghorithm bellow can
1793 // can identity the annotation that span over the entire set and with the
1794 // smallest ID. However the root annotation will have to have the start
1795 // offset equal to 0.
1796 if (startNode.getOffset().longValue() != 0) return null;
1797 // Go anf find the annotation.
1798 Annotation theRootAnnotation = null;
1799 // Check if there are annotations starting at offset 0. If there are, then
1800 // check all of them to see which one has the greatest span. Basically its
1801 // END offset should be the bigest offset from the input annotation set.
1802 long start = startNode.getOffset().longValue();
1803 long end = endNode.getOffset().longValue();
1804 for(Iterator it = anAnnotationSet.iterator(); it.hasNext();){
1805 Annotation currentAnnot = (Annotation) it.next();
1806 // If the currentAnnot has both its Start and End equals to the Start and
1807 // end of the AnnotationSet then check to see if its ID is the smallest.
1808 if (
1809 (start == currentAnnot.getStartNode().getOffset().longValue()) &&
1810 (end == currentAnnot.getEndNode().getOffset().longValue())
1811 ){
1812 // The currentAnnotation has is a potencial root one.
1813 if (theRootAnnotation == null)
1814 theRootAnnotation = currentAnnot;
1815 else{
1816 // If its ID is greater that the currentAnnot then update the root
1817 if ( theRootAnnotation.getId().intValue() > currentAnnot.getId().intValue())
1818 theRootAnnotation = currentAnnot;
1819 }// End if
1820 }// End if
1821 }// End for
1822 return theRootAnnotation;
1823 }// End identifyTheRootAnnotation()
1824
1825 private Annotation identifyTheRootAnnotation(List anAnnotationList){
1826 if (anAnnotationList == null || anAnnotationList.isEmpty()) return null;
1827 // If the first annotation in the list (which is sorted by start offset)
1828 //does not have an offset = 0, then there's no root tag.
1829 if(((Annotation)anAnnotationList.get(0)).
1830 getStartNode().getOffset().longValue() > 0) return null;
1831
1832 //find the limits
1833 long start = 0; //we know this already
1834 long end = 0; //end = 0 will be improved by the next loop
1835 for(int i = 0; i < anAnnotationList.size(); i++){
1836 Annotation anAnnotation = (Annotation)anAnnotationList.get(i);
1837 long localEnd = anAnnotation.getEndNode().getOffset().longValue();
1838 if(localEnd > end) end = localEnd;
1839 }
1840
1841 // Go and find the annotation.
1842 //look at all annotations that start at 0 and end at end
1843 //if there are several, choose the one with the smallest ID
1844 Annotation theRootAnnotation = null;
1845 for(int i = 0; i < anAnnotationList.size(); i++){
1846 Annotation currentAnnot = (Annotation) anAnnotationList.get(i);
1847 long localStart = currentAnnot.getStartNode().getOffset().longValue();
1848 long localEnd = currentAnnot.getEndNode().getOffset().longValue();
1849 // If the currentAnnot has both its Start and End equals to the Start and
1850 // end of the AnnotationSet then check to see if its ID is the smallest.
1851 if (
1852 (start == localStart) && (end == localEnd)){
1853 // The currentAnnotation has is a potential root one.
1854 if (theRootAnnotation == null) theRootAnnotation = currentAnnot;
1855 else{
1856 // If root's ID is greater that the currentAnnot then update the root
1857 if (theRootAnnotation.getId().intValue() > currentAnnot.getId().intValue())
1858 theRootAnnotation = currentAnnot;
1859 }// End if
1860 }// End if
1861 }// End for
1862 return theRootAnnotation;
1863 }// End identifyTheRootAnnotation()
1864
1865
1866 /** This method takes aScanString and searches for those chars from
1867 * entitiesMap that appear in the string. A tree map(offset2Char) is filled
1868 * using as key the offsets where those Chars appear and the Char.
1869 * If one of the params is null the method simply returns.
1870 */
1871 private void buildEntityMapFromString(String aScanString, TreeMap aMapToFill){
1872 if (aScanString == null || aMapToFill == null) return;
1873 if (entitiesMap == null || entitiesMap.isEmpty()){
1874 Err.prln("WARNING: Entities map was not initialised !");
1875 return;
1876 }// End if
1877 // Fill the Map with the offsets of the special chars
1878 Iterator entitiesMapIterator = entitiesMap.keySet().iterator();
1879 Character c;
1880 int fromIndex;
1881 while(entitiesMapIterator.hasNext()){
1882 c = (Character) entitiesMapIterator.next();
1883 fromIndex = 0;
1884 while (-1 != fromIndex){
1885 fromIndex = aScanString.indexOf(c.charValue(),fromIndex);
1886 if (-1 != fromIndex){
1887 aMapToFill.put(new Long(fromIndex),c);
1888 fromIndex ++;
1889 }// End if
1890 }// End while
1891 }// End while
1892 }//buildEntityMapFromString();
1893
1894 private String writeEmptyTag(Annotation annot){
1895 return writeEmptyTag(annot, true);
1896 } // writeEmptyTag
1897
1898 /** Returns a string representing an empty tag based on the input annot*/
1899 private String writeEmptyTag(Annotation annot, boolean includeNamespace){
1900 StringBuffer strBuff = new StringBuffer("");
1901 if (annot == null) return strBuff.toString();
1902
1903 strBuff.append("<");
1904 strBuff.append(annot.getType());
1905
1906 AnnotationSet originalMarkupsAnnotSet =
1907 this.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
1908 if (! originalMarkupsAnnotSet.contains(annot)) {
1909 strBuff.append(" gateId=\"");
1910 strBuff.append(annot.getId());
1911 strBuff.append("\"");
1912 }
1913 strBuff.append(writeFeatures(annot.getFeatures(),includeNamespace));
1914 strBuff.append("/>");
1915
1916 return strBuff.toString();
1917 }// writeEmptyTag()
1918
1919 /** Returns a string representing an end tag based on the input annot*/
1920 private String writeEndTag(Annotation annot){
1921 StringBuffer strBuff = new StringBuffer("");
1922 if (annot == null) return strBuff.toString();
1923/*
1924 if (annot.getType().indexOf(" ") != -1)
1925 Out.prln("Warning: Truncating end tag to first word for annot type \""
1926 +annot.getType()+ "\". ");
1927*/
1928 strBuff.append("</"+annot.getType()+">");
1929
1930 return strBuff.toString();
1931 }// writeEndTag()
1932
1933 /** Returns a string representing a FeatureMap serialized as XML attributes*/
1934 private String writeFeatures(FeatureMap feat, boolean includeNamespace){
1935 StringBuffer strBuff = new StringBuffer("");
1936 if (feat == null) return strBuff.toString();
1937 Iterator it = feat.keySet().iterator();
1938 while (it.hasNext()){
1939 Object key = it.next();
1940 Object value = feat.get(key);
1941 if ( (key != null) && (value != null) ){
1942 // Eliminate a feature inserted at reading time and which help to
1943 // take some decissions at saving time
1944 if ("isEmptyAndSpan".equals(key.toString()))
1945 continue;
1946 if( !(String.class.isAssignableFrom(key.getClass()) ||
1947 Number.class.isAssignableFrom(key.getClass()))){
1948
1949 Out.prln("Warning:Found a feature NAME("+key+") that doesn't came"+
1950 " from String or Number.(feature discarded)");
1951 continue;
1952 }// End if
1953 if ( !(String.class.isAssignableFrom(value.getClass()) ||
1954 Number.class.isAssignableFrom(value.getClass()) ||
1955 java.util.Collection.class.isAssignableFrom(value.getClass()))){
1956
1957 Out.prln("Warning:Found a feature VALUE("+value+") that doesn't came"+
1958 " from String, Number or Collection.(feature discarded)");
1959 continue;
1960 }// End if
1961 if ("matches".equals(key)) {
1962 strBuff.append(" ");
1963 if(includeNamespace) {
1964 strBuff.append("gate:");
1965 }
1966// strBuff.append(key);
1967 // replace non XML chars in attribute name
1968 strBuff.append(
1969 filterNonXmlChars(replaceCharsWithEntities(key.toString())));
1970 strBuff.append("=\"");
1971 }
1972 else {
1973 strBuff.append(" ");
1974// strBuff.append(key);
1975 // replace non XML chars in attribute name
1976 strBuff.append(
1977 filterNonXmlChars(replaceCharsWithEntities(key.toString())));
1978 strBuff.append("=\"");
1979 }
1980 if (java.util.Collection.class.isAssignableFrom(value.getClass())){
1981 Iterator valueIter = ((Collection)value).iterator();
1982 while(valueIter.hasNext()){
1983 Object item = valueIter.next();
1984 if (!(String.class.isAssignableFrom(item.getClass()) ||
1985 Number.class.isAssignableFrom(item.getClass())))
1986 continue;
1987// strBuff.append(item);
1988 // replace non XML chars in collection item
1989 strBuff.append(
1990 filterNonXmlChars(replaceCharsWithEntities(item.toString())));
1991 strBuff.append(";");
1992 }// End while
1993 if (strBuff.charAt(strBuff.length()-1) == ';')
1994 strBuff.deleteCharAt(strBuff.length()-1);
1995 }else{
1996// strBuff.append(value);
1997 // replace non XML chars in attribute value
1998 strBuff.append(
1999 filterNonXmlChars(replaceCharsWithEntities(value.toString())));
2000 }// End if
2001 strBuff.append("\"");
2002 }// End if
2003 }// End while
2004 return strBuff.toString();
2005 }// writeFeatures()
2006
2007 /** Returns a GateXml document that is a custom XML format for wich there is
2008 * a reader inside GATE called gate.xml.GateFormatXmlHandler.
2009 * What it does is to serialize a GATE document in an XML format.
2010 * @return a string representing a Gate Xml document.
2011 */
2012 public String toXml(){
2013 // Initialize the xmlContent with 3 time the size of the current document.
2014 // This is because of the tags size. This measure is made to increase the
2015 // performance of StringBuffer.
2016 StringBuffer xmlContent = new StringBuffer(
2017 DOC_SIZE_MULTIPLICATION_FACTOR*(getContent().size().intValue()));
2018 // Add xml header
2019 xmlContent.append("<?xml version=\"1.0\" encoding=\"");
2020 xmlContent.append(getEncoding());
2021 xmlContent.append("\" ?>");
2022 xmlContent.append(Strings.getNl());
2023
2024 // Add the root element
2025 xmlContent.append("<GateDocument>\n");
2026 xmlContent.append("<!-- The document's features-->\n\n");
2027 xmlContent.append("<GateDocumentFeatures>\n");
2028
2029 xmlContent.append(featuresToXml(this.getFeatures()));
2030 xmlContent.append("</GateDocumentFeatures>\n");
2031 xmlContent.append("<!-- The document content area with serialized"+
2032 " nodes -->\n\n");
2033 // Add plain text element
2034 xmlContent.append("<TextWithNodes>");
2035 xmlContent.append(textWithNodes(this.getContent().toString()));
2036 xmlContent.append("</TextWithNodes>\n");
2037 // Serialize as XML all document's annotation sets
2038 // Serialize the default AnnotationSet
2039 StatusListener sListener = (StatusListener)
2040 gate.gui.MainFrame.getListeners().
2041 get("gate.event.StatusListener");
2042 if(sListener != null)
2043 sListener.statusChanged("Saving the default annotation set ");
2044 xmlContent.append("<!-- The default annotation set -->\n\n");
2045 xmlContent.append(annotationSetToXml(this.getAnnotations()));
2046 // Serialize all others AnnotationSets
2047 // namedAnnotSets is a Map containing all other named Annotation Sets.
2048 if (namedAnnotSets != null){
2049 Iterator iter = namedAnnotSets.values().iterator();
2050 while(iter.hasNext()){
2051 AnnotationSet annotSet = (AnnotationSet) iter.next();
2052 xmlContent.append("<!-- Named annotation set -->\n\n");
2053 // Serialize it as XML
2054 if(sListener != null) sListener.statusChanged("Saving " +
2055 annotSet.getName()+
2056 " annotation set ");
2057 xmlContent.append(annotationSetToXml(annotSet));
2058 }// End while
2059 }// End if
2060 // Add the end of GateDocument
2061 xmlContent.append("</GateDocument>");
2062 if(sListener != null) sListener.statusChanged("Done !");
2063 // return the XmlGateDocument
2064 return xmlContent.toString();
2065 }// toXml
2066
2067 /** This method filters any non XML char
2068 * see: http://www.w3c.org/TR/2000/REC-xml-20001006#charsets
2069 * All non XML chars will be replaced with 0x20 (space char) This assures
2070 * that the next time the document is loaded there won't be any problems.
2071 * @param aStrBuffer represents the input String that is filtred. If the
2072 * aStrBuffer is null then an empty string will be returend
2073 * @return the "purified" StringBuffer version of the aStrBuffer
2074 */
2075 private StringBuffer filterNonXmlChars(StringBuffer aStrBuffer){
2076 if (aStrBuffer == null) return new StringBuffer("");
2077// String space = new String(" ");
2078 char space = ' ';
2079 for (int i=aStrBuffer.length()-1;i>=0; i--){
2080 if (!isXmlChar(aStrBuffer.charAt(i)))
2081 aStrBuffer.setCharAt(i, space);
2082 }// End for
2083 return aStrBuffer;
2084 }// filterNonXmlChars()
2085
2086 /** This method decide if a char is a valid XML one or not
2087 * @param ch the char to be tested
2088 * @return true if is a valid XML char and fals if is not.
2089 */
2090 public static boolean isXmlChar(char ch){
2091 if (ch == 0x9 || ch == 0xA || ch ==0xD) return true;
2092 if ((0x20 <= ch) && (ch <= 0xD7FF)) return true;
2093 if ((0xE000 <= ch) && (ch <= 0xFFFD)) return true;
2094 if ((0x10000 <= ch) && (ch <= 0x10FFFF)) return true;
2095 return false;
2096 }// End isXmlChar()
2097
2098 /** This method saves a FeatureMap as XML elements.
2099 * @param aFeatureMap the feature map that has to be saved as XML.
2100 * @return a String like this: <Feature><Name>...</Name>
2101 * <Value>...</Value></Feature><Feature>...</Feature>
2102 */
2103 private String featuresToXml(FeatureMap aFeatureMap){
2104 StringBuffer str = new StringBuffer("");
2105
2106 if (aFeatureMap == null) return str.toString();
2107
2108 Set keySet = aFeatureMap.keySet();
2109 Iterator keyIterator = keySet.iterator();
2110 while(keyIterator.hasNext()){
2111 Object key = keyIterator.next();
2112 Object value = aFeatureMap.get(key);
2113 if ((key != null) && (value != null)){
2114 String keyClassName = null;
2115 String keyItemClassName = null;
2116 String valueClassName = null;
2117 String valueItemClassName = null;
2118 String key2String = key.toString();
2119 String value2String = value.toString();
2120
2121 Object item = null;
2122 // Test key if it is String, Number or Collection
2123 if (key instanceof java.lang.String ||
2124 key instanceof java.lang.Number ||
2125 key instanceof java.util.Collection)
2126 keyClassName = key.getClass().getName();
2127
2128 // Test value if it is String, Number or Collection
2129 if (value instanceof java.lang.String ||
2130 value instanceof java.lang.Number ||
2131 value instanceof java.util.Collection)
2132 valueClassName = value.getClass().getName();
2133
2134 // Features and values that are not Strings, Numbers or collections
2135 // will be discarded.
2136 if (keyClassName == null || valueClassName == null) continue;
2137
2138 // If key is collection serialize the colection in a specific format
2139 if (key instanceof java.util.Collection){
2140 StringBuffer keyStrBuff = new StringBuffer("");
2141 Iterator iter = ((Collection) key).iterator();
2142 if (iter.hasNext()){
2143 item = iter.next();
2144 if (item instanceof java.lang.Number)
2145 keyItemClassName = item.getClass().getName();
2146 else
2147 keyItemClassName = String.class.getName();
2148 keyStrBuff.append(item.toString());
2149 }// End if
2150 while (iter.hasNext()){
2151 item = iter.next();
2152 keyStrBuff.append(";" + item.toString());
2153 }// End while
2154 key2String = keyStrBuff.toString();
2155 }// End if
2156 // If key is collection serialize the colection in a specific format
2157 if (value instanceof java.util.Collection){
2158 StringBuffer valueStrBuff = new StringBuffer("");
2159 Iterator iter = ((Collection) value).iterator();
2160 if (iter.hasNext()){
2161 item = iter.next();
2162 if (item instanceof java.lang.Number)
2163 valueItemClassName = item.getClass().getName();
2164 else
2165 valueItemClassName = String.class.getName();
2166 valueStrBuff.append(item.toString());
2167 }// End if
2168 while (iter.hasNext()){
2169 item = iter.next();
2170 valueStrBuff.append(";" + item.toString());
2171 }// End while
2172 value2String = valueStrBuff.toString();
2173 }// End if
2174 str.append("<Feature>\n <Name");
2175 if (keyClassName != null)
2176 str.append(" className=\""+keyClassName+"\"");
2177 if (keyItemClassName != null)
2178 str.append(" itemClassName=\""+keyItemClassName+"\"");
2179 str.append(">");
2180 str.append(filterNonXmlChars(replaceCharsWithEntities(key2String)));
2181 str.append("</Name>\n <Value");
2182 if (valueClassName != null)
2183 str.append(" className=\"" + valueClassName + "\"");
2184 if (valueItemClassName != null)
2185 str.append(" itemClassName=\"" + valueItemClassName + "\"");
2186 str.append(">");
2187 str.append(filterNonXmlChars(replaceCharsWithEntities(value2String)));
2188 str.append("</Value>\n</Feature>\n");
2189 }// End if
2190 }// end While
2191 return str.toString();
2192 }//featuresToXml
2193
2194 /** This method replace all chars that appears in the anInputString and also
2195 * that are in the entitiesMap with their corresponding entity
2196 * @param anInputString the string analyzed. If it is null then returns the
2197 * empty string
2198 * @return a string representing the input string with chars replaced with
2199 * entities
2200 */
2201 private StringBuffer replaceCharsWithEntities(String anInputString){
2202 if (anInputString == null) return new StringBuffer("");
2203 StringBuffer strBuff = new StringBuffer(anInputString);
2204 for (int i=strBuff.length()-1; i>=0; i--){
2205 Character ch = new Character(strBuff.charAt(i));
2206 if (entitiesMap.keySet().contains(ch)){
2207 strBuff.replace(i,i+1,(String) entitiesMap.get(ch));
2208 }// End if
2209 }// End for
2210 return strBuff;
2211 }//replaceCharsWithEntities()
2212
2213 /** This method creates Node XML elements and inserts them at the
2214 * corresponding offset inside the text. Nodes are created from the default
2215 * annotation set, as well as from all existing named annotation sets.
2216 * @param aText The text representing the document's plain text.
2217 * @return The text with empty <Node id="NodeId"/> elements.
2218 */
2219 private String textWithNodes(String aText){
2220 if (aText == null) return new String("");
2221 StringBuffer textWithNodes = filterNonXmlChars(new StringBuffer(aText));
2222
2223 // Construct a map from offsets to Chars
2224 TreeMap offsets2CharsMap = new TreeMap();
2225 if (aText.length()!= 0){
2226 // Fill the offsets2CharsMap with all the indices where special chars appear
2227 buildEntityMapFromString(aText,offsets2CharsMap);
2228 }//End if
2229 // Construct the offsetsSet for all nodes belonging to this document
2230 TreeSet offsetsSet = new TreeSet();
2231 Iterator annotSetIter = this.getAnnotations().iterator();
2232 while (annotSetIter.hasNext()){
2233 Annotation annot = (Annotation) annotSetIter.next();
2234 offsetsSet.add(annot.getStartNode().getOffset());
2235 offsetsSet.add(annot.getEndNode().getOffset());
2236 }// end While
2237 // Get the nodes from all other named annotation sets.
2238 if (namedAnnotSets != null){
2239 Iterator iter = namedAnnotSets.values().iterator();
2240 while(iter.hasNext()){
2241 AnnotationSet annotSet = (AnnotationSet) iter.next();
2242 Iterator iter2 = annotSet.iterator();
2243 while(iter2.hasNext()){
2244 Annotation annotTmp = (Annotation) iter2.next();
2245 offsetsSet.add(annotTmp.getStartNode().getOffset());
2246 offsetsSet.add(annotTmp.getEndNode().getOffset());
2247 }// End while
2248 }// End while
2249 }// End if
2250 // offsetsSet is ordered in ascending order because the structure
2251 // is a TreeSet
2252
2253 if (offsetsSet.isEmpty()){
2254 return replaceCharsWithEntities(aText).toString();
2255 }// End if
2256 // Iterate through all nodes from anAnnotSet and transform them to
2257 // XML elements. Then insert those elements at the node's offset into the
2258 // textWithNodes .
2259 while (!offsetsSet.isEmpty()){
2260 Long offset = (Long) offsetsSet.last();
2261 // Eliminate the offset from the list in order to create more memory space
2262 offsetsSet.remove(offset);
2263 // Use offset
2264 int offsetValue = offset.intValue();
2265 String strNode = "<Node id=\"" + offsetValue + "\"/>";
2266 // Before inserting this string into the textWithNodes, check to see if
2267 // there are any chars to be replaced with their corresponding entities
2268 if (!offsets2CharsMap.isEmpty()){
2269 Long offsChar = (Long) offsets2CharsMap.lastKey();
2270 while( !offsets2CharsMap.isEmpty() &&
2271 offsChar.intValue() >= offset.intValue()){
2272 // Replace the char at offsChar with its corresponding entity form
2273 // the entitiesMap.
2274 textWithNodes.replace(offsChar.intValue(),offsChar.intValue()+1,
2275 (String)entitiesMap.get((Character)offsets2CharsMap.get(offsChar)));
2276 // Discard the offsChar after it was used because this offset will
2277 // never appear again
2278 offsets2CharsMap.remove(offsChar);
2279 // Investigate next offsChar
2280 if (!offsets2CharsMap.isEmpty())
2281 offsChar = (Long) offsets2CharsMap.lastKey();
2282 }// End while
2283 }// End if
2284 // Now it is safe to insert the node
2285 textWithNodes.insert(offsetValue,strNode);
2286 }// end while
2287 // Need to replace the entities in the remaining text, if there is any text
2288 // So, if there are any more items in offsets2CharsMap they need to be
2289 // replaced
2290 while (!offsets2CharsMap.isEmpty()){
2291 Long offsChar = (Long) offsets2CharsMap.lastKey();
2292 // Replace the char with its entity
2293 textWithNodes.replace(offsChar.intValue(),offsChar.intValue()+1,
2294 (String)entitiesMap.get((Character)offsets2CharsMap.get(offsChar)));
2295 // remove the offset from the map
2296 offsets2CharsMap.remove(offsChar);
2297 }// End while
2298 return textWithNodes.toString();
2299 }//textWithNodes()
2300
2301 /** This method saves an AnnotationSet as XML.
2302 * @param anAnnotationSet The annotation set that has to be saved as XML.
2303 * @return a String like this: <AnnotationSet> <Annotation>....
2304 * </AnnotationSet>
2305 */
2306 private String annotationSetToXml(AnnotationSet anAnnotationSet){
2307 StringBuffer str = new StringBuffer("");
2308
2309 if (anAnnotationSet == null){
2310 str.append("<AnnotationSet>\n");
2311 str.append("</AnnotationSet>\n");
2312 return str.toString();
2313 }// End if
2314 if (anAnnotationSet.getName() == null)
2315 str.append("<AnnotationSet>\n");
2316 else str.append("<AnnotationSet Name=\"" + anAnnotationSet.getName()+
2317 "\" >\n");
2318 // Iterate through AnnotationSet and save each Annotation as XML
2319 Iterator iterator = anAnnotationSet.iterator();
2320 while (iterator.hasNext()){
2321 Annotation annot = (Annotation) iterator.next();
2322 str.append("<Annotation " + "Type=\"" + annot.getType() +
2323 "\" StartNode=\"" + annot.getStartNode().getOffset() +
2324 "\" EndNode=\"" + annot.getEndNode().getOffset() + "\">\n");
2325 str.append(featuresToXml(annot.getFeatures()));
2326 str.append("</Annotation>\n");
2327 }// End while
2328
2329 str.append("</AnnotationSet>\n");
2330 return str.toString();
2331 }// annotationSetToXml
2332
2333 /** Returns a map with the named annotation sets. It returns <code>null</code>
2334 * if no named annotaton set exists. */
2335 public Map getNamedAnnotationSets() {
2336 return namedAnnotSets;
2337 } // getNamedAnnotationSets
2338
2339 /** Returns a set of all named annotation sets in existence
2340 */
2341 public Set getAnnotationSetNames(){
2342 return namedAnnotSets.keySet();
2343 }
2344
2345
2346 /**
2347 * Removes one of the named annotation sets.
2348 * Note that the default annotation set cannot be removed.
2349 * @param name the name of the annotation set to be removed
2350 */
2351 public void removeAnnotationSet(String name){
2352 Object removed = namedAnnotSets.remove(name);
2353 if(removed != null){
2354 fireAnnotationSetRemoved(
2355 new DocumentEvent(this, DocumentEvent.ANNOTATION_SET_REMOVED, name));
2356 }
2357 }
2358
2359 /** Propagate edit changes to the document content and annotations. */
2360 public void edit(Long start, Long end, DocumentContent replacement)
2361 throws InvalidOffsetException
2362 {
2363 if(! isValidOffsetRange(start, end))
2364 throw new InvalidOffsetException();
2365
2366 if(content != null)
2367 ((DocumentContentImpl) content).edit(start, end, replacement);
2368
2369 if(defaultAnnots != null)
2370 ((AnnotationSetImpl) defaultAnnots).edit(start, end, replacement);
2371
2372 if(namedAnnotSets != null) {
2373 Iterator iter = namedAnnotSets.values().iterator();
2374 while(iter.hasNext())
2375 ((AnnotationSetImpl) iter.next()).edit(start, end, replacement);
2376 }
2377 //let the listeners know
2378 fireContentEdited(new DocumentEvent(this, DocumentEvent.CONTENT_EDITED,
2379 start, end));
2380 } // edit(start,end,replacement)
2381
2382 /** Check that an offset is valid, i.e. it is non-null, greater than
2383 * or equal to 0 and less than the size of the document content.
2384 */
2385 public boolean isValidOffset(Long offset) {
2386 if(offset == null)
2387 return false;
2388
2389 long o = offset.longValue();
2390 if(o > getContent().size().longValue() || o < 0)
2391 return false;
2392
2393 return true;
2394 } // isValidOffset
2395
2396 /** Check that both start and end are valid offsets and that
2397 * they constitute a valid offset range, i.e. start is greater
2398 * than or equal to long.
2399 */
2400 public boolean isValidOffsetRange(Long start, Long end) {
2401 return
2402 isValidOffset(start) && isValidOffset(end) &&
2403 start.longValue() <= end.longValue();
2404 } // isValidOffsetRange(start,end)
2405
2406 /** Sets the nextAnnotationId */
2407 public void setNextAnnotationId(int aNextAnnotationId){
2408 nextAnnotationId = aNextAnnotationId;
2409 }// setNextAnnotationId();
2410
2411 /** Generate and return the next annotation ID */
2412 public Integer getNextAnnotationId() {
2413 return new Integer(nextAnnotationId++);
2414 } // getNextAnnotationId
2415
2416 /** Generate and return the next node ID */
2417 public Integer getNextNodeId() { return new Integer(nextNodeId++); }
2418
2419 /** Ordering based on URL.toString() and the URL offsets (if any) */
2420 public int compareTo(Object o) throws ClassCastException {
2421 DocumentImpl other = (DocumentImpl) o;
2422 return getOrderingString().compareTo(other.getOrderingString());
2423 } // compareTo
2424
2425 /** Utility method to produce a string for comparison in ordering.
2426 * String is based on the source URL and offsets.
2427 */
2428 protected String getOrderingString() {
2429 if(sourceUrl == null) return toString();
2430
2431 StringBuffer orderingString = new StringBuffer(sourceUrl.toString());
2432 if(sourceUrlStartOffset != null && sourceUrlEndOffset != null) {
2433 orderingString.append(sourceUrlStartOffset.toString());
2434 orderingString.append(sourceUrlEndOffset.toString());
2435 }
2436
2437 return orderingString.toString();
2438 } // getOrderingString()
2439
2440 /** The id of the next new annotation */
2441 protected int nextAnnotationId = 0;
2442
2443 /** The id of the next new node */
2444 protected int nextNodeId = 0;
2445 /** The source URL */
2446 protected URL sourceUrl;
2447
2448 /** The document's URL name. */
2449
2450 /** The content of the document */
2451 protected DocumentContent content;
2452
2453 /** The encoding of the source of the document content */
2454 protected String encoding = null;
2455
2456 // Data needed in toXml(AnnotationSet) methos
2457
2458 /** This field indicates whether or not to add the tag
2459 * called GatePreserveFormat to the document. HTML, XML, SGML docs won't
2460 * have this tag added
2461 */
2462// private boolean addGatePreserveFormatTag = false;
2463
2464 /**
2465 * Used by the XML dump preserving format method
2466 */
2467 private Annotation theRootAnnotation = null;
2468
2469 /** This field is used when creating StringBuffers for toXml() methods.
2470 * The size of the StringBuffer will be docDonctent.size() multiplied by this
2471 * value. It is aimed to improve the performance of StringBuffer
2472 */
2473 private final int DOC_SIZE_MULTIPLICATION_FACTOR = 2;
2474
2475 /** Constant used in the inner class AnnotationComparator to order
2476 * annotations on their start offset
2477 */
2478 private final int ORDER_ON_START_OFFSET = 0;
2479 /** Constant used in the inner class AnnotationComparator to order
2480 * annotations on their end offset
2481 */
2482 private final int ORDER_ON_END_OFFSET = 1;
2483 /** Constant used in the inner class AnnotationComparator to order
2484 * annotations on their ID
2485 */
2486 private final int ORDER_ON_ANNOT_ID = 2;
2487 /** Constant used in the inner class AnnotationComparator to order
2488 * annotations ascending
2489 */
2490 private final int ASC = 3;
2491 /** Constant used in the inner class AnnotationComparator to order
2492 * annotations descending
2493 */
2494 private final int DESC = -3;
2495
2496 /** A map initialized in init() containing entities that needs to be
2497 * replaced in strings
2498 */
2499 private static Map entitiesMap = null;
2500 // Initialize the entities map use when saving as xml
2501 static{
2502 entitiesMap = new HashMap();
2503 entitiesMap.put(new Character('<'),"<");
2504 entitiesMap.put(new Character('>'),">");
2505 entitiesMap.put(new Character('&'),"&");
2506 entitiesMap.put(new Character('\''),"'");
2507 entitiesMap.put(new Character('"'),""");
2508 entitiesMap.put(new Character((char)160)," ");
2509 entitiesMap.put(new Character((char)169),"©");
2510 }//static
2511
2512 /** The range that the content comes from at the source URL
2513 * (or null if none).
2514 */
2515 //protected Long[] sourceUrlOffsets;
2516
2517 /** The start of the range that the content comes from at the source URL
2518 * (or null if none).
2519 */
2520 protected Long sourceUrlStartOffset;
2521
2522 /** The end of the range that the content comes from at the source URL
2523 * (or null if none).
2524 */
2525 protected Long sourceUrlEndOffset;
2526
2527 /** The default annotation set */
2528 protected AnnotationSet defaultAnnots;
2529
2530 /** Named sets of annotations */
2531 protected Map namedAnnotSets;
2532
2533 /**
2534 * A property of the document that will be set when the user
2535 * wants to create the document from a string, as opposed to from
2536 * a URL.
2537 */
2538 private String stringContent;
2539
2540 /**
2541 * The stringContent of a document is
2542 * a property of the document that will be set when the user
2543 * wants to create the document from a string, as opposed to from
2544 * a URL.
2545 * <B>Use the <TT>getContent</TT> method instead to get the actual document
2546 * content.</B>
2547 */
2548 public String getStringContent() { return stringContent; }
2549
2550 /**
2551 * The stringContent of a document is
2552 * a property of the document that will be set when the user
2553 * wants to create the document from a string, as opposed to from
2554 * a URL.
2555 * <B>Use the <TT>setContent</TT> method instead to update the actual
2556 * document content.</B>
2557 */
2558 public void setStringContent(String stringContent) {
2559 this.stringContent = stringContent;
2560 } // set StringContent
2561
2562 /** Is the document markup-aware? */
2563 protected Boolean markupAware = new Boolean(false);
2564// /** Hash code */
2565// public int hashCode() {
2566// int code = getContent().hashCode();
2567// int memberCode = (defaultAnnots == null) ? 0 : defaultAnnots.hashCode();
2568// code += memberCode;
2569// memberCode = (encoding == null) ? 0 : encoding.hashCode();
2570// code += memberCode;
2571// memberCode = (features == null) ? 0 : features.hashCode();
2572// code += memberCode;
2573// code += (markupAware.booleanValue()) ? 0 : 1;
2574// memberCode = (namedAnnotSets == null) ? 0 : namedAnnotSets.hashCode();
2575// code += memberCode;
2576// code += nextAnnotationId;
2577// code += nextNodeId;
2578// memberCode = (sourceUrl == null) ? 0 : sourceUrl.hashCode();
2579// code += memberCode;
2580// memberCode =
2581// (sourceUrlStartOffset == null) ? 0 : sourceUrlStartOffset.hashCode();
2582// code += memberCode;
2583// memberCode =
2584// (sourceUrlEndOffset == null) ? 0 : sourceUrlEndOffset.hashCode();
2585// code += memberCode;
2586// return code;
2587// } // hashcode
2588
2589 /** String respresentation */
2590 public String toString() {
2591 String n = Strings.getNl();
2592 StringBuffer s = new StringBuffer("DocumentImpl: " + n);
2593 s.append(" content:" + content + n);
2594 s.append(" defaultAnnots:" + defaultAnnots + n);
2595 s.append(" encoding:" + encoding + n);
2596 s.append(" features:" + features + n);
2597 s.append(" markupAware:" + markupAware + n);
2598 s.append(" namedAnnotSets:" + namedAnnotSets + n);
2599 s.append(" nextAnnotationId:" + nextAnnotationId + n);
2600 s.append(" nextNodeId:" + nextNodeId + n);
2601 s.append(" sourceUrl:" + sourceUrl + n);
2602 s.append(" sourceUrlStartOffset:" + sourceUrlStartOffset + n);
2603 s.append(" sourceUrlEndOffset:" + sourceUrlEndOffset + n);
2604 s.append(n);
2605
2606 return s.toString();
2607 } // toString
2608
2609 /** Freeze the serialization UID. */
2610 static final long serialVersionUID = -8456893608311510260L;
2611
2612 /** Inner class needed to compare annotations*/
2613 class AnnotationComparator implements java.util.Comparator {
2614 int orderOn = -1;
2615 int orderType = ASC;
2616 /** Constructs a comparator according to one of three sorter types:
2617 * ORDER_ON_ANNOT_TYPE, ORDER_ON_END_OFFSET, ORDER_ON_START_OFFSET
2618 */
2619 public AnnotationComparator(int anOrderOn, int anOrderType){
2620 orderOn = anOrderOn;
2621 orderType = anOrderType;
2622 }// AnnotationComparator()
2623
2624 /**This method must be implemented according to Comparator interface */
2625 public int compare(Object o1, Object o2){
2626 Annotation a1 = (Annotation) o1;
2627 Annotation a2 = (Annotation) o2;
2628 // ORDER_ON_START_OFFSET ?
2629 if (orderOn == ORDER_ON_START_OFFSET){
2630 int result = a1.getStartNode().getOffset().compareTo(
2631 a2.getStartNode().getOffset());
2632 if (orderType == ASC){
2633 // ASC
2634 // If they are equal then their ID will decide.
2635 if (result == 0)
2636 return a1.getId().compareTo(a2.getId());
2637 return result;
2638 }else{
2639 // DESC
2640 if (result == 0)
2641 return - (a1.getId().compareTo(a2.getId()));
2642 return -result;
2643 }// End if (orderType == ASC)
2644 }// End if (orderOn == ORDER_ON_START_OFFSET)
2645
2646 // ORDER_ON_END_OFFSET ?
2647 if (orderOn == ORDER_ON_END_OFFSET){
2648 int result = a1.getEndNode().getOffset().compareTo(
2649 a2.getEndNode().getOffset());
2650 if (orderType == ASC){
2651 // ASC
2652 // If they are equal then their ID will decide.
2653 if (result == 0)
2654 return - (a1.getId().compareTo(a2.getId()));
2655 return result;
2656 }else{
2657 // DESC
2658 // If they are equal then their ID will decide.
2659 if (result == 0)
2660 return a1.getId().compareTo(a2.getId());
2661 return - result;
2662 }// End if (orderType == ASC)
2663 }// End if (orderOn == ORDER_ON_END_OFFSET)
2664
2665 // ORDER_ON_ANNOT_ID ?
2666 if (orderOn == ORDER_ON_ANNOT_ID){
2667 if (orderType == ASC)
2668 return a1.getId().compareTo(a2.getId());
2669 else
2670 return -(a1.getId().compareTo(a2.getId()));
2671 }// End if
2672 return 0;
2673 }//compare()
2674 } // End inner class AnnotationComparator
2675
2676
2677 private transient Vector documentListeners;
2678 private transient Vector gateListeners;
2679
2680 public synchronized void removeDocumentListener(DocumentListener l) {
2681 if (documentListeners != null && documentListeners.contains(l)) {
2682 Vector v = (Vector) documentListeners.clone();
2683 v.removeElement(l);
2684 documentListeners = v;
2685 }
2686 }
2687 public synchronized void addDocumentListener(DocumentListener l) {
2688 Vector v = documentListeners == null ? new Vector(2) : (Vector) documentListeners.clone();
2689 if (!v.contains(l)) {
2690 v.addElement(l);
2691 documentListeners = v;
2692 }
2693 }
2694
2695 protected void fireAnnotationSetAdded(DocumentEvent e) {
2696 if (documentListeners != null) {
2697 Vector listeners = documentListeners;
2698 int count = listeners.size();
2699 for (int i = 0; i < count; i++) {
2700 ((DocumentListener) listeners.elementAt(i)).annotationSetAdded(e);
2701 }
2702 }
2703 }
2704
2705 protected void fireAnnotationSetRemoved(DocumentEvent e) {
2706 if (documentListeners != null) {
2707 Vector listeners = documentListeners;
2708 int count = listeners.size();
2709 for (int i = 0; i < count; i++) {
2710 ((DocumentListener) listeners.elementAt(i)).annotationSetRemoved(e);
2711 }
2712 }
2713 }
2714
2715 protected void fireContentEdited(DocumentEvent e) {
2716 if (documentListeners != null) {
2717 Vector listeners = documentListeners;
2718 int count = listeners.size();
2719 for (int i = 0; i < count; i++) {
2720 ((DocumentListener) listeners.elementAt(i)).contentEdited(e);
2721 }
2722 }
2723 }
2724
2725 public void resourceLoaded(CreoleEvent e) {
2726 }
2727 public void resourceUnloaded(CreoleEvent e) {
2728 }
2729 public void datastoreOpened(CreoleEvent e) {
2730 }
2731 public void datastoreCreated(CreoleEvent e) {
2732 }
2733 public void resourceRenamed(Resource resource, String oldName,
2734 String newName){
2735 }
2736 public void datastoreClosed(CreoleEvent e) {
2737 if (! e.getDatastore().equals(this.getDataStore()))
2738 return;
2739 //close this lr, since it cannot stay open when the DS it comes from
2740 //is closed
2741 Factory.deleteResource(this);
2742 }
2743 public void setLRPersistenceId(Object lrID) {
2744 super.setLRPersistenceId( lrID);
2745 //make persistent documents listen to the creole register
2746 //for events about their DS
2747 Gate.getCreoleRegister().addCreoleListener(this);
2748 }
2749 public void resourceAdopted(DatastoreEvent evt) {
2750 }
2751 public void resourceDeleted(DatastoreEvent evt) {
2752 if(! evt.getSource().equals(this.getDataStore()))
2753 return;
2754 //if an open document is deleted from a DS, then
2755 //it must close itself immediately, as is no longer valid
2756 if(evt.getResourceID().equals(this.getLRPersistenceId()))
2757 Factory.deleteResource(this);
2758 }
2759 public void resourceWritten(DatastoreEvent evt) {
2760 }
2761 public void setDataStore(DataStore dataStore) throws gate.persist.PersistenceException {
2762 super.setDataStore( dataStore);
2763 if (this.dataStore != null)
2764 this.dataStore.addDatastoreListener(this);
2765 }
2766
2767 /**
2768 * This method added by Shafirin Andrey, to allow access to
2769 * protected member {@link #defaultAnnots}
2770 * Required for JAPE-Debugger.
2771 * */
2772 public void setDefaultAnnotations(AnnotationSet defaultAnnotations) {
2773 defaultAnnots = defaultAnnotations;
2774 }
2775
2776} // class DocumentImpl
2777