| Sgml2Xml.java |
1 /*
2 * Sgml2Xml.java
3 *
4 * Copyright (c) 1998-2004, The University of Sheffield.
5 *
6 * This file is part of GATE (see http://gate.ac.uk/), and is free
7 * software, licenced under the GNU Library General Public License,
8 * Version 2, June 1991 (in the distribution as file licence.html,
9 * and also available at http://gate.ac.uk/gate/licence.html).
10 *
11 * Cristian URSU, 4/July/2000
12 *
13 * $Id: Sgml2Xml.java,v 1.15 2004/07/21 17:10:09 akshay Exp $
14 */
15
16 package gate.sgml;
17
18 import java.io.File;
19 import java.io.IOException;
20 import java.net.MalformedURLException;
21 import java.util.*;
22
23 import gate.Document;
24 import gate.util.Files;
25
26
27 /**
28 * Not so fast...
29 * This class is not a realy Sgml2Xml convertor.
30 * It takes an SGML document and tries to prepare it for an XML parser
31 * For a true conversion we need an Java SGML parser...
32 * If you know one let me know....
33 *
34 * What does it do:
35 * <ul>
36 * <li>If it finds something like this : <element attribute = value>
37 * it will produce: <element attribute = "value">
38 * <li>If it finds something like this : <element something
39 * attribute2=value>it will produce : <element
40 * defaultAttribute="something" attribute2="value">
41 * <li>If it finds : <element att1='value1 value2' att2="value2
42 * value3"> it will produce: <element att1="value1 value2"
43 * att2="value2 value3">
44 * <li>If it finds : <element1> <elem>text </element1>
45 * will produce: <element1> <elem>text<elem>
46 * </element1>
47 * <li>If it find : <element1> <elem>[white spaces]
48 * </element1>,
49 * it will produce:<element1> <elem/>[white spaces]<
50 * /element1>
51 * </ul>
52 * What doesn't:
53 * <ul>
54 * <li>Doesn't expand the entities. So the entities from the SGML document
55 * must be resolved by the XML parser
56 * <li>Doesn't replace internal entities with their corresponding value
57 * </ul>
58 */
59
60 public class Sgml2Xml{
61
62 /** Debug flag */
63 private static final boolean DEBUG = false;
64
65 /**
66 * The constructor initialises some member fields
67 * @param SgmlDoc the content of the Sgml document that will be modified
68 */
69 public Sgml2Xml(String SgmlDoc){
70 // create a new modifier
71 m_modifier = new StringBuffer(SgmlDoc);
72 // create a new dobiousElements list
73 // se the explanatin at the end of the class
74 dubiousElements = new ArrayList();
75 stack = new Stack();
76 }
77
78 /**
79 * The other constructor
80 * @param doc The Gate document that will be transformed to XML
81 */
82 public Sgml2Xml(Document doc){
83 // set as a member
84 m_doc = doc;
85
86 // create a new modifier
87 m_modifier = new StringBuffer(m_doc.getContent().toString());
88
89 // create a new dobiousElements list
90 // se the explanatin at the end of the class
91 dubiousElements = new ArrayList();
92 stack = new Stack();
93
94 }
95
96 /* I keep this just in case I need some more debuging
97
98 public static void main(String[] args){
99 Sgml2Xml convertor =
100 new Sgml2Xml("<w VVI='res trtetre\" relu = \"stop\">say
101 <w VBZ>is\n<trunc> <w UNC>th </trunc>");
102 try{
103 Out.println(convertor.convert());
104 } catch (Exception e){
105 e.printStackTrace(Err.getPrintWriter());
106 }
107 }
108 */
109
110 /**
111 * It analises the char that was red in state 1
112 * If it finds '<' it then goes to state 2
113 * Otherwise it stays in state 1 and keeps track about the text that is not
114 * white spaces.
115 */
116 private void doState1(char currChar){
117 if ('<' == currChar){
118 // change to state 2
119 m_currState = 2;
120 if (!stack.isEmpty()){
121 // peek the element from the top of the stack
122 CustomObject o = (CustomObject) stack.peek();
123 // set some properties for this element
124 // first test to find out if text folows this element charPos > 0
125 if (charPos > 0){
126 // this is not an empty element because there is text that follows
127 // set the element from the top of the stack to be a non empty one
128 o.setClosePos(charPos);
129 o.setEmpty(false);
130 // reset the charPos
131 charPos = 0;
132 }//if (charPos > 0)
133 }//if (!stack.isEmpty())
134 }//if ('<' == m_currChar)
135 // if currChar is not whiteSpace then save the position of the last
136 // char that was read
137 if (('<' != currChar) && !isWhiteSpace(currChar))
138 charPos = m_cursor;
139 }//doState1
140
141 /**
142 We came from state 1 and just read '<'
143 If currChar == '/' -> state 11
144 If is a char != white spaces -> state 3
145 stay in state 2 while there are only white spaces
146 */
147 private void doState2(char currChar){
148 if ('/' == currChar){
149 // go to state 11
150 m_currState = 11;
151 }
152 // if currChar is a char != white spaces then go to state 3
153 if (('/' != m_currChar) && !isWhiteSpace(m_currChar)){
154 // save the position where starts the element's name
155 // we need that in order to be able to read the current tag name
156 // this name it will be read from m_modifier using the substring() method
157 elemNameStart = m_cursor -1;
158 // go to state 3
159 m_currState = 3;
160 }
161 }// doState2
162
163 /**
164 * Just read the first char from the element's name and now analize the next
165 * char.
166 * If '>' the elem name was a single char -> state 1
167 * IF is WhiteSpaces -> state 4
168 * Otherwise stay in state 3 and read the elemnt's name
169 */
170 private void doState3(char currChar){
171 if ( '>' == currChar ){
172
173 // save the pos where the element's name ends
174 elemNameEnd = m_cursor - 1;
175
176 // this is also the pos where to insert '/' for empty elements.
177 // In this case we have this situation <w> sau < w>
178 closePos = m_cursor - 1;
179
180 // get the name of the element
181 elemName = m_modifier.substring(elemNameStart,elemNameEnd);
182
183 // we put the element into stack
184 // we think in this point that the element is empty...
185 performFinalAction(elemName, closePos);
186
187 // go to state 1
188 m_currState = 1;
189 }
190 if (isWhiteSpace(currChar)){
191 // go to state 4
192 m_currState = 4;
193
194 // save the pos where the element's name ends
195 elemNameEnd = m_cursor - 1;
196
197 // get the name of the element
198 elemName = m_modifier.substring(elemNameStart,elemNameEnd);
199 }
200 }// doState3
201
202 /**
203 * We read the name of the element and we prepare for '>' or attributes
204 * '>' -> state 1
205 * any char !- white space -> state 5
206 */
207 private void doState4(char currChar){
208 if ( '>' == currChar ){
209 // this is also the pos where to insert '/' for empty elements in this case
210 closePos = m_cursor -1 ;
211
212 // we put the element into stack
213 // we think in this point that the element is empty...
214 performFinalAction(elemName, closePos);
215
216 // go to state 1
217 m_currState = 1;
218 }
219 if (( '>' != currChar ) && !isWhiteSpace(currChar)){
220 // we just read the first char from the attrib name or attrib value..
221 // go to state 5
222 m_currState = 5;
223
224 // remember the position where starts the attrib or the value of an attrib
225 attrStart = m_cursor - 1;
226 }
227 } // doState4
228
229 /**
230 * '=' -> state 6
231 * '>' -> state 4 (we didn't read an attribute but a value of the
232 * defaultAtt )
233 * WS (white spaces) we don't know yet if we read an attribute or the value
234 * of the defaultAttr -> state 10
235 * This state modifies the content onf m_modifier ... it adds text
236 */
237 private void doState5(char currChar){
238 if ( '=' == currChar )
239 m_currState = 6;
240 if ( '>' == currChar ){
241 // this mean that the attribute was a value and we have to create
242 // a default attribute
243 // the same as in state 10
244 attrEnd = m_cursor - 1 ;
245 m_modifier.insert(attrEnd,'"');
246 m_modifier.insert(attrStart,"defaultAttr=\"");
247
248 // go to state 4
249 m_currState = 4;
250
251 // parse again the entire sequence from state 4 before reading any char
252 m_cursor = attrStart;
253 }
254 if (isWhiteSpace(currChar)){
255 // go to state 10
256 m_currState = 10;
257
258 // record the position where ends this attribute
259 attrEnd = m_cursor - 1;
260 }
261 } // doState5
262
263 /**
264 * IF we read ' or " then we have to get prepared to read everything until
265 * the next ' or "
266 * If we read a char then -> state 8;
267 * Stay here while we read WS
268 */
269 private void doState6(char currChar){
270 if ( ('\'' == currChar) || ('"' == currChar) ){
271 endPair = currChar;
272 if ('\'' == currChar){
273
274 // we have to replace ' with "
275 m_modifier = m_modifier.replace(m_cursor - 1, m_cursor,"\"");
276 }
277 m_currState = 7;
278 }
279 if ( ('\'' != currChar) && ('"' != currChar) && !isWhiteSpace(currChar)){
280
281 // this means that curChar is any char
282 m_currState = 8;
283
284 // every value must be inside this pair""
285 m_modifier.insert(m_cursor - 1, '"');
286
287 // insert implies the modification of m_cursor
288 // we increment m_cursor in order to say in the same position and to
289 // anulate the efect of insert.
290 m_cursor ++;
291 }
292 }// doState6
293
294 /**
295 * If we find the pair ' or " go to state 9
296 * Otherwhise read everything and stay in state 7
297 * If in state 7 we read '>' then we add automaticaly a " at the end and go
298 * to state 1
299 */
300 private void doState7(char currChar){
301 //if ( ('\'' == currChar) || ('"' == currChar) ){
302 if ( endPair == currChar ){
303 if ('\'' == currChar){
304
305 // we have to replace ' with "
306 m_modifier = m_modifier.replace(m_cursor - 1, m_cursor,"\"");
307 }
308 // reset the endPair
309 endPair = ' ';
310 m_currState = 9;
311 }
312
313 if ('>' == currChar){
314 // go to state 1
315 m_currState = 1;
316
317 // insert the final " ata the end
318 m_modifier.insert(m_cursor - 1, '"');
319
320 // go to te current possition (because of insert)
321 m_cursor ++;
322
323 performFinalAction(elemName, m_cursor - 1);
324 }
325
326 }// doState7
327
328 /**
329 * If '>' go to state 1
330 * If WS go to state 9
331 * Stays in state 8 and read the attribute's value
332 */
333 private void doState8(char currChar){
334
335 if ('>' == currChar){
336 // go to state 1
337 m_currState = 1;
338
339 // complete the end " ( <elem attr="value> )
340 m_modifier.insert(m_cursor - 1, '"');
341
342 // go to te current possition (because of insert)
343 m_cursor ++;
344
345 // we finished to read a beggining tag
346 // see the method definition for more details
347 performFinalAction(elemName, m_cursor - 1);
348 }
349 if (isWhiteSpace(currChar)){
350 // go to state 9
351 m_currState = 9;
352
353 // add the ending " char
354 m_modifier.insert(m_cursor - 1, '"');
355
356 // increment the cursor in order to anulate the effect of insert
357 m_cursor ++;
358 }
359 } // doState8
360 /**
361 * Here we prepare to read another attrib, value pair (any char -> state 5)
362 * If '>' we just read a beggining tag -> state 1
363 * Stay here while read WS
364 */
365 private void doState9(char currChar){
366 if ('>' == currChar){
367 // go to state 1
368 m_currState = 1;
369
370 // add the object to the stack
371 performFinalAction(elemName, m_cursor - 1);
372 }
373 if (('>' != currChar) && !isWhiteSpace(m_currChar)){
374 // this is the same as state 4->5
375 m_currState = 5;
376 attrStart = m_cursor - 1;
377 }
378 }//doState9
379
380 /**
381 * If any C -> state 4
382 * If '=' state 6
383 * Stays here while reads WS
384 */
385 private void doState10(char currChar){
386 if ('=' == currChar)
387 m_currState = 6;
388 if ( ('=' != currChar) && !isWhiteSpace(currChar)){
389 // this mean that the attribute was a value and we have to create
390 // a default attribute
391 m_modifier.insert(attrEnd,'"');
392 m_modifier.insert(attrStart,"defaultAttr=\"");
393
394 // go to state 4
395 m_currState = 4;
396
397 m_cursor = attrStart;
398 }
399 }// doState10
400
401 /**
402 * We are preparing to read the and definition of an element
403 * Stays in this state while reading WS
404 */
405 private void doState11(char currChar){
406 if (!isWhiteSpace(currChar)){
407 m_currState = 12;
408 elemNameStart = m_cursor - 1;
409 }
410 } // doState11
411
412 /**
413 * Here we read the element's name ...this is an end tag
414 * Stays here while reads a char
415 */
416 private void doState12(char currChar) {
417 if ('>' == currChar){
418 elemNameEnd = m_cursor - 1;
419 elemName = m_modifier.substring(elemNameStart,elemNameEnd);
420 performActionWithEndElem(elemName);
421 m_currState = 1;
422 }
423 if (isWhiteSpace(currChar)){
424 m_currState = 13;
425 elemNameEnd = m_cursor - 1;
426 }
427 }//doState12
428
429 /**
430 * If '>' -> state 1
431 * Stays here while reads WS
432 */
433 private void doState13(char currChar) {
434 if ('>' == currChar){
435 elemName = m_modifier.substring(elemNameStart,elemNameEnd);
436 performActionWithEndElem(elemName);
437 m_currState = 1;
438 }
439 } // doState13
440
441 /**
442 This method is responsable with document conversion
443 */
444 public String convert()throws IOException,MalformedURLException {
445 while (thereAreCharsToBeProcessed()) {
446 // read() gets the next char and increment the m_cursor
447 m_currChar = read();
448 switch(m_currState){
449 case 1: doState1(m_currChar);break;
450 case 2: doState2(m_currChar);break;
451 case 3: doState3(m_currChar);break;
452 case 4: doState4(m_currChar);break;
453 case 5: doState5(m_currChar);break;
454 case 6: doState6(m_currChar);break;
455 case 7: doState7(m_currChar);break;
456 case 8: doState8(m_currChar);break;
457 case 9: doState9(m_currChar);break;
458 case 10: doState10(m_currChar);break;
459 case 11: doState11(m_currChar);break;
460 case 12: doState12(m_currChar);break;
461 case 13: doState13(m_currChar);break;
462 }// switch(m_currState)
463 }// while (thereAreCharsToBeProcessed())
464
465 // put all the elements from the stack into the dubiousElements list
466 // we do that in order to colect all the dubious elements
467 while (!stack.isEmpty()) {
468 CustomObject obj = (CustomObject) stack.pop();
469 dubiousElements.add(obj);
470 }
471
472 // sort the dubiousElements list descending on closePos...
473 // This is vital for the alghorithm because we have to make
474 // all the modifications from the bottom to the top...
475 // If we fail to do that, insert will change indices and
476 // CustomObject.getClosePos() will not be acurate anymore.
477 Collections.sort(dubiousElements, new MyComparator());
478
479 //here we resolve all the dubious Elements...
480 // see the description of makeFinalModifications() method
481 ListIterator listIterator = dubiousElements.listIterator();
482 while (listIterator.hasNext()){
483 CustomObject obj = (CustomObject) listIterator.next();
484 makeFinalModifications(obj);
485 }
486
487 //finally add the XML prolog
488 m_modifier.insert(0,"<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n");
489 //Out.println(m_modifier.toString());
490 /*
491 // get a InputStream from m_modifier and write it into a temp file
492 // finally return the URI of the new XML document
493 ByteArrayInputStream is = new ByteArrayInputStream(
494 m_modifier.toString().getBytes()
495 );
496 */
497 // this method is in gate.util package
498 File file = Files.writeTempFile(m_modifier.toString(),"UTF-8");
499
500 //return m_doc.getSourceURL().toString();
501 return file.toURL().toString();
502 }// convert()
503
504 /**
505 * This method tests to see if there are more char to be read
506 * It will return false when there are no more chars to be read
507 */
508 private boolean thereAreCharsToBeProcessed() {
509 if (m_cursor < m_modifier.length()) return true;
510 else return false;
511 }//thereAreCharsToBeProcessed
512
513 /**
514 * This method reads a char and increments the m_cursor
515 */
516 private char read(){
517 return m_modifier.charAt(m_cursor ++);
518 }//read
519
520 /**
521 * This is the action when we finished to read the entire tag
522 * The action means that we put the tag into stack and consider that is empty
523 * as default
524 */
525 private void performFinalAction(String elemName, int pos) {
526 // create anew CustomObject
527 CustomObject obj = new CustomObject();
528
529 // set its properties
530 obj.setElemName(elemName);
531 obj.setClosePos(pos);
532
533 // default we consider every element to be empty
534 // in state 1 we modify that if the element is followed by text
535 obj.setEmpty(true);
536 stack.push(obj);
537 } // performFinalAction
538
539 /**
540 * This is the action performed when an end tag is read.
541 * The action consists in colecting all the dubiosElements(elements without
542 * an end tag). They are considered dubious because we don't know if they
543 * are empty or may be closed... Only the DTD can provide this information.
544 * We don't have a DTD so we will consider that all dubious elements
545 * followed by text will close at the end of the text...
546 * If a dubious element is followed by another element then is
547 * automaticaly considered an empty element.
548 *
549 * @param elemName is the the name of the end tag that was read
550 */
551 private void performActionWithEndElem(String elemName) {
552 CustomObject obj = null;
553 boolean stop = false;
554
555 // get all the elements that are dubious from the stack
556 // the iteration will stop when an element is equal with elemName
557 while (!stack.isEmpty() && !stop){
558
559 // eliminate the object from the stack
560 obj = (CustomObject) stack.pop();
561
562 //if its elemName is equal with the param elemName we stop the itteration
563 if (obj.getElemName().equalsIgnoreCase(elemName)) stop = true;
564
565 // otherwhise add the element to the doubiousElements list
566 else dubiousElements.add(obj);
567 }
568 }//performActionWithEndElem
569
570 /**
571 * This method is called after we read the entire SGML document
572 * It resolves the dobious Elements this way:
573 * <ul>
574 * <li>
575 * 1. We don't have a DTD so we will consider that all dubious elements
576 * followed by text will close at the end of the text...
577 * <li>
578 * 2. If a dubious element is followed by another element then is
579 automaticaly considered an empty element.
580 *
581 * An element is considered dubious when we don't know if it is empty
582 * or may be closed...
583 *
584 * @param aCustomObject an object from the dubiousElements list
585 */
586 private void makeFinalModifications(CustomObject aCustomObject) {
587 String endElement = null;
588 // if the element is empty then we add / before > like this:
589 // <w> -> <w/>
590 if (aCustomObject.isEmpty())
591 m_modifier.insert(aCustomObject.getClosePos(),"/");
592 // otherwhise we create an end element
593 // <w> -> </w>
594 else{
595 // create the end element
596 endElement = "</" + aCustomObject.getElemName() + ">";
597 // insert it where the closePos indicates
598 m_modifier.insert(aCustomObject.getClosePos(), endElement);
599 }
600 } // makeFinalModifications
601
602 /**
603 * Tests if c is a white space char
604 */
605 private boolean isWhiteSpace(char c) {
606 return Character.isWhitespace(c);
607 }
608
609 // this is a gate Document... It's content will be transferred to
610 // m_modifier
611 private Document m_doc = null;
612
613 // this is the modifier that will transform an SGML document into an
614 // XML document
615 private StringBuffer m_modifier = null;
616
617 // we need the stack to be able to remember the order of the tags
618 private Stack stack = null;
619
620 // this is a list with all the tags that are not colsed...
621 // some of them are empty tags and some of them are not...
622 private List dubiousElements = null;
623
624 // this is tre current position inside the modifier
625 private int m_cursor = 0;
626
627 // the current state of the SGML2XML automata
628 private int m_currState = 1;
629
630 // the char that was read from the m_modifier @ position m_cursor
631 private char m_currChar = ' ';
632
633 // the fields above are used by the convert method and its auxiliary functions
634 // like doState1...13()
635
636 // indicates the last position of a text character (one which is not a white
637 // space)
638 // it is used in doState1() when we have to decide if an element is empty or
639 // not
640 // We decide that based on this field
641 // If the charPos > 0 then it means that the object from the top of stack
642 // is followed by text and we consider that is not empty
643 private int charPos = 0;
644
645 // is the current tag name
646 private String elemName = null;
647
648 // indicates where in the m_modifier begins the current tag elemName
649 private int elemNameStart = 0;
650
651 // indicates where in the m_modifier ends the current tag elemName
652 // we need that in order to be able to read the current tag name
653 // this name it will be read from m_modifier using the substring() method
654 // it will be something like this :
655 // elemName = m_modifier.substring(elemNameStart,elemNameEnd)
656 // Eg: <w attr1=val1> -> <[elemNameStart]w[elemNameEnd] [attr1=val1>
657 private int elemNameEnd = 0;
658
659 // this is the position there a start tag ends like this:
660 // Eg: <w attr1=val1> -> <w attr1=val1 [closePos]>
661 private int closePos = 0;
662
663 //this is the position where an attribute starts...
664 // we need it when we have to add the defaultAttr (see state 5)
665 private int attrStart = 0;
666
667 //this is the position where an attribute ends...
668 // we need it when we have to add the defaultAttr (see state 5) or to add "
669 // Eg: <w attr1=val1> -> <w [attrStart]attr1[attrEnd]=val1>
670 private int attrEnd = 0;
671
672 // endPair field is used in states 6 and 7....
673 // When we read something like this :
674 // attr=' val1 val2 val3' endPair remembers what is the pair for the beginning
675 // string
676 // Note that a combination like: attr = ' val1 val2 " will have an unexpected
677 // behaviour...
678 // We need this field when we have the following situation
679 // attr1 = " val1 val2 ' val3" . We need to know what is the end pair for ".
680 // In this case we can't allow ' to be the endPair
681 private char endPair = ' ';
682
683 } // class Sgml2Xml
684
685 /**
686 * The objects belonging to this class are used inside the stack
687 */
688 class CustomObject {
689
690 // constructor
691 public CustomObject() {
692 elemName = null;
693 closePos = 0;
694 empty = false;
695 }
696
697 // accessor
698 public String getElemName() {
699 return elemName;
700 }
701
702 public int getClosePos() {
703 return closePos;
704 }
705
706 public boolean isEmpty() {
707 return empty;
708 }
709
710 // modifiers
711 void setElemName(String anElemName) {
712 elemName = anElemName;
713 }
714
715 void setClosePos(int aPos){
716 closePos = aPos;
717 }
718
719 void setEmpty(boolean anEmptyValue) {
720 empty = anEmptyValue;
721 }
722
723 // data fields
724 private String elemName = null;
725
726 private int closePos = 0;
727
728 private boolean empty = false;
729
730 } // CustomObject
731
732 class MyComparator implements Comparator {
733
734 public MyComparator() {
735 }
736
737 public int compare(Object o1, Object o2) {
738 if ( !(o1 instanceof CustomObject) ||
739 !(o2 instanceof CustomObject)) return 0;
740
741 CustomObject co1 = (CustomObject) o1;
742 CustomObject co2 = (CustomObject) o2;
743 int result = 0;
744 if (co1.getClosePos() < co2.getClosePos()) result = -1;
745 if (co1.getClosePos() == co2.getClosePos()) result = 0;
746 if (co1.getClosePos() > co2.getClosePos()) result = 1;
747
748 return -result;
749 } // compare
750
751 }// class MyComparator
752