| DocumentContentImpl.java |
1 /*
2 * DocumentContentImpl.java
3 *
4 * Copyright (c) 1998-2004, The University of Sheffield.
5 *
6 * This file is part of GATE (see http://gate.ac.uk/), and is free
7 * software, licenced under the GNU Library General Public License,
8 * Version 2, June 1991 (in the distribution as file licence.html,
9 * and also available at http://gate.ac.uk/gate/licence.html).
10 *
11 * Hamish Cunningham, 11/Feb/2000
12 *
13 * $Id: DocumentContentImpl.java,v 1.28 2004/07/21 17:10:03 akshay Exp $
14 */
15
16 package gate.corpora;
17
18 import java.io.*;
19 import java.net.URL;
20
21 import gate.DocumentContent;
22 import gate.util.InvalidOffsetException;
23
24 /** Represents the commonalities between all sorts of document contents.
25 */
26 public class DocumentContentImpl implements DocumentContent
27 {
28 /** Debug flag */
29 private static final boolean DEBUG = false;
30
31 /** Buffer size for reading
32 * 16k is 4 times the block size on most filesystems
33 * so it should be efficient for most cases
34 * */
35 private static final int INTERNAL_BUFFER_SIZE = 16*1024;
36
37 /** Default construction */
38 public DocumentContentImpl() {
39 content = new String();
40 } // default construction
41
42 /** Contruction from URL and offsets. */
43 public DocumentContentImpl(URL u, String encoding, Long start, Long end)
44 throws IOException {
45
46 int readLength = 0;
47 char[] readBuffer = new char[INTERNAL_BUFFER_SIZE];
48
49 BufferedReader uReader = null;
50 StringBuffer buf = new StringBuffer();
51 char c;
52 long s = 0, e = Long.MAX_VALUE, counter = 0;
53 if(start != null && end != null) {
54 s = start.longValue();
55 e = end.longValue();
56 }
57
58 if(encoding != null && !encoding.equalsIgnoreCase("")) {
59 uReader = new BufferedReader(
60 new InputStreamReader(u.openStream(), encoding), INTERNAL_BUFFER_SIZE
61 );
62 } else {
63 uReader = new BufferedReader(
64 new InputStreamReader(u.openStream()), INTERNAL_BUFFER_SIZE
65 );
66 };
67
68 // 1. skip S characters
69 uReader.skip(s);
70
71 // 2. how many character shall I read?
72 long toRead = e - s;
73
74 // 3. read gtom source into buffer
75 while (
76 toRead > 0 &&
77 (readLength = uReader.read(readBuffer, 0, INTERNAL_BUFFER_SIZE)) != -1
78 ) {
79 if (toRead < readLength) {
80 //well, if toRead(long) is less than readLenght(int)
81 //then there can be no overflow, so the cast is safe
82 readLength = (int)toRead;
83 }
84
85 buf.append(readBuffer, 0, readLength);
86 toRead -= readLength;
87 }
88
89 // 4.close reader
90 uReader.close();
91
92 content = new String(buf);
93 originalContent = content;
94 } // Contruction from URL and offsets
95
96 /** Propagate changes to the document content. */
97 void edit(Long start, Long end, DocumentContent replacement)
98 {
99 int s = start.intValue(), e = end.intValue();
100 String repl = ((DocumentContentImpl) replacement).content;
101 StringBuffer newContent = new StringBuffer(content);
102 newContent.replace(s, e, repl);
103 content = newContent.toString();
104 } // edit(start,end,replacement)
105
106 /** The contents under a particular span. */
107 public DocumentContent getContent(Long start, Long end)
108 throws InvalidOffsetException
109 {
110 if(! isValidOffsetRange(start, end))
111 throw new InvalidOffsetException();
112
113 return new DocumentContentImpl(
114 content.substring(start.intValue(), end.intValue())
115 );
116 } // getContent(start, end)
117
118 /** Returns the String representing the content in case of a textual document.
119 * NOTE: this is a temporary solution until we have a more generic one.
120 */
121 public String toString(){
122 return content;
123 }
124
125 /** The size of this content (e.g. character length for textual
126 * content).
127 */
128 public Long size() {
129 return new Long(content.length());
130 } // size()
131
132 /** Check that an offset is valid */
133 boolean isValidOffset(Long offset) {
134 if(offset == null)
135 return false;
136
137 long o = offset.longValue();
138 long len = content.length();
139 if(o > len || o < 0)
140 return false;
141
142 return true;
143 } // isValidOffset
144
145 /** Check that both start and end are valid offsets and that
146 * they constitute a valid offset range
147 */
148 boolean isValidOffsetRange(Long start, Long end) {
149 return
150 isValidOffset(start) && isValidOffset(end) &&
151 start.longValue() <= end.longValue();
152 } // isValidOffsetRange(start,end)
153
154 /** Two documents are the same if their contents is the same
155 */
156 public boolean equals(Object other) {
157 if (!(other instanceof DocumentContentImpl)) return false;
158
159 DocumentContentImpl docImpl = (DocumentContentImpl) other;
160 return content.equals(docImpl.toString());
161 } // equals
162
163 /** Calculate the hash value for the object. */
164 public int hashCode(){ return toString().hashCode(); }
165
166 /** Just for now - later we have to cater for different types of
167 * content.
168 */
169 String content;
170
171 /**
172 * For preserving the original content of the document.
173 * The edit command didn't affect on the original content.
174 * If you construct the content by URL the originalContent will keep
175 * whole information retrieved by URL even you set some start and end.
176 */
177 String originalContent;
178
179 /**
180 * Return the original content of the document received during the loading
181 * phase or on construction from string.
182 */
183 public String getOriginalContent() { return originalContent; }
184
185 /** For ranges */
186 public DocumentContentImpl(String s)
187 { content = s; originalContent = content; }
188
189 /** Freeze the serialization UID. */
190 static final long serialVersionUID = -1426940535575467461L;
191 } // class DocumentContentImpl
192