| RepositioningInfo.java |
1 /*
2 * RepositioningInfo.java
3 *
4 * Copyright (c) 1998-2004, The University of Sheffield.
5 *
6 * This file is part of GATE (see http://gate.ac.uk/), and is free
7 * software, licenced under the GNU Library General Public License,
8 * Version 2, June 1991 (in the distribution as file licence.html,
9 * and also available at http://gate.ac.uk/gate/licence.html).
10 *
11 * Angel Kirilov, 04/January/2002
12 *
13 * $Id: RepositioningInfo.java,v 1.10 2004/07/21 17:10:03 akshay Exp $
14 */
15
16 package gate.corpora;
17
18 import java.io.Serializable;
19 import java.util.ArrayList;
20
21 import gate.util.Out;
22
23 /**
24 * RepositioningInfo keep information about correspondence of positions
25 * between the original and extracted document content. With this information
26 * this class could be used for computing of this correspondence in the strict
27 * way (return -1 where is no correspondence)
28 * or in "flow" way (return near computable position)
29 */
30
31 public class RepositioningInfo extends ArrayList {
32
33 /** Freeze the serialization UID. */
34 static final long serialVersionUID = -2895662600168468559L;
35 /** Debug flag */
36 private static final boolean DEBUG = false;
37
38 /**
39 * Just information keeper inner class. No significant functionality.
40 */
41 public class PositionInfo implements Serializable {
42
43 /** Freeze the serialization UID. */
44 static final long serialVersionUID = -7747351720249898499L;
45
46 /** Data members for one peace of text information */
47 private long m_origPos, m_origLength, m_currPos, m_currLength;
48
49 /** The only constructor. We haven't set methods for data members. */
50 public PositionInfo(long orig, long origLen, long curr, long currLen) {
51 m_origPos = orig;
52 m_origLength = origLen;
53 m_currPos = curr;
54 m_currLength = currLen;
55 } // PositionInfo
56
57 /** Position in the extracted (and probably changed) content */
58 public long getCurrentPosition() {
59 return m_currPos;
60 } // getCurrentPosition
61
62 /** Position in the original content */
63 public long getOriginalPosition() {
64 return m_origPos;
65 } // getOriginalPosition
66
67 /** Length of peace of text in the original content */
68 public long getOriginalLength() {
69 return m_origLength;
70 } // getOriginalLength
71
72 /** Length of peace of text in the extracted content */
73 public long getCurrentLength() {
74 return m_currLength;
75 } // getCurrentLength
76
77 /** For debug purposes */
78 public String toString() {
79 return "("+m_origPos+","+m_origLength+","
80 +m_currPos+","+m_currLength+")";
81 } // toString
82 } // class PositionInfo
83
84 /** Default constructor */
85 public RepositioningInfo() {
86 super();
87 } // RepositioningInfo
88
89 /** Create a new position information record. */
90 public void addPositionInfo(long origPos, long origLength,
91 long currPos, long currLength) {
92 // sorted add of new position
93 int insertPos = 0;
94 PositionInfo lastPI;
95
96 for(int i = size(); i>0; i--) {
97 lastPI = (PositionInfo) get(i-1);
98 if(lastPI.getOriginalPosition() < origPos) {
99 insertPos = i;
100 break;
101 } // if - sort key
102 } // for
103
104 add(insertPos, new PositionInfo(origPos, origLength, currPos, currLength));
105 } // addPositionInfo
106
107 /** Compute position in extracted content by position in the original content.
108 * If there is no correspondence return -1.
109 */
110 public long getExtractedPos(long absPos) {
111 long result = absPos;
112 PositionInfo currPI = null;
113 int size = size();
114
115 if(size != 0) {
116 long origPos, origLen;
117 boolean found = false;
118
119 for(int i=0; i<size; ++i) {
120 currPI = (PositionInfo) get(i);
121 origPos = currPI.getOriginalPosition();
122 origLen = currPI.getOriginalLength();
123
124 if(absPos <= origPos+origLen) {
125 if(absPos < origPos) {
126 // outside the range of information
127 result = -1;
128 }
129 else {
130 // current position + offset in this PositionInfo record
131 result = currPI.getCurrentPosition() + absPos - origPos;
132 } // if
133 found = true;
134 break;
135 } // if
136 } // for
137
138 if(!found) {
139 // after the last repositioning info
140 result = -1;
141 } // if - !found
142 } // if
143
144 return result;
145 } // getExtractedPos
146
147 public long getOriginalPos(long relPos) {
148 return getOriginalPos(relPos, false);
149 } // getOriginalPos
150
151 /** Compute position in original content by position in the extracted content.
152 * If there is no correspondence return -1.
153 */
154 public long getOriginalPos(long relPos, boolean afterChar) {
155 long result = relPos;
156 PositionInfo currPI = null;
157 int size = size();
158
159 if(size != 0) {
160 long currPos, currLen;
161 boolean found = false;
162
163 for(int i=0; i<size; ++i) {
164 currPI = (PositionInfo) get(i);
165 currPos = currPI.getCurrentPosition();
166 currLen = currPI.getCurrentLength();
167
168 if(afterChar && relPos == currPos+currLen) {
169 result = currPI.getOriginalPosition() + currPI.getOriginalLength();
170 found = true;
171 break;
172 } // if
173
174 if(relPos < currPos+currLen) {
175 if(relPos < currPos) {
176 // outside the range of information
177 result = -1;
178 }
179 else {
180 // current position + offset in this PositionInfo record
181 result = currPI.getOriginalPosition() + relPos - currPos;
182 } // if
183 found = true;
184 break;
185 } // if
186 } // for
187
188 if(!found) {
189 // after the last repositioning info
190 result = -1;
191 } // if - !found
192 } // if
193
194 return result;
195 } // getOriginalPos
196
197 /** Not finished yet */
198 public long getExtractedPosFlow(long absPos) {
199 long result = -1;
200 return result;
201 } // getExtractedPosFlow
202
203 /** Not finished yet */
204 public long getOriginalPosFlow(long relPos) {
205 long result = -1;
206 return result;
207 } // getOriginalPosFlow
208
209 /**
210 * Return the position info index containing <B>@param absPos</B>
211 * If there is no such position info return -1.
212 */
213 public int getIndexByOriginalPosition(long absPos) {
214 PositionInfo currPI = null;
215 int result = -1;
216
217 int size = size();
218 long origPos, origLen;
219
220 // Find with the liniear algorithm. Could be extended to binary search.
221 for(int i=0; i<size; ++i) {
222 currPI = (PositionInfo) get(i);
223 origPos = currPI.getOriginalPosition();
224 origLen = currPI.getOriginalLength();
225
226 if(absPos <= origPos+origLen) {
227 if(absPos >= origPos) {
228 result = i;
229 } // if
230 break;
231 } // if
232 } // for
233
234 return result;
235 } // getItemByOriginalPosition
236
237 /**
238 * Return the position info index containing <B>@param absPos</B>
239 * or the index of record before this position.
240 * Result is -1 if the position is before the first record.
241 * Rezult is size() if the position is after the last record.
242 */
243 public int getIndexByOriginalPositionFlow(long absPos) {
244 PositionInfo currPI = null;
245
246 int size = size();
247 int result = size;
248 long origPos, origLen;
249
250 // Find with the liniear algorithm. Could be extended to binary search.
251 for(int i=0; i<size; ++i) {
252 currPI = (PositionInfo) get(i);
253 origPos = currPI.getOriginalPosition();
254 origLen = currPI.getOriginalLength();
255
256 if(absPos <= origPos+origLen) {
257 // is inside of current record
258 if(absPos >= origPos) {
259 result = i;
260 }
261 else {
262 // not inside the current recort - return previous
263 result = i-1;
264 } // if
265 break;
266 } // if
267 } // for
268
269 return result;
270 } // getItemByOriginalPositionFlow
271
272 /**
273 * Correct the RepositioningInfo structure for shrink/expand changes.
274 * <br>
275 *
276 * Normaly the text peaces have same sizes in both original text and
277 * extracted text. But in some cases there are nonlinear substitutions.
278 * For example the sequence "<" is converted to "<".
279 * <br>
280 *
281 * The correction will split the corresponding PositionInfo structure to
282 * 3 new records - before correction, correction record and after correction.
283 * Front and end records are the same maner like the original record -
284 * m_origLength == m_currLength, since the middle record has different
285 * values because of shrink/expand changes. All records after this middle
286 * record should be corrected with the difference between these values.
287 * <br>
288 *
289 * All m_currPos above the current information record should be corrected
290 * with (origLen - newLen) i.e.
291 * <code> m_currPos -= origLen - newLen; </code>
292 * <br>
293 *
294 * @param originalPos Position of changed text in the original content.
295 * @param origLen Length of changed peace of text in the original content.
296 * @param newLen Length of new peace of text substiting the original peace.
297 */
298 public void correctInformation(long originalPos, long origLen, long newLen) {
299 PositionInfo currPI;
300 PositionInfo frontPI, correctPI, endPI;
301
302 int index = getIndexByOriginalPositionFlow(originalPos);
303
304 // correct the index when the originalPos precede all records
305 if(index == -1) {
306 index = 0;
307 } // if
308
309 // correction of all other information records
310 // All m_currPos above the current record should be corrected with
311 // (origLen - newLen) i.e. <code> m_currPos -= origLen - newLen; </code>
312
313 for(int i=index; i<size(); ++i) {
314 currPI = (PositionInfo) get(i);
315 currPI.m_currPos -= origLen - newLen;
316 } // for
317
318 currPI = (PositionInfo) get(index);
319 if(originalPos >= currPI.m_origPos
320 && currPI.m_origPos + currPI.m_origLength >= originalPos + origLen) {
321 long frontLen = originalPos - currPI.m_origPos;
322
323 frontPI = new PositionInfo(currPI.m_origPos,
324 frontLen,
325 currPI.m_currPos,
326 frontLen);
327 correctPI = new PositionInfo(originalPos,
328 origLen,
329 currPI.m_currPos + frontLen,
330 newLen);
331 long endLen = currPI.m_origLength - frontLen - origLen;
332 endPI = new PositionInfo(originalPos + origLen,
333 endLen,
334 currPI.m_currPos + frontLen + newLen,
335 endLen);
336
337 set(index, frontPI); // substitute old element
338 if(endPI.m_origLength > 0) {
339 add(index+1, endPI); // insert new end element
340 } // if
341 if(correctPI.m_origLength > 0) {
342 add(index+1, correctPI); // insert middle new element
343 } // if
344 } // if - substitution range check
345 } // correctInformation
346
347 /**
348 * Correct the original position information in the records. When some text
349 * is shrinked/expanded by the parser. With this method is corrected the
350 * substitution of "\r\n" with "\n".
351 */
352 public void correctInformationOriginalMove(long originalPos, long moveLen) {
353 PositionInfo currPI;
354
355 if(DEBUG) {
356 if(originalPos < 380) // debug information restriction
357 Out.println("Before correction: "+this);
358 } // DEBUG
359
360 int index = getIndexByOriginalPositionFlow(originalPos);
361
362 // correct the index when the originalPos precede all records
363 if(index == -1) {
364 index = 0;
365 } // if
366
367 // position is after all records in list
368 if(index == size()) {
369 return;
370 } // if
371
372 for(int i = index+1; i<size(); ++i) {
373 currPI = (PositionInfo) get(i);
374 currPI.m_origPos += moveLen;
375 } // for
376
377 currPI = (PositionInfo) get(index);
378
379 // should we split this record to two new records (inside the record)
380 if(originalPos > currPI.m_origPos) {
381 if(originalPos < currPI.m_origPos + currPI.m_origLength) {
382 PositionInfo frontPI, endPI;
383 long frontLen = originalPos - currPI.m_origPos;
384 frontPI = new PositionInfo(currPI.m_origPos,
385 frontLen,
386 currPI.m_currPos,
387 frontLen);
388
389 long endLen = currPI.m_origLength - frontLen;
390 endPI = new PositionInfo(originalPos + moveLen,
391 endLen,
392 currPI.m_currPos + frontLen,
393 endLen);
394 set(index, frontPI); // substitute old element
395 if(endPI.m_origLength != 0) {
396 add(index+1, endPI); // insert new end element
397 } // if - should add this record
398
399 if(DEBUG) {
400 if(originalPos < 380) { // debug information restriction
401 Out.println("Point 2. Current: "+currPI);
402 Out.println("Point 2. frontPI: "+frontPI);
403 Out.println("Point 2. endPI: "+endPI);
404 }
405 } // DEBUG
406 } // if - inside the record
407 } // if
408 else {
409 // correction if the position is before the current record
410 currPI.m_origPos += moveLen;
411 }
412
413 if(DEBUG) {
414 if(originalPos < 380) {
415 Out.println("Correction move: "+originalPos+", "+moveLen);
416 Out.println("Corrected: "+this);
417 Out.println("index: "+index);
418 /*
419 Exception ex = new Exception();
420 Out.println("Call point: ");
421 ex.printStackTrace();
422 */
423 }
424 } // DEBUG
425 } // correctInformationOriginalMove
426
427 } // class RepositioningInfo