| EntityDescriptor.java |
1 /*
2 * EntityDescriptor.java
3 *
4 * Copyright (c) 1998-2004, The University of Sheffield.
5 *
6 * This file is part of GATE (see http://gate.ac.uk/), and is free
7 * software, licenced under the GNU Library General Public License,
8 * Version 2, June 1991 (in the distribution as file licence.html,
9 * and also available at http://gate.ac.uk/gate/licence.html).
10 *
11 * Valentin Tablan, July/2000
12 *
13 * $Id: EntityDescriptor.java,v 1.7 2004/07/21 17:10:05 akshay Exp $
14 */
15
16 package gate.creole.nerc;
17
18 import java.io.Serializable;
19
20 import gate.Annotation;
21 import gate.Document;
22 import gate.util.InvalidOffsetException;
23
24 /** Represents a single named entity */
25 public class EntityDescriptor implements Serializable{
26
27 /** Constructs a new entity descriptor */
28 public EntityDescriptor(String string, String category, int start, int end) {
29 this.string = normaliseString(string);
30 this.category = category;
31 offsets = new int[2];
32 offsets[0] = start;
33 offsets[1] = end;
34 }
35
36 /** Constructs a new entity descriptor starting from a Gate annotation */
37 public EntityDescriptor(Document document, Annotation annotation) {
38 offsets = new int[2];
39 offsets[0] = annotation.getStartNode().getOffset().intValue();
40 offsets[1] = annotation.getEndNode().getOffset().intValue();
41 try{
42 string = normaliseString(document.getContent().getContent(
43 annotation.getStartNode().getOffset(),
44 annotation.getEndNode().getOffset()).
45 toString());
46 } catch(InvalidOffsetException ioe){
47 ioe.printStackTrace();
48 }
49 category = annotation.getType();
50 }
51
52 /** Returns a normalised string for the entity. This is the string from the
53 * text document the entity was descovered in, with all whitespace sequences
54 * replaced by a single space character
55 */
56 public String getString(){
57 return string;
58 }
59
60 /** Returns the category of the entity*/
61 public String getCategory(){
62 return category;
63 }
64
65 /** Returns a pair of integers specifying the character offsets in the
66 * original file where the entity occured
67 */
68 public int[] getOffsets(){
69 return offsets;
70 }
71
72 /** Returns a string giving the category, offsets and normalised string for
73 * the entity, with no newlines.
74 */
75 public String toString(){
76 return category + " " + offsets[0] + " " + offsets[1] + " " + string;
77 }
78
79 String string;
80 String category;
81 int[] offsets;
82
83 /** Normalises a string. That is removes all the leading and trailing
84 * whitespace characters and replaces all inner whitespace sequences with a
85 * single space character
86 */
87 protected String normaliseString(String text){
88 /// String res = "";
89 StringBuffer res = new StringBuffer(gate.Gate.STRINGBUFFER_SIZE);
90 if(text == null) return null;
91 int charIdx = 0;
92 boolean lastWasSpace = false;
93 //skip the leading spaces
94 while(charIdx < text.length() &&
95 Character.isWhitespace(text.charAt(charIdx))) charIdx++;
96 //parse the rest of the text
97 while(charIdx < text.length()){
98 if(Character.isWhitespace(text.charAt(charIdx))){
99 //reading spaces
100 lastWasSpace = true;
101 }else{
102 //reading non-spaces
103 if(lastWasSpace) ///res += " ";
104 res.append(" ");
105 /// res += text.charAt(charIdx);
106 res.append(text.charAt(charIdx));
107 lastWasSpace = false;
108 }
109 charIdx++;
110 }//while(charIdx < text.length())
111 return res.toString();
112 }
113
114 }
115