1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19 package org.archive.util;
20
21 import java.io.BufferedReader;
22 import java.io.IOException;
23 import java.io.PrintWriter;
24 import java.io.StringReader;
25 import java.io.StringWriter;
26 import java.util.HashMap;
27 import java.util.Map;
28 import java.util.regex.Matcher;
29 import java.util.regex.Pattern;
30
31 import javax.servlet.jsp.JspWriter;
32
33 import org.apache.commons.lang.StringEscapeUtils;
34
35 public class TextUtils {
36 private static final String FIRSTWORD = "^([^//s]*).*$";
37
38 /***
39 * Allowable range between & and ;
40 */
41 private static final int MAX_ENTITY_WIDTH = 9;
42
43 private static final ThreadLocal<Map<String,Matcher>> TL_MATCHER_MAP
44 = new ThreadLocal<Map<String,Matcher>>() {
45 protected Map<String,Matcher> initialValue() {
46 return new HashMap<String,Matcher>(50);
47 }
48 };
49
50 /***
51 * Get a matcher object for a precompiled regex pattern.
52 *
53 * This method tries to reuse Matcher objects for efficiency.
54 * It can hold for recycling one Matcher per pattern per thread.
55 *
56 * Matchers retrieved should be returned for reuse via the
57 * recycleMatcher() method, but no errors will occur if they
58 * are not.
59 *
60 * This method is a hotspot frequently accessed.
61 *
62 * @param pattern the string pattern to use
63 * @param input the character sequence the matcher should be using
64 * @return a matcher object loaded with the submitted character sequence
65 */
66 public static Matcher getMatcher(String pattern, CharSequence input) {
67 if (pattern == null) {
68 throw new IllegalArgumentException("String 'pattern' must not be null");
69 }
70 input = new InterruptibleCharSequence(input);
71 final Map<String,Matcher> matchers = TL_MATCHER_MAP.get();
72 Matcher m = (Matcher)matchers.get(pattern);
73 if(m == null) {
74 m = Pattern.compile(pattern).matcher(input);
75 } else {
76 matchers.put(pattern,null);
77 m.reset(input);
78 }
79 return m;
80 }
81
82 public static void recycleMatcher(Matcher m) {
83 final Map<String,Matcher> matchers = TL_MATCHER_MAP.get();
84 matchers.put(m.pattern().pattern(),m);
85 }
86
87 /***
88 * Utility method using a precompiled pattern instead of using the
89 * replaceAll method of the String class. This method will also be reusing
90 * Matcher objects.
91 *
92 * @see java.util.regex.Pattern
93 * @param pattern precompiled Pattern to match against
94 * @param input the character sequence to check
95 * @param replacement the String to substitute every match with
96 * @return the String with all the matches substituted
97 */
98 public static String replaceAll(
99 String pattern, CharSequence input, String replacement) {
100 input = new InterruptibleCharSequence(input);
101 Matcher m = getMatcher(pattern, input);
102 String res = m.replaceAll(replacement);
103 recycleMatcher(m);
104 return res;
105 }
106
107 /***
108 * Utility method using a precompiled pattern instead of using the
109 * replaceFirst method of the String class. This method will also be reusing
110 * Matcher objects.
111 *
112 * @see java.util.regex.Pattern
113 * @param pattern precompiled Pattern to match against
114 * @param input the character sequence to check
115 * @param replacement the String to substitute the first match with
116 * @return the String with the first match substituted
117 */
118 public static String replaceFirst(
119 String pattern, CharSequence input, String replacement) {
120 input = new InterruptibleCharSequence(input);
121 Matcher m = getMatcher(pattern, input);
122 String res = m.replaceFirst(replacement);
123 recycleMatcher(m);
124 return res;
125 }
126
127 /***
128 * Utility method using a precompiled pattern instead of using the matches
129 * method of the String class. This method will also be reusing Matcher
130 * objects.
131 *
132 * @see java.util.regex.Pattern
133 * @param pattern precompiled Pattern to match against
134 * @param input the character sequence to check
135 * @return true if character sequence matches
136 */
137 public static boolean matches(String pattern, CharSequence input) {
138 input = new InterruptibleCharSequence(input);
139 Matcher m = getMatcher(pattern, input);
140 boolean res = m.matches();
141 recycleMatcher(m);
142 return res;
143 }
144
145 /***
146 * Utility method using a precompiled pattern instead of using the split
147 * method of the String class.
148 *
149 * @see java.util.regex.Pattern
150 * @param pattern precompiled Pattern to split by
151 * @param input the character sequence to split
152 * @return array of Strings split by pattern
153 */
154 public static String[] split(String pattern, CharSequence input) {
155 Matcher m = getMatcher(pattern,input);
156 String[] retVal = m.pattern().split(input);
157 recycleMatcher(m);
158 return retVal;
159 }
160
161 /***
162 * @param s String to find first word in (Words are delimited by
163 * whitespace).
164 * @return First word in the passed string else null if no word found.
165 */
166 public static String getFirstWord(String s) {
167 Matcher m = getMatcher(FIRSTWORD, s);
168 String retVal = (m != null && m.matches())? m.group(1): null;
169 recycleMatcher(m);
170 return retVal;
171 }
172
173 /***
174 * Escapes a string so that it can be passed as an argument to a javscript
175 * in a JSP page. This method takes a string and returns the same string
176 * with any single quote escaped by prepending the character with a
177 * backslash. Linebreaks are also replaced with '\n'. Also,
178 * less-than signs and ampersands are replaced with HTML entities.
179 *
180 * @param s The string to escape
181 * @return The same string escaped.
182 */
183 public static String escapeForHTMLJavascript(String s) {
184 return escapeForHTML(StringEscapeUtils.escapeJavaScript(s));
185 }
186
187 /***
188 * Escapes a string so that it can be placed inside XML/HTML attribute.
189 * Replaces ampersand, less-than, greater-than, single-quote, and
190 * double-quote with escaped versions.
191 * @param s The string to escape
192 * @return The same string escaped.
193 */
194 public static String escapeForMarkupAttribute(String s) {
195 return StringEscapeUtils.escapeXml(s);
196 }
197
198 /***
199 * Minimally escapes a string so that it can be placed inside XML/HTML
200 * attribute.
201 * Escapes lt and amp.
202 * @param s The string to escape
203 * @return The same string escaped.
204 */
205 public static String escapeForHTML(String s) {
206
207 String escaped = s.replaceAll("&","&");
208 return escaped.replaceAll("<","<");
209 }
210
211 /***
212 * Utility method for writing a (potentially large) String to a JspWriter,
213 * escaping it for HTML display, without constructing another large String
214 * of the whole content.
215 * @param s String to write
216 * @param out destination JspWriter
217 * @throws IOException
218 */
219 public static void writeEscapedForHTML(String s, JspWriter out)
220 throws IOException {
221 BufferedReader reader = new BufferedReader(new StringReader(s));
222 String line;
223 while((line=reader.readLine()) != null){
224 out.println(StringEscapeUtils.escapeHtml(line));
225 }
226 }
227
228 /***
229 * Replaces HTML Entity Encodings.
230 * @param cs The CharSequence to remove html codes from
231 * @return the same CharSequence or an escaped String.
232 */
233 public static CharSequence unescapeHtml(final CharSequence cs) {
234 if (cs == null) {
235 return cs;
236 }
237
238 return StringEscapeUtils.unescapeHtml(cs.toString());
239 }
240
241 /***
242 * @param message Message to put at top of the string returned. May be
243 * null.
244 * @param e Exception to write into a string.
245 * @return Return formatted string made of passed message and stack trace
246 * of passed exception.
247 */
248 public static String exceptionToString(String message, Throwable e) {
249 StringWriter sw = new StringWriter();
250 if (message == null || message.length() == 0) {
251 sw.write(message);
252 sw.write("\n");
253 }
254 e.printStackTrace(new PrintWriter(sw));
255 return sw.toString();
256 }
257 }