View Javadoc

1   /* Copyright (C) 2003 Internet Archive.
2    *
3    * This file is part of the Heritrix web crawler (crawler.archive.org).
4    *
5    * Heritrix is free software; you can redistribute it and/or modify
6    * it under the terms of the GNU Lesser Public License as published by
7    * the Free Software Foundation; either version 2.1 of the License, or
8    * any later version.
9    *
10   * Heritrix is distributed in the hope that it will be useful,
11   * but WITHOUT ANY WARRANTY; without even the implied warranty of
12   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13   * GNU Lesser Public License for more details.
14   *
15   * You should have received a copy of the GNU Lesser Public License
16   * along with Heritrix; if not, write to the Free Software
17   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18   */
19  package org.archive.util;
20  
21  import java.io.BufferedReader;
22  import java.io.IOException;
23  import java.io.PrintWriter;
24  import java.io.StringReader;
25  import java.io.StringWriter;
26  import java.util.HashMap;
27  import java.util.Map;
28  import java.util.regex.Matcher;
29  import java.util.regex.Pattern;
30  
31  import javax.servlet.jsp.JspWriter;
32  
33  import org.apache.commons.lang.StringEscapeUtils;
34  
35  public class TextUtils {
36      private static final String FIRSTWORD = "^([^//s]*).*$";
37      
38      /***
39       * Allowable range between & and ;
40       */
41      private static final int MAX_ENTITY_WIDTH = 9;
42      
43      private static final ThreadLocal<Map<String,Matcher>> TL_MATCHER_MAP
44       = new ThreadLocal<Map<String,Matcher>>() {
45          protected Map<String,Matcher> initialValue() {
46              return new HashMap<String,Matcher>(50);
47          }
48      };
49  
50      /***
51       * Get a matcher object for a precompiled regex pattern.
52       * 
53       * This method tries to reuse Matcher objects for efficiency.
54       * It can hold for recycling one Matcher per pattern per thread. 
55       * 
56       * Matchers retrieved should be returned for reuse via the
57       * recycleMatcher() method, but no errors will occur if they
58       * are not.
59       * 
60       * This method is a hotspot frequently accessed.
61       *
62       * @param pattern the string pattern to use
63       * @param input the character sequence the matcher should be using
64       * @return a matcher object loaded with the submitted character sequence
65       */
66      public static Matcher getMatcher(String pattern, CharSequence input) {
67          if (pattern == null) {
68              throw new IllegalArgumentException("String 'pattern' must not be null");
69          }
70          input = new InterruptibleCharSequence(input);
71          final Map<String,Matcher> matchers = TL_MATCHER_MAP.get();
72          Matcher m = (Matcher)matchers.get(pattern);
73          if(m == null) {
74              m = Pattern.compile(pattern).matcher(input);
75          } else {
76              matchers.put(pattern,null);
77              m.reset(input);
78          }
79          return m;
80      }
81  
82      public static void recycleMatcher(Matcher m) {
83          final Map<String,Matcher> matchers = TL_MATCHER_MAP.get();
84          matchers.put(m.pattern().pattern(),m);
85      }
86      
87      /***
88       * Utility method using a precompiled pattern instead of using the
89       * replaceAll method of the String class. This method will also be reusing
90       * Matcher objects.
91       * 
92       * @see java.util.regex.Pattern
93       * @param pattern precompiled Pattern to match against
94       * @param input the character sequence to check
95       * @param replacement the String to substitute every match with
96       * @return the String with all the matches substituted
97       */
98      public static String replaceAll(
99              String pattern, CharSequence input, String replacement) {
100         input = new InterruptibleCharSequence(input);
101         Matcher m = getMatcher(pattern, input);
102         String res = m.replaceAll(replacement);
103         recycleMatcher(m);
104         return res;
105     }
106 
107     /***
108      * Utility method using a precompiled pattern instead of using the
109      * replaceFirst method of the String class. This method will also be reusing
110      * Matcher objects.
111      * 
112      * @see java.util.regex.Pattern
113      * @param pattern precompiled Pattern to match against
114      * @param input the character sequence to check
115      * @param replacement the String to substitute the first match with
116      * @return the String with the first match substituted
117      */
118     public static String replaceFirst(
119             String pattern, CharSequence input, String replacement) {
120         input = new InterruptibleCharSequence(input);
121         Matcher m = getMatcher(pattern, input);
122         String res = m.replaceFirst(replacement);
123         recycleMatcher(m);
124         return res;
125     }
126 
127     /***
128      * Utility method using a precompiled pattern instead of using the matches
129      * method of the String class. This method will also be reusing Matcher
130      * objects.
131      * 
132      * @see java.util.regex.Pattern
133      * @param pattern precompiled Pattern to match against
134      * @param input the character sequence to check
135      * @return true if character sequence matches
136      */
137     public static boolean matches(String pattern, CharSequence input) {
138         input = new InterruptibleCharSequence(input);
139         Matcher m = getMatcher(pattern, input);
140         boolean res = m.matches();
141         recycleMatcher(m);
142         return res;
143     }
144 
145     /***
146      * Utility method using a precompiled pattern instead of using the split
147      * method of the String class.
148      * 
149      * @see java.util.regex.Pattern
150      * @param pattern precompiled Pattern to split by
151      * @param input the character sequence to split
152      * @return array of Strings split by pattern
153      */
154     public static String[] split(String pattern, CharSequence input) {
155         Matcher m = getMatcher(pattern,input);
156         String[] retVal = m.pattern().split(input); 
157         recycleMatcher(m);
158         return retVal;
159     }
160     
161     /***
162      * @param s String to find first word in (Words are delimited by
163      * whitespace).
164      * @return First word in the passed string else null if no word found.
165      */
166     public static String getFirstWord(String s) {
167         Matcher m = getMatcher(FIRSTWORD, s);
168         String retVal = (m != null && m.matches())? m.group(1): null;
169         recycleMatcher(m);
170         return retVal;
171     }
172 
173     /***
174      * Escapes a string so that it can be passed as an argument to a javscript
175      * in a JSP page. This method takes a string and returns the same string
176      * with any single quote escaped by prepending the character with a
177      * backslash. Linebreaks are also replaced with '\n'.  Also,
178      * less-than signs and ampersands are replaced with HTML entities.
179      * 
180      * @param s The string to escape
181      * @return The same string escaped.
182      */
183     public static String escapeForHTMLJavascript(String s) {
184         return escapeForHTML(StringEscapeUtils.escapeJavaScript(s));
185     }
186     
187     /***
188      * Escapes a string so that it can be placed inside XML/HTML attribute.
189      * Replaces ampersand, less-than, greater-than, single-quote, and 
190      * double-quote with escaped versions.
191      * @param s The string to escape
192      * @return The same string escaped.
193      */
194     public static String escapeForMarkupAttribute(String s) {
195         return StringEscapeUtils.escapeXml(s);
196     }
197     
198     /***
199      * Minimally escapes a string so that it can be placed inside XML/HTML
200      * attribute.
201      * Escapes lt and amp.
202      * @param s The string to escape
203      * @return The same string escaped.
204      */
205     public static String escapeForHTML(String s) {
206         // TODO: do this in a single pass instead of creating 5 junk strings
207         String escaped = s.replaceAll("&","&amp;");
208         return escaped.replaceAll("<","&lt;");
209     }
210 
211     /***
212      * Utility method for writing a (potentially large) String to a JspWriter,
213      * escaping it for HTML display, without constructing another large String
214      * of the whole content. 
215      * @param s String to write
216      * @param out destination JspWriter
217      * @throws IOException
218      */
219     public static void writeEscapedForHTML(String s, JspWriter out)
220     throws IOException {
221         BufferedReader reader = new BufferedReader(new StringReader(s));
222         String line;
223         while((line=reader.readLine()) != null){
224             out.println(StringEscapeUtils.escapeHtml(line));
225         }
226     }
227     
228     /***
229      * Replaces HTML Entity Encodings.
230      * @param cs The CharSequence to remove html codes from
231      * @return the same CharSequence or an escaped String.
232      */
233     public static CharSequence unescapeHtml(final CharSequence cs) {
234         if (cs == null) {
235             return cs;
236         }
237         
238         return StringEscapeUtils.unescapeHtml(cs.toString());
239     }
240     
241     /***
242      * @param message Message to put at top of the string returned. May be
243      * null.
244      * @param e Exception to write into a string.
245      * @return Return formatted string made of passed message and stack trace
246      * of passed exception.
247      */
248     public static String exceptionToString(String  message, Throwable e) {
249         StringWriter sw = new StringWriter();
250         if (message == null || message.length() == 0) {
251             sw.write(message);
252             sw.write("\n");
253         }
254         e.printStackTrace(new PrintWriter(sw));
255         return sw.toString();
256     }
257 }