1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27 package org.archive.crawler.filter;
28
29 import java.util.logging.Level;
30 import java.util.logging.Logger;
31
32 import javax.management.AttributeNotFoundException;
33
34 import org.archive.crawler.datamodel.CrawlURI;
35 import org.archive.crawler.settings.ComplexType;
36 import org.archive.crawler.settings.MapType;
37 import org.archive.crawler.settings.SimpleType;
38
39 /***
40 * Compares suffix of a passed CrawlURI, UURI, or String against a regular
41 * expression pattern accepting matches.
42 *
43 * @author Igor Ranitovic
44 * @deprecated As of release 1.10.0. Replaced by
45 * {@link MatchesFilePatternDecideRule}.
46 */
47 public class FilePatternFilter extends URIRegExpFilter {
48
49 private static final long serialVersionUID = -4019256104085004651L;
50
51 private static final Logger logger =
52 Logger.getLogger(FilePatternFilter.class.getName());
53 public static final String ATTR_USE_DEFAULT = "use-default-patterns";
54 public static final String IMAGES_PATTERNS = ".*(?i)(//.(bmp|gif|jpe?g" +
55 "|png|tiff?))$";
56 public static final String AUDIO_PATTERNS = ".*(?i)(//.(mid|mp2|mp3|mp4" +
57 "|wav))$";
58 public static final String VIDEO_PATTERNS = ".*(?i)(//.(avi|mov|mpeg|ram" +
59 "|rm|smil|wmv))$";
60 public static final String MISC_PATTERNS = ".*(?i)(//.(doc|pdf|ppt|swf))$";
61 public static final String ALL_DEFAULT_PATTERNS = ".*(?i)(//.(bmp|gif" +
62 "|jpe?g|png|tiff?|mid|mp2|mp3|mp4|wav|avi|mov|mpeg|ram|rm|smil|wmv" +
63 "|doc|pdf|ppt|swf))$";
64
65 public static final String ALL = "All";
66 public static final String IMAGES = "Images";
67 public static final String AUDIO = "Audio";
68 public static final String VIDEO = "Video";
69 public static final String MISC = "Miscellaneous";
70 public static final String CUSTOM = "Custom";
71
72 /***
73 * @param name
74 */
75 public FilePatternFilter(String name) {
76 super(name);
77 setDescription("A URI path suffix filter *Deprecated* Use" +
78 "DecidingFilter and MatchesFilePatternDecideRule instead. " +
79 "All URLs that end with the specified pattern(s) will be added " +
80 "to the scope's focus. Default file patterns are:\n.avi, .bmp, " +
81 ".doc, .gif, .jp(e)g, .mid, .mov, .mp2, .mp3, .mp4, .mpeg, " +
82 ".pdf, .png, .ppt, .ram, .rm,.smil, .swf, .tif(f), .wav, .wmv\n" +
83 "It is also possible to specifiy custom regular expressions " +
84 "for this filter, turning it into (effectively) a generic " +
85 "regular expression filter.");
86
87 String[] options = new String[] {ALL, IMAGES, AUDIO, VIDEO, MISC,
88 CUSTOM};
89
90 addElementToDefinition(
91 new SimpleType(ATTR_USE_DEFAULT, "URLs that match selected file " +
92 "patterns will be crawled. Default file patterns are:\n" +
93 "Images: .bmp, .gif, .jp(e)g, .png, .tif(f)\nAudio: .mid, " +
94 ".mp2, .mp3, .mp4, .wav\nVideo: .avi, .mov, .mpeg, .ram, " +
95 ".rm, .smil, .wmv\nMiscellaneous: .doc, .pdf, .ppt, .swf\n" +
96 "All: All above patterns\nChoose 'Custom' to specify your own" +
97 " pattern. These default patterns are case insensitive.",
98 "All", options));
99
100 addElementToDefinition(
101 new SimpleType(ATTR_REGEXP, "Custom java regular expression.+n " +
102 "This regular expression will be used instead of the " +
103 "supplied pattern groups for matching.\nAn example " +
104 "of such a regular expression (Miscellaneous):\n" +
105 ".*(?i)(//.(doc|pdf|ppt|swf))$\n" +
106 "Any arbitrary reg.expr. is valid though and will be " +
107 "applied to the URI.", ""));
108
109
110 }
111
112 /***
113 * @see org.archive.crawler.filter.URIRegExpFilter#getRegexp(java.lang.Object)
114 */
115 protected String getRegexp(Object o) {
116 try {
117 String patternType = (String)getAttribute(o, ATTR_USE_DEFAULT);
118
119 if (patternType.equals(ALL)) {
120 return ALL_DEFAULT_PATTERNS;
121 } else if (patternType.equals(IMAGES)) {
122 return IMAGES_PATTERNS;
123 }else if (patternType.equals(AUDIO)) {
124 return AUDIO_PATTERNS;
125 }else if(patternType.equals(VIDEO)) {
126 return VIDEO_PATTERNS;
127 }else if(patternType.equals(MISC)) {
128 return MISC_PATTERNS;
129 }else if(patternType.equals(CUSTOM)) {
130 return (String) getAttribute(o, ATTR_REGEXP);
131 }else {
132 assert false : "Unrecognized pattern type " + patternType +
133 ". Should never happened!";
134 }
135
136 } catch (AttributeNotFoundException e) {
137 logger.log(Level.SEVERE,"necessary setting missing",e);
138 }
139
140
141 return null;
142 }
143
144 /***
145 * @see org.archive.crawler.framework.Filter#accepts(java.lang.Object)
146 */
147 public boolean accepts(Object o) {
148 CrawlURI curi = (o instanceof CrawlURI) ? (CrawlURI) o : null;
149
150
151
152
153
154 try {
155 if (!((Boolean) getAttribute(ATTR_ENABLED, curi)).booleanValue()) {
156 return false;
157 }
158 } catch (AttributeNotFoundException e) {
159 logger.severe(e.getMessage());
160 }
161
162 boolean accept = returnTrueIfMatches(curi) == innerAccepts(o);
163
164 if (accept && logger.isLoggable(Level.FINEST)) {
165
166 ComplexType p = this.getParent();
167 if (p instanceof MapType) {
168 p = p.getParent();
169 }
170 String msg = this.toString() + " belonging to " + p.toString()
171 + " accepted " + o.toString();
172 logger.finest(msg);
173 }
174
175 return accept;
176 }
177
178 }