View Javadoc

1   /* UriUtilsTest
2    *
3    * $Id: ArchiveUtilsTest.java 5052 2007-04-10 02:26:52Z gojomo $
4    *
5    * Copyright (C) 2010 Internet Archive.
6    *
7    * This file is part of the Heritrix web crawler (crawler.archive.org).
8    *
9    * Heritrix is free software; you can redistribute it and/or modify
10   * it under the terms of the GNU Lesser Public License as published by
11   * the Free Software Foundation; either version 2.1 of the License, or
12   * any later version.
13   *
14   * Heritrix is distributed in the hope that it will be useful,
15   * but WITHOUT ANY WARRANTY; without even the implied warranty of
16   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17   * GNU Lesser Public License for more details.
18   *
19   * You should have received a copy of the GNU Lesser Public License
20   * along with Heritrix; if not, write to the Free Software
21   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
22   */
23  
24  package org.archive.util;
25  
26  import junit.framework.Test;
27  import junit.framework.TestCase;
28  import junit.framework.TestSuite;
29  
30  /***
31   * JUnit test suite for UriUtils. 
32   * 
33   * Several of the tests for the 'legacy' (H1 through at least 1.14.4) 
34   * heuristics are disabled by renaming, because those heuristics have known 
35   * failures; however, until more experience with the new heuristics is 
36   * collected, H1 still uses them for consistency. 
37   * 
38   * @contributor gojomo
39   * @version $Id: ArchiveUtilsTest.java 5052 2007-04-10 02:26:52Z gojomo $
40   */
41  public class UriUtilsTest extends TestCase {
42  
43      public UriUtilsTest(final String testName) {
44          super(testName);
45      }
46  
47      /***
48       * run all the tests for ArchiveUtilsTest
49       * 
50       * @param argv
51       *            the command line arguments
52       */
53      public static void main(String argv[]) {
54          junit.textui.TestRunner.run(suite());
55      }
56  
57      public static Test suite() {
58          return new TestSuite(UriUtilsTest.class);
59      }
60  
61      /*** image URIs that should be considered likely URIs **/
62      static String[] urisRelativeImages = { 
63          "photo.jpg", 
64          "./photo.jpg",
65          "../photo.jpg", 
66          "images/photo.jpg", 
67          "../../images/photo.jpg" };
68  
69      /*** check that plausible relative image URIs return true with legacy tests */
70      public void xestLegacySimpleImageRelatives() {
71          legacyTryAll(urisRelativeImages, true);
72      }
73      
74      /*** check that plausible relative image URIs return true with new tests */
75      public void testNewSimpleImageRelatives() {
76          tryAll(urisRelativeImages,true); 
77      }
78  
79      /*** absolute URIs that should be considered likely URIs **/
80      static String[] urisAbsolute = { 
81          "http://example.com",
82          "http://example.com/", "http://www.example.com",
83          "http://www.example.com/", "http://www.example.com/about",
84          "http://www.example.com/about/",
85          "http://www.example.com/about/index.html", "https://example.com",
86          "https://example.com/", "https://www.example.com",
87          "https://www.example.com/", "https://www.example.com/about",
88          "https://www.example.com/about/",
89          "https://www.example.com/about/index.html",
90          "ftp://example.com/public/report.pdf",
91      // TODO: other schemes? mailto?
92  
93      };
94  
95      /*** check that absolute URIs return true with legacy tests */
96      public void testLegacyAbsolutes() {
97          legacyTryAll(urisAbsolute,true);
98      }
99      
100     /*** check that absolute URIs return true with new tests */
101     public void testAbsolutes() {
102         tryAll(urisAbsolute,true);
103     }
104 
105     /*** path-absolute images URIs that should be considered likely URIs **/
106     static String[] urisPathAbsoluteImages = { 
107         "/photo.jpg", 
108         "/images/photo.jpg", 
109     };
110     
111     /*** check that path-absolute image URIs return true with legacy tests*/
112     public void testLegacySimpleImagePathAbsolutes() {
113         legacyTryAll(urisPathAbsoluteImages, true); 
114     }
115     
116     /*** check that path-absolute image URIs return true with new tests*/
117     public void testSimpleImagePathAbsolutes() {
118         tryAll(urisPathAbsoluteImages, true); 
119     }
120     
121     /*** URI-like strings risking false positives that should NOT be likely URIs **/
122     static String[] notUrisNaiveFalsePositives = {
123         "0.99",
124         "3.14157",
125         "text/javascript"
126     };
127     
128     /*** check that typical false-positives of the naive test are not deemed URIs */
129     public void xestLegacyNaiveFalsePositives() {
130         legacyTryAll(notUrisNaiveFalsePositives, false); 
131     }
132     
133     /*** check that typical false-positives of the naive test are not deemed URIs */
134     public void testNaiveFalsePositives() {
135         tryAll(notUrisNaiveFalsePositives, false); 
136     }
137     
138     /*** strings that should not be considered likely URIs **/
139     static String[] notUrisNaive = {
140         "foo bar",
141         "<script>foo=bar</script>",
142         "item\t$0.99\tred",
143     };
144     
145     /*** check that strings that fail naive test are not deemed URIs legacy tests*/
146     public void testLegacyNaiveNotUris() {
147         legacyTryAll(notUrisNaive, false); 
148     }
149     
150     /*** check that strings that fail naive test are not deemed URIs new tests*/
151     public void testNaiveNotUris() {
152         tryAll(notUrisNaive, false); 
153     }
154     
155     
156     /***
157      * Test that all supplied candidates give the expected result, for each of 
158      * the 'legacy' (H1) likely-URI-tests
159      * 
160      * @param candidates String[] to test
161      * @param expected desired answer
162      */
163     protected void legacyTryAll(String[] candidates, boolean expected) {
164         for (String candidate : candidates) {
165             assertEquals("javascript context: " + candidate, 
166                     expected, 
167                     UriUtils.isLikelyUriJavascriptContextLegacy(candidate));
168             assertEquals("html context: " + candidate, 
169                     expected, 
170                     UriUtils.isLikelyUriHtmlContextLegacy(candidate));
171         }
172     }
173     
174 
175     
176     /***
177      * Test that all supplied candidates give the expected results, for 
178      * the 'new' heuristics now in this class. 
179      * @param candidates String[] to test
180      * @param expected desired answer
181      */
182     protected void tryAll(String[] candidates, boolean expected) {
183         for (String candidate : candidates) {
184             assertEquals("new: " + candidate, 
185                     expected, 
186                     UriUtils.isLikelyUri(candidate));
187             assertEquals("html context: " + candidate, 
188                     expected, 
189                     UriUtils.isLikelyUri(candidate));
190         }
191     }
192 }