1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23 package org.archive.crawler.url;
24
25 import java.util.Iterator;
26 import java.util.logging.Logger;
27 import java.util.logging.Level;
28
29 import javax.management.AttributeNotFoundException;
30
31 import org.archive.crawler.datamodel.CrawlOrder;
32 import org.archive.crawler.settings.MapType;
33 import org.archive.net.UURI;
34
35 /***
36 * URL canonicalizer.
37 * @author stack
38 * @version $Date: 2006-09-26 20:38:48 +0000 (Tue, 26 Sep 2006) $, $Revision: 4667 $
39 */
40 public class Canonicalizer {
41 private static Logger logger =
42 Logger.getLogger(Canonicalizer.class.getName());
43
44 /***
45 * Constructor.
46 * This class can't be constructed.
47 * Shutdown.
48 */
49 private Canonicalizer() {
50 super();
51 }
52
53 /***
54 * Convenience method that is passed a settings object instance pulling
55 * from it what it needs to canonicalize.
56 * @param uuri UURI to canonicalize.
57 * @param order A crawlorder instance.
58 * @return Canonicalized string of uuri else uuri if an error.
59 */
60 public static String canonicalize(UURI uuri, CrawlOrder order) {
61 MapType rules = null;
62 String canonical = uuri.toString();
63 try {
64 rules = (MapType)order.getAttribute(uuri, CrawlOrder.ATTR_RULES);
65 canonical = Canonicalizer.canonicalize(uuri, rules.iterator(uuri));
66 } catch (AttributeNotFoundException e) {
67 logger.warning("Failed canonicalization of " + canonical +
68 ": " + e);
69 }
70 return canonical;
71 }
72
73 /***
74 * Run the passed uuri through the list of rules.
75 * @param uuri Url to canonicalize.
76 * @param rules Iterator of canonicalization rules to apply (Get one
77 * of these on the url-canonicalizer-rules element in order files or
78 * create a list externally). Rules must implement the Rule interface.
79 * @return Canonicalized URL.
80 */
81 public static String canonicalize(UURI uuri, Iterator rules) {
82 String before = uuri.toString();
83
84 String canonical = before;
85 for (; rules.hasNext();) {
86 CanonicalizationRule r = (CanonicalizationRule)rules.next();
87
88
89
90 if (!r.isEnabled(uuri)) {
91 if (logger.isLoggable(Level.FINER)) {
92 logger.finer("Rule " + r.getName() + " is disabled.");
93 }
94 continue;
95 }
96 canonical = r.canonicalize(canonical, uuri);
97 if (logger.isLoggable(Level.FINER)) {
98 logger.finer("Rule " + r.getName() + " " + before + " => " +
99 canonical);
100 }
101 }
102 if (logger.isLoggable(Level.INFO)) {
103 logger.fine(before + " => " + canonical);
104 }
105 return canonical;
106 }
107 }