View Javadoc

1   /* Canonicalizer
2    * 
3    * Created on Oct 7, 2004
4    *
5    * Copyright (C) 2004 Internet Archive.
6    * 
7    * This file is part of the Heritrix web crawler (crawler.archive.org).
8    * 
9    * Heritrix is free software; you can redistribute it and/or modify
10   * it under the terms of the GNU Lesser Public License as published by
11   * the Free Software Foundation; either version 2.1 of the License, or
12   * any later version.
13   * 
14   * Heritrix is distributed in the hope that it will be useful, 
15   * but WITHOUT ANY WARRANTY; without even the implied warranty of
16   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17   * GNU Lesser Public License for more details.
18   * 
19   * You should have received a copy of the GNU Lesser Public License
20   * along with Heritrix; if not, write to the Free Software
21   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
22   */
23  package org.archive.crawler.url;
24  
25  import java.util.Iterator;
26  import java.util.logging.Logger;
27  import java.util.logging.Level;
28  
29  import javax.management.AttributeNotFoundException;
30  
31  import org.archive.crawler.datamodel.CrawlOrder;
32  import org.archive.crawler.settings.MapType;
33  import org.archive.net.UURI;
34  
35  /***
36   * URL canonicalizer.
37   * @author stack
38   * @version $Date: 2006-09-26 20:38:48 +0000 (Tue, 26 Sep 2006) $, $Revision: 4667 $
39   */
40  public class Canonicalizer {
41      private static Logger logger =
42          Logger.getLogger(Canonicalizer.class.getName());
43      
44      /***
45       * Constructor.
46       * This class can't be constructed.
47       * Shutdown.
48       */
49      private Canonicalizer() {
50          super();
51      }
52      
53      /***
54       * Convenience method that is passed a settings object instance pulling
55       * from it what it needs to canonicalize.
56       * @param uuri UURI to canonicalize.
57       * @param order A crawlorder instance.
58       * @return Canonicalized string of uuri else uuri if an error.
59       */
60      public static String canonicalize(UURI uuri, CrawlOrder order) {
61          MapType rules = null;
62          String canonical = uuri.toString();
63          try {
64              rules = (MapType)order.getAttribute(uuri, CrawlOrder.ATTR_RULES);
65              canonical = Canonicalizer.canonicalize(uuri, rules.iterator(uuri));
66          } catch (AttributeNotFoundException e) {
67              logger.warning("Failed canonicalization of " + canonical +
68                  ": " + e);
69          }
70          return canonical;
71      }
72  
73      /***
74       * Run the passed uuri through the list of rules.
75       * @param uuri Url to canonicalize.
76       * @param rules Iterator of canonicalization rules to apply (Get one
77       * of these on the url-canonicalizer-rules element in order files or
78       * create a list externally).  Rules must implement the Rule interface.
79       * @return Canonicalized URL.
80       */
81      public static String canonicalize(UURI uuri, Iterator rules) {
82          String before = uuri.toString();
83          //String beforeRule = null;
84          String canonical = before;
85          for (; rules.hasNext();) {
86              CanonicalizationRule r = (CanonicalizationRule)rules.next();
87              //if (logger.isLoggable(Level.FINER)) {
88              //    beforeRule = canonical;
89              //}
90              if (!r.isEnabled(uuri)) {
91                  if (logger.isLoggable(Level.FINER)) {
92                      logger.finer("Rule " + r.getName() + " is disabled.");
93                  }
94                  continue;
95              }
96              canonical = r.canonicalize(canonical, uuri);
97              if (logger.isLoggable(Level.FINER)) {
98                  logger.finer("Rule " + r.getName() + " " + before + " => " +
99                          canonical);
100             }
101         }
102         if (logger.isLoggable(Level.INFO)) {
103             logger.fine(before + " => " + canonical);
104         }
105         return canonical;
106     }
107 }