View Javadoc

1   /* IdenticalDigestDecideRule
2   *
3   * $Id: HopsPathMatchesRegExpDecideRule.java 4649 2006-09-25 17:16:55 +0000 (Mon, 25 Sep 2006) paul_jack $
4   *
5   * Created on Feb 17, 2007
6   *
7   * Copyright (C) 2007 Internet Archive.
8   *
9   * This file is part of the Heritrix web crawler (crawler.archive.org).
10  *
11  * Heritrix is free software; you can redistribute it and/or modify
12  * it under the terms of the GNU Lesser Public License as published by
13  * the Free Software Foundation; either version 2.1 of the License, or
14  * any later version.
15  *
16  * Heritrix is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19  * GNU Lesser Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser Public License
22  * along with Heritrix; if not, write to the Free Software
23  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
24  */
25  package org.archive.crawler.deciderules.recrawl;
26  
27  import org.archive.crawler.datamodel.CoreAttributeConstants;
28  import org.archive.crawler.datamodel.CrawlURI;
29  import org.archive.crawler.deciderules.PredicatedDecideRule;
30  import org.archive.crawler.settings.SimpleType;
31  import org.archive.crawler.settings.Type;
32  
33  import st.ata.util.AList;
34  
35  /***
36   * Rule applies configured decision to any CrawlURIs whose prior-history
37   * content-digest matches the latest fetch. 
38   *
39   * @author gojomo
40   */
41  public class IdenticalDigestDecideRule extends PredicatedDecideRule 
42  implements CoreAttributeConstants {
43      private static final long serialVersionUID = 4275993790856626949L;
44  
45      /***
46       * Usual constructor. 
47       * @param name
48       */
49      public IdenticalDigestDecideRule(String name) {
50          super(name);
51          setDescription("IdenticalDigestDecideRule. Applies configured " +
52                  "decision to any CrawlURIs whose prior-history " +
53                  "content-digest matches the latest fetch.");
54          // make default REJECT (overriding superclass)
55          Type type = addElementToDefinition(new SimpleType(ATTR_DECISION,
56                  "Decision to be applied", REJECT, ALLOWED_TYPES));
57      }
58  
59      /***
60       * Evaluate whether given CrawlURI's content-digest exactly 
61       * matches that of preceding fetch. 
62       *
63       * @param object should be CrawlURI
64       * @return true if current-fetch content-digest matches previous
65       */
66      protected boolean evaluate(Object object) {
67          CrawlURI curi = (CrawlURI)object;
68          return hasIdenticalDigest(curi);
69      }
70  
71      /***
72       * Utility method for testing if a CrawlURI's last two history 
73       * entiries (one being the most recent fetch) have identical 
74       * content-digest information. 
75       * 
76       * @param curi CrawlURI to test
77       * @return true if last two history entries have identical digests, 
78       * otherwise false
79       */
80      public static boolean hasIdenticalDigest(CrawlURI curi) {
81          if(curi.getAList().containsKey(A_FETCH_HISTORY)) {
82              AList[] history = curi.getAList().getAListArray(A_FETCH_HISTORY);
83              return history[0] != null 
84                     && history[0].containsKey(CoreAttributeConstants.A_CONTENT_DIGEST)
85                     && history[1] != null
86                     && history[1].containsKey(CoreAttributeConstants.A_CONTENT_DIGEST)
87                     && history[0].getString(CoreAttributeConstants.A_CONTENT_DIGEST).equals(
88                             history[1].getString(CoreAttributeConstants.A_CONTENT_DIGEST));
89          } else {
90              return false;
91          }
92      }
93  }