View Javadoc

1   /* Copyright (C) 2003 Internet Archive.
2    *
3    * This file is part of the Heritrix web crawler (crawler.archive.org).
4    *
5    * Heritrix is free software; you can redistribute it and/or modify
6    * it under the terms of the GNU Lesser Public License as published by
7    * the Free Software Foundation; either version 2.1 of the License, or
8    * any later version.
9    *
10   * Heritrix is distributed in the hope that it will be useful,
11   * but WITHOUT ANY WARRANTY; without even the implied warranty of
12   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13   * GNU Lesser Public License for more details.
14   *
15   * You should have received a copy of the GNU Lesser Public License
16   * along with Heritrix; if not, write to the Free Software
17   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18   *
19   * TransclusionFilter.java
20   * Created on Oct 3, 2003
21   *
22   * $Header$
23   */
24  package org.archive.crawler.filter;
25  
26  import javax.management.AttributeNotFoundException;
27  
28  import org.archive.crawler.datamodel.CandidateURI;
29  import org.archive.crawler.deciderules.DecideRule;
30  import org.archive.crawler.deciderules.DecidingFilter;
31  import org.archive.crawler.extractor.Link;
32  import org.archive.crawler.framework.CrawlScope;
33  import org.archive.crawler.framework.Filter;
34  import org.archive.crawler.scope.ClassicScope;
35  import org.archive.crawler.settings.SimpleType;
36  
37  /***
38   * Filter which accepts CandidateURI/CrawlURI instances which contain more
39   * than zero but fewer than max-trans-hops entries at the end of their
40   * discovery path.
41   *
42   * @author Gordon Mohr
43   * @deprecated As of release 1.10.0.  Replaced by {@link DecidingFilter} and
44   * equivalent {@link DecideRule}.
45   */
46  public class TransclusionFilter extends Filter {
47  
48      private static final long serialVersionUID = 4251767672778714051L;
49  
50      private static final String ATTR_MAX_SPECULATIVE_HOPS =
51          "max-speculative-hops";
52      private static final String ATTR_MAX_REFERRAL_HOPS = "max-referral-hops";
53      private static final String ATTR_MAX_EMBED_HOPS = "max-embed-hops";
54      private static final int DEFAULT_MAX_TRANS_HOPS = 4;
55  
56      /***
57       * Default speculative hops.
58       *
59       * No more than 1
60       */
61      private static final int DEFAULT_MAX_SPECULATIVE_HOPS = 1;
62  
63      /***
64       * Default maximum referral hops.
65       *
66       * No limit beside the overall trans limit
67       */
68      private static final int DEFAULT_MAX_REFERRAL_HOPS = -1;
69  
70      /***
71       * Default embedded link hops.
72       *
73       * No limit beside the overall trans limit
74       */
75      private static final int DEFAULT_MAX_EMBED_HOPS = -1;
76  
77      int maxTransHops = DEFAULT_MAX_TRANS_HOPS;
78      int maxSpeculativeHops = DEFAULT_MAX_SPECULATIVE_HOPS;
79      int maxReferralHops = DEFAULT_MAX_REFERRAL_HOPS;
80      int maxEmbedHops = DEFAULT_MAX_EMBED_HOPS;
81  
82  //  // 1-3 trailing P(recondition)/R(eferral)/E(mbed)/X(speculative-embed) hops
83  //  private static final String TRANSCLUSION_PATH = ".*[PREX][PREX]?[PREX]?$";
84  
85      /***
86       * @param name
87       */
88      public TransclusionFilter(String name) {
89          super(name, "Transclusion filter *Deprecated* Use" +
90          		"DecidingFilter and equivalent DecideRule instead.");
91  
92          addElementToDefinition(
93              new SimpleType(
94                  ATTR_MAX_SPECULATIVE_HOPS,
95                  "Maximum number of consecutive speculative (i.e. URIs" +
96                  " extracted that we are not sure if they are embeds or" +
97                  " not) hops to allow.\nA value of -1 means no upper limit.",
98                  new Integer(DEFAULT_MAX_SPECULATIVE_HOPS)));
99          addElementToDefinition(
100             new SimpleType(
101                 ATTR_MAX_REFERRAL_HOPS,
102                 "Maximum number of consecutive referral hops to allow.\n" +
103                 "A value of -1 means no upper limit.",
104                 new Integer(DEFAULT_MAX_REFERRAL_HOPS)));
105         addElementToDefinition(
106             new SimpleType(
107                 ATTR_MAX_EMBED_HOPS,
108                 "Maximum number of consecutive embed hops to allow.\n" +
109                 "A value of -1 means no upper limit.",
110                 new Integer(DEFAULT_MAX_EMBED_HOPS)));
111     }
112 
113     /* (non-Javadoc)
114      * @see org.archive.crawler.framework.Filter#innerAccepts(java.lang.Object)
115      */
116     protected boolean innerAccepts(Object o) {
117         if(! (o instanceof CandidateURI)) {
118             return false;
119         }
120         String path = ((CandidateURI)o).getPathFromSeed();
121         int transCount = 0;
122         int specCount = 0;
123         int refCount = 0;
124         int embedCount = 0;
125         loop: for(int i=path.length()-1;i>=0;i--) {
126             // everything except 'L' is considered transitive
127             switch (path.charAt(i)) {
128                 case Link.NAVLINK_HOP: {
129                     break loop;
130                 }
131                 case Link.PREREQ_HOP: {
132                     if(transCount==0) {
133                         // always consider a trailing P as a 1-hop trans inclusion; disregard previous hops
134                         transCount++;
135                         break loop;
136                     }
137                     // otherwise, just count as another regular trans hop
138                     break;
139                 }
140                 case Link.SPECULATIVE_HOP: {
141                     specCount++;
142                     break;
143                 }
144                 case Link.REFER_HOP: {
145                     refCount++;
146                     break;
147                 }
148                 case Link.EMBED_HOP: {
149                     embedCount++;
150                     break;
151                 }
152                 // FIXME: what is 'D'?
153                 // 'D's get a free pass
154             }
155             transCount++;
156         }
157 
158         readMaxValues(o);
159 
160         // This is a case of possible transclusion
161         return (transCount > 0) 
162             // ...and the overall number of hops isn't too high
163             && (transCount <= this.maxTransHops) 
164             // ...and the number of spec-hops isn't too high
165             && (this.maxSpeculativeHops < 0 ||  specCount <= this.maxSpeculativeHops) 
166             // ...and the number of referral-hops isn't too high
167             && (this.maxReferralHops < 0 || refCount <= this.maxReferralHops)
168             // ...and the number of embed-hops isn't too high
169             && (this.maxEmbedHops < 0 || embedCount <= this.maxEmbedHops);
170     }
171 
172     public void readMaxValues(Object o) {
173         try {
174             CrawlScope scope =
175                 (CrawlScope) globalSettings().getModule(CrawlScope.ATTR_NAME);
176             this.maxTransHops = ((Integer) scope.getAttribute(o, ClassicScope.ATTR_MAX_TRANS_HOPS)).intValue();
177             this.maxSpeculativeHops = ((Integer) getAttribute(o, ATTR_MAX_SPECULATIVE_HOPS)).intValue();
178             this.maxReferralHops = ((Integer) getAttribute(o, ATTR_MAX_REFERRAL_HOPS)).intValue();
179             this.maxEmbedHops = ((Integer) getAttribute(o, ATTR_MAX_EMBED_HOPS)).intValue();
180         } catch (AttributeNotFoundException e) {
181             // TODO Auto-generated catch block
182             e.printStackTrace();
183         }
184     }
185 
186 }