View Javadoc

1   /* Copyright (C) 2003 Internet Archive.
2    *
3    * This file is part of the Heritrix web crawler (crawler.archive.org).
4    *
5    * Heritrix is free software; you can redistribute it and/or modify
6    * it under the terms of the GNU Lesser Public License as published by
7    * the Free Software Foundation; either version 2.1 of the License, or
8    * any later version.
9    *
10   * Heritrix is distributed in the hope that it will be useful,
11   * but WITHOUT ANY WARRANTY; without even the implied warranty of
12   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13   * GNU Lesser Public License for more details.
14   *
15   * You should have received a copy of the GNU Lesser Public License
16   * along with Heritrix; if not, write to the Free Software
17   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18   *
19   * CandidateURI.java
20   * Created on Sep 30, 2003
21   *
22   * $Header$
23   */
24  package org.archive.crawler.datamodel;
25  
26  import java.io.IOException;
27  import java.io.ObjectInputStream;
28  import java.io.ObjectOutputStream;
29  import java.io.PrintWriter;
30  import java.io.Serializable;
31  import java.util.ArrayList;
32  import java.util.Iterator;
33  import java.util.List;
34  
35  import org.apache.commons.httpclient.URIException;
36  import org.archive.crawler.extractor.Link;
37  import org.archive.net.UURI;
38  import org.archive.net.UURIFactory;
39  import org.archive.util.ArchiveUtils;
40  import org.archive.util.Reporter;
41  
42  import st.ata.util.AList;
43  import st.ata.util.HashtableAList;
44  
45  /***
46   * A URI, discovered or passed-in, that may be scheduled.
47   * When scheduled, a CandidateURI becomes a {@link CrawlURI}
48   * made with the data contained herein. A CandidateURI
49   * contains just the fields necessary to perform quick in-scope analysis.
50   * 
51   * <p>Has a flexible attribute list that will be promoted into
52   * any {@link CrawlURI} created from this CandidateURI.  Use it
53   * to add custom data or state needed later doing custom processing.
54   * See accessors/setters {@link #putString(String, String)},
55   * {@link #getString(String)}, etc. 
56   *
57   * @author Gordon Mohr
58   */
59  public class CandidateURI
60  implements Serializable, Reporter, CoreAttributeConstants {
61      private static final long serialVersionUID = -7152937921526560388L;
62  
63      /*** Highest scheduling priority.
64       * Before any others of its class.
65       */
66      public static final int HIGHEST = 0;
67      
68      /*** High scheduling priority.
69       * After any {@link #HIGHEST}.
70       */
71      public static final int HIGH = 1;
72      
73      /*** Medium priority.
74       * After any {@link #HIGH}.
75       */
76      public static final int MEDIUM = 2;
77      
78      /*** Normal/low priority.
79       * Whenever/end of queue.
80       */
81      public static final int NORMAL = 3;
82      
83      private int schedulingDirective = NORMAL;
84      
85      /*** 
86       * Usuable URI under consideration. Transient to allow
87       * more efficient custom serialization 
88       */
89      private transient UURI uuri;
90      
91      /*** Seed status */
92      private boolean isSeed = false;
93  
94      private boolean forceRevisit = false; // even if already visited
95      
96      /*** String of letters indicating how this URI was reached from a seed.
97       * <pre>
98       * P precondition
99       * R redirection
100      * E embedded (as frame, src, link, codebase, etc.)
101      * X speculative embed (as from javascript, some alternate-format extractors
102      * L link</pre>
103      * For example LLLE (an embedded image on a page 3 links from seed).
104      */
105     private String pathFromSeed;
106     
107     /***
108      * Where this URI was (presently) discovered. . Transient to allow
109      * more efficient custom serialization
110      */
111     private transient UURI via;
112 
113     /***
114      * Context of URI's discovery, as per the 'context' in Link
115      */
116     private CharSequence viaContext;
117     
118     /***
119      * Flexible dynamic attributes list.
120      * <p>
121      * The attribute list is a flexible map of key/value pairs for storing
122      * status of this URI for use by other processors. By convention the
123      * attribute list is keyed by constants found in the
124      * {@link CoreAttributeConstants} interface.  Use this list to carry
125      * data or state produced by custom processors rather change the
126      * classes {@link CrawlURI} or this class, CandidateURI.
127      *
128      * Transient to allow more efficient custom serialization.
129      */
130     private transient AList alist;
131     
132     /***
133      * Cache of this candidate uuri as a string.
134      *
135      * Profiling shows us spending about 1-2% of total elapsed time in
136      * toString.
137      */
138     private String cachedCandidateURIString = null;
139     
140 
141     /***
142      * Frontier/Scheduler lifecycle info.
143      * This is an identifier set by the Frontier for its
144      * purposes. Usually its the name of the Frontier queue
145      * this URI gets queued to.  Values can be host + port
146      * or IP, etc.
147      */
148     private String classKey;
149 
150     /***
151      * Constructor.
152      * Protected access to block access to default constructor.
153      */
154     protected CandidateURI () {
155         super();
156     }
157     
158     /***
159      * @param u uuri instance this CandidateURI wraps.
160      */
161     public CandidateURI(UURI u) {
162         this.uuri = u;
163     }
164     
165     /***
166      * @param u uuri instance this CandidateURI wraps.
167      * @param pathFromSeed
168      * @param via
169      * @param viaContext
170      */
171     public CandidateURI(UURI u, String pathFromSeed, UURI via,
172             CharSequence viaContext) {
173         this.uuri = u;
174         this.pathFromSeed = pathFromSeed;
175         this.via = via;
176         this.viaContext = viaContext;
177     }
178 
179     /***
180      * Set the <tt>isSeed</tt> attribute of this URI.
181      * @param b Is this URI a seed, true or false.
182      */
183     public void setIsSeed(boolean b) {
184         this.isSeed = b;
185         if (this.isSeed) {
186             if(pathFromSeed==null) {
187                 this.pathFromSeed = "";
188             }
189 //          seeds created on redirect must have a via to be recognized; don't clear
190 //            setVia(null);
191         }
192     }
193 
194     /***
195      * @return UURI
196      */
197     public UURI getUURI() {
198         return this.uuri;
199     }
200 
201     /***
202      * @return Whether seeded.
203      */
204     public boolean isSeed() {
205         return this.isSeed;
206     }
207 
208     /***
209      * @return path (hop-types) from seed
210      */
211     public String getPathFromSeed() {
212         return this.pathFromSeed;
213     }
214 
215     /***
216      * @return URI via which this one was discovered
217      */
218     public UURI getVia() {
219         return this.via;
220     }
221 
222     /***
223      * @return CharSequence context in which this one was discovered
224      */
225     public CharSequence getViaContext() {
226         return this.viaContext;
227     }
228     
229     /***
230      * @param string
231      */
232     protected void setPathFromSeed(String string) {
233         pathFromSeed = string;
234     }
235     
236     /***
237      * Called when making a copy of another CandidateURI.
238      * @param alist AList to use.
239      */
240     protected void setAList(AList alist) {
241         this.alist = alist;
242     }
243 
244     public void setVia(UURI via) {
245         this.via = via;
246     }
247 
248     /***
249      * @return This candidate URI as a string wrapped with 'CandidateURI(' +
250      * ')'.
251      */
252     public synchronized String getCandidateURIString() {
253         if (this.cachedCandidateURIString == null) {
254             this.cachedCandidateURIString =
255                 "CandidateURI(" + toString() + ")";
256         }
257         return this.cachedCandidateURIString;
258     }
259 
260     /***
261      * Method returns string version of this URI's referral URI.
262      * @return String version of referral URI
263      */
264     public String flattenVia() {
265         return (via == null)? "": via.toString();
266     }
267     
268     /***
269      * @return The UURI this CandidateURI wraps as a string 
270      * (We used return what {@link #getCandidateURIString()}
271      * returns on a toString -- use that method if you still need
272      * this functionality).
273      * @see #getCandidateURIString()
274      */
275     public String toString() {
276         return getURIString();
277     }
278 
279     /***
280      * @return URI String
281      * @deprecated Use {@link #toString()}.
282      */
283     public String getURIString() {
284         return getUURI().toString();
285     }
286 
287     /***
288      * Compares the domain of this CandidateURI with that of another
289      * CandidateURI
290      *
291      * @param other The other CandidateURI
292      *
293      * @return True if both are in the same domain, false otherwise.
294      * @throws URIException
295      */
296     public boolean sameDomainAs(CandidateURI other) throws URIException {
297         String domain = getUURI().getHost();
298         if (domain == null) {
299             return false;
300         }
301         while(domain.lastIndexOf('.') > domain.indexOf('.')) {
302             // While has more than one dot, lop off first segment
303             domain = domain.substring(domain.indexOf('.') + 1);
304         }
305         if(other.getUURI().getHost() == null) {
306             return false;
307         }
308         return other.getUURI().getHost().endsWith(domain);
309     }
310 
311     /***
312      * If this method returns true, this URI should be fetched even though
313      * it already has been crawled. This also implies
314      * that this URI will be scheduled for crawl before any other waiting
315      * URIs for the same host.
316      *
317      * This value is used to refetch any expired robots.txt or dns-lookups.
318      *
319      * @return true if crawling of this URI should be forced
320      */
321     public boolean forceFetch() {
322         return forceRevisit;
323     }
324 
325    /***
326      * Method to signal that this URI should be fetched even though
327      * it already has been crawled. Setting this to true also implies
328      * that this URI will be scheduled for crawl before any other waiting
329      * URIs for the same host.
330      *
331      * This value is used to refetch any expired robots.txt or dns-lookups.
332      *
333      * @param b set to true to enforce the crawling of this URI
334      */
335     public void setForceFetch(boolean b) {
336         forceRevisit = b;
337     }
338 
339     /***
340      * @return Returns the schedulingDirective.
341      */
342     public int getSchedulingDirective() {
343         return schedulingDirective;
344     }
345     /*** 
346      * @param schedulingDirective The schedulingDirective to set.
347      */
348     public void setSchedulingDirective(int schedulingDirective) {
349         this.schedulingDirective = schedulingDirective;
350     }
351 
352 
353     /***
354      * @return True if needs immediate scheduling.
355      */
356     public boolean needsImmediateScheduling() {
357         return schedulingDirective == HIGH;
358     }
359 
360     /***
361      * @return True if needs soon but not top scheduling.
362      */
363     public boolean needsSoonScheduling() {
364         return schedulingDirective == MEDIUM;
365     }
366 
367     /***
368      * Tally up the number of transitive (non-simple-link) hops at
369      * the end of this CandidateURI's pathFromSeed.
370      * 
371      * In some cases, URIs with greater than zero but less than some
372      * threshold such hops are treated specially. 
373      * 
374      * <p>TODO: consider moving link-count in here as well, caching
375      * calculation, and refactoring CrawlScope.exceedsMaxHops() to use this. 
376      * 
377      * @return Transhop count.
378      */
379     public int getTransHops() {
380         String path = getPathFromSeed();
381         int transCount = 0;
382         for(int i=path.length()-1;i>=0;i--) {
383             if(path.charAt(i)==Link.NAVLINK_HOP) {
384                 break;
385             }
386             transCount++;
387         }
388         return transCount;
389     }
390 
391     /***
392      * Given a string containing a URI, then optional whitespace
393      * delimited hops-path and via info, create a CandidateURI 
394      * instance.
395      * 
396      * @param uriHopsViaString String with a URI.
397      * @return A CandidateURI made from passed <code>uriHopsViaString</code>.
398      * @throws URIException
399      */
400     public static CandidateURI fromString(String uriHopsViaString)
401             throws URIException {
402         String args[] = uriHopsViaString.split("//s+");
403         String pathFromSeeds = (args.length > 1 && !args[1].equals("-")) ?
404                 args[1]: "";
405         UURI via = (args.length > 2 && !args[2].equals("-")) ?
406                 UURIFactory.getInstance(args[2]) : null;
407         CharSequence viaContext = (args.length > 3 && !args[3].equals("-")) ?
408                 args[2]: null;
409         return new CandidateURI(UURIFactory.getInstance(args[0]),
410                 pathFromSeeds, via, viaContext);
411     }
412     
413     public static CandidateURI createSeedCandidateURI(UURI uuri) {
414         CandidateURI c = new CandidateURI(uuri);
415         c.setIsSeed(true);
416         return c;
417     }
418     
419     /***
420      * Utility method for creation of CandidateURIs found extracting
421      * links from this CrawlURI.
422      * @param baseUURI BaseUURI for <code>link</code>.
423      * @param link Link to wrap CandidateURI in.
424      * @return New candidateURI wrapper around <code>link</code>.
425      * @throws URIException
426      */
427     public CandidateURI createCandidateURI(UURI baseUURI, Link link)
428     throws URIException {
429         UURI u = (link.getDestination() instanceof UURI)?
430             (UURI)link.getDestination():
431             UURIFactory.getInstance(baseUURI,
432                 link.getDestination().toString());
433         CandidateURI newCaURI = new CandidateURI(u, getPathFromSeed() + link.getHopType(),
434                 getUURI(), link.getContext());
435         newCaURI.inheritFrom(this);
436         return newCaURI;
437     }
438 
439     /***
440      * Utility method for creation of CandidateURIs found extracting
441      * links from this CrawlURI.
442      * @param baseUURI BaseUURI for <code>link</code>.
443      * @param link Link to wrap CandidateURI in.
444      * @param scheduling How new CandidateURI should be scheduled.
445      * @param seed True if this CandidateURI is a seed.
446      * @return New candidateURI wrapper around <code>link</code>.
447      * @throws URIException
448      */
449     public CandidateURI createCandidateURI(UURI baseUURI, Link link,
450         int scheduling, boolean seed)
451     throws URIException {
452         final CandidateURI caURI = createCandidateURI(baseUURI, link);
453         caURI.setSchedulingDirective(scheduling);
454         caURI.setIsSeed(seed);
455         return caURI;
456     }
457     
458     /***
459      * Inherit (copy) the relevant keys-values from the ancestor. 
460      * 
461      * @param ancestor
462      */
463     protected void inheritFrom(CandidateURI ancestor) {
464         List heritableKeys = (List) ancestor.getObject(A_HERITABLE_KEYS);
465         if(heritableKeys!=null) {
466             getAList().copyKeysFrom(heritableKeys.iterator(),ancestor.getAList());
467         }
468     }
469     
470     /***
471      * Get the token (usually the hostname + port) which indicates
472      * what "class" this CrawlURI should be grouped with,
473      * for the purposes of ensuring only one item of the
474      * class is processed at once, all items of the class
475      * are held for a politeness period, etc.
476      *
477      * @return Token (usually the hostname) which indicates
478      * what "class" this CrawlURI should be grouped with.
479      */
480     public String getClassKey() {
481         return classKey;
482     }
483 
484     public void setClassKey(String key) {
485         classKey = key;
486     }
487     
488     /***
489      * Assumption is that only one thread at a time will ever be accessing
490      * a particular CandidateURI.
491      * 
492      * @return the attribute list.
493      */
494     public AList getAList() {
495         if (this.alist == null) {
496             this.alist = new HashtableAList();
497         }
498         return this.alist;
499     }
500     
501     protected void clearAList() {
502         this.alist = null;
503     }
504     
505     public void putObject(String key, Object value) {
506         getAList().putObject(key, value);
507     }
508     
509     public Object getObject(String key) {
510         return getAList().getObject(key);
511     }
512     
513     public String getString(String key) {
514         return getAList().getString(key);
515     }
516     
517     public void putString(String key, String value) {
518         getAList().putString(key, value);
519     }
520     
521     public long getLong(String key) {
522         return getAList().getLong(key);
523     }
524     
525     public void putLong(String key, long value) {
526         getAList().putLong(key, value);
527     }
528     
529     public int getInt(String key) {
530         return getAList().getInt(key);
531     }
532     
533     public void putInt(String key, int value) {
534         getAList().putInt(key, value);
535     }
536     
537     public boolean containsKey(String key) {
538         return getAList().containsKey(key);
539     }
540     
541     public void remove(String key) {
542         getAList().remove(key);
543     }
544     
545     public Iterator keys() {
546         return getAList().getKeys();
547     }
548     
549     /***
550      * @return True if this CandidateURI was result of a redirect:
551      * i.e. Its parent URI redirected to here, this URI was what was in 
552      * the 'Location:' or 'Content-Location:' HTTP Header.
553      */
554     public boolean isLocation() {
555         return this.pathFromSeed != null && this.pathFromSeed.length() > 0 &&
556             this.pathFromSeed.charAt(this.pathFromSeed.length() - 1) ==
557                 Link.REFER_HOP;
558     }
559 
560     /***
561      * Custom serialization writing 'uuri' and 'via' as Strings, rather
562      * than the bloated full serialization of their object classes, and 
563      * an empty alist as 'null'. Shrinks serialized form by 50% or more
564      * in short tests. 
565      * 
566      * @param stream
567      * @throws IOException
568      */
569     private void writeObject(ObjectOutputStream stream)
570         throws IOException {
571         stream.defaultWriteObject();
572         stream.writeUTF(uuri.toString());
573         stream.writeObject((via == null) ? null : via.getURI());
574         stream.writeObject((alist==null) ? null : alist);
575     }
576 
577     /***
578      * Custom deserialization to reconstruct UURI instances from more
579      * compact Strings. 
580      * 
581      * @param stream
582      * @throws IOException
583      * @throws ClassNotFoundException
584      */
585     private void readObject(ObjectInputStream stream)
586         throws IOException, ClassNotFoundException {
587         stream.defaultReadObject();
588         uuri = readUuri(stream.readUTF());
589         via = readUuri((String)stream.readObject());
590         alist = (AList) stream.readObject();
591     }
592 
593     /***
594      * Read a UURI from a String, handling a null or URIException
595      * 
596      * @param u String or null from which to create UURI
597      * @return the best UURI instance creatable
598      */
599     protected UURI readUuri(String u) {
600         if (u == null) {
601             return null;
602         }
603         try {
604             return UURIFactory.getInstance(u);
605         } catch (URIException ux) {
606             // simply continue to next try
607         }
608         try {
609             // try adding an junk scheme
610             return UURIFactory.getInstance("invalid:" + u);
611         } catch (URIException ux) {
612             ux.printStackTrace();
613             // ignored; method continues
614         }
615         try {
616             // return total junk
617             return UURIFactory.getInstance("invalid:");
618         } catch (URIException e) {
619             e.printStackTrace();
620             return null;
621         }
622     }
623     
624     //
625     // Reporter implementation
626     //
627 
628     public String singleLineReport() {
629         return ArchiveUtils.singleLineReport(this);
630     }
631     
632     public void singleLineReportTo(PrintWriter w) {
633         String className = this.getClass().getName();
634         className = className.substring(className.lastIndexOf(".")+1);
635         w.print(className);
636         w.print(" ");
637         w.print(getUURI().toString());
638         w.print(" ");
639         w.print(pathFromSeed);
640         w.print(" ");
641         w.print(flattenVia());
642     }
643 
644     /* (non-Javadoc)
645      * @see org.archive.util.Reporter#singleLineLegend()
646      */
647     public String singleLineLegend() {
648         return "className uri hopsPath viaUri";
649     }
650     
651     /* (non-Javadoc)
652      * @see org.archive.util.Reporter#getReports()
653      */
654     public String[] getReports() {
655         // none but default: empty options
656         return new String[] {};
657     }
658 
659     /* (non-Javadoc)
660      * @see org.archive.util.Reporter#reportTo(java.lang.String, java.io.Writer)
661      */
662     public void reportTo(String name, PrintWriter writer) {
663         singleLineReportTo(writer);
664         writer.print("\n");
665     }
666 
667     /* (non-Javadoc)
668      * @see org.archive.util.Reporter#reportTo(java.io.Writer)
669      */
670     public void reportTo(PrintWriter writer) throws IOException {
671         reportTo(null,writer);
672     }
673 
674     /*** Make the given key 'heritable', meaning its value will be 
675      * added to descendant CandidateURIs. Only keys with immutable
676      * values should be made heritable -- the value instance may 
677      * be shared until the AList is serialized/deserialized. 
678      * 
679      * @param key to make heritable
680      */
681     public void makeHeritable(String key) {
682         @SuppressWarnings("unchecked")
683         List<String> heritableKeys = (List<String>) getObject(A_HERITABLE_KEYS);
684         if(heritableKeys==null) {
685             heritableKeys = new ArrayList<String>();
686             heritableKeys.add(A_HERITABLE_KEYS);
687             putObject(A_HERITABLE_KEYS,heritableKeys);
688         }
689         heritableKeys.add(key);
690     }
691     
692     /*** Make the given key non-'heritable', meaning its value will 
693      * not be added to descendant CandidateURIs. Only meaningful if
694      * key was previously made heritable.  
695      * 
696      * @param key to make non-heritable
697      */
698     public void makeNonHeritable(String key) {
699         List heritableKeys = (List) getObject(A_HERITABLE_KEYS);
700         if(heritableKeys==null) {
701             return;
702         }
703         heritableKeys.remove(key);
704         if(heritableKeys.size()==1) {
705             // only remaining heritable key is itself; disable completely
706             remove(A_HERITABLE_KEYS);
707         }
708     }
709 }