1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24 package org.archive.crawler.datamodel;
25
26 import java.io.IOException;
27 import java.io.ObjectInputStream;
28 import java.io.ObjectOutputStream;
29 import java.io.PrintWriter;
30 import java.io.Serializable;
31 import java.util.ArrayList;
32 import java.util.Iterator;
33 import java.util.List;
34
35 import org.apache.commons.httpclient.URIException;
36 import org.archive.crawler.extractor.Link;
37 import org.archive.net.UURI;
38 import org.archive.net.UURIFactory;
39 import org.archive.util.ArchiveUtils;
40 import org.archive.util.Reporter;
41
42 import st.ata.util.AList;
43 import st.ata.util.HashtableAList;
44
45 /***
46 * A URI, discovered or passed-in, that may be scheduled.
47 * When scheduled, a CandidateURI becomes a {@link CrawlURI}
48 * made with the data contained herein. A CandidateURI
49 * contains just the fields necessary to perform quick in-scope analysis.
50 *
51 * <p>Has a flexible attribute list that will be promoted into
52 * any {@link CrawlURI} created from this CandidateURI. Use it
53 * to add custom data or state needed later doing custom processing.
54 * See accessors/setters {@link #putString(String, String)},
55 * {@link #getString(String)}, etc.
56 *
57 * @author Gordon Mohr
58 */
59 public class CandidateURI
60 implements Serializable, Reporter, CoreAttributeConstants {
61 private static final long serialVersionUID = -7152937921526560388L;
62
63 /*** Highest scheduling priority.
64 * Before any others of its class.
65 */
66 public static final int HIGHEST = 0;
67
68 /*** High scheduling priority.
69 * After any {@link #HIGHEST}.
70 */
71 public static final int HIGH = 1;
72
73 /*** Medium priority.
74 * After any {@link #HIGH}.
75 */
76 public static final int MEDIUM = 2;
77
78 /*** Normal/low priority.
79 * Whenever/end of queue.
80 */
81 public static final int NORMAL = 3;
82
83 private int schedulingDirective = NORMAL;
84
85 /***
86 * Usuable URI under consideration. Transient to allow
87 * more efficient custom serialization
88 */
89 private transient UURI uuri;
90
91 /*** Seed status */
92 private boolean isSeed = false;
93
94 private boolean forceRevisit = false;
95
96 /*** String of letters indicating how this URI was reached from a seed.
97 * <pre>
98 * P precondition
99 * R redirection
100 * E embedded (as frame, src, link, codebase, etc.)
101 * X speculative embed (as from javascript, some alternate-format extractors
102 * L link</pre>
103 * For example LLLE (an embedded image on a page 3 links from seed).
104 */
105 private String pathFromSeed;
106
107 /***
108 * Where this URI was (presently) discovered. . Transient to allow
109 * more efficient custom serialization
110 */
111 private transient UURI via;
112
113 /***
114 * Context of URI's discovery, as per the 'context' in Link
115 */
116 private CharSequence viaContext;
117
118 /***
119 * Flexible dynamic attributes list.
120 * <p>
121 * The attribute list is a flexible map of key/value pairs for storing
122 * status of this URI for use by other processors. By convention the
123 * attribute list is keyed by constants found in the
124 * {@link CoreAttributeConstants} interface. Use this list to carry
125 * data or state produced by custom processors rather change the
126 * classes {@link CrawlURI} or this class, CandidateURI.
127 *
128 * Transient to allow more efficient custom serialization.
129 */
130 private transient AList alist;
131
132 /***
133 * Cache of this candidate uuri as a string.
134 *
135 * Profiling shows us spending about 1-2% of total elapsed time in
136 * toString.
137 */
138 private String cachedCandidateURIString = null;
139
140
141 /***
142 * Frontier/Scheduler lifecycle info.
143 * This is an identifier set by the Frontier for its
144 * purposes. Usually its the name of the Frontier queue
145 * this URI gets queued to. Values can be host + port
146 * or IP, etc.
147 */
148 private String classKey;
149
150 /***
151 * Constructor.
152 * Protected access to block access to default constructor.
153 */
154 protected CandidateURI () {
155 super();
156 }
157
158 /***
159 * @param u uuri instance this CandidateURI wraps.
160 */
161 public CandidateURI(UURI u) {
162 this.uuri = u;
163 }
164
165 /***
166 * @param u uuri instance this CandidateURI wraps.
167 * @param pathFromSeed
168 * @param via
169 * @param viaContext
170 */
171 public CandidateURI(UURI u, String pathFromSeed, UURI via,
172 CharSequence viaContext) {
173 this.uuri = u;
174 this.pathFromSeed = pathFromSeed;
175 this.via = via;
176 this.viaContext = viaContext;
177 }
178
179 /***
180 * Set the <tt>isSeed</tt> attribute of this URI.
181 * @param b Is this URI a seed, true or false.
182 */
183 public void setIsSeed(boolean b) {
184 this.isSeed = b;
185 if (this.isSeed) {
186 if(pathFromSeed==null) {
187 this.pathFromSeed = "";
188 }
189
190
191 }
192 }
193
194 /***
195 * @return UURI
196 */
197 public UURI getUURI() {
198 return this.uuri;
199 }
200
201 /***
202 * @return Whether seeded.
203 */
204 public boolean isSeed() {
205 return this.isSeed;
206 }
207
208 /***
209 * @return path (hop-types) from seed
210 */
211 public String getPathFromSeed() {
212 return this.pathFromSeed;
213 }
214
215 /***
216 * @return URI via which this one was discovered
217 */
218 public UURI getVia() {
219 return this.via;
220 }
221
222 /***
223 * @return CharSequence context in which this one was discovered
224 */
225 public CharSequence getViaContext() {
226 return this.viaContext;
227 }
228
229 /***
230 * @param string
231 */
232 protected void setPathFromSeed(String string) {
233 pathFromSeed = string;
234 }
235
236 /***
237 * Called when making a copy of another CandidateURI.
238 * @param alist AList to use.
239 */
240 protected void setAList(AList alist) {
241 this.alist = alist;
242 }
243
244 public void setVia(UURI via) {
245 this.via = via;
246 }
247
248 /***
249 * @return This candidate URI as a string wrapped with 'CandidateURI(' +
250 * ')'.
251 */
252 public synchronized String getCandidateURIString() {
253 if (this.cachedCandidateURIString == null) {
254 this.cachedCandidateURIString =
255 "CandidateURI(" + toString() + ")";
256 }
257 return this.cachedCandidateURIString;
258 }
259
260 /***
261 * Method returns string version of this URI's referral URI.
262 * @return String version of referral URI
263 */
264 public String flattenVia() {
265 return (via == null)? "": via.toString();
266 }
267
268 /***
269 * @return The UURI this CandidateURI wraps as a string
270 * (We used return what {@link #getCandidateURIString()}
271 * returns on a toString -- use that method if you still need
272 * this functionality).
273 * @see #getCandidateURIString()
274 */
275 public String toString() {
276 return getURIString();
277 }
278
279 /***
280 * @return URI String
281 * @deprecated Use {@link #toString()}.
282 */
283 public String getURIString() {
284 return getUURI().toString();
285 }
286
287 /***
288 * Compares the domain of this CandidateURI with that of another
289 * CandidateURI
290 *
291 * @param other The other CandidateURI
292 *
293 * @return True if both are in the same domain, false otherwise.
294 * @throws URIException
295 */
296 public boolean sameDomainAs(CandidateURI other) throws URIException {
297 String domain = getUURI().getHost();
298 if (domain == null) {
299 return false;
300 }
301 while(domain.lastIndexOf('.') > domain.indexOf('.')) {
302
303 domain = domain.substring(domain.indexOf('.') + 1);
304 }
305 if(other.getUURI().getHost() == null) {
306 return false;
307 }
308 return other.getUURI().getHost().endsWith(domain);
309 }
310
311 /***
312 * If this method returns true, this URI should be fetched even though
313 * it already has been crawled. This also implies
314 * that this URI will be scheduled for crawl before any other waiting
315 * URIs for the same host.
316 *
317 * This value is used to refetch any expired robots.txt or dns-lookups.
318 *
319 * @return true if crawling of this URI should be forced
320 */
321 public boolean forceFetch() {
322 return forceRevisit;
323 }
324
325 /***
326 * Method to signal that this URI should be fetched even though
327 * it already has been crawled. Setting this to true also implies
328 * that this URI will be scheduled for crawl before any other waiting
329 * URIs for the same host.
330 *
331 * This value is used to refetch any expired robots.txt or dns-lookups.
332 *
333 * @param b set to true to enforce the crawling of this URI
334 */
335 public void setForceFetch(boolean b) {
336 forceRevisit = b;
337 }
338
339 /***
340 * @return Returns the schedulingDirective.
341 */
342 public int getSchedulingDirective() {
343 return schedulingDirective;
344 }
345 /***
346 * @param schedulingDirective The schedulingDirective to set.
347 */
348 public void setSchedulingDirective(int schedulingDirective) {
349 this.schedulingDirective = schedulingDirective;
350 }
351
352
353 /***
354 * @return True if needs immediate scheduling.
355 */
356 public boolean needsImmediateScheduling() {
357 return schedulingDirective == HIGH;
358 }
359
360 /***
361 * @return True if needs soon but not top scheduling.
362 */
363 public boolean needsSoonScheduling() {
364 return schedulingDirective == MEDIUM;
365 }
366
367 /***
368 * Tally up the number of transitive (non-simple-link) hops at
369 * the end of this CandidateURI's pathFromSeed.
370 *
371 * In some cases, URIs with greater than zero but less than some
372 * threshold such hops are treated specially.
373 *
374 * <p>TODO: consider moving link-count in here as well, caching
375 * calculation, and refactoring CrawlScope.exceedsMaxHops() to use this.
376 *
377 * @return Transhop count.
378 */
379 public int getTransHops() {
380 String path = getPathFromSeed();
381 int transCount = 0;
382 for(int i=path.length()-1;i>=0;i--) {
383 if(path.charAt(i)==Link.NAVLINK_HOP) {
384 break;
385 }
386 transCount++;
387 }
388 return transCount;
389 }
390
391 /***
392 * Given a string containing a URI, then optional whitespace
393 * delimited hops-path and via info, create a CandidateURI
394 * instance.
395 *
396 * @param uriHopsViaString String with a URI.
397 * @return A CandidateURI made from passed <code>uriHopsViaString</code>.
398 * @throws URIException
399 */
400 public static CandidateURI fromString(String uriHopsViaString)
401 throws URIException {
402 String args[] = uriHopsViaString.split("//s+");
403 String pathFromSeeds = (args.length > 1 && !args[1].equals("-")) ?
404 args[1]: "";
405 UURI via = (args.length > 2 && !args[2].equals("-")) ?
406 UURIFactory.getInstance(args[2]) : null;
407 CharSequence viaContext = (args.length > 3 && !args[3].equals("-")) ?
408 args[2]: null;
409 return new CandidateURI(UURIFactory.getInstance(args[0]),
410 pathFromSeeds, via, viaContext);
411 }
412
413 public static CandidateURI createSeedCandidateURI(UURI uuri) {
414 CandidateURI c = new CandidateURI(uuri);
415 c.setIsSeed(true);
416 return c;
417 }
418
419 /***
420 * Utility method for creation of CandidateURIs found extracting
421 * links from this CrawlURI.
422 * @param baseUURI BaseUURI for <code>link</code>.
423 * @param link Link to wrap CandidateURI in.
424 * @return New candidateURI wrapper around <code>link</code>.
425 * @throws URIException
426 */
427 public CandidateURI createCandidateURI(UURI baseUURI, Link link)
428 throws URIException {
429 UURI u = (link.getDestination() instanceof UURI)?
430 (UURI)link.getDestination():
431 UURIFactory.getInstance(baseUURI,
432 link.getDestination().toString());
433 CandidateURI newCaURI = new CandidateURI(u, getPathFromSeed() + link.getHopType(),
434 getUURI(), link.getContext());
435 newCaURI.inheritFrom(this);
436 return newCaURI;
437 }
438
439 /***
440 * Utility method for creation of CandidateURIs found extracting
441 * links from this CrawlURI.
442 * @param baseUURI BaseUURI for <code>link</code>.
443 * @param link Link to wrap CandidateURI in.
444 * @param scheduling How new CandidateURI should be scheduled.
445 * @param seed True if this CandidateURI is a seed.
446 * @return New candidateURI wrapper around <code>link</code>.
447 * @throws URIException
448 */
449 public CandidateURI createCandidateURI(UURI baseUURI, Link link,
450 int scheduling, boolean seed)
451 throws URIException {
452 final CandidateURI caURI = createCandidateURI(baseUURI, link);
453 caURI.setSchedulingDirective(scheduling);
454 caURI.setIsSeed(seed);
455 return caURI;
456 }
457
458 /***
459 * Inherit (copy) the relevant keys-values from the ancestor.
460 *
461 * @param ancestor
462 */
463 protected void inheritFrom(CandidateURI ancestor) {
464 List heritableKeys = (List) ancestor.getObject(A_HERITABLE_KEYS);
465 if(heritableKeys!=null) {
466 getAList().copyKeysFrom(heritableKeys.iterator(),ancestor.getAList());
467 }
468 }
469
470 /***
471 * Get the token (usually the hostname + port) which indicates
472 * what "class" this CrawlURI should be grouped with,
473 * for the purposes of ensuring only one item of the
474 * class is processed at once, all items of the class
475 * are held for a politeness period, etc.
476 *
477 * @return Token (usually the hostname) which indicates
478 * what "class" this CrawlURI should be grouped with.
479 */
480 public String getClassKey() {
481 return classKey;
482 }
483
484 public void setClassKey(String key) {
485 classKey = key;
486 }
487
488 /***
489 * Assumption is that only one thread at a time will ever be accessing
490 * a particular CandidateURI.
491 *
492 * @return the attribute list.
493 */
494 public AList getAList() {
495 if (this.alist == null) {
496 this.alist = new HashtableAList();
497 }
498 return this.alist;
499 }
500
501 protected void clearAList() {
502 this.alist = null;
503 }
504
505 public void putObject(String key, Object value) {
506 getAList().putObject(key, value);
507 }
508
509 public Object getObject(String key) {
510 return getAList().getObject(key);
511 }
512
513 public String getString(String key) {
514 return getAList().getString(key);
515 }
516
517 public void putString(String key, String value) {
518 getAList().putString(key, value);
519 }
520
521 public long getLong(String key) {
522 return getAList().getLong(key);
523 }
524
525 public void putLong(String key, long value) {
526 getAList().putLong(key, value);
527 }
528
529 public int getInt(String key) {
530 return getAList().getInt(key);
531 }
532
533 public void putInt(String key, int value) {
534 getAList().putInt(key, value);
535 }
536
537 public boolean containsKey(String key) {
538 return getAList().containsKey(key);
539 }
540
541 public void remove(String key) {
542 getAList().remove(key);
543 }
544
545 public Iterator keys() {
546 return getAList().getKeys();
547 }
548
549 /***
550 * @return True if this CandidateURI was result of a redirect:
551 * i.e. Its parent URI redirected to here, this URI was what was in
552 * the 'Location:' or 'Content-Location:' HTTP Header.
553 */
554 public boolean isLocation() {
555 return this.pathFromSeed != null && this.pathFromSeed.length() > 0 &&
556 this.pathFromSeed.charAt(this.pathFromSeed.length() - 1) ==
557 Link.REFER_HOP;
558 }
559
560 /***
561 * Custom serialization writing 'uuri' and 'via' as Strings, rather
562 * than the bloated full serialization of their object classes, and
563 * an empty alist as 'null'. Shrinks serialized form by 50% or more
564 * in short tests.
565 *
566 * @param stream
567 * @throws IOException
568 */
569 private void writeObject(ObjectOutputStream stream)
570 throws IOException {
571 stream.defaultWriteObject();
572 stream.writeUTF(uuri.toString());
573 stream.writeObject((via == null) ? null : via.getURI());
574 stream.writeObject((alist==null) ? null : alist);
575 }
576
577 /***
578 * Custom deserialization to reconstruct UURI instances from more
579 * compact Strings.
580 *
581 * @param stream
582 * @throws IOException
583 * @throws ClassNotFoundException
584 */
585 private void readObject(ObjectInputStream stream)
586 throws IOException, ClassNotFoundException {
587 stream.defaultReadObject();
588 uuri = readUuri(stream.readUTF());
589 via = readUuri((String)stream.readObject());
590 alist = (AList) stream.readObject();
591 }
592
593 /***
594 * Read a UURI from a String, handling a null or URIException
595 *
596 * @param u String or null from which to create UURI
597 * @return the best UURI instance creatable
598 */
599 protected UURI readUuri(String u) {
600 if (u == null) {
601 return null;
602 }
603 try {
604 return UURIFactory.getInstance(u);
605 } catch (URIException ux) {
606
607 }
608 try {
609
610 return UURIFactory.getInstance("invalid:" + u);
611 } catch (URIException ux) {
612 ux.printStackTrace();
613
614 }
615 try {
616
617 return UURIFactory.getInstance("invalid:");
618 } catch (URIException e) {
619 e.printStackTrace();
620 return null;
621 }
622 }
623
624
625
626
627
628 public String singleLineReport() {
629 return ArchiveUtils.singleLineReport(this);
630 }
631
632 public void singleLineReportTo(PrintWriter w) {
633 String className = this.getClass().getName();
634 className = className.substring(className.lastIndexOf(".")+1);
635 w.print(className);
636 w.print(" ");
637 w.print(getUURI().toString());
638 w.print(" ");
639 w.print(pathFromSeed);
640 w.print(" ");
641 w.print(flattenVia());
642 }
643
644
645
646
647 public String singleLineLegend() {
648 return "className uri hopsPath viaUri";
649 }
650
651
652
653
654 public String[] getReports() {
655
656 return new String[] {};
657 }
658
659
660
661
662 public void reportTo(String name, PrintWriter writer) {
663 singleLineReportTo(writer);
664 writer.print("\n");
665 }
666
667
668
669
670 public void reportTo(PrintWriter writer) throws IOException {
671 reportTo(null,writer);
672 }
673
674 /*** Make the given key 'heritable', meaning its value will be
675 * added to descendant CandidateURIs. Only keys with immutable
676 * values should be made heritable -- the value instance may
677 * be shared until the AList is serialized/deserialized.
678 *
679 * @param key to make heritable
680 */
681 public void makeHeritable(String key) {
682 @SuppressWarnings("unchecked")
683 List<String> heritableKeys = (List<String>) getObject(A_HERITABLE_KEYS);
684 if(heritableKeys==null) {
685 heritableKeys = new ArrayList<String>();
686 heritableKeys.add(A_HERITABLE_KEYS);
687 putObject(A_HERITABLE_KEYS,heritableKeys);
688 }
689 heritableKeys.add(key);
690 }
691
692 /*** Make the given key non-'heritable', meaning its value will
693 * not be added to descendant CandidateURIs. Only meaningful if
694 * key was previously made heritable.
695 *
696 * @param key to make non-heritable
697 */
698 public void makeNonHeritable(String key) {
699 List heritableKeys = (List) getObject(A_HERITABLE_KEYS);
700 if(heritableKeys==null) {
701 return;
702 }
703 heritableKeys.remove(key);
704 if(heritableKeys.size()==1) {
705
706 remove(A_HERITABLE_KEYS);
707 }
708 }
709 }