View Javadoc

1   /* Link
2   *
3   * $Id: Link.java 6773 2010-02-18 01:52:41Z szznax $
4   *
5   * Created on Mar 7, 2005
6   *
7   * Copyright (C) 2005 Internet Archive.
8   *
9   * This file is part of the Heritrix web crawler (crawler.archive.org).
10  *
11  * Heritrix is free software; you can redistribute it and/or modify
12  * it under the terms of the GNU Lesser Public License as published by
13  * the Free Software Foundation; either version 2.1 of the License, or
14  * any later version.
15  *
16  * Heritrix is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19  * GNU Lesser Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser Public License
22  * along with Heritrix; if not, write to the Free Software
23  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
24  */ 
25  package org.archive.crawler.extractor;
26  
27  import java.io.Serializable;
28  
29  
30  /***
31   * Link represents one discovered "edge" of the web graph: the source
32   * URI, the destination URI, and the type of reference (represented by the
33   * context in which it was found). 
34   * 
35   * As such, it is a suitably generic item to returned from generic 
36   * link-extraction utility code.
37   * 
38   * @author gojomo
39   */
40  public class Link implements Serializable {
41  
42      private static final long serialVersionUID = 7660959085498739376L;
43  
44      /* contexts for when another syntax (XPath-like or header-based)
45       *  in unavailable */
46      /*** stand-in value for embeds without other context */
47      public static final String EMBED_MISC = "=EMBED_MISC".intern();
48      /*** stand-in value for js-discovered urls without other context */
49      public static final String JS_MISC = "=JS_MISC".intern();
50      /*** stand-in value for navlink urls without other context */
51      public static final String NAVLINK_MISC = "=NAVLINK_MISC".intern();
52      /*** stand-in value for speculative/aggressively extracted urls without other context */
53      public static final String SPECULATIVE_MISC = "=SPECULATIVE_MISC".intern();
54      /*** stand-in value for prerequisite without other context */
55      public static final String PREREQ_MISC = "=PREREQ_MISC".intern(); 
56      
57      /* hop types */
58      /*** navigation links, like A/@HREF */
59      public static final char NAVLINK_HOP = 'L'; // TODO: change to 'N' to avoid 'L'ink confusion?
60      /*** implied prerequisite links, like dns or robots */
61      public static final char PREREQ_HOP = 'P';
62      /*** embedded links necessary to render the page, like IMG/@SRC */
63      public static final char EMBED_HOP = 'E';
64      /*** speculative/aggressively extracted links, perhaps embed or nav, as in javascript */
65      public static final char SPECULATIVE_HOP = 'X';
66      /*** referral/redirect links, like header 'Location:' on a 301/302 response */
67      public static final char REFER_HOP = 'R';
68  
69      /*** URI where this Link was discovered */
70      private CharSequence source;
71      /*** URI (absolute) where this Link points */
72      private CharSequence destination;
73      /*** context of discovery -- will be an XPath-like element[/@attribute] 
74       * fragment for HTML URIs, a header name with trailing ':' for header 
75       * values, or one of the stand-in constants when other context is 
76       * unavailable */
77      private CharSequence context;
78      /*** hop-type, as character abbrieviation */
79      private char hopType;
80      
81      /***
82       * Create a Link with the given fields.
83       * @param source
84       * @param destination
85       * @param context
86       * @param hopType
87       */
88      public Link(CharSequence source, CharSequence destination,
89              CharSequence context, char hopType) {
90          super();
91          this.source = source;
92          this.destination = destination;
93          this.context = context;
94          this.hopType = hopType;
95      }
96  
97      /***
98       * @return Returns the context.
99       */
100     public CharSequence getContext() {
101         return context;
102     }
103     /***
104      * @return Returns the destination.
105      */
106     public CharSequence getDestination() {
107         return destination;
108     }
109     /***
110      * @return Returns the source.
111      */
112     public CharSequence getSource() {
113         return source;
114     }
115 
116     /***
117      * @return char hopType
118      */
119     public char getHopType() {
120         return hopType;
121     }
122 
123     /***
124      * Create a suitable XPath-like context from an element name and optional
125      * attribute name. 
126      * 
127      * @param element
128      * @param attribute
129      * @return CharSequence context
130      */
131     public static CharSequence elementContext(CharSequence element, CharSequence attribute) {
132         return attribute == null? "": element + "/@" + attribute;
133     }
134     
135     @Override
136     public String toString() {
137         return this.destination + " " + this.hopType + " " + this.context;
138     }
139 }