1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25 package org.archive.crawler.extractor;
26
27 import java.io.Serializable;
28
29
30 /***
31 * Link represents one discovered "edge" of the web graph: the source
32 * URI, the destination URI, and the type of reference (represented by the
33 * context in which it was found).
34 *
35 * As such, it is a suitably generic item to returned from generic
36 * link-extraction utility code.
37 *
38 * @author gojomo
39 */
40 public class Link implements Serializable {
41
42 private static final long serialVersionUID = 7660959085498739376L;
43
44
45
46 /*** stand-in value for embeds without other context */
47 public static final String EMBED_MISC = "=EMBED_MISC".intern();
48 /*** stand-in value for js-discovered urls without other context */
49 public static final String JS_MISC = "=JS_MISC".intern();
50 /*** stand-in value for navlink urls without other context */
51 public static final String NAVLINK_MISC = "=NAVLINK_MISC".intern();
52 /*** stand-in value for speculative/aggressively extracted urls without other context */
53 public static final String SPECULATIVE_MISC = "=SPECULATIVE_MISC".intern();
54 /*** stand-in value for prerequisite without other context */
55 public static final String PREREQ_MISC = "=PREREQ_MISC".intern();
56
57
58 /*** navigation links, like A/@HREF */
59 public static final char NAVLINK_HOP = 'L';
60 /*** implied prerequisite links, like dns or robots */
61 public static final char PREREQ_HOP = 'P';
62 /*** embedded links necessary to render the page, like IMG/@SRC */
63 public static final char EMBED_HOP = 'E';
64 /*** speculative/aggressively extracted links, perhaps embed or nav, as in javascript */
65 public static final char SPECULATIVE_HOP = 'X';
66 /*** referral/redirect links, like header 'Location:' on a 301/302 response */
67 public static final char REFER_HOP = 'R';
68
69 /*** URI where this Link was discovered */
70 private CharSequence source;
71 /*** URI (absolute) where this Link points */
72 private CharSequence destination;
73 /*** context of discovery -- will be an XPath-like element[/@attribute]
74 * fragment for HTML URIs, a header name with trailing ':' for header
75 * values, or one of the stand-in constants when other context is
76 * unavailable */
77 private CharSequence context;
78 /*** hop-type, as character abbrieviation */
79 private char hopType;
80
81 /***
82 * Create a Link with the given fields.
83 * @param source
84 * @param destination
85 * @param context
86 * @param hopType
87 */
88 public Link(CharSequence source, CharSequence destination,
89 CharSequence context, char hopType) {
90 super();
91 this.source = source;
92 this.destination = destination;
93 this.context = context;
94 this.hopType = hopType;
95 }
96
97 /***
98 * @return Returns the context.
99 */
100 public CharSequence getContext() {
101 return context;
102 }
103 /***
104 * @return Returns the destination.
105 */
106 public CharSequence getDestination() {
107 return destination;
108 }
109 /***
110 * @return Returns the source.
111 */
112 public CharSequence getSource() {
113 return source;
114 }
115
116 /***
117 * @return char hopType
118 */
119 public char getHopType() {
120 return hopType;
121 }
122
123 /***
124 * Create a suitable XPath-like context from an element name and optional
125 * attribute name.
126 *
127 * @param element
128 * @param attribute
129 * @return CharSequence context
130 */
131 public static CharSequence elementContext(CharSequence element, CharSequence attribute) {
132 return attribute == null? "": element + "/@" + attribute;
133 }
134
135 @Override
136 public String toString() {
137 return this.destination + " " + this.hopType + " " + this.context;
138 }
139 }