|
||||||||||
PREV CLASS NEXT CLASS | FRAMES NO FRAMES | |||||||||
SUMMARY: NESTED | FIELD | CONSTR | METHOD | DETAIL: FIELD | CONSTR | METHOD |
java.lang.Object org.archive.crawler.datamodel.CandidateURI
public class CandidateURI
A URI, discovered or passed-in, that may be scheduled.
When scheduled, a CandidateURI becomes a CrawlURI
made with the data contained herein. A CandidateURI
contains just the fields necessary to perform quick in-scope analysis.
Has a flexible attribute list that will be promoted into
any CrawlURI
created from this CandidateURI. Use it
to add custom data or state needed later doing custom processing.
See accessors/setters putString(String, String)
,
getString(String)
, etc.
Field Summary | |
---|---|
static int |
HIGH
High scheduling priority. |
static int |
HIGHEST
Highest scheduling priority. |
static int |
MEDIUM
Medium priority. |
static int |
NORMAL
Normal/low priority. |
Constructor Summary | |
---|---|
protected |
CandidateURI()
Constructor. |
|
CandidateURI(UURI u)
|
|
CandidateURI(UURI u,
java.lang.String pathFromSeed,
UURI via,
java.lang.CharSequence viaContext)
|
Method Summary | |
---|---|
protected void |
clearAList()
|
boolean |
containsKey(java.lang.String key)
|
CandidateURI |
createCandidateURI(UURI baseUURI,
Link link)
Utility method for creation of CandidateURIs found extracting links from this CrawlURI. |
CandidateURI |
createCandidateURI(UURI baseUURI,
Link link,
int scheduling,
boolean seed)
Utility method for creation of CandidateURIs found extracting links from this CrawlURI. |
static CandidateURI |
createSeedCandidateURI(UURI uuri)
|
java.lang.String |
flattenVia()
Method returns string version of this URI's referral URI. |
boolean |
forceFetch()
If this method returns true, this URI should be fetched even though it already has been crawled. |
static CandidateURI |
fromString(java.lang.String uriHopsViaString)
Given a string containing a URI, then optional whitespace delimited hops-path and via info, create a CandidateURI instance. |
st.ata.util.AList |
getAList()
Assumption is that only one thread at a time will ever be accessing a particular CandidateURI. |
java.lang.String |
getCandidateURIString()
|
java.lang.String |
getClassKey()
Get the token (usually the hostname + port) which indicates what "class" this CrawlURI should be grouped with, for the purposes of ensuring only one item of the class is processed at once, all items of the class are held for a politeness period, etc. |
int |
getInt(java.lang.String key)
|
long |
getLong(java.lang.String key)
|
java.lang.Object |
getObject(java.lang.String key)
|
java.lang.String |
getPathFromSeed()
|
java.lang.String[] |
getReports()
Get an array of report names offered by this Reporter. |
int |
getSchedulingDirective()
|
java.lang.String |
getString(java.lang.String key)
|
int |
getTransHops()
Tally up the number of transitive (non-simple-link) hops at the end of this CandidateURI's pathFromSeed. |
java.lang.String |
getURIString()
Deprecated. Use toString() . |
UURI |
getUURI()
|
UURI |
getVia()
|
java.lang.CharSequence |
getViaContext()
|
protected void |
inheritFrom(CandidateURI ancestor)
Inherit (copy) the relevant keys-values from the ancestor. |
boolean |
isLocation()
|
boolean |
isSeed()
|
java.util.Iterator |
keys()
|
void |
makeHeritable(java.lang.String key)
Make the given key 'heritable', meaning its value will be added to descendant CandidateURIs. |
void |
makeNonHeritable(java.lang.String key)
Make the given key non-'heritable', meaning its value will not be added to descendant CandidateURIs. |
boolean |
needsImmediateScheduling()
|
boolean |
needsSoonScheduling()
|
void |
putInt(java.lang.String key,
int value)
|
void |
putLong(java.lang.String key,
long value)
|
void |
putObject(java.lang.String key,
java.lang.Object value)
|
void |
putString(java.lang.String key,
java.lang.String value)
|
protected UURI |
readUuri(java.lang.String u)
Read a UURI from a String, handling a null or URIException |
void |
remove(java.lang.String key)
|
void |
reportTo(java.io.PrintWriter writer)
Make a default report to the passed-in Writer. |
void |
reportTo(java.lang.String name,
java.io.PrintWriter writer)
Make a report of the given name to the passed-in Writer, If null, give the default report. |
boolean |
sameDomainAs(CandidateURI other)
Compares the domain of this CandidateURI with that of another CandidateURI |
protected void |
setAList(st.ata.util.AList alist)
Called when making a copy of another CandidateURI. |
void |
setClassKey(java.lang.String key)
|
void |
setForceFetch(boolean b)
Method to signal that this URI should be fetched even though it already has been crawled. |
void |
setIsSeed(boolean b)
Set the isSeed attribute of this URI. |
protected void |
setPathFromSeed(java.lang.String string)
|
void |
setSchedulingDirective(int schedulingDirective)
|
void |
setVia(UURI via)
|
java.lang.String |
singleLineLegend()
Return a legend for the single-line summary report as a String. |
java.lang.String |
singleLineReport()
Return a short single-line summary report as a String. |
void |
singleLineReportTo(java.io.PrintWriter w)
Make a single-line summary report to the passed-in writer |
java.lang.String |
toString()
|
Methods inherited from class java.lang.Object |
---|
clone, equals, finalize, getClass, hashCode, notify, notifyAll, wait, wait, wait |
Field Detail |
---|
public static final int HIGHEST
public static final int HIGH
HIGHEST
.
public static final int MEDIUM
HIGH
.
public static final int NORMAL
Constructor Detail |
---|
protected CandidateURI()
public CandidateURI(UURI u)
u
- uuri instance this CandidateURI wraps.public CandidateURI(UURI u, java.lang.String pathFromSeed, UURI via, java.lang.CharSequence viaContext)
u
- uuri instance this CandidateURI wraps.pathFromSeed
- via
- viaContext
- Method Detail |
---|
public void setIsSeed(boolean b)
b
- Is this URI a seed, true or false.public UURI getUURI()
public boolean isSeed()
public java.lang.String getPathFromSeed()
public UURI getVia()
public java.lang.CharSequence getViaContext()
protected void setPathFromSeed(java.lang.String string)
string
- protected void setAList(st.ata.util.AList alist)
alist
- AList to use.public void setVia(UURI via)
public java.lang.String getCandidateURIString()
public java.lang.String flattenVia()
public java.lang.String toString()
toString
in class java.lang.Object
getCandidateURIString()
returns on a toString -- use that method if you still need
this functionality).getCandidateURIString()
public java.lang.String getURIString()
toString()
.
public boolean sameDomainAs(CandidateURI other) throws org.apache.commons.httpclient.URIException
other
- The other CandidateURI
org.apache.commons.httpclient.URIException
public boolean forceFetch()
public void setForceFetch(boolean b)
b
- set to true to enforce the crawling of this URIpublic int getSchedulingDirective()
public void setSchedulingDirective(int schedulingDirective)
schedulingDirective
- The schedulingDirective to set.public boolean needsImmediateScheduling()
public boolean needsSoonScheduling()
public int getTransHops()
TODO: consider moving link-count in here as well, caching calculation, and refactoring CrawlScope.exceedsMaxHops() to use this.
public static CandidateURI fromString(java.lang.String uriHopsViaString) throws org.apache.commons.httpclient.URIException
uriHopsViaString
- String with a URI.
uriHopsViaString
.
org.apache.commons.httpclient.URIException
public static CandidateURI createSeedCandidateURI(UURI uuri)
public CandidateURI createCandidateURI(UURI baseUURI, Link link) throws org.apache.commons.httpclient.URIException
baseUURI
- BaseUURI for link
.link
- Link to wrap CandidateURI in.
link
.
org.apache.commons.httpclient.URIException
public CandidateURI createCandidateURI(UURI baseUURI, Link link, int scheduling, boolean seed) throws org.apache.commons.httpclient.URIException
baseUURI
- BaseUURI for link
.link
- Link to wrap CandidateURI in.scheduling
- How new CandidateURI should be scheduled.seed
- True if this CandidateURI is a seed.
link
.
org.apache.commons.httpclient.URIException
protected void inheritFrom(CandidateURI ancestor)
ancestor
- public java.lang.String getClassKey()
public void setClassKey(java.lang.String key)
public st.ata.util.AList getAList()
protected void clearAList()
public void putObject(java.lang.String key, java.lang.Object value)
public java.lang.Object getObject(java.lang.String key)
public java.lang.String getString(java.lang.String key)
public void putString(java.lang.String key, java.lang.String value)
public long getLong(java.lang.String key)
public void putLong(java.lang.String key, long value)
public int getInt(java.lang.String key)
public void putInt(java.lang.String key, int value)
public boolean containsKey(java.lang.String key)
public void remove(java.lang.String key)
public java.util.Iterator keys()
public boolean isLocation()
protected UURI readUuri(java.lang.String u)
u
- String or null from which to create UURI
public java.lang.String singleLineReport()
Reporter
singleLineReport
in interface Reporter
public void singleLineReportTo(java.io.PrintWriter w)
Reporter
singleLineReportTo
in interface Reporter
w
- to receive reportpublic java.lang.String singleLineLegend()
Reporter
singleLineLegend
in interface Reporter
public java.lang.String[] getReports()
Reporter
getReports
in interface Reporter
public void reportTo(java.lang.String name, java.io.PrintWriter writer)
Reporter
reportTo
in interface Reporter
writer
- to receive reportpublic void reportTo(java.io.PrintWriter writer) throws java.io.IOException
Reporter
reportTo
in interface Reporter
writer
- to receive report
java.io.IOException
public void makeHeritable(java.lang.String key)
key
- to make heritablepublic void makeNonHeritable(java.lang.String key)
key
- to make non-heritable
|
||||||||||
PREV CLASS NEXT CLASS | FRAMES NO FRAMES | |||||||||
SUMMARY: NESTED | FIELD | CONSTR | METHOD | DETAIL: FIELD | CONSTR | METHOD |