|
||||||||||
PREV CLASS NEXT CLASS | FRAMES NO FRAMES | |||||||||
SUMMARY: NESTED | FIELD | CONSTR | METHOD | DETAIL: FIELD | CONSTR | METHOD |
java.lang.Object javax.management.Attribute org.archive.crawler.settings.Type org.archive.crawler.settings.ComplexType org.archive.crawler.settings.ModuleType org.archive.crawler.frontier.AbstractFrontier
public abstract class AbstractFrontier
Shared facilities for Frontier implementations.
Nested Class Summary |
---|
Nested classes/interfaces inherited from class org.archive.crawler.settings.ComplexType |
---|
ComplexType.MBeanAttributeInfoIterator |
Nested classes/interfaces inherited from interface org.archive.crawler.framework.Frontier |
---|
Frontier.FrontierGroup |
Field Summary | |
---|---|
protected static java.lang.String |
ACCEPTABLE_FORCE_QUEUE
|
static java.lang.String |
ATTR_DELAY_FACTOR
how many multiples of last fetch elapsed time to wait before recontacting same server |
static java.lang.String |
ATTR_FORCE_QUEUE
queue assignment to force onto CrawlURIs; intended to be overridden |
static java.lang.String |
ATTR_MAX_DELAY
never wait more than this long, regardless of multiple |
static java.lang.String |
ATTR_MAX_HOST_BANDWIDTH_USAGE
maximum per-host bandwidth usage |
static java.lang.String |
ATTR_MAX_OVERALL_BANDWIDTH_USAGE
maximum overall bandwidth usage |
static java.lang.String |
ATTR_MAX_RETRIES
maximum times to emit a CrawlURI without final disposition |
static java.lang.String |
ATTR_MIN_DELAY
always wait this long after one completion before recontacting same server, regardless of multiple |
static java.lang.String |
ATTR_PAUSE_AT_FINISH
whether pause, rather than finish, when crawl appears done |
static java.lang.String |
ATTR_PAUSE_AT_START
whether to pause at crawl start |
static java.lang.String |
ATTR_PREFERENCE_EMBED_HOPS
number of hops of embeds (ERX) to bump to front of host queue |
static java.lang.String |
ATTR_QUEUE_ASSIGNMENT_POLICY
|
protected static java.lang.String |
ATTR_RECOVERY_ENABLED
Recover log on or off attribute. |
static java.lang.String |
ATTR_RESPECT_CRAWL_DELAY_UP_TO_SECS
Whether to respect a 'Crawl-Delay' (in seconds) given in a site's robots.txt |
static java.lang.String |
ATTR_RETRY_DELAY
for retryable problems, seconds to wait before a retry |
static java.lang.String |
ATTR_SOURCE_TAG_SEEDS
whether to pause at crawl start |
protected CrawlController |
controller
|
protected static java.lang.Boolean |
DEFAULT_ATTR_RECOVERY_ENABLED
|
protected static java.lang.Float |
DEFAULT_DELAY_FACTOR
|
protected static java.lang.String |
DEFAULT_FORCE_QUEUE
|
protected static java.lang.Integer |
DEFAULT_MAX_DELAY
|
protected static java.lang.Integer |
DEFAULT_MAX_HOST_BANDWIDTH_USAGE
|
protected static java.lang.Integer |
DEFAULT_MAX_OVERALL_BANDWIDTH_USAGE
|
protected static java.lang.Integer |
DEFAULT_MAX_RETRIES
|
protected static java.lang.Integer |
DEFAULT_MIN_DELAY
|
protected static java.lang.Boolean |
DEFAULT_PAUSE_AT_FINISH
|
protected static java.lang.Boolean |
DEFAULT_PAUSE_AT_START
|
protected static java.lang.Integer |
DEFAULT_PREFERENCE_EMBED_HOPS
|
protected static java.lang.Integer |
DEFAULT_RESPECT_CRAWL_DELAY_UP_TO_SECS
|
protected static java.lang.Long |
DEFAULT_RETRY_DELAY
|
protected static java.lang.Boolean |
DEFAULT_SOURCE_TAG_SEEDS
|
protected long |
disregardedUriCount
|
protected long |
failedFetchCount
|
static java.lang.String |
IGNORED_SEEDS_FILENAME
file collecting report of ignored seed-file entries (if any) |
protected int |
lastMaxBandwidthKB
|
protected java.util.concurrent.atomic.AtomicLong |
liveDisregardedUriCount
URIs that are disregarded (for example because of robot.txt rules |
protected java.util.concurrent.atomic.AtomicLong |
liveFailedFetchCount
|
protected java.util.concurrent.atomic.AtomicLong |
liveQueuedUriCount
total URIs queued to be visited |
protected java.util.concurrent.atomic.AtomicLong |
liveSucceededFetchCount
|
protected java.util.concurrent.atomic.AtomicLong |
nextOrdinal
ordinal numbers to assign to created CrawlURIs |
protected long |
processedBytesAfterLastEmittedURI
|
protected long |
queuedUriCount
|
protected boolean |
shouldPause
should the frontier hold any threads asking for URIs? |
protected boolean |
shouldTerminate
should the frontier send an EndedException to any threads asking for URIs? |
protected long |
succeededFetchCount
|
protected long |
totalProcessedBytes
Used when bandwidth constraint are used. |
Fields inherited from class org.archive.crawler.settings.ComplexType |
---|
definition, definitionMap |
Fields inherited from interface org.archive.crawler.framework.Frontier |
---|
ATTR_NAME |
Constructor Summary | |
---|---|
AbstractFrontier(java.lang.String name,
java.lang.String description)
|
Method Summary | |
---|---|
protected void |
applySpecialHandling(CrawlURI curi)
Perform any special handling of the CrawlURI, such as promoting its URI to seed-status, or preferencing it because it is an embed. |
protected CrawlURI |
asCrawlUri(CandidateURI caUri)
|
protected java.lang.String |
canonicalize(CandidateURI cauri)
Canonicalize passed CandidateURI. |
protected java.lang.String |
canonicalize(UURI uuri)
Canonicalize passed uuri. |
void |
crawlCheckpoint(java.io.File checkpointDir)
Called by CrawlController when checkpointing. |
void |
crawlEnded(java.lang.String sExitMessage)
Called when a CrawlController has ended a crawl and is about to exit. |
void |
crawlEnding(java.lang.String sExitMessage)
Called when a CrawlController is ending a crawl (for any reason) |
void |
crawlPaused(java.lang.String statusMessage)
Called when a CrawlController is actually paused (all threads are idle). |
void |
crawlPausing(java.lang.String statusMessage)
Called when a CrawlController is going to be paused. |
void |
crawlResuming(java.lang.String statusMessage)
Called when a CrawlController is resuming a crawl that had been paused. |
void |
crawlStarted(java.lang.String message)
Called on crawl start. |
protected void |
decrementQueuedCount(long numberOfDeletes)
Note that a number of queued Uris have been deleted. |
long |
disregardedUriCount()
Number of URIs that were scheduled at one point but have been disregarded. |
protected void |
doJournalAdded(CrawlURI c)
|
protected void |
doJournalDisregarded(CrawlURI c)
|
protected void |
doJournalEmitted(CrawlURI c)
|
protected void |
doJournalFinishedFailure(CrawlURI c)
|
protected void |
doJournalFinishedSuccess(CrawlURI c)
|
protected void |
doJournalRescheduled(CrawlURI c)
|
long |
failedFetchCount()
(non-Javadoc) |
long |
finishedUriCount()
(non-Javadoc) |
java.lang.String |
getClassKey(CandidateURI cauri)
|
FrontierJournal |
getFrontierJournal()
|
protected QueueAssignmentPolicy |
getQueueAssignmentPolicy(CandidateURI cauri)
|
protected CrawlServer |
getServer(CrawlURI curi)
|
void |
importRecoverLog(java.lang.String pathToLog,
boolean retainFailures)
Recover earlier state by reading a recovery log. |
protected void |
incrementDisregardedUriCount()
Increment the running count of disregarded URIs. |
protected void |
incrementFailedFetchCount()
Increment the running count of failed URIs. |
protected void |
incrementQueuedUriCount()
Increment the running count of queued URIs. |
protected void |
incrementQueuedUriCount(long increment)
Increment the running count of queued URIs. |
protected void |
incrementSucceededFetchCount()
Increment the running count of successfully fetched URIs. |
void |
initialize(CrawlController c)
Initialize the Frontier. |
protected boolean |
isDisregarded(CrawlURI curi)
|
boolean |
isEmpty()
Frontier is empty only if all queues are empty and no URIs are in-process |
void |
kickUpdate()
Notify Frontier that it should consider updating configuration info that may have changed in external files. |
void |
loadSeeds()
Load up the seeds. |
protected void |
log(CrawlURI curi)
Log to the main crawl.log |
protected void |
logLocalizedErrors(CrawlURI curi)
Take note of any processor-local errors that have been entered into the CrawlURI. |
protected boolean |
needsRetrying(CrawlURI curi)
Checks if a recently completed CrawlURI that did not finish successfully needs to be retried (processed again after some time elapses) |
protected void |
noteAboutToEmit(CrawlURI curi,
WorkQueue q)
Perform fixups on a CrawlURI about to be returned via next(). |
protected boolean |
overMaxRetries(CrawlURI curi)
|
void |
pause()
Notify Frontier that it should not release any URIs, instead holding all threads, until instructed otherwise. |
protected long |
politenessDelayFor(CrawlURI curi)
Update any scheduling structures with the new information in this CrawlURI. |
protected void |
preNext(long now)
|
long |
queuedUriCount()
(non-Javadoc) |
void |
reportTo(java.io.PrintWriter writer)
Make a default report to the passed-in Writer. |
protected long |
retryDelayFor(CrawlURI curi)
Return a suitable value to wait before retrying the given URI. |
static void |
saveIgnoredItems(java.lang.String ignoredItems,
java.io.File dir)
Dump ignored seed items (if any) to disk; delete file otherwise. |
protected java.io.File |
scratchDirFor(java.lang.String key)
Utility method to return a scratch dir for the given key's temp files. |
java.lang.String |
singleLineReport()
Return a short single-line summary report as a String. |
void |
start()
Request that Frontier allow crawling to begin. |
long |
succeededFetchCount()
(non-Javadoc) |
protected void |
tally(CrawlURI curi,
CrawlSubstats.Stage stage)
Report CrawlURI to each of the three 'substats' accumulators (group/queue, server, host) for a given stage. |
void |
terminate()
Notify Frontier that it should end the crawl, giving any worker ToeThread that askss for a next() an EndedException. |
long |
totalBytesWritten()
Deprecated. misnomer; use StatisticsTracking figures instead |
void |
unpause()
Resumes the release of URIs to crawl, allowing worker ToeThreads to proceed. |
Methods inherited from class org.archive.crawler.settings.ModuleType |
---|
addElement, listUsedFiles |
Methods inherited from class org.archive.crawler.settings.Type |
---|
addConstraint, equals, getConstraints, getLegalValueType, isExpertSetting, isOverrideable, isTransient, setExpertSetting, setLegalValueType, setOverrideable, setTransient |
Methods inherited from class javax.management.Attribute |
---|
getName, hashCode |
Methods inherited from class java.lang.Object |
---|
clone, finalize, getClass, notify, notifyAll, wait, wait, wait |
Methods inherited from interface org.archive.crawler.framework.Frontier |
---|
averageDepth, congestionRatio, considerIncluded, deepestUri, deleted, deleteURIs, deleteURIs, discoveredUriCount, finalTasks, finished, getGroup, getInitialMarker, getURIsList, next, schedule |
Methods inherited from interface org.archive.util.Reporter |
---|
getReports, reportTo, singleLineLegend, singleLineReportTo |
Field Detail |
---|
protected transient CrawlController controller
protected java.util.concurrent.atomic.AtomicLong nextOrdinal
protected boolean shouldPause
protected transient boolean shouldTerminate
public static final java.lang.String ATTR_DELAY_FACTOR
protected static final java.lang.Float DEFAULT_DELAY_FACTOR
public static final java.lang.String ATTR_MIN_DELAY
protected static final java.lang.Integer DEFAULT_MIN_DELAY
public static final java.lang.String ATTR_RESPECT_CRAWL_DELAY_UP_TO_SECS
protected static final java.lang.Integer DEFAULT_RESPECT_CRAWL_DELAY_UP_TO_SECS
public static final java.lang.String ATTR_MAX_DELAY
protected static final java.lang.Integer DEFAULT_MAX_DELAY
public static final java.lang.String ATTR_PREFERENCE_EMBED_HOPS
protected static final java.lang.Integer DEFAULT_PREFERENCE_EMBED_HOPS
public static final java.lang.String ATTR_MAX_HOST_BANDWIDTH_USAGE
protected static final java.lang.Integer DEFAULT_MAX_HOST_BANDWIDTH_USAGE
public static final java.lang.String ATTR_MAX_OVERALL_BANDWIDTH_USAGE
protected static final java.lang.Integer DEFAULT_MAX_OVERALL_BANDWIDTH_USAGE
public static final java.lang.String ATTR_RETRY_DELAY
protected static final java.lang.Long DEFAULT_RETRY_DELAY
public static final java.lang.String ATTR_MAX_RETRIES
protected static final java.lang.Integer DEFAULT_MAX_RETRIES
public static final java.lang.String ATTR_QUEUE_ASSIGNMENT_POLICY
public static final java.lang.String ATTR_FORCE_QUEUE
protected static final java.lang.String DEFAULT_FORCE_QUEUE
protected static final java.lang.String ACCEPTABLE_FORCE_QUEUE
public static final java.lang.String ATTR_PAUSE_AT_FINISH
protected static final java.lang.Boolean DEFAULT_PAUSE_AT_FINISH
public static final java.lang.String ATTR_PAUSE_AT_START
protected static final java.lang.Boolean DEFAULT_PAUSE_AT_START
public static final java.lang.String ATTR_SOURCE_TAG_SEEDS
protected static final java.lang.Boolean DEFAULT_SOURCE_TAG_SEEDS
protected static final java.lang.String ATTR_RECOVERY_ENABLED
protected static final java.lang.Boolean DEFAULT_ATTR_RECOVERY_ENABLED
protected long queuedUriCount
protected long succeededFetchCount
protected long failedFetchCount
protected long disregardedUriCount
protected transient java.util.concurrent.atomic.AtomicLong liveQueuedUriCount
protected transient java.util.concurrent.atomic.AtomicLong liveSucceededFetchCount
protected transient java.util.concurrent.atomic.AtomicLong liveFailedFetchCount
protected transient java.util.concurrent.atomic.AtomicLong liveDisregardedUriCount
protected long totalProcessedBytes
protected long processedBytesAfterLastEmittedURI
protected int lastMaxBandwidthKB
public static final java.lang.String IGNORED_SEEDS_FILENAME
Constructor Detail |
---|
public AbstractFrontier(java.lang.String name, java.lang.String description)
name
- Name of this frontier.description
- Description for this frontier.Method Detail |
---|
public void start()
Frontier
start
in interface Frontier
public void pause()
Frontier
pause
in interface Frontier
public void unpause()
Frontier
unpause
in interface Frontier
public void initialize(CrawlController c) throws FatalConfigurationException, java.io.IOException
Frontier
This method is invoked by the CrawlController once it has created the Frontier. The constructor of the Frontier should only contain code for setting up it's settings framework. This method should contain all other 'startup' code.
initialize
in interface Frontier
c
- The CrawlController that created the Frontier.
FatalConfigurationException
- If provided settings are illegal or
otherwise unusable.
java.io.IOException
- If there is a problem reading settings or seeds file
from disk.public void terminate()
Frontier
terminate
in interface Frontier
protected void tally(CrawlURI curi, CrawlSubstats.Stage stage)
curi
- stage
- protected void doJournalFinishedSuccess(CrawlURI c)
protected void doJournalAdded(CrawlURI c)
protected void doJournalRescheduled(CrawlURI c)
protected void doJournalFinishedFailure(CrawlURI c)
protected void doJournalDisregarded(CrawlURI c)
protected void doJournalEmitted(CrawlURI c)
public boolean isEmpty()
isEmpty
in interface Frontier
protected void incrementQueuedUriCount()
protected void incrementQueuedUriCount(long increment)
increment
- amount to increment the queued countprotected void decrementQueuedCount(long numberOfDeletes)
numberOfDeletes
- public long queuedUriCount()
queuedUriCount
in interface Frontier
Frontier.queuedUriCount()
public long finishedUriCount()
finishedUriCount
in interface Frontier
Frontier.finishedUriCount()
protected void incrementSucceededFetchCount()
public long succeededFetchCount()
succeededFetchCount
in interface Frontier
Frontier.succeededFetchCount()
protected void incrementFailedFetchCount()
public long failedFetchCount()
failedFetchCount
in interface Frontier
Frontier.failedFetchCount()
protected void incrementDisregardedUriCount()
public long disregardedUriCount()
Frontier
Counts any URI that is scheduled only to be disregarded because it is determined to lie outside the scope of the crawl. Most commonly this will be due to robots.txt exclusions.
disregardedUriCount
in interface Frontier
public long totalBytesWritten()
Frontier
totalBytesWritten
in interface Frontier
public void loadSeeds()
loadSeeds
in interface Frontier
CrawlController.kickUpdate()
public static void saveIgnoredItems(java.lang.String ignoredItems, java.io.File dir)
ignoredItems
- dir
- protected CrawlURI asCrawlUri(CandidateURI caUri)
protected void preNext(long now) throws java.lang.InterruptedException, EndedException
now
-
java.lang.InterruptedException
EndedException
protected void applySpecialHandling(CrawlURI curi)
curi
- protected void noteAboutToEmit(CrawlURI curi, WorkQueue q)
curi
- CrawlURI about to be returned by next()q
- the queue from which the CrawlURI cameprotected CrawlServer getServer(CrawlURI curi)
curi
-
protected long retryDelayFor(CrawlURI curi)
curi
- CrawlURI to be retried
protected long politenessDelayFor(CrawlURI curi)
curi
- The CrawlURI
protected void logLocalizedErrors(CrawlURI curi)
curi
- protected java.io.File scratchDirFor(java.lang.String key)
key
-
protected boolean overMaxRetries(CrawlURI curi)
public void importRecoverLog(java.lang.String pathToLog, boolean retainFailures) throws java.io.IOException
Frontier
Some Frontiers are able to write detailed logs that can be loaded after a system crash to recover the state of the Frontier prior to the crash. This method is the one used to achive this.
importRecoverLog
in interface Frontier
pathToLog
- The name (with full path) of the recover log.retainFailures
- If true, failures in log should count as
having been included. (If false, failures will be ignored, meaning
the corresponding URIs will be retried in the recovered crawl.)
java.io.IOException
- If problems occur reading the recover log.public void kickUpdate()
Frontier
kickUpdate
in interface Frontier
protected void log(CrawlURI curi)
curi
- protected boolean isDisregarded(CrawlURI curi)
protected boolean needsRetrying(CrawlURI curi)
curi
- The CrawlURI to check
protected java.lang.String canonicalize(UURI uuri)
uuri
- Candidate URI to canonicalize.
uuri
.protected java.lang.String canonicalize(CandidateURI cauri)
canonicalize(UURI)
in that it takes a look at
the CandidateURI context possibly overriding any canonicalization effect if
it could make us miss content. If canonicalization produces an URL that
was 'alreadyseen', but the entry in the 'alreadyseen' database did
nothing but redirect to the current URL, we won't get the current URL;
we'll think we've already see it. Examples would be archive.org
redirecting to www.archive.org or the inverse, www.netarkivet.net
redirecting to netarkivet.net (assuming stripWWW rule enabled).
Note, this method under circumstance sets the forceFetch flag.
cauri
- CandidateURI to examine.
cacuri
.public java.lang.String getClassKey(CandidateURI cauri)
getClassKey
in interface Frontier
cauri
- CrawlURI we're to get a key for.
protected QueueAssignmentPolicy getQueueAssignmentPolicy(CandidateURI cauri)
public FrontierJournal getFrontierJournal()
getFrontierJournal
in interface Frontier
public void crawlEnding(java.lang.String sExitMessage)
CrawlStatusListener
crawlEnding
in interface CrawlStatusListener
sExitMessage
- Type of exit. Should be one of the STATUS constants
in defined in CrawlJob.CrawlJob
public void crawlEnded(java.lang.String sExitMessage)
CrawlStatusListener
crawlEnded
in interface CrawlStatusListener
sExitMessage
- Type of exit. Should be one of the STATUS constants
in defined in CrawlJob.CrawlJob
public void crawlStarted(java.lang.String message)
CrawlStatusListener
crawlStarted
in interface CrawlStatusListener
message
- Start message.public void crawlPausing(java.lang.String statusMessage)
CrawlStatusListener
crawlPausing
in interface CrawlStatusListener
statusMessage
- Should be
STATUS_WAITING_FOR_PAUSE
. Passed for conveniencepublic void crawlPaused(java.lang.String statusMessage)
CrawlStatusListener
crawlPaused
in interface CrawlStatusListener
statusMessage
- Should be
CrawlJob.STATUS_PAUSED
. Passed for
conveniencepublic void crawlResuming(java.lang.String statusMessage)
CrawlStatusListener
crawlResuming
in interface CrawlStatusListener
statusMessage
- Should be
CrawlJob.STATUS_RUNNING
. Passed for
conveniencepublic void crawlCheckpoint(java.io.File checkpointDir) throws java.lang.Exception
CrawlStatusListener
CrawlController
when checkpointing.
crawlCheckpoint
in interface CrawlStatusListener
checkpointDir
- Checkpoint dir. Write checkpoint state here.
java.lang.Exception
- A fatal exception. Any exceptions
that are let out of this checkpoint are assumed fatal
and terminate further checkpoint processing.public java.lang.String singleLineReport()
Reporter
singleLineReport
in interface Reporter
public void reportTo(java.io.PrintWriter writer)
Reporter
reportTo
in interface Reporter
writer
- to receive report
|
||||||||||
PREV CLASS NEXT CLASS | FRAMES NO FRAMES | |||||||||
SUMMARY: NESTED | FIELD | CONSTR | METHOD | DETAIL: FIELD | CONSTR | METHOD |