|
||||||||||
PREV NEXT | FRAMES NO FRAMES |
Packages that use CrawlURI | |
---|---|
org.archive.crawler.admin | Contains classes that the web UI uses to monitor and control crawls. |
org.archive.crawler.datamodel | |
org.archive.crawler.datamodel.credential | Contains html form login and basic and digest credentials used by Heritrix logging into sites. |
org.archive.crawler.deciderules | Provides classes for a simple decision rules framework. |
org.archive.crawler.deciderules.recrawl | |
org.archive.crawler.event | |
org.archive.crawler.extractor | |
org.archive.crawler.fetcher | |
org.archive.crawler.filter | |
org.archive.crawler.framework | |
org.archive.crawler.frontier | |
org.archive.crawler.postprocessor | |
org.archive.crawler.prefetch | |
org.archive.crawler.processor | |
org.archive.crawler.processor.recrawl | |
org.archive.crawler.scope | |
org.archive.crawler.selftest | Provides the client-side aspect of the heritrix integration self test. |
org.archive.crawler.settings | Provides classes for the settings framework. |
org.archive.crawler.util | |
org.archive.crawler.writer |
Uses of CrawlURI in org.archive.crawler.admin |
---|
Methods in org.archive.crawler.admin with parameters of type CrawlURI | |
---|---|
void |
StatisticsTracker.crawledURIDisregard(CrawlURI curi)
|
void |
StatisticsTracker.crawledURIFailure(CrawlURI curi)
|
void |
StatisticsTracker.crawledURINeedRetry(CrawlURI curi)
|
void |
StatisticsTracker.crawledURISuccessful(CrawlURI curi)
|
void |
SeedRecord.updateWith(CrawlURI curi,
java.lang.String disposition)
A later/repeat report of the same seed has arrived; update with latest. |
Constructors in org.archive.crawler.admin with parameters of type CrawlURI | |
---|---|
SeedRecord(CrawlURI curi,
java.lang.String disposition)
Create a record from the given CrawlURI and disposition string |
Uses of CrawlURI in org.archive.crawler.datamodel |
---|
Methods in org.archive.crawler.datamodel that return CrawlURI | |
---|---|
static CrawlURI |
CrawlURI.from(CandidateURI caUri,
long ordinal)
Make a CrawlURI from the passed CandidateURI . |
Methods in org.archive.crawler.datamodel with parameters of type CrawlURI | |
---|---|
boolean |
RobotsExclusionPolicy.disallows(CrawlURI curi,
java.lang.String userAgent)
|
java.lang.String |
CrawlOrder.getFrom(CrawlURI curi)
|
java.lang.String |
CrawlOrder.getUserAgent(CrawlURI curi)
|
boolean |
RobotsHonoringPolicy.shouldMasquerade(CrawlURI curi)
This method returns true if the crawler should masquerade as the user agent which restrictions it opted to use. |
java.util.Set |
CredentialStore.subset(CrawlURI context,
java.lang.Class type)
Return set made up of all credentials of the passed type . |
java.util.Set<Credential> |
CredentialStore.subset(CrawlURI context,
java.lang.Class type,
java.lang.String rootUri)
Return set made up of all credentials of the passed type . |
void |
CrawlSubstats.tally(CrawlURI curi,
CrawlSubstats.Stage stage)
Examing the CrawlURI and based on its status and internal values, update tallies. |
void |
CrawlServer.updateRobots(CrawlURI curi)
Update the robots exclusion policy. |
Uses of CrawlURI in org.archive.crawler.datamodel.credential |
---|
Methods in org.archive.crawler.datamodel.credential with parameters of type CrawlURI | |
---|---|
void |
Credential.attach(CrawlURI curi)
Attach this credentials avatar to the passed curi . |
void |
Credential.attach(CrawlURI curi,
java.lang.String payload)
Attach this credentials avatar to the passed curi . |
boolean |
Credential.detach(CrawlURI curi)
Detach this credential from passed curi. |
boolean |
Credential.detachAll(CrawlURI curi)
Detach all credentials of this type from passed curi. |
static Rfc2617Credential |
Rfc2617Credential.getByRealm(java.util.Set rfc2617Credentials,
java.lang.String realm,
CrawlURI context)
Convenience method that does look up on passed set using realm for key. |
Credential |
CredentialAvatar.getCredential(SettingsHandler handler,
CrawlURI curi)
|
java.lang.String |
Credential.getCredentialDomain(CrawlURI context)
|
java.util.Map<java.lang.String,java.lang.Object> |
HtmlFormCredential.getFormItems(CrawlURI context)
|
java.lang.String |
HtmlFormCredential.getHttpMethod(CrawlURI context)
|
java.lang.String |
Rfc2617Credential.getKey(CrawlURI context)
|
java.lang.String |
HtmlFormCredential.getKey(CrawlURI curi)
|
abstract java.lang.String |
Credential.getKey(CrawlURI context)
|
java.lang.String |
Rfc2617Credential.getLogin(CrawlURI context)
|
java.lang.String |
HtmlFormCredential.getLoginUri(CrawlURI context)
|
java.lang.String |
Rfc2617Credential.getPassword(CrawlURI context)
|
java.lang.String |
Rfc2617Credential.getPrerequisite(CrawlURI curi)
|
java.lang.String |
HtmlFormCredential.getPrerequisite(CrawlURI curi)
|
abstract java.lang.String |
Credential.getPrerequisite(CrawlURI curi)
Return the authentication URI, either absolute or relative, that serves as prerequisite the passed curi . |
java.lang.String |
Rfc2617Credential.getRealm(CrawlURI context)
|
boolean |
Rfc2617Credential.hasPrerequisite(CrawlURI curi)
|
boolean |
HtmlFormCredential.hasPrerequisite(CrawlURI curi)
|
abstract boolean |
Credential.hasPrerequisite(CrawlURI curi)
|
boolean |
Rfc2617Credential.isPost(CrawlURI curi)
|
boolean |
HtmlFormCredential.isPost(CrawlURI curi)
|
abstract boolean |
Credential.isPost(CrawlURI curi)
|
boolean |
Rfc2617Credential.isPrerequisite(CrawlURI curi)
|
boolean |
HtmlFormCredential.isPrerequisite(CrawlURI curi)
|
abstract boolean |
Credential.isPrerequisite(CrawlURI curi)
|
boolean |
Rfc2617Credential.populate(CrawlURI curi,
org.apache.commons.httpclient.HttpClient http,
org.apache.commons.httpclient.HttpMethod method,
java.lang.String payload)
|
boolean |
HtmlFormCredential.populate(CrawlURI curi,
org.apache.commons.httpclient.HttpClient http,
org.apache.commons.httpclient.HttpMethod method,
java.lang.String payload)
|
abstract boolean |
Credential.populate(CrawlURI curi,
org.apache.commons.httpclient.HttpClient http,
org.apache.commons.httpclient.HttpMethod method,
java.lang.String payload)
|
boolean |
Credential.rootUriMatch(CrawlController controller,
CrawlURI curi)
Test passed curi matches this credentials rootUri. |
Uses of CrawlURI in org.archive.crawler.deciderules |
---|
Methods in org.archive.crawler.deciderules with parameters of type CrawlURI | |
---|---|
protected boolean |
FilterDecideRule.filtersAccept(CrawlURI curi)
Do all specified filters (if any) accept this CrawlURI? |
protected boolean |
FilterDecideRule.filtersAccept(MapType fs,
CrawlURI curi)
Do all specified filters (if any) accept this CrawlURI? |
Uses of CrawlURI in org.archive.crawler.deciderules.recrawl |
---|
Methods in org.archive.crawler.deciderules.recrawl with parameters of type CrawlURI | |
---|---|
static boolean |
IdenticalDigestDecideRule.hasIdenticalDigest(CrawlURI curi)
Utility method for testing if a CrawlURI's last two history entiries (one being the most recent fetch) have identical content-digest information. |
Uses of CrawlURI in org.archive.crawler.event |
---|
Methods in org.archive.crawler.event with parameters of type CrawlURI | |
---|---|
void |
CrawlURIDispositionListener.crawledURIDisregard(CrawlURI curi)
Notification of a crawled URI that is to be disregarded. |
void |
CrawlURIDispositionListener.crawledURIFailure(CrawlURI curi)
Notification of a failed crawling of a URI. |
void |
CrawlURIDispositionListener.crawledURINeedRetry(CrawlURI curi)
Notification of a failed crawl of a URI that will be retried (failure due to possible transient problems). |
void |
CrawlURIDispositionListener.crawledURISuccessful(CrawlURI curi)
Notification of a successfully crawled URI |
Uses of CrawlURI in org.archive.crawler.extractor |
---|
Fields in org.archive.crawler.extractor declared as CrawlURI | |
---|---|
(package private) CrawlURI |
CrawlUriSWFAction.curi
|
Methods in org.archive.crawler.extractor that return CrawlURI | |
---|---|
protected CrawlURI |
ExtractorTool.getCrawlURI(ARCRecord record,
HttpRecorder hr)
|
Methods in org.archive.crawler.extractor with parameters of type CrawlURI | |
---|---|
protected void |
ExtractorHTTP.addHeaderLink(CrawlURI curi,
org.apache.commons.httpclient.Header loc)
|
protected void |
ExtractorHTML.addLinkFromString(CrawlURI curi,
java.lang.CharSequence uri,
java.lang.CharSequence context,
char hopType)
|
protected void |
ExtractorHTML.considerIfLikelyUri(CrawlURI curi,
java.lang.CharSequence candidate,
java.lang.CharSequence valueContext,
char hopType)
Consider whether a given string is URI-like. |
protected void |
ExtractorHTML.considerQueryStringValues(CrawlURI curi,
java.lang.CharSequence queryString,
java.lang.CharSequence valueContext,
char hopType)
Consider a query-string-like collections of key=value[&key=value] pairs for URI-like strings in the values. |
static long |
ExtractorJS.considerStrings(CrawlURI curi,
java.lang.CharSequence cs,
CrawlController controller,
boolean handlingJSFile)
|
void |
ExtractorCSS.extract(CrawlURI curi)
|
protected void |
ExtractorDOC.extract(CrawlURI curi)
Processes a word document and extracts any hyperlinks from it. |
protected void |
ExtractorSWF.extract(CrawlURI curi)
|
protected void |
TrapSuppressExtractor.extract(CrawlURI curi)
|
protected void |
ExtractorPDF.extract(CrawlURI curi)
|
protected abstract void |
Extractor.extract(CrawlURI curi)
|
void |
ExtractorHTML.extract(CrawlURI curi)
|
void |
ExtractorJS.extract(CrawlURI curi)
|
protected void |
ExtractorUniversal.extract(CrawlURI curi)
|
void |
ExtractorXML.extract(CrawlURI curi)
|
void |
ExtractorURI.extract(CrawlURI curi)
Perform usual extraction on a CrawlURI |
void |
ExtractorImpliedURI.extract(CrawlURI curi)
Perform usual extraction on a CrawlURI |
(package private) void |
ExtractorHTML.extract(CrawlURI curi,
java.lang.CharSequence cs)
Run extractor. |
(package private) void |
JerichoExtractorHTML.extract(CrawlURI curi,
java.lang.CharSequence cs)
Run extractor. |
protected void |
ExtractorURI.extractLink(CrawlURI curi,
Link wref)
Consider a single Link for internal URIs |
protected void |
ChangeEvaluator.innerProcess(CrawlURI curi)
|
void |
Extractor.innerProcess(CrawlURI curi)
|
protected void |
HTTPContentDigest.innerProcess(CrawlURI curi)
|
void |
ExtractorHTTP.innerProcess(CrawlURI curi)
|
protected boolean |
ExtractorHTML.isHtmlExpectedHere(CrawlURI curi)
Test whether this HTML is so unexpected (eg in place of a GIF URI) that it shouldn't be scanned for links. |
protected boolean |
Extractor.isHttpTransactionContentToProcess(CrawlURI curi)
|
protected void |
ExtractorTool.outlinks(CrawlURI curi)
|
protected void |
ExtractorHTML.processEmbed(CrawlURI curi,
java.lang.CharSequence value,
java.lang.CharSequence context)
|
protected void |
ExtractorHTML.processEmbed(CrawlURI curi,
java.lang.CharSequence value,
java.lang.CharSequence context,
char hopType)
|
protected void |
JerichoExtractorHTML.processForm(CrawlURI curi,
au.id.jericho.lib.html.Element element)
|
protected void |
ExtractorHTML.processGeneralTag(CrawlURI curi,
java.lang.CharSequence element,
java.lang.CharSequence cs)
|
protected void |
JerichoExtractorHTML.processGeneralTag(CrawlURI curi,
au.id.jericho.lib.html.Element element,
au.id.jericho.lib.html.Attributes attributes)
|
protected void |
ExtractorHTML.processLink(CrawlURI curi,
java.lang.CharSequence value,
java.lang.CharSequence context)
Handle generic HREF cases. |
protected boolean |
ExtractorHTML.processMeta(CrawlURI curi,
java.lang.CharSequence cs)
Process metadata tags. |
protected boolean |
JerichoExtractorHTML.processMeta(CrawlURI curi,
au.id.jericho.lib.html.Element element)
|
protected void |
AggressiveExtractorHTML.processScript(CrawlURI curi,
java.lang.CharSequence sequence,
int endOfOpenTag)
|
protected void |
ExtractorHTML.processScript(CrawlURI curi,
java.lang.CharSequence sequence,
int endOfOpenTag)
|
protected void |
JerichoExtractorHTML.processScript(CrawlURI curi,
au.id.jericho.lib.html.Element element)
|
protected void |
ExtractorHTML.processScriptCode(CrawlURI curi,
java.lang.CharSequence cs)
Extract the (java)script source in the given CharSequence. |
protected void |
ExtractorHTML.processStyle(CrawlURI curi,
java.lang.CharSequence sequence,
int endOfOpenTag)
Process style text. |
protected void |
JerichoExtractorHTML.processStyle(CrawlURI curi,
au.id.jericho.lib.html.Element element)
|
static long |
ExtractorCSS.processStyleCode(CrawlURI curi,
java.lang.CharSequence cs,
CrawlController controller)
|
static long |
ExtractorXML.processXml(CrawlURI curi,
java.lang.CharSequence cs,
CrawlController controller)
|
protected boolean |
ExtractorXML.shouldExtract(CrawlURI curi)
|
Constructors in org.archive.crawler.extractor with parameters of type CrawlURI | |
---|---|
CrawlUriSWFAction(CrawlURI curi,
CrawlController controller)
|
|
ExtractorSWF.ExtractorSWFActions(CrawlURI curi,
CrawlController controller)
|
Uses of CrawlURI in org.archive.crawler.fetcher |
---|
Methods in org.archive.crawler.fetcher with parameters of type CrawlURI | |
---|---|
protected void |
FetchHTTP.addResponseContent(org.apache.commons.httpclient.HttpMethod method,
CrawlURI curi)
This method populates curi with response status and
content type. |
protected boolean |
FetchHTTP.checkMidfetchAbort(CrawlURI curi,
HttpRecorderMethod method,
org.apache.commons.httpclient.HttpConnection conn)
|
protected org.apache.commons.httpclient.HostConfiguration |
FetchHTTP.configureMethod(CrawlURI curi,
org.apache.commons.httpclient.HttpMethod method)
Configure the HttpMethod setting options and headers. |
protected void |
FetchHTTP.doAbort(CrawlURI curi,
org.apache.commons.httpclient.HttpMethod method,
java.lang.String annotation)
|
protected java.lang.Object |
FetchHTTP.getAttributeEither(CrawlURI curi,
java.lang.String key)
Get a value either from inside the CrawlURI instance, or from settings (module attributes). |
protected org.apache.commons.httpclient.auth.AuthScheme |
FetchHTTP.getAuthScheme(org.apache.commons.httpclient.HttpMethod method,
CrawlURI curi)
|
boolean |
FetchFTP.getExtractFromDirs(CrawlURI curi)
Returns the extract.from.dirs attribute for this
FetchFTP and the given curi. |
boolean |
FetchFTP.getExtractParent(CrawlURI curi)
Returns the extract.parent attribute for this
FetchFTP and the given curi. |
int |
FetchFTP.getFetchBandwidth(CrawlURI curi)
Returns the fetch-bandwidth attribute for this
FetchFTP and the given curi. |
long |
FetchFTP.getMaxLength(CrawlURI curi)
Returns the max-length-bytes attribute for this
FetchFTP and the given curi. |
int |
FetchFTP.getTimeout(CrawlURI curi)
Returns the timeout-seconds attribute for this
FetchFTP and the given curi. |
protected void |
FetchHTTP.handle401(org.apache.commons.httpclient.HttpMethod method,
CrawlURI curi)
Server is looking for basic/digest auth credentials (RFC2617). |
void |
FetchFTP.innerProcess(CrawlURI curi)
Processes the given URI. |
protected void |
FetchHTTP.innerProcess(CrawlURI curi)
|
protected void |
FetchDNS.innerProcess(CrawlURI curi)
|
protected boolean |
FetchDNS.isQuadAddress(CrawlURI curi,
java.lang.String dnsName,
CrawlHost targetHost)
|
protected void |
FetchDNS.recordDNS(CrawlURI curi,
org.xbill.DNS.Record[] rrecordSet)
|
protected void |
FetchHTTP.setConditionalGetHeader(CrawlURI curi,
org.apache.commons.httpclient.HttpMethod method,
java.lang.String setting,
java.lang.String sourceHeader,
java.lang.String targetHeader)
Set the given conditional-GET header, if the setting is enabled and a suitable value is available in the URI history. |
protected void |
FetchHTTP.setSizes(CrawlURI curi,
HttpRecorder rec)
Update CrawlURI internal sizes based on current transaction (and in the case of 304s, history) |
protected void |
FetchDNS.setUnresolvable(CrawlURI curi,
CrawlHost host)
|
protected void |
FetchDNS.storeDNSRecord(CrawlURI curi,
java.lang.String dnsName,
CrawlHost targetHost,
org.xbill.DNS.Record[] rrecordSet)
|
Uses of CrawlURI in org.archive.crawler.filter |
---|
Methods in org.archive.crawler.filter with parameters of type CrawlURI | |
---|---|
protected boolean |
PathologicalPathFilter.getFilterOffPosition(CrawlURI curi)
Deprecated. |
protected boolean |
PathDepthFilter.getFilterOffPosition(CrawlURI curi)
Deprecated. |
protected boolean |
OrFilter.returnTrueIfMatches(CrawlURI curi)
Deprecated. |
protected boolean |
PathDepthFilter.returnTrueIfMatches(CrawlURI curi)
Deprecated. |
protected boolean |
URIRegExpFilter.returnTrueIfMatches(CrawlURI curi)
Deprecated. |
Uses of CrawlURI in org.archive.crawler.framework |
---|
Methods in org.archive.crawler.framework that return CrawlURI | |
---|---|
CrawlURI |
Frontier.next()
Get the next URI that should be processed. |
Methods in org.archive.crawler.framework with parameters of type CrawlURI | |
---|---|
void |
Frontier.deleted(CrawlURI curi)
Notify Frontier that a CrawlURI has been deleted outside of the normal next()/finished() lifecycle. |
void |
Frontier.finished(CrawlURI cURI)
Report a URI being processed as having finished processing. |
void |
CrawlController.fireCrawledURIDisregardEvent(CrawlURI curi)
Allows an external class to raise a CrawlURIDispostion crawledURIDisregard event that will be broadcast to all listeners that have registered with the CrawlController. |
void |
CrawlController.fireCrawledURIFailureEvent(CrawlURI curi)
Allows an external class to raise a CrawlURIDispostion crawledURIFailure event that will be broadcast to all listeners that have registered with the CrawlController. |
void |
CrawlController.fireCrawledURINeedRetryEvent(CrawlURI curi)
Allows an external class to raise a CrawlURIDispostion crawledURINeedRetry event that will be broadcast to all listeners that have registered with the CrawlController. |
void |
CrawlController.fireCrawledURISuccessfulEvent(CrawlURI curi)
Allows an external class to raise a CrawlURIDispostion crawledURISuccessful event that will be broadcast to all listeners that have registered with the CrawlController. |
Processor |
Processor.getDefaultNextProcessor(CrawlURI curi)
Returns the next processor for the given CrawlURI in the processor chain. |
protected boolean |
Filter.getFilterOffPosition(CrawlURI curi)
If the filter is disabled, the value returned by this method is what filters return as their disabled setting. |
Frontier.FrontierGroup |
Frontier.getGroup(CrawlURI curi)
Get the 'frontier group' (usually queue) for the given CrawlURI. |
protected java.lang.String |
WriterPoolProcessor.getHostAddress(CrawlURI curi)
Return IP address of given URI suitable for recording (as in a classic ARC 5-field header line). |
protected abstract void |
WriterPoolProcessor.innerProcess(CrawlURI curi)
Writes a CrawlURI and its associated data to store file. |
protected void |
Processor.innerProcess(CrawlURI curi)
Classes subclassing this one should override this method to perform their custom actions on the CrawlURI. |
protected void |
Processor.innerRejectProcess(CrawlURI curi)
|
protected boolean |
Processor.isContentToProcess(CrawlURI curi)
|
protected boolean |
Processor.isHttpTransactionContentToProcess(CrawlURI curi)
|
void |
Processor.process(CrawlURI curi)
Perform processing on the given CrawlURI. |
protected boolean |
Filter.returnTrueIfMatches(CrawlURI curi)
Checks to see if filter functionality should be inverted for this curi. |
protected boolean |
WriterPoolProcessor.shouldWrite(CrawlURI curi)
Whether the given CrawlURI should be written to archive files. |
Uses of CrawlURI in org.archive.crawler.frontier |
---|
Methods in org.archive.crawler.frontier that return CrawlURI | |
---|---|
protected CrawlURI |
WorkQueueFrontier.asCrawlUri(CandidateURI caUri)
|
protected CrawlURI |
AbstractFrontier.asCrawlUri(CandidateURI caUri)
|
CrawlURI |
BdbMultipleWorkQueues.get(com.sleepycat.je.DatabaseEntry headKey)
Get the next nearest item after the given key. |
protected CrawlURI |
AdaptiveRevisitHostQueue.getCrawlURI(java.lang.String uri)
Returns the CrawlURI associated with the specified URI (string) or null if no such CrawlURI is queued in this HQ. |
CrawlURI |
WorkQueueFrontier.next()
Return the next CrawlURI to be processed (and presumably visited/fetched) by a a worker thread. |
CrawlURI |
AdaptiveRevisitHostQueue.next()
Returns the 'top' URI in the AdaptiveRevisitHostQueue. |
CrawlURI |
AdaptiveRevisitFrontier.next()
|
CrawlURI |
AdaptiveRevisitHostQueue.peek()
Returns the URI with the earliest time of next processing. |
CrawlURI |
WorkQueue.peek(WorkQueueFrontier frontier)
Return the topmost queue item -- and remember it, such that even later higher-priority inserts don't change it. |
protected abstract CrawlURI |
WorkQueue.peekItem(WorkQueueFrontier frontier)
Returns first item from queue (does not delete) |
protected CrawlURI |
BdbWorkQueue.peekItem(WorkQueueFrontier frontier)
|
Methods in org.archive.crawler.frontier with parameters of type CrawlURI | |
---|---|
boolean |
BdbMultipleWorkQueues.BdbFrontierMarker.accepts(CrawlURI curi)
|
void |
AdaptiveRevisitHostQueue.add(CrawlURI curi,
boolean overrideSetTimeOnDups)
Add a CrawlURI to this host queue. |
protected void |
AdaptiveRevisitHostQueue.addInProcessing(CrawlURI curi)
Adds a CrawlURI to the list of CrawlURIs belonging to this HQ and are being processed at the moment. |
protected void |
AbstractFrontier.applySpecialHandling(CrawlURI curi)
Perform any special handling of the CrawlURI, such as promoting its URI to seed-status, or preferencing it because it is an embed. |
(package private) static com.sleepycat.je.DatabaseEntry |
BdbMultipleWorkQueues.calculateInsertKey(CrawlURI curi)
Calculate the insertKey that places a CrawlURI in the desired spot. |
protected long |
AdaptiveRevisitFrontier.calculateSnoozeTime(CrawlURI curi)
Calculates how long a host queue needs to be snoozed following the crawling of a URI. |
abstract int |
CostAssignmentPolicy.costOf(CrawlURI curi)
|
int |
AntiCalendarCostAssignmentPolicy.costOf(CrawlURI curi)
|
int |
UnitCostAssignmentPolicy.costOf(CrawlURI curi)
|
int |
ZeroCostAssignmentPolicy.costOf(CrawlURI curi)
|
int |
WagCostAssignmentPolicy.costOf(CrawlURI curi)
Add constant penalties for certain features of URI (and its 'via') that make it more delayable/skippable. |
void |
DomainSensitiveFrontier.crawledURIDisregard(CrawlURI curi)
Deprecated. |
void |
DomainSensitiveFrontier.crawledURIFailure(CrawlURI curi)
Deprecated. |
void |
DomainSensitiveFrontier.crawledURINeedRetry(CrawlURI curi)
Deprecated. |
void |
DomainSensitiveFrontier.crawledURISuccessful(CrawlURI curi)
Deprecated. |
void |
BdbMultipleWorkQueues.delete(CrawlURI item)
Delete the given CrawlURI from persistent store. |
void |
WorkQueueFrontier.deleted(CrawlURI curi)
Force logging, etc. |
void |
AdaptiveRevisitFrontier.deleted(CrawlURI curi)
|
protected abstract void |
WorkQueue.deleteItem(WorkQueueFrontier frontier,
CrawlURI item)
Removes the given item from the queue. |
protected void |
BdbWorkQueue.deleteItem(WorkQueueFrontier frontier,
CrawlURI peekItem)
|
protected void |
AdaptiveRevisitFrontier.disregardDisposition(CrawlURI curi)
|
protected void |
AbstractFrontier.doJournalAdded(CrawlURI c)
|
protected void |
AbstractFrontier.doJournalDisregarded(CrawlURI c)
|
protected void |
AbstractFrontier.doJournalEmitted(CrawlURI c)
|
protected void |
AbstractFrontier.doJournalFinishedFailure(CrawlURI c)
|
protected void |
AbstractFrontier.doJournalFinishedSuccess(CrawlURI c)
|
protected void |
AbstractFrontier.doJournalRescheduled(CrawlURI c)
|
void |
WorkQueue.enqueue(WorkQueueFrontier frontier,
CrawlURI curi)
Add the given CrawlURI, noting its addition in running count. |
protected void |
AdaptiveRevisitFrontier.failureDisposition(CrawlURI curi)
The CrawlURI has encountered a problem, and will not be retried. |
void |
WorkQueueFrontier.finished(CrawlURI curi)
Note that the previously emitted CrawlURI has completed its processing (for now). |
void |
AdaptiveRevisitFrontier.finished(CrawlURI curi)
|
protected void |
WorkQueueFrontier.forget(CrawlURI curi)
Forget the given CrawlURI. |
Frontier.FrontierGroup |
WorkQueueFrontier.getGroup(CrawlURI curi)
|
Frontier.FrontierGroup |
AdaptiveRevisitFrontier.getGroup(CrawlURI curi)
|
protected AdaptiveRevisitHostQueue |
AdaptiveRevisitFrontier.getHQ(CrawlURI curi)
Get the AdaptiveRevisitHostQueue for the given CrawlURI, creating it if necessary. |
protected abstract WorkQueue |
WorkQueueFrontier.getQueueFor(CrawlURI curi)
Return the work queue for the given CrawlURI's classKey. |
protected WorkQueue |
BdbFrontier.getQueueFor(CrawlURI curi)
Return the work queue for the given CrawlURI's classKey. |
protected CrawlServer |
AbstractFrontier.getServer(CrawlURI curi)
|
protected CrawlServer |
AdaptiveRevisitFrontier.getServer(CrawlURI curi)
|
protected void |
DomainSensitiveFrontier.incrementHostCounters(CrawlURI curi)
Deprecated. |
protected void |
AdaptiveRevisitFrontier.innerFinished(CrawlURI curi)
|
protected abstract void |
WorkQueue.insertItem(WorkQueueFrontier frontier,
CrawlURI curi,
boolean expectedPresent)
Insert the given curi, whether it is already present or not. |
protected void |
BdbWorkQueue.insertItem(WorkQueueFrontier frontier,
CrawlURI curi,
boolean overwriteIfPresent)
|
protected boolean |
AbstractFrontier.isDisregarded(CrawlURI curi)
|
protected boolean |
AdaptiveRevisitFrontier.isDisregarded(CrawlURI curi)
|
protected void |
AbstractFrontier.log(CrawlURI curi)
Log to the main crawl.log |
protected void |
AbstractFrontier.logLocalizedErrors(CrawlURI curi)
Take note of any processor-local errors that have been entered into the CrawlURI. |
protected boolean |
AdaptiveRevisitFrontier.needsPromptRetry(CrawlURI curi)
Checks if a recently completed CrawlURI that did not finish successfully needs to be retried immediately (processed again as soon as politeness allows.) |
protected boolean |
AbstractFrontier.needsRetrying(CrawlURI curi)
Checks if a recently completed CrawlURI that did not finish successfully needs to be retried (processed again after some time elapses) |
protected boolean |
AdaptiveRevisitFrontier.needsRetrying(CrawlURI curi)
Checks if a recently completed CrawlURI that did not finish successfully needs to be retried (processed again after some time elapses) |
protected void |
AbstractFrontier.noteAboutToEmit(CrawlURI curi,
WorkQueue q)
Perform fixups on a CrawlURI about to be returned via next(). |
protected boolean |
AbstractFrontier.overMaxRetries(CrawlURI curi)
|
protected long |
AbstractFrontier.politenessDelayFor(CrawlURI curi)
Update any scheduling structures with the new information in this CrawlURI. |
void |
BdbMultipleWorkQueues.put(CrawlURI curi,
boolean overwriteIfPresent)
Put the given CrawlURI in at the appropriate place. |
protected void |
AdaptiveRevisitFrontier.reschedule(CrawlURI curi,
boolean errorWait)
Put near top of relevant hostQueue (but behind anything recently scheduled 'high').. |
protected long |
AbstractFrontier.retryDelayFor(CrawlURI curi)
Return a suitable value to wait before retrying the given URI. |
protected void |
WorkQueueFrontier.sendToQueue(CrawlURI curi)
Send a CrawlURI to the appropriate subqueue. |
protected boolean |
AdaptiveRevisitFrontier.shouldBeForgotten(CrawlURI curi)
Some URIs, if they recur, deserve another chance at consideration: they might not be too many hops away via another path, or the scope may have been updated to allow them passage. |
protected com.sleepycat.je.OperationStatus |
AdaptiveRevisitHostQueue.strictAdd(CrawlURI curi,
boolean overrideDuplicates)
An internal method for adding URIs to the queue. |
protected void |
AdaptiveRevisitFrontier.successDisposition(CrawlURI curi)
The CrawlURI has been successfully crawled. |
protected void |
AbstractFrontier.tally(CrawlURI curi,
CrawlSubstats.Stage stage)
Report CrawlURI to each of the three 'substats' accumulators (group/queue, server, host) for a given stage. |
void |
AdaptiveRevisitHostQueue.update(CrawlURI curi,
boolean needWait,
long wakeupTime)
Update CrawlURI that has completed processing. |
void |
AdaptiveRevisitHostQueue.update(CrawlURI curi,
boolean needWait,
long wakeupTime,
boolean forgetURI)
Update CrawlURI that has completed processing. |
void |
WorkQueue.update(WorkQueueFrontier frontier,
CrawlURI curi)
Update the given CrawlURI, which should already be present. |
Uses of CrawlURI in org.archive.crawler.postprocessor |
---|
Methods in org.archive.crawler.postprocessor with parameters of type CrawlURI | |
---|---|
protected int |
LinksScoper.getSchedulingFor(CrawlURI curi,
Link wref,
int preferenceDepthHops)
Determine scheduling for the curi . |
protected void |
LinksScoper.handlePrerequisite(CrawlURI curi)
The CrawlURI has a prerequisite; apply scoping and update Link to CandidateURI in manner analogous to outlink handling. |
protected void |
FrontierScheduler.handlePrerequisites(CrawlURI curi)
|
protected void |
WaitEvaluator.innerProcess(CrawlURI curi)
|
protected void |
ContentBasedWaitEvaluator.innerProcess(CrawlURI curi)
|
protected void |
SupplementaryLinksScoper.innerProcess(CrawlURI curi)
|
protected void |
LowDiskPauseProcessor.innerProcess(CrawlURI curi)
Notes a CrawlURI's content size in its running tally. |
protected void |
LinksScoper.innerProcess(CrawlURI curi)
|
protected void |
CrawlStateUpdater.innerProcess(CrawlURI curi)
|
protected void |
FrontierScheduler.innerProcess(CrawlURI curi)
|
protected void |
AcceptRevisitProcessor.innerProcess(CrawlURI curi)
|
protected void |
RejectRevisitProcessor.innerProcess(CrawlURI curi)
|
Uses of CrawlURI in org.archive.crawler.prefetch |
---|
Methods in org.archive.crawler.prefetch with parameters of type CrawlURI | |
---|---|
protected boolean |
QuotaEnforcer.applyQuota(CrawlURI curi,
java.lang.String quotaKey,
long actual)
Apply the quota specified by the given key against the actual value provided. |
protected boolean |
QuotaEnforcer.checkQuotas(CrawlURI curi,
CrawlSubstats.HasCrawlSubstats hasStats,
int CAT)
Check all quotas for the given substats and category (server, host, or group). |
long |
PreconditionEnforcer.getIPValidityDuration(CrawlURI curi)
Get the maximum time a dns-record is valid. |
long |
PreconditionEnforcer.getRobotsValidityDuration(CrawlURI curi)
Get the maximum time a robots.txt is valid. |
protected long |
RuntimeLimitEnforcer.getRuntime(CrawlURI curi)
Returns the amount of time to allow the crawl to run before this processor interrupts. |
protected void |
Preselector.innerProcess(CrawlURI curi)
|
protected void |
QuotaEnforcer.innerProcess(CrawlURI curi)
|
protected void |
PreconditionEnforcer.innerProcess(CrawlURI curi)
|
protected void |
RuntimeLimitEnforcer.innerProcess(CrawlURI curi)
|
boolean |
PreconditionEnforcer.isIpExpired(CrawlURI curi)
Return true if ip should be looked up. |
boolean |
PreconditionEnforcer.isRobotsExpired(CrawlURI curi)
Is the robots policy expired. |
Uses of CrawlURI in org.archive.crawler.processor |
---|
Methods in org.archive.crawler.processor with parameters of type CrawlURI | |
---|---|
protected void |
CrawlMapper.innerProcess(CrawlURI curi)
|
protected void |
BeanShellProcessor.innerProcess(CrawlURI curi)
|
Uses of CrawlURI in org.archive.crawler.processor.recrawl |
---|
Methods in org.archive.crawler.processor.recrawl with parameters of type CrawlURI | |
---|---|
protected void |
PersistStoreProcessor.innerProcess(CrawlURI curi)
|
protected void |
PersistLogProcessor.innerProcess(CrawlURI curi)
|
protected void |
FetchHistoryProcessor.innerProcess(CrawlURI curi)
|
protected void |
PersistLoadProcessor.innerProcess(CrawlURI curi)
|
java.lang.String |
PersistProcessor.persistKeyFor(CrawlURI curi)
Return a preferred String key for persisting the given CrawlURI's AList state. |
protected boolean |
PersistProcessor.shouldLoad(CrawlURI curi)
Whether the current CrawlURI's state should be loaded |
protected boolean |
PersistProcessor.shouldStore(CrawlURI curi)
Whether the current CrawlURI's state should be persisted (to log or direct to database). |
Uses of CrawlURI in org.archive.crawler.scope |
---|
Methods in org.archive.crawler.scope with parameters of type CrawlURI | |
---|---|
boolean |
SeedCachingScope.addSeed(CrawlURI curi)
|
Uses of CrawlURI in org.archive.crawler.selftest |
---|
Methods in org.archive.crawler.selftest with parameters of type CrawlURI | |
---|---|
void |
SelfTestCrawlJobHandler.crawledURIDisregard(CrawlURI curi)
|
void |
SelfTestCrawlJobHandler.crawledURIFailure(CrawlURI curi)
|
void |
SelfTestCrawlJobHandler.crawledURINeedRetry(CrawlURI curi)
|
void |
SelfTestCrawlJobHandler.crawledURISuccessful(CrawlURI curi)
|
Uses of CrawlURI in org.archive.crawler.settings |
---|
Methods in org.archive.crawler.settings that return CrawlURI | |
---|---|
CrawlURI |
SettingsFrameworkTestCase.getMatchDomainURI()
|
CrawlURI |
SettingsFrameworkTestCase.getMatchHostURI()
|
CrawlURI |
SettingsFrameworkTestCase.getUnMatchedURI()
|
Methods in org.archive.crawler.settings with parameters of type CrawlURI | |
---|---|
java.lang.Object |
ComplexType.getAttribute(java.lang.String name,
CrawlURI uri)
Obtain the value of a specific attribute that is valid for a specific CrawlURI. |
Uses of CrawlURI in org.archive.crawler.util |
---|
Methods in org.archive.crawler.util with parameters of type CrawlURI | |
---|---|
void |
CrawledBytesHistotable.accumulate(CrawlURI curi)
|
Uses of CrawlURI in org.archive.crawler.writer |
---|
Fields in org.archive.crawler.writer declared as CrawlURI | |
---|---|
protected CrawlURI |
MirrorWriterProcessor.PathSegment.curi
The URI, for logging and error reporting. |
Methods in org.archive.crawler.writer with parameters of type CrawlURI | |
---|---|
protected java.io.OutputStream |
Kw3WriterProcessor.initOutputStream(CrawlURI curi)
|
protected void |
MirrorWriterProcessor.innerProcess(CrawlURI curi)
|
protected void |
Kw3WriterProcessor.innerProcess(CrawlURI curi)
|
protected void |
ARCWriterProcessor.innerProcess(CrawlURI curi)
Writes a CrawlURI and its associated data to store file. |
protected void |
WARCWriterProcessor.innerProcess(CrawlURI curi)
Writes a CrawlURI and its associated data to store file. |
protected void |
ARCWriterProcessor.write(CrawlURI curi,
long recordLength,
java.io.InputStream in,
java.lang.String ip)
|
protected void |
WARCWriterProcessor.write(java.lang.String lowerCaseScheme,
CrawlURI curi)
|
protected void |
Kw3WriterProcessor.writeArchiveInfoPart(java.lang.String boundary,
CrawlURI curi,
ReplayInputStream ris,
java.io.OutputStream out)
|
protected void |
Kw3WriterProcessor.writeContentPart(java.lang.String boundary,
CrawlURI curi,
ReplayInputStream ris,
java.io.OutputStream out)
|
protected java.net.URI |
WARCWriterProcessor.writeFtpControlConversation(WARCWriter w,
java.lang.String timestamp,
java.net.URI baseid,
CrawlURI curi,
ANVLRecord headers,
java.lang.String controlConversation)
|
protected java.net.URI |
WARCWriterProcessor.writeMetadata(WARCWriter w,
java.lang.String timestamp,
java.net.URI baseid,
CrawlURI curi,
ANVLRecord namedFields)
|
protected void |
Kw3WriterProcessor.writeMimeFile(CrawlURI curi)
|
protected java.net.URI |
WARCWriterProcessor.writeRequest(WARCWriter w,
java.lang.String timestamp,
java.lang.String mimetype,
java.net.URI baseid,
CrawlURI curi,
ANVLRecord namedFields)
|
protected java.net.URI |
WARCWriterProcessor.writeResource(WARCWriter w,
java.lang.String timestamp,
java.lang.String mimetype,
java.net.URI baseid,
CrawlURI curi,
ANVLRecord namedFields)
|
protected java.net.URI |
WARCWriterProcessor.writeResponse(WARCWriter w,
java.lang.String timestamp,
java.lang.String mimetype,
java.net.URI baseid,
CrawlURI curi,
ANVLRecord namedFields)
|
protected java.net.URI |
WARCWriterProcessor.writeRevisitDigest(WARCWriter w,
java.lang.String timestamp,
java.lang.String mimetype,
java.net.URI baseid,
CrawlURI curi,
ANVLRecord namedFields)
|
protected java.net.URI |
WARCWriterProcessor.writeRevisitNotModified(WARCWriter w,
java.lang.String timestamp,
java.net.URI baseid,
CrawlURI curi,
ANVLRecord namedFields)
|
Constructors in org.archive.crawler.writer with parameters of type CrawlURI | |
---|---|
MirrorWriterProcessor.DirSegment(java.lang.String uriPath,
int beginIndex,
int endIndex,
int maxSegLen,
boolean caseSensitive,
CrawlURI curi,
java.util.Map characterMap,
java.lang.String dotBegin,
java.lang.String dotEnd,
java.util.Set underscoreSet)
Creates a DirSegment. |
|
MirrorWriterProcessor.EndSegment(java.lang.String uriPath,
int beginIndex,
int endIndex,
int maxSegLen,
boolean caseSensitive,
CrawlURI curi,
java.util.Map characterMap,
java.lang.String dotBegin,
java.lang.String query,
java.lang.String suffix,
int maxPathLen,
boolean suffixAtEnd)
Creates an EndSegment. |
|
MirrorWriterProcessor.PathSegment(int maxSegLen,
boolean caseSensitive,
CrawlURI curi)
Creates a new PathSegment. |
|
||||||||||
PREV NEXT | FRAMES NO FRAMES |