|
||||||||||
PREV NEXT | FRAMES NO FRAMES |
org.archive.* |
---|
org.archive.crawler.Heritrix | ||
---|---|---|
public static final java.lang.String |
ADD_CRAWL_JOB_BASEDON_OPER |
"addJobBasedon" |
public static final java.lang.String |
ADD_CRAWL_JOB_OPER |
"addJob" |
public static final java.lang.String |
ALERT_OPER |
"alert" |
public static final java.lang.String |
ALERTCOUNT_ATTR |
"AlertCount" |
public static final java.lang.String |
ARCHIVE_PACKAGE |
"org.archive." |
public static final java.lang.String |
COMPLETED_JOBS_OPER |
"completedJobs" |
public static final java.lang.String |
CRAWLEND_REPORT_OPER |
"crawlendReport" |
public static final java.lang.String |
CURRENTJOB_ATTR |
"CurrentJob" |
public static final java.lang.String |
DEFAULT_ENCODING |
"ISO-8859-1" |
public static final java.lang.String |
DELETE_CRAWL_JOB_OPER |
"deleteJob" |
public static final java.lang.String |
DESTROY_OPER |
"destroy" |
public static final java.lang.String |
HERITRIX_PROPERTIES_PREFIX |
"heritrix." |
public static final java.lang.String |
INTERRUPT_OPER |
"interrupt" |
public static final java.lang.String |
ISCRAWLING_ATTR |
"IsCrawling" |
public static final java.lang.String |
ISRUNNING_ATTR |
"IsRunning" |
public static final java.lang.String |
JAR_SUFFIX |
".jar" |
public static final java.lang.String |
LOG_OPER |
"log" |
public static final java.lang.String |
NEWALERTCOUNT_ATTR |
"NewAlertCount" |
public static final java.lang.String |
PENDING_JOBS_OPER |
"pendingJobs" |
public static final java.lang.String |
PROPERTIES |
"heritrix.properties" |
public static final java.lang.String |
PROPERTIES_KEY |
"heritrix.properties" |
public static final java.lang.String |
REBIND_JNDI_OPER |
"rebindJNDI" |
public static final java.lang.String |
ROOT_CONTEXT |
"/" |
public static final java.lang.String |
SHUTDOWN_OPER |
"shutdown" |
public static final java.lang.String |
START_CRAWLING_OPER |
"startCrawling" |
public static final java.lang.String |
START_OPER |
"start" |
public static final java.lang.String |
STARTLOG |
"heritrix_dmesg.log" |
public static final java.lang.String |
STATUS_ATTR |
"Status" |
public static final java.lang.String |
STOP_CRAWLING_OPER |
"stopCrawling" |
public static final java.lang.String |
STOP_OPER |
"stop" |
public static final java.lang.String |
SYSTEM_PREFIX |
"system." |
public static final java.lang.String |
TERMINATE_CRAWL_JOB_OPER |
"terminateCurrentJob" |
public static final java.lang.String |
VERSION_ATTR |
"Version" |
org.archive.crawler.SimpleHttpServer | ||
---|---|---|
public static final int |
DEFAULT_PORT |
8080 |
org.archive.crawler.admin.CrawlJob | ||
---|---|---|
public static final java.lang.String |
CHECKPOINT_OPER |
"startCheckpoint" |
public static final java.lang.String |
CRAWL_LOG_STYLE |
"crawlLog" |
public static final java.lang.String |
CRAWL_TIME_ATTR |
"CrawlTime" |
public static final java.lang.String |
CRAWLJOB_JMXMBEAN_TYPE |
"CrawlService.Job" |
public static final java.lang.String |
CURRENT_DOC_RATE_ATTR |
"CurrentDocRate" |
public static final java.lang.String |
CURRENT_KB_RATE_ATTR |
"CurrentKbRate" |
public static final java.lang.String |
DISCOVERED_COUNT_ATTR |
"DiscoveredCount" |
public static final java.lang.String |
DOC_RATE_ATTR |
"DocRate" |
public static final java.lang.String |
DOWNLOAD_COUNT_ATTR |
"DownloadedCount" |
public static final java.lang.String |
DUMP_URIS_OPER |
"dumpUris" |
public static final java.lang.String |
FRONTIER_REPORT_OPER |
"frontierReport" |
public static final java.lang.String |
FRONTIER_SHORT_REPORT_ATTR |
"FrontierShortReport" |
public static final java.lang.String |
IMPORT_URI_OPER |
"importUri" |
public static final java.lang.String |
IMPORT_URIS_OPER |
"importUris" |
public static final java.lang.String |
KB_RATE_ATTR |
"KbRate" |
public static final java.lang.String |
NAME_ATTR |
"Name" |
public static final java.lang.String |
OP_DB_STAT |
"getDatabaseStats" |
public static final java.lang.String |
PAUSE_OPER |
"pause" |
public static final int |
PRIORITY_AVERAGE |
2 |
public static final int |
PRIORITY_CRITICAL |
4 |
public static final int |
PRIORITY_HIGH |
3 |
public static final int |
PRIORITY_LOW |
1 |
public static final int |
PRIORITY_MINIMAL |
0 |
public static final java.lang.String |
PROG_STATS |
"progressStatistics" |
public static final java.lang.String |
PROGRESS_STATISTICS_LEGEND_OPER |
"progressStatisticsLegend" |
public static final java.lang.String |
PROGRESS_STATISTICS_OPER |
"progressStatistics" |
public static final java.lang.String |
RECOVERY_JOURNAL_STYLE |
"recoveryJournal" |
public static final java.lang.String |
RESUME_OPER |
"resume" |
public static final java.lang.String |
SEEDS_REPORT_OPER |
"seedsReport" |
public static final java.lang.String |
STATUS_ABORTED |
"Finished - Ended by operator" |
public static final java.lang.String |
STATUS_ATTR |
"Status" |
public static final java.lang.String |
STATUS_CHECKPOINTING |
"Checkpointing" |
public static final java.lang.String |
STATUS_CREATED |
"Created" |
public static final java.lang.String |
STATUS_DELETED |
"Deleted" |
public static final java.lang.String |
STATUS_FINISHED |
"Finished" |
public static final java.lang.String |
STATUS_FINISHED_ABNORMAL |
"Finished - Abnormal exit from crawling" |
public static final java.lang.String |
STATUS_FINISHED_DATA_LIMIT |
"Finished - Maximum amount of data limit hit" |
public static final java.lang.String |
STATUS_FINISHED_DOCUMENT_LIMIT |
"Finished - Maximum number of documents limit hit" |
public static final java.lang.String |
STATUS_FINISHED_TIME_LIMIT |
"Finished - Timelimit hit" |
public static final java.lang.String |
STATUS_MISCONFIGURED |
"Could not launch job - Fatal InitializationException" |
public static final java.lang.String |
STATUS_PAUSED |
"Paused" |
public static final java.lang.String |
STATUS_PENDING |
"Pending" |
public static final java.lang.String |
STATUS_PREPARING |
"Preparing" |
public static final java.lang.String |
STATUS_PROFILE |
"Profile" |
public static final java.lang.String |
STATUS_RUNNING |
"Running" |
public static final java.lang.String |
STATUS_WAITING_FOR_PAUSE |
"Pausing - Waiting for threads to finish" |
public static final java.lang.String |
THREAD_COUNT_ATTR |
"ThreadCount" |
public static final java.lang.String |
THREADS_REPORT_OPER |
"threadsReport" |
public static final java.lang.String |
THREADS_SHORT_REPORT_ATTR |
"ThreadsShortReport" |
public static final java.lang.String |
TOTAL_DATA_ATTR |
"TotalData" |
public static final java.lang.String |
UID_ATTR |
"UID" |
org.archive.crawler.admin.CrawlJobHandler | ||
---|---|---|
public static final java.lang.String |
DEFAULT_PROFILE |
"default" |
public static final java.lang.String |
DEFAULT_PROFILE_NAME |
"heritrix.default.profile" |
public static final java.lang.String |
ORDER_FILE_NAME |
"order.xml" |
public static final java.lang.String |
PROFILES_DIR_NAME |
"profiles" |
public static final java.lang.String |
RECOVER_LOG |
"recover" |
org.archive.crawler.admin.ui.JobConfigureUtils | ||
---|---|---|
public static final java.lang.String |
ACTION |
"action" |
public static final java.lang.String |
FILTERS |
"filters" |
public static final java.lang.String |
SUBACTION |
"subaction" |
org.archive.crawler.datamodel.CandidateURI | ||
---|---|---|
public static final int |
HIGH |
1 |
public static final int |
HIGHEST |
0 |
public static final int |
MEDIUM |
2 |
public static final int |
NORMAL |
3 |
org.archive.crawler.datamodel.Checkpoint | ||
---|---|---|
public static final java.lang.String |
VALIDITY_STAMP_FILENAME |
"valid" |
org.archive.crawler.datamodel.CoreAttributeConstants | ||
---|---|---|
public static final java.lang.String |
A_ANNOTATIONS |
"annotations" |
public static final java.lang.String |
A_CONTENT_DIGEST |
"content-digest" |
public static final java.lang.String |
A_CONTENT_TYPE |
"content-type" |
public static final java.lang.String |
A_CREDENTIAL_AVATARS_KEY |
"credential-avatars" |
public static final java.lang.String |
A_DELAY_FACTOR |
"delay-factor" |
public static final java.lang.String |
A_DISTANCE_FROM_SEED |
"distance-from-seed" |
public static final java.lang.String |
A_DNS_FETCH_TIME |
"dns-fetch-time" |
public static final java.lang.String |
A_DNS_SERVER_IP_LABEL |
"dns-server-ip" |
public static final java.lang.String |
A_ETAG_HEADER |
"etag" |
public static final java.lang.String |
A_FETCH_BEGAN_TIME |
"fetch-began-time" |
public static final java.lang.String |
A_FETCH_COMPLETED_TIME |
"fetch-completed-time" |
public static final java.lang.String |
A_FETCH_HISTORY |
"fetch-history" |
public static final java.lang.String |
A_FORCE_RETIRE |
"force-retire" |
public static final java.lang.String |
A_FTP_CONTROL_CONVERSATION |
"ftp-control-conversation" |
public static final java.lang.String |
A_FTP_FETCH_STATUS |
"ftp-fetch-status" |
public static final java.lang.String |
A_HERITABLE_KEYS |
"heritable" |
public static final java.lang.String |
A_HTML_BASE |
"html-base-href" |
public static final java.lang.String |
A_HTTP_BIND_ADDRESS |
"http-bind-address" |
public static final java.lang.String |
A_HTTP_PROXY_HOST |
"http-proxy-host" |
public static final java.lang.String |
A_HTTP_PROXY_PORT |
"http-proxy-port" |
public static final java.lang.String |
A_HTTP_TRANSACTION |
"http-transaction" |
public static final java.lang.String |
A_LAST_MODIFIED_HEADER |
"last-modified" |
public static final java.lang.String |
A_LOCALIZED_ERRORS |
"localized-errors" |
public static final java.lang.String |
A_META_ROBOTS |
"meta-robots" |
public static final java.lang.String |
A_MINIMUM_DELAY |
"minimum-delay" |
public static final java.lang.String |
A_MIRROR_PATH |
"mirror-path" |
public static final java.lang.String |
A_PREREQUISITE_URI |
"prerequisite-uri" |
public static final java.lang.String |
A_REFERENCE_LENGTH |
"reference-length" |
public static final java.lang.String |
A_RETRY_DELAY |
"retry-delay" |
public static final java.lang.String |
A_RRECORD_SET_LABEL |
"dns-records" |
public static final java.lang.String |
A_RUNTIME_EXCEPTION |
"runtime-exception" |
public static final java.lang.String |
A_SOURCE_TAG |
"source" |
public static final java.lang.String |
A_STATUS |
"status" |
public static final java.lang.String |
A_WRITTEN_TO_WARC |
"written-to-warc" |
public static final java.lang.String |
HEADER_TRUNC |
"headerTrunc" |
public static final java.lang.String |
LENGTH_TRUNC |
"lenTrunc" |
public static final java.lang.String |
TIMER_TRUNC |
"timeTrunc" |
public static final java.lang.String |
TRUNC_SUFFIX |
"Trunc" |
org.archive.crawler.datamodel.CrawlHost | ||
---|---|---|
public static final long |
IP_NEVER_EXPIRES |
-1L |
public static final long |
IP_NEVER_LOOKED_UP |
-2L |
org.archive.crawler.datamodel.CrawlOrder | ||
---|---|---|
public static final java.lang.String |
ATTR_BDB_CACHE_PERCENT |
"bdb-cache-percent" |
public static final java.lang.String |
ATTR_CHECKPOINT_COPY_BDBJE_LOGS |
"checkpoint-copy-bdbje-logs" |
public static final java.lang.String |
ATTR_CHECKPOINTS_PATH |
"checkpoints-path" |
public static final java.lang.String |
ATTR_DISK_PATH |
"disk-path" |
public static final java.lang.String |
ATTR_EXTRACT_PROCESSORS |
"extract-processors" |
public static final java.lang.String |
ATTR_FETCH_PROCESSORS |
"fetch-processors" |
public static final java.lang.String |
ATTR_FROM |
"from" |
public static final java.lang.String |
ATTR_HTTP_HEADERS |
"http-headers" |
public static final java.lang.String |
ATTR_INDEPENDENT_EXTRACTORS |
"independent-extractors" |
public static final java.lang.String |
ATTR_LOGGERS |
"loggers" |
public static final java.lang.String |
ATTR_LOGS_PATH |
"logs-path" |
public static final java.lang.String |
ATTR_MAX_BYTES_DOWNLOAD |
"max-bytes-download" |
public static final java.lang.String |
ATTR_MAX_DOCUMENT_DOWNLOAD |
"max-document-download" |
public static final java.lang.String |
ATTR_MAX_TIME_SEC |
"max-time-sec" |
public static final java.lang.String |
ATTR_MAX_TOE_THREADS |
"max-toe-threads" |
public static final java.lang.String |
ATTR_NAME |
"crawl-order" |
public static final java.lang.String |
ATTR_POST_PROCESSORS |
"post-processors" |
public static final java.lang.String |
ATTR_PRE_FETCH_PROCESSORS |
"pre-fetch-processors" |
public static final java.lang.String |
ATTR_RECORDER_IN_BUFFER |
"recorder-in-buffer-bytes" |
public static final java.lang.String |
ATTR_RECORDER_OUT_BUFFER |
"recorder-out-buffer-bytes" |
public static final java.lang.String |
ATTR_RECOVER_PATH |
"recover-path" |
public static final java.lang.String |
ATTR_RECOVER_RETAIN_FAILURES |
"recover-retain-failures" |
public static final java.lang.String |
ATTR_RECOVER_SCOPE_ENQUEUES |
"recover-scope-enqueues" |
public static final java.lang.String |
ATTR_RECOVER_SCOPE_INCLUDES |
"recover-scope-includes" |
public static final java.lang.String |
ATTR_RULES |
"uri-canonicalization-rules" |
public static final java.lang.String |
ATTR_SCRATCH_PATH |
"scratch-path" |
public static final java.lang.String |
ATTR_SETTINGS_DIRECTORY |
"settings-directory" |
public static final java.lang.String |
ATTR_STATE_PATH |
"state-path" |
public static final java.lang.String |
ATTR_USER_AGENT |
"user-agent" |
public static final java.lang.String |
ATTR_WRITE_PROCESSORS |
"write-processors" |
org.archive.crawler.datamodel.CrawlServer | ||
---|---|---|
public static final long |
MIN_ROBOTS_RETRIES |
3L |
public static final long |
ROBOTS_NOT_FETCHED |
-1L |
org.archive.crawler.datamodel.CrawlURI | ||
---|---|---|
public static final int |
UNCALCULATED |
-1 |
org.archive.crawler.datamodel.CredentialStore | ||
---|---|---|
public static final java.lang.String |
ATTR_CREDENTIALS |
"credentials" |
public static final java.lang.String |
ATTR_NAME |
"credential-store" |
org.archive.crawler.datamodel.FetchStatusCodes | ||
---|---|---|
public static final int |
S_BLOCKED_BY_CUSTOM_PROCESSOR |
-5002 |
public static final int |
S_BLOCKED_BY_QUOTA |
-5003 |
public static final int |
S_BLOCKED_BY_RUNTIME_LIMIT |
-5004 |
public static final int |
S_BLOCKED_BY_USER |
-5001 |
public static final int |
S_CONNECT_FAILED |
-2 |
public static final int |
S_CONNECT_LOST |
-3 |
public static final int |
S_DEEMED_CHAFF |
-4000 |
public static final int |
S_DEEMED_NOT_FOUND |
-404 |
public static final int |
S_DEFERRED |
-50 |
public static final int |
S_DELETED_BY_USER |
-6000 |
public static final int |
S_DNS_SUCCESS |
1 |
public static final int |
S_DOMAIN_PREREQUISITE_FAILURE |
-6 |
public static final int |
S_DOMAIN_UNRESOLVABLE |
-1 |
public static final int |
S_GETBYNAME_SUCCESS |
1001 |
public static final int |
S_OTHER_PREREQUISITE_FAILURE |
-62 |
public static final int |
S_OUT_OF_SCOPE |
-5000 |
public static final int |
S_PREREQUISITE_UNSCHEDULABLE_FAILURE |
-63 |
public static final int |
S_PROCESSING_THREAD_KILLED |
-7000 |
public static final int |
S_ROBOTS_PRECLUDED |
-9998 |
public static final int |
S_ROBOTS_PREREQUISITE_FAILURE |
-61 |
public static final int |
S_RUNTIME_EXCEPTION |
-5 |
public static final int |
S_SERIOUS_ERROR |
-3000 |
public static final int |
S_TIMEOUT |
-4 |
public static final int |
S_TOO_MANY_EMBED_HOPS |
-4002 |
public static final int |
S_TOO_MANY_LINK_HOPS |
-4001 |
public static final int |
S_TOO_MANY_RETRIES |
-8 |
public static final int |
S_UNATTEMPTED |
0 |
public static final int |
S_UNFETCHABLE_URI |
-7 |
public static final int |
S_UNQUEUEABLE |
-60 |
org.archive.crawler.datamodel.RobotsHonoringPolicy | ||
---|---|---|
public static final java.lang.String |
ATTR_CUSTOM_ROBOTS |
"custom-robots" |
public static final java.lang.String |
ATTR_MASQUERADE |
"masquerade" |
public static final java.lang.String |
ATTR_NAME |
"robots-honoring-policy" |
public static final java.lang.String |
ATTR_TYPE |
"type" |
public static final java.lang.String |
ATTR_USER_AGENTS |
"user-agents" |
public static final int |
CLASSIC |
0 |
public static final int |
CUSTOM |
2 |
public static final int |
IGNORE |
1 |
public static final int |
MOST_FAVORED |
3 |
public static final int |
MOST_FAVORED_SET |
4 |
org.archive.crawler.datamodel.Robotstxt | ||
---|---|---|
static final long |
serialVersionUID |
7025386509301303890L |
org.archive.crawler.deciderules.BeanShellDecideRule | ||
---|---|---|
public static final java.lang.String |
ATTR_ISOLATE_THREADS |
"isolate-threads" |
public static final java.lang.String |
ATTR_SCRIPT_FILE |
"script-file" |
org.archive.crawler.deciderules.ConfiguredDecideRule | ||
---|---|---|
public static final java.lang.String |
ATTR_DECISION |
"decision" |
org.archive.crawler.deciderules.DecideRuleSequence | ||
---|---|---|
public static final java.lang.String |
ATTR_RULES |
"rules" |
org.archive.crawler.deciderules.DecidingFilter | ||
---|---|---|
public static final java.lang.String |
ATTR_DECIDE_RULES |
"decide-rules" |
org.archive.crawler.deciderules.DecidingScope | ||
---|---|---|
public static final java.lang.String |
ATTR_DECIDE_RULES |
"decide-rules" |
org.archive.crawler.deciderules.ExternalGeoLocationDecideRule | ||
---|---|---|
static final java.lang.String |
ATTR_COUNTRY_CODE |
"country-code" |
static final java.lang.String |
ATTR_IMPLEMENTATION |
"implementation-class" |
static final java.lang.String |
DEFAULT_COUNTRY_CODE |
"--" |
org.archive.crawler.deciderules.ExternalImplDecideRule | ||
---|---|---|
static final java.lang.String |
ATTR_IMPLEMENTATION |
"implementation-class" |
org.archive.crawler.deciderules.FetchStatusMatchesRegExpDecideRule | ||
---|---|---|
public static final java.lang.String |
ATTR_REGEXP |
"regexp" |
org.archive.crawler.deciderules.FilterDecideRule | ||
---|---|---|
public static final java.lang.String |
ATTR_FALSE_DECISION |
"false-decision" |
public static final java.lang.String |
ATTR_FILTERS |
"filters" |
public static final java.lang.String |
ATTR_TRUE_DECISION |
"true-decision" |
org.archive.crawler.deciderules.HopsPathMatchesRegExpDecideRule | ||
---|---|---|
public static final java.lang.String |
ATTR_REGEXP |
"regexp" |
org.archive.crawler.deciderules.MatchesFilePatternDecideRule | ||
---|---|---|
public static final java.lang.String |
ALL |
"All" |
public static final java.lang.String |
ALL_DEFAULT_PATTERNS |
".*(?i)(\\.(bmp|gif|jpe?g|png|svg|tiff?|aac|aiff?|m3u|m4a|midi?|mp2|mp3|mp4|mpa|ogg|ra|ram|wav|wma|asf|asx|avi|flv|mov|mp4|mpeg|mpg|qt|ram|rm|smil|wmv|doc|pdf|ppt|swf))$" |
public static final java.lang.String |
ATTR_USE_PRESET |
"use-preset-pattern" |
public static final java.lang.String |
AUDIO |
"Audio" |
public static final java.lang.String |
AUDIO_PATTERNS |
".*(?i)(\\.(aac|aiff?|m3u|m4a|midi?|mp2|mp3|mp4|mpa|ogg|ra|ram|wav|wma))$" |
public static final java.lang.String |
CUSTOM |
"Custom" |
public static final java.lang.String |
IMAGES |
"Images" |
public static final java.lang.String |
IMAGES_PATTERNS |
".*(?i)(\\.(bmp|gif|jpe?g|png|svg|tiff?))$" |
public static final java.lang.String |
MISC |
"Miscellaneous" |
public static final java.lang.String |
MISC_PATTERNS |
".*(?i)(\\.(doc|pdf|ppt|swf))$" |
public static final java.lang.String |
VIDEO |
"Video" |
public static final java.lang.String |
VIDEO_PATTERNS |
".*(?i)(\\.(asf|asx|avi|flv|mov|mp4|mpeg|mpg|qt|ram|rm|smil|wmv))$" |
org.archive.crawler.deciderules.MatchesListRegExpDecideRule | ||
---|---|---|
public static final java.lang.String |
ATTR_LIST_LOGIC |
"list-logic" |
public static final java.lang.String |
ATTR_REGEXP_LIST |
"regexp-list" |
public static final java.lang.String |
DEFAULT_LIST_LOGIC |
"OR" |
org.archive.crawler.deciderules.MatchesRegExpDecideRule | ||
---|---|---|
public static final java.lang.String |
ATTR_REGEXP |
"regexp" |
org.archive.crawler.deciderules.NotExceedsDocumentLengthTresholdDecideRule | ||
---|---|---|
public static final java.lang.String |
ATTR_CONTENT_LENGTH_TRESHOLD |
"content-length-treshold" |
public static final java.lang.String |
ATTR_USE_AS_MIDFETCH |
"use-as-midfetch-filter" |
public static final int |
HEADER_PREDICTS_MISSING |
-1 |
org.archive.crawler.deciderules.PathologicalPathDecideRule | ||
---|---|---|
public static final java.lang.String |
ATTR_REPETITIONS |
"max-repetitions" |
org.archive.crawler.deciderules.ScopePlusOneDecideRule | ||
---|---|---|
public static final java.lang.String |
ATTR_SCOPE |
"host-or-domain-scope" |
public static final java.lang.String |
DOMAIN |
"Domain" |
public static final java.lang.String |
HOST |
"Host" |
org.archive.crawler.deciderules.SurtPrefixedDecideRule | ||
---|---|---|
public static final java.lang.String |
ATTR_ALSO_CHECK_VIA |
"also-check-via" |
public static final java.lang.String |
ATTR_REBUILD_ON_RECONFIG |
"rebuild-on-reconfig" |
public static final java.lang.String |
ATTR_SEEDS_AS_SURT_PREFIXES |
"seeds-as-surt-prefixes" |
public static final java.lang.String |
ATTR_SURTS_DUMP_FILE |
"surts-dump-file" |
public static final java.lang.String |
ATTR_SURTS_SOURCE_FILE |
"surts-source-file" |
org.archive.crawler.deciderules.TooManyPathSegmentsDecideRule | ||
---|---|---|
public static final java.lang.String |
ATTR_MAX_PATH_DEPTH |
"max-path-depth" |
org.archive.crawler.extractor.CrawlUriSWFAction | ||
---|---|---|
static final java.lang.String |
JSSTRING |
"javascript:" |
org.archive.crawler.extractor.ExtractorCSS | ||
---|---|---|
static final java.lang.String |
CSS_BACKSLASH_ESCAPE |
"\\\\([,\'\"\\(\\)\\s])" |
static final java.lang.String |
CSS_URI_EXTRACTOR |
"(?i)(?:@import (?:url[(]|)|url[(])\\s*([\\\"\']?)([^\\\"\'].{0,2083}?)\\1\\s*[);]" |
org.archive.crawler.extractor.ExtractorHTML | ||
---|---|---|
static final java.lang.String |
APPLET |
"applet" |
public static final java.lang.String |
ATTR_EXTRACT_JAVASCRIPT |
"extract-javascript" |
public static final java.lang.String |
ATTR_EXTRACT_ONLY_FORM_GETS |
"extract-only-form-gets" |
public static final java.lang.String |
ATTR_IGNORE_FORM_ACTION_URLS |
"ignore-form-action-urls" |
public static final java.lang.String |
ATTR_IGNORE_UNEXPECTED_HTML |
"ignore-unexpected-html" |
public static final java.lang.String |
ATTR_TREAT_FRAMES_AS_EMBED_LINKS |
"treat-frames-as-embed-links" |
static final java.lang.String |
BASE |
"base" |
static final java.lang.String |
CLASSEXT |
".class" |
public static final java.lang.String |
EXTRACT_VALUE_ATTRIBUTES |
"extract-value-attributes" |
static final java.lang.String |
FRAME |
"frame" |
static final java.lang.String |
IFRAME |
"iframe" |
static final java.lang.String |
JAVASCRIPT |
"(?i)^javascript:.*" |
static final java.lang.String |
LINK |
"link" |
static final java.lang.String |
NON_HTML_PATH_EXTENSION |
"(?i)(gif)|(jp(e)?g)|(png)|(tif(f)?)|(bmp)|(avi)|(mov)|(mp(e)?g)|(mp3)|(mp4)|(swf)|(wav)|(au)|(aiff)|(mid)" |
static final java.lang.String |
WHITESPACE |
"\\s" |
org.archive.crawler.extractor.ExtractorImpliedURI | ||
---|---|---|
public static final java.lang.String |
ATTR_BUILD_PATTERN |
"build-pattern" |
public static final java.lang.String |
ATTR_REMOVE_TRIGGER_URIS |
"remove-trigger-uris" |
public static final java.lang.String |
ATTR_TRIGGER_REGEXP |
"trigger-regexp" |
org.archive.crawler.extractor.ExtractorJS | ||
---|---|---|
static final java.lang.String |
JAVASCRIPT_STRING_EXTRACTOR |
"(\\\\{0,8}+(?:\"|\'))(\\S{0,2083}?)(?:\\1)" |
org.archive.crawler.extractor.ExtractorSWF.ExtractorSWFActions | ||
---|---|---|
static final java.lang.String |
JSSTRING |
"javascript:" |
org.archive.crawler.extractor.ExtractorUniversal | ||
---|---|---|
static final java.lang.String |
IP_ADDRESS |
"((http://)|(https://))(\\d(\\d)?(\\d)?\\.\\d(\\d)?(\\d)?\\.\\d(\\d)?(\\d)?\\.\\d(\\d)?(\\d)?)" |
public static final java.lang.String |
TLDs |
"(ac(/.*)?)|(ad(/.*)?)|(ae(/.*)?)|(af(/.*)?)|(ag(/.*)?)|(ai(/.*)?)|(al(/.*)?)|(am(/.*)?)|(an(/.*)?)|(ao(/.*)?)|(aero(/.*)?)|(aq(/.*)?)|(ar(/.*)?)|(as(/.*)?)|(at(/.*)?)|(au(/.*)?)|(aw(/.*)?)|(az(/.*)?)|(ba(/.*)?)|(bb(/.*)?)|(bd(/.*)?)|(be(/.*)?)|(bf(/.*)?)|(bg(/.*)?)|(bh(/.*)?)|(bi(/.*)?)|(biz(/.*)?)|(bj(/.*)?)|(bm(/.*)?)|(bn(/.*)?)|(bo(/.*)?)|(br(/.*)?)|(bs(/.*)?)|(bt(/.*)?)|(bv(/.*)?)|(bw(/.*)?)|(by(/.*)?)|(bz(/.*)?)|(ca(/.*)?)|(cc(/.*)?)|(cd(/.*)?)|(cf(/.*)?)|(cg(/.*)?)|(ch(/.*)?)|(ci(/.*)?)|(ck(/.*)?)|(cl(/.*)?)|(cm(/.*)?)|(cn(/.*)?)|(co(/.*)?)|(com(/.*)?)|(coop(/.*)?)|(cr(/.*)?)|(cs(/.*)?)|(cu(/.*)?)|(cv(/.*)?)|(cx(/.*)?)|(cy(/.*)?)|(cz(/.*)?)|(de(/.*)?)|(dj(/.*)?)|(dk(/.*)?)|(dm(/.*)?)|(do(/.*)?)|(dz(/.*)?)|(ec(/.*)?)|(edu(/.*)?)|(ee(/.*)?)|(eg(/.*)?)|(eh(/.*)?)|(er(/.*)?)|(es(/.*)?)|(et(/.*)?)|(fi(/.*)?)|(fj(/.*)?)|(fk(/.*)?)|(fm(/.*)?)|(fo(/.*)?)|(fr(/.*)?)|(ga(/.*)?)|(gd(/.*)?)|(ge(/.*)?)|(gf(/.*)?)|(gg(/.*)?)|(gh(/.*)?)|(gi(/.*)?)|(gl(/.*)?)|(gm(/.*)?)|(gn(/.*)?)|(gov(/.*)?)|(gp(/.*)?)|(gq(/.*)?)|(gr(/.*)?)|(gs(/.*)?)|(gt(/.*)?)|(gu(/.*)?)|(gw(/.*)?)|(gy(/.*)?)|(hk(/.*)?)|(hm(/.*)?)|(hn(/.*)?)|(hr(/.*)?)|(ht(/.*)?)|(hu(/.*)?)|(id(/.*)?)|(ie(/.*)?)|(il(/.*)?)|(im(/.*)?)|(in(/.*)?)|(info(/.*)?)|(int(/.*)?)|(io(/.*)?)|(iq(/.*)?)|(ir(/.*)?)|(is(/.*)?)|(it(/.*)?)|(je(/.*)?)|(jm(/.*)?)|(jo(/.*)?)|(jp(/.*)?)|(ke(/.*)?)|(kg(/.*)?)|(kh(/.*)?)|(ki(/.*)?)|(km(/.*)?)|(kn(/.*)?)|(kp(/.*)?)|(kr(/.*)?)|(kw(/.*)?)|(ky(/.*)?)|(kz(/.*)?)|(la(/.*)?)|(lb(/.*)?)|(lc(/.*)?)|(li(/.*)?)|(lk(/.*)?)|(lr(/.*)?)|(ls(/.*)?)|(lt(/.*)?)|(lu(/.*)?)|(lv(/.*)?)|(ly(/.*)?)|(ma(/.*)?)|(mc(/.*)?)|(md(/.*)?)|(mg(/.*)?)|(mh(/.*)?)|(mil(/.*)?)|(mk(/.*)?)|(ml(/.*)?)|(mm(/.*)?)|(mn(/.*)?)|(mo(/.*)?)|(mp(/.*)?)|(mq(/.*)?)|(mr(/.*)?)|(ms(/.*)?)|(mt(/.*)?)|(mu(/.*)?)|(museum(/.*)?)|(mv(/.*)?)|(mw(/.*)?)|(mx(/.*)?)|(my(/.*)?)|(mz(/.*)?)|(na(/.*)?)|(name(/.*)?)|(nc(/.*)?)|(ne(/.*)?)|(net(/.*)?)|(nf(/.*)?)|(ng(/.*)?)|(ni(/.*)?)|(nl(/.*)?)|(no(/.*)?)|(np(/.*)?)|(nr(/.*)?)|(nt(/.*)?)|(nu(/.*)?)|(nz(/.*)?)|(om(/.*)?)|(org(/.*)?)|(pa(/.*)?)|(pe(/.*)?)|(pf(/.*)?)|(pg(/.*)?)|(ph(/.*)?)|(pk(/.*)?)|(pl(/.*)?)|(pm(/.*)?)|(pn(/.*)?)|(pr(/.*)?)|(pro(/.*)?)|(ps(/.*)?)|(pt(/.*)?)|(pw(/.*)?)|(py(/.*)?)|(qa(/.*)?)|(re(/.*)?)|(ro(/.*)?)|(ru(/.*)?)|(rw(/.*)?)|(sa(/.*)?)|(sb(/.*)?)|(sc(/.*)?)|(sd(/.*)?)|(se(/.*)?)|(sg(/.*)?)|(sh(/.*)?)|(si(/.*)?)|(sj(/.*)?)|(sk(/.*)?)|(sl(/.*)?)|(sm(/.*)?)|(sn(/.*)?)|(so(/.*)?)|(sr(/.*)?)|(sv(/.*)?)|(st(/.*)?)|(sy(/.*)?)|(sz(/.*)?)|(tc(/.*)?)|(td(/.*)?)|(tf(/.*)?)|(tg(/.*)?)|(th(/.*)?)|(tj(/.*)?)|(tk(/.*)?)|(tm(/.*)?)|(tn(/.*)?)|(to(/.*)?)|(tp(/.*)?)|(tr(/.*)?)|(tt(/.*)?)|(tv(/.*)?)|(tw(/.*)?)|(tz(/.*)?)|(ua(/.*)?)|(ug(/.*)?)|(uk(/.*)?)|(um(/.*)?)|(us(/.*)?)|(uy(/.*)?)|(uz(/.*)?)|(va(/.*)?)|(vc(/.*)?)|(ve(/.*)?)|(vg(/.*)?)|(vi(/.*)?)|(vn(/.*)?)|(vu(/.*)?)|(wf(/.*)?)|(ws(/.*)?)|(ye(/.*)?)|(yt(/.*)?)|(yu(/.*)?)|(za(/.*)?)|(zm(/.*)?)|(zw(/.*)?)" |
org.archive.crawler.extractor.ExtractorURI | ||
---|---|---|
static final java.lang.String |
ABS_HTTP_URI_PATTERN |
"^https?://[^\\s<>]*$" |
org.archive.crawler.extractor.ExtractorXML | ||
---|---|---|
static final java.lang.String |
XML_URI_EXTRACTOR |
"(?i)[\"\'>]\\s*(https?:[^\\s\"\'<>]+)\\s*[\"\'<]" |
org.archive.crawler.extractor.HTTPContentDigest | ||
---|---|---|
public static final java.lang.String |
ATTR_MAX_SIZE_BYTES |
"max-size-bytes" |
public static final java.lang.String |
ATTR_STRIP_REG_EXPR |
"strip-reg-expr" |
protected static final java.lang.String |
DEFAULT_STRIP_REG_EXPR |
"" |
org.archive.crawler.extractor.Link | ||
---|---|---|
public static final char |
EMBED_HOP |
69 |
public static final char |
NAVLINK_HOP |
76 |
public static final char |
PREREQ_HOP |
80 |
public static final char |
REFER_HOP |
82 |
public static final char |
SPECULATIVE_HOP |
88 |
org.archive.crawler.fetcher.FetchFTP | ||
---|---|---|
public static final java.lang.String |
ATTR_BANDWIDTH |
"fetch-bandwidth" |
public static final java.lang.String |
ATTR_MAX_LENGTH |
"max-length-bytes" |
public static final java.lang.String |
ATTR_PASSWORD |
"password" |
public static final java.lang.String |
ATTR_TIMEOUT |
"timeout-seconds" |
public static final java.lang.String |
ATTR_USERNAME |
"username" |
org.archive.crawler.fetcher.FetchHTTP | ||
---|---|---|
public static final java.lang.String |
ATTR_ACCEPT_HEADERS |
"accept-headers" |
public static final java.lang.String |
ATTR_BDB_COOKIES |
"use-bdb-for-cookies" |
public static final java.lang.String |
ATTR_DEFAULT_ENCODING |
"default-encoding" |
public static final java.lang.String |
ATTR_DIGEST_ALGORITHM |
"digest-algorithm" |
public static final java.lang.String |
ATTR_DIGEST_CONTENT |
"digest-content" |
public static final java.lang.String |
ATTR_FETCH_BANDWIDTH_MAX |
"fetch-bandwidth" |
public static final java.lang.String |
ATTR_HTTP_BIND_ADDRESS |
"http-bind-address" |
public static final java.lang.String |
ATTR_HTTP_PROXY_HOST |
"http-proxy-host" |
public static final java.lang.String |
ATTR_HTTP_PROXY_PORT |
"http-proxy-port" |
public static final java.lang.String |
ATTR_IGNORE_COOKIES |
"ignore-cookies" |
public static final java.lang.String |
ATTR_LOAD_COOKIES |
"load-cookies-from-file" |
public static final java.lang.String |
ATTR_MAX_LENGTH_BYTES |
"max-length-bytes" |
public static final java.lang.String |
ATTR_MIDFETCH_DECIDE_RULES |
"midfetch-decide-rules" |
public static final java.lang.String |
ATTR_SAVE_COOKIES |
"save-cookies-to-file" |
public static final java.lang.String |
ATTR_SEND_CONNECTION_CLOSE |
"send-connection-close" |
public static final java.lang.String |
ATTR_SEND_IF_MODIFIED_SINCE |
"send-if-modified-since" |
public static final java.lang.String |
ATTR_SEND_IF_NONE_MATCH |
"send-if-none-match" |
public static final java.lang.String |
ATTR_SEND_RANGE |
"send-range" |
public static final java.lang.String |
ATTR_SEND_REFERER |
"send-referer" |
public static final java.lang.String |
ATTR_SOTIMEOUT_MS |
"sotimeout-ms" |
public static final java.lang.String |
ATTR_TIMEOUT_SECONDS |
"timeout-seconds" |
public static final java.lang.String |
ATTR_TRUST |
"trust-level" |
public static final java.lang.String |
COOKIEDB_NAME |
"http_cookies" |
public static final java.lang.String |
DEFAULT_DIGEST_ALGORITHM |
"sha1" |
public static final java.lang.String |
DESC_DIGEST_ALGORITHM |
"Which algorithm (for example MD5 or SHA-1) to use to perform an on-the-fly digest hash of retrieved content-bodies." |
public static final java.lang.String |
DESC_DIGEST_CONTENT |
"Whether or not to perform an on-the-fly digest hash of retrieved content-bodies." |
public static final java.lang.String |
HTTP_SCHEME |
"http" |
public static final java.lang.String |
HTTPS_SCHEME |
"https" |
public static final java.lang.String |
MD5 |
"md5" |
public static final java.lang.String |
RANGE |
"Range" |
public static final java.lang.String |
RANGE_PREFIX |
"bytes=0-" |
public static final java.lang.String |
REFERER |
"Referer" |
static final java.lang.String |
SERVER_CACHE_KEY |
"heritrix.server.cache" |
public static final java.lang.String |
SHA1 |
"sha1" |
static final java.lang.String |
SSL_FACTORY_KEY |
"heritrix.ssl.factory" |
org.archive.crawler.filter.FilePatternFilter | ||
---|---|---|
public static final java.lang.String |
ALL |
"All" |
public static final java.lang.String |
ALL_DEFAULT_PATTERNS |
".*(?i)(\\.(bmp|gif|jpe?g|png|tiff?|mid|mp2|mp3|mp4|wav|avi|mov|mpeg|ram|rm|smil|wmv|doc|pdf|ppt|swf))$" |
public static final java.lang.String |
ATTR_USE_DEFAULT |
"use-default-patterns" |
public static final java.lang.String |
AUDIO |
"Audio" |
public static final java.lang.String |
AUDIO_PATTERNS |
".*(?i)(\\.(mid|mp2|mp3|mp4|wav))$" |
public static final java.lang.String |
CUSTOM |
"Custom" |
public static final java.lang.String |
IMAGES |
"Images" |
public static final java.lang.String |
IMAGES_PATTERNS |
".*(?i)(\\.(bmp|gif|jpe?g|png|tiff?))$" |
public static final java.lang.String |
MISC |
"Miscellaneous" |
public static final java.lang.String |
MISC_PATTERNS |
".*(?i)(\\.(doc|pdf|ppt|swf))$" |
public static final java.lang.String |
VIDEO |
"Video" |
public static final java.lang.String |
VIDEO_PATTERNS |
".*(?i)(\\.(avi|mov|mpeg|ram|rm|smil|wmv))$" |
org.archive.crawler.filter.HTTPMidFetchUnchangedFilter | ||
---|---|---|
public static final int |
HEADER_PREDICTS_CHANGED |
1 |
public static final int |
HEADER_PREDICTS_MISSING |
-1 |
public static final int |
HEADER_PREDICTS_UNCHANGED |
0 |
org.archive.crawler.filter.OrFilter | ||
---|---|---|
public static final java.lang.String |
ATTR_FILTERS |
"filters" |
public static final java.lang.String |
ATTR_MATCH_RETURN_VALUE |
"if-matches-return" |
org.archive.crawler.filter.PathDepthFilter | ||
---|---|---|
public static final java.lang.String |
ATTR_MATCH_RETURN_VALUE |
"path-less-or-equal-return" |
public static final java.lang.String |
ATTR_MAX_PATH_DEPTH |
"max-path-depth" |
static final char |
slash |
47 |
org.archive.crawler.filter.PathologicalPathFilter | ||
---|---|---|
public static final java.lang.String |
ATTR_REPETITIONS |
"repetitions" |
org.archive.crawler.filter.SurtPrefixFilter | ||
---|---|---|
public static final java.lang.String |
ATTR_MATCH_RETURN_VALUE |
"if-match-return" |
public static final java.lang.String |
ATTR_SURTS_SOURCE_FILE |
"surts-source-file" |
org.archive.crawler.filter.URIListRegExpFilter | ||
---|---|---|
public static final java.lang.String |
ATTR_LIST_LOGIC |
"list-logic" |
public static final java.lang.String |
ATTR_MATCH_RETURN_VALUE |
"if-match-return" |
public static final java.lang.String |
ATTR_REGEXP_LIST |
"regexp-list" |
public static final java.lang.String |
DEFAULT_LIST_LOGIC |
"OR" |
org.archive.crawler.filter.URIRegExpFilter | ||
---|---|---|
public static final java.lang.String |
ATTR_MATCH_RETURN_VALUE |
"if-match-return" |
public static final java.lang.String |
ATTR_REGEXP |
"regexp" |
org.archive.crawler.framework.AbstractTracker | ||
---|---|---|
public static final java.lang.String |
ATTR_STATS_INTERVAL |
"interval-seconds" |
org.archive.crawler.framework.CrawlController | ||
---|---|---|
public static final java.lang.String |
CURRENT_LOG_SUFFIX |
".log" |
public static final java.lang.String |
LOGNAME_CRAWL |
"crawl" |
public static final java.lang.String |
LOGNAME_LOCAL_ERRORS |
"local-errors" |
public static final java.lang.String |
LOGNAME_PROGRESS_STATISTICS |
"progress-statistics" |
public static final java.lang.String |
LOGNAME_RUNTIME_ERRORS |
"runtime-errors" |
public static final java.lang.String |
LOGNAME_URI_ERRORS |
"uri-errors" |
public static final char |
MANIFEST_CONFIG_FILE |
67 |
public static final char |
MANIFEST_LOG_FILE |
76 |
public static final java.lang.String |
MANIFEST_REPORT |
"manifest" |
public static final char |
MANIFEST_REPORT_FILE |
82 |
public static final java.lang.String |
PROCESSORS_REPORT |
"processors" |
org.archive.crawler.framework.CrawlScope | ||
---|---|---|
public static final java.lang.String |
ATTR_NAME |
"scope" |
public static final java.lang.String |
ATTR_REREAD_SEEDS_ON_CONFIG |
"reread-seeds-on-config" |
public static final java.lang.String |
ATTR_SEEDS |
"seedsfile" |
org.archive.crawler.framework.Filter | ||
---|---|---|
public static final java.lang.String |
ATTR_ENABLED |
"enabled" |
org.archive.crawler.framework.Frontier | ||
---|---|---|
public static final java.lang.String |
ATTR_NAME |
"frontier" |
org.archive.crawler.framework.FrontierHostStatistics | ||
---|---|---|
public static final int |
HOST_DEFERRED |
3 |
public static final int |
HOST_INACTIVE |
4 |
public static final int |
HOST_INPROCESS |
2 |
public static final int |
HOST_READY |
1 |
public static final int |
HOST_UNKNOWN |
0 |
org.archive.crawler.framework.Processor | ||
---|---|---|
public static final java.lang.String |
ATTR_DECIDE_RULES |
"decide-rules" |
public static final java.lang.String |
ATTR_ENABLED |
"enabled" |
org.archive.crawler.framework.Scoper | ||
---|---|---|
protected static final java.lang.String |
ATTR_OVERRIDE_LOGGER_ENABLED |
"override-logger" |
org.archive.crawler.framework.StatisticsTracking | ||
---|---|---|
public static final java.lang.String |
SEED_DISPOSITION_DISREGARD |
"Seed was disregarded" |
public static final java.lang.String |
SEED_DISPOSITION_FAILURE |
"Failed to crawl seed" |
public static final java.lang.String |
SEED_DISPOSITION_NOT_PROCESSED |
"Seed has not been processed" |
public static final java.lang.String |
SEED_DISPOSITION_RETRY |
"Failed to crawl seed, will retry" |
public static final java.lang.String |
SEED_DISPOSITION_SUCCESS |
"Seed successfully crawled" |
org.archive.crawler.framework.WriterPoolProcessor | ||
---|---|---|
protected static final java.lang.String |
ANNOTATION_UNWRITTEN |
"unwritten" |
public static final java.lang.String |
ATTR_COMPRESS |
"compress" |
public static final java.lang.String |
ATTR_MAX_BYTES_WRITTEN |
"total-bytes-to-write" |
public static final java.lang.String |
ATTR_MAX_SIZE_BYTES |
"max-size-bytes" |
public static final java.lang.String |
ATTR_PATH |
"path" |
public static final java.lang.String |
ATTR_POOL_MAX_ACTIVE |
"pool-max-active" |
public static final java.lang.String |
ATTR_POOL_MAX_WAIT |
"pool-max-wait" |
public static final java.lang.String |
ATTR_PREFIX |
"prefix" |
public static final java.lang.String |
ATTR_SKIP_IDENTICAL_DIGESTS |
"skip-identical-digests" |
public static final java.lang.String |
ATTR_SUFFIX |
"suffix" |
public static final boolean |
DEFAULT_COMPRESS |
true |
org.archive.crawler.frontier.AbstractFrontier | ||
---|---|---|
protected static final java.lang.String |
ACCEPTABLE_FORCE_QUEUE |
"[-\\w\\.,:]*" |
public static final java.lang.String |
ATTR_DELAY_FACTOR |
"delay-factor" |
public static final java.lang.String |
ATTR_FORCE_QUEUE |
"force-queue-assignment" |
public static final java.lang.String |
ATTR_MAX_DELAY |
"max-delay-ms" |
public static final java.lang.String |
ATTR_MAX_HOST_BANDWIDTH_USAGE |
"max-per-host-bandwidth-usage-KB-sec" |
public static final java.lang.String |
ATTR_MAX_OVERALL_BANDWIDTH_USAGE |
"total-bandwidth-usage-KB-sec" |
public static final java.lang.String |
ATTR_MAX_RETRIES |
"max-retries" |
public static final java.lang.String |
ATTR_MIN_DELAY |
"min-delay-ms" |
public static final java.lang.String |
ATTR_PAUSE_AT_FINISH |
"pause-at-finish" |
public static final java.lang.String |
ATTR_PAUSE_AT_START |
"pause-at-start" |
public static final java.lang.String |
ATTR_PREFERENCE_EMBED_HOPS |
"preference-embed-hops" |
public static final java.lang.String |
ATTR_QUEUE_ASSIGNMENT_POLICY |
"queue-assignment-policy" |
protected static final java.lang.String |
ATTR_RECOVERY_ENABLED |
"recovery-log-enabled" |
public static final java.lang.String |
ATTR_RESPECT_CRAWL_DELAY_UP_TO_SECS |
"respect-crawl-delay-up-to-secs" |
public static final java.lang.String |
ATTR_RETRY_DELAY |
"retry-delay-seconds" |
public static final java.lang.String |
ATTR_SOURCE_TAG_SEEDS |
"source-tag-seeds" |
protected static final java.lang.String |
DEFAULT_FORCE_QUEUE |
"" |
public static final java.lang.String |
IGNORED_SEEDS_FILENAME |
"seeds.ignored" |
org.archive.crawler.frontier.AdaptiveRevisitAttributeConstants | ||
---|---|---|
public static final java.lang.String |
A_CONTENT_STATE_KEY |
"ar-state" |
public static final java.lang.String |
A_DISCARD_REVISIT |
"no-revisit" |
public static final java.lang.String |
A_FETCH_OVERDUE |
"fetch-overdue" |
public static final java.lang.String |
A_LAST_CONTENT_DIGEST |
"last-content-digest" |
public static final java.lang.String |
A_LAST_DATESTAMP |
"last-datestamp" |
public static final java.lang.String |
A_LAST_ETAG |
"last-etag" |
public static final java.lang.String |
A_NUMBER_OF_VERSIONS |
"number-of-versions" |
public static final java.lang.String |
A_NUMBER_OF_VISITS |
"number-of-visits" |
public static final java.lang.String |
A_TIME_OF_NEXT_PROCESSING |
"time-of-next-processing" |
public static final java.lang.String |
A_WAIT_INTERVAL |
"wait-interval" |
public static final java.lang.String |
A_WAIT_REEVALUATED |
"wait-reevaluated" |
public static final int |
CONTENT_CHANGED |
1 |
public static final int |
CONTENT_UNCHANGED |
0 |
public static final int |
CONTENT_UNKNOWN |
-1 |
org.archive.crawler.frontier.AdaptiveRevisitFrontier | ||
---|---|---|
protected static final java.lang.String |
ACCEPTABLE_FORCE_QUEUE |
"[-\\w\\.,:]*" |
public static final java.lang.String |
ATTR_DELAY_FACTOR |
"delay-factor" |
public static final java.lang.String |
ATTR_FORCE_QUEUE |
"force-queue-assignment" |
public static final java.lang.String |
ATTR_HOST_VALENCE |
"host-valence" |
public static final java.lang.String |
ATTR_MAX_DELAY |
"max-delay-ms" |
public static final java.lang.String |
ATTR_MAX_RETRIES |
"max-retries" |
public static final java.lang.String |
ATTR_MIN_DELAY |
"min-delay-ms" |
public static final java.lang.String |
ATTR_PREFERENCE_EMBED_HOPS |
"preference-embed-hops" |
public static final java.lang.String |
ATTR_QUEUE_ASSIGNMENT_POLICY |
"queue-assignment-policy" |
public static final java.lang.String |
ATTR_QUEUE_IGNORE_WWW |
"queue-ignore-www" |
public static final java.lang.String |
ATTR_RETRY_DELAY |
"retry-delay-seconds" |
public static final java.lang.String |
ATTR_USE_URI_UNIQ_FILTER |
"use-uri-uniq-filter" |
protected static final java.lang.String |
DEFAULT_FORCE_QUEUE |
"" |
org.archive.crawler.frontier.AdaptiveRevisitHostQueue | ||
---|---|---|
public static final int |
HQSTATE_BUSY |
2 |
public static final int |
HQSTATE_EMPTY |
0 |
public static final int |
HQSTATE_READY |
1 |
public static final int |
HQSTATE_SNOOZED |
3 |
org.archive.crawler.frontier.BdbFrontier | ||
---|---|---|
public static final java.lang.String |
ATTR_DUMP_PENDING_AT_CLOSE |
"dump-pending-at-close" |
public static final java.lang.String |
ATTR_INCLUDED |
"uri-included-structure" |
org.archive.crawler.frontier.DomainSensitiveFrontier | ||
---|---|---|
public static final java.lang.String |
ATTR_COUNTER_MODE |
"counter-mode" |
public static final java.lang.String |
ATTR_MAX_DOCS |
"max-docs" |
public static final java.lang.String |
COUNT_DOMAIN |
"count-per-domain" |
public static final java.lang.String |
COUNT_HOST |
"count-per-host" |
public static final java.lang.String |
COUNT_OVERRIDE |
"count-per-override" |
public static final java.lang.String |
DEFAULT_MODE |
"count-per-override" |
org.archive.crawler.frontier.FrontierJournal | ||
---|---|---|
public static final java.lang.String |
LOGNAME_RECOVER |
"recover.gz" |
org.archive.crawler.frontier.RecoveryJournal | ||
---|---|---|
public static final java.lang.String |
F_ADD |
"F+ " |
public static final java.lang.String |
F_DISREGARD |
"Fd " |
public static final java.lang.String |
F_EMIT |
"Fe " |
public static final java.lang.String |
F_FAILURE |
"Ff " |
public static final java.lang.String |
F_RESCHEDULE |
"Fr " |
public static final java.lang.String |
F_SUCCESS |
"Fs " |
org.archive.crawler.frontier.WorkQueue | ||
---|---|---|
static final long |
serialVersionUID |
-1939168792663316048L |
org.archive.crawler.frontier.WorkQueueFrontier | ||
---|---|---|
public static final java.lang.String |
ATTR_BALANCE_REPLENISH_AMOUNT |
"balance-replenish-amount" |
public static final java.lang.String |
ATTR_COST_POLICY |
"cost-policy" |
public static final java.lang.String |
ATTR_ERROR_PENALTY_AMOUNT |
"error-penalty-amount" |
public static final java.lang.String |
ATTR_HOLD_QUEUES |
"hold-queues" |
public static final java.lang.String |
ATTR_QUEUE_TOTAL_BUDGET |
"queue-total-budget" |
public static final java.lang.String |
ATTR_SNOOZE_DEACTIVATE_MS |
"snooze-deactivate-ms" |
public static final java.lang.String |
ATTR_TARGET_READY_QUEUES_BACKLOG |
"target-ready-backlog" |
org.archive.crawler.io.CrawlerJournal | ||
---|---|---|
public static final java.lang.String |
GZIP_SUFFIX |
".gz" |
public static final java.lang.String |
LOG_ERROR |
"E " |
public static final java.lang.String |
LOG_TIMESTAMP |
"T " |
org.archive.crawler.postprocessor.ContentBasedWaitEvaluator | ||
---|---|---|
public static final java.lang.String |
ATTR_CONTENT_REGEXPR |
"content-regular-expression" |
protected static final java.lang.String |
DEFAULT_CONTENT_REGEXPR |
"^.*$" |
org.archive.crawler.postprocessor.ImageWaitEvaluator | ||
---|---|---|
protected static final java.lang.String |
DEFAULT_CONTENT_REGEXPR |
"^image/.*$" |
org.archive.crawler.postprocessor.LinksScoper | ||
---|---|---|
public static final java.lang.String |
ATTR_PREFERENCE_DEPTH_HOPS |
"preference-depth-hops" |
public static final java.lang.String |
ATTR_REJECTLOG_DECIDE_RULES |
"scope-rejected-url-rules" |
org.archive.crawler.postprocessor.LowDiskPauseProcessor | ||
---|---|---|
public static final java.lang.String |
ATTR_MONITOR_MOUNTS |
"monitor-mounts" |
public static final java.lang.String |
ATTR_PAUSE_THRESHOLD |
"pause-threshold-kb" |
public static final java.lang.String |
ATTR_RECHECK_THRESHOLD |
"recheck-threshold-kb" |
public static final java.lang.String |
DEFAULT_MONITOR_MOUNTS |
"" |
public static final int |
DEFAULT_PAUSE_THRESHOLD |
512000 |
public static final int |
DEFAULT_RECHECK_THRESHOLD |
204800 |
org.archive.crawler.postprocessor.SupplementaryLinksScoper | ||
---|---|---|
public static final java.lang.String |
ATTR_LINKS_DECIDE_RULES |
"link-rules" |
org.archive.crawler.postprocessor.TextWaitEvaluator | ||
---|---|---|
protected static final java.lang.String |
DEFAULT_CONTENT_REGEXPR |
"^text/.*$" |
org.archive.crawler.postprocessor.WaitEvaluator | ||
---|---|---|
public static final java.lang.String |
ATTR_CHANGED_FACTOR |
"changed-factor" |
public static final java.lang.String |
ATTR_DEFAULT_WAIT_INTERVAL |
"default-wait-interval-seconds" |
public static final java.lang.String |
ATTR_INITIAL_WAIT_INTERVAL |
"initial-wait-interval-seconds" |
public static final java.lang.String |
ATTR_MAX_WAIT_INTERVAL |
"max-wait-interval-seconds" |
public static final java.lang.String |
ATTR_MIN_WAIT_INTERVAL |
"min-wait-interval-seconds" |
public static final java.lang.String |
ATTR_UNCHANGED_FACTOR |
"unchanged-factor" |
public static final java.lang.String |
ATTR_USE_OVERDUE_TIME |
"use-overdue-time" |
org.archive.crawler.prefetch.PreconditionEnforcer | ||
---|---|---|
public static final java.lang.String |
ATTR_CALCULATE_ROBOTS_ONLY |
"calculate-robots-only" |
public static final java.lang.String |
ATTR_IP_VALIDITY_DURATION |
"ip-validity-duration-seconds" |
public static final java.lang.String |
ATTR_ROBOTS_VALIDITY_DURATION |
"robot-validity-duration-seconds" |
org.archive.crawler.prefetch.Preselector | ||
---|---|---|
public static final java.lang.String |
ATTR_ALLOW_BY_REGEXP |
"allow-by-regexp" |
public static final java.lang.String |
ATTR_BLOCK_ALL |
"block-all" |
public static final java.lang.String |
ATTR_BLOCK_BY_REGEXP |
"block-by-regexp" |
public static final java.lang.String |
ATTR_RECHECK_SCOPE |
"recheck-scope" |
org.archive.crawler.prefetch.QuotaEnforcer | ||
---|---|---|
protected static final java.lang.String |
ATTR_FORCE_RETIRE |
"force-retire" |
protected static final int |
GROUP |
2 |
protected static final int |
HOST |
1 |
protected static final int |
NAME |
0 |
protected static final int |
RESPONSE_KB |
4 |
protected static final int |
RESPONSES |
3 |
protected static final int |
SERVER |
0 |
protected static final int |
SUCCESS_KB |
2 |
protected static final int |
SUCCESSES |
1 |
org.archive.crawler.prefetch.RuntimeLimitEnforcer | ||
---|---|---|
protected static final long |
DEFAULT_RUNTIME_SECONDS |
86400L |
org.archive.crawler.processor.BeanShellProcessor | ||
---|---|---|
public static final java.lang.String |
ATTR_ISOLATE_THREADS |
"isolate-threads" |
public static final java.lang.String |
ATTR_SCRIPT_FILE |
"script-file" |
org.archive.crawler.processor.CrawlMapper | ||
---|---|---|
public static final java.lang.String |
ATTR_CHECK_OUTLINKS |
"check-outlinks" |
public static final java.lang.String |
ATTR_CHECK_URI |
"check-uri" |
public static final java.lang.String |
ATTR_DIVERSION_DIR |
"diversion-dir" |
public static final java.lang.String |
ATTR_LOCAL_NAME |
"local-name" |
public static final java.lang.String |
ATTR_MAP_OUTLINK_DECIDE_RULES |
"decide-rules" |
public static final java.lang.String |
ATTR_ROTATION_DIGITS |
"rotation-digits" |
public static final java.lang.String |
DEFAULT_DIVERSION_DIR |
"diversions" |
public static final java.lang.String |
DEFAULT_LOCAL_NAME |
"." |
org.archive.crawler.processor.HashCrawlMapper | ||
---|---|---|
public static final java.lang.String |
ATTR_CRAWLER_COUNT |
"crawler-count" |
public static final java.lang.String |
ATTR_REDUCE_PATTERN |
"reduce-prefix-pattern" |
public static final java.lang.String |
ATTR_USE_PUBLICSUFFIX_REDUCE |
"use_publicsuffix_reduction" |
public static final java.lang.String |
DEFAULT_REDUCE_PATTERN |
"" |
org.archive.crawler.processor.LexicalCrawlMapper | ||
---|---|---|
public static final java.lang.String |
ATTR_MAP_SOURCE |
"map-source" |
public static final java.lang.String |
DEFAULT_MAP_SOURCE |
"" |
org.archive.crawler.processor.recrawl.FetchHistoryProcessor | ||
---|---|---|
public static final java.lang.String |
ATTR_HISTORY_LENGTH |
"history-length" |
org.archive.crawler.processor.recrawl.PersistLoadProcessor | ||
---|---|---|
public static final java.lang.String |
ATTR_PRELOAD_SOURCE |
"preload-source" |
org.archive.crawler.processor.recrawl.PersistLogProcessor | ||
---|---|---|
public static final java.lang.String |
ATTR_LOG_FILENAME |
"log-filename" |
public static final java.lang.String |
DEFAULT_LOG_FILENAME |
"persistlog.txtser.gz" |
org.archive.crawler.processor.recrawl.PersistProcessor | ||
---|---|---|
public static final java.lang.String |
URI_HISTORY_DBNAME |
"uri_history" |
org.archive.crawler.scope.ClassicScope | ||
---|---|---|
public static final java.lang.String |
ATTR_EXCLUDE_FILTER |
"exclude-filter" |
public static final java.lang.String |
ATTR_MAX_LINK_HOPS |
"max-link-hops" |
public static final java.lang.String |
ATTR_MAX_TRANS_HOPS |
"max-trans-hops" |
org.archive.crawler.scope.DomainScope | ||
---|---|---|
public static final java.lang.String |
ATTR_ADDITIONAL_FOCUS_FILTER |
"additionalScopeFocus" |
public static final java.lang.String |
ATTR_TRANSITIVE_FILTER |
"transitiveFilter" |
public static final java.lang.String |
DOT |
"." |
org.archive.crawler.scope.HostScope | ||
---|---|---|
public static final java.lang.String |
ATTR_ADDITIONAL_FOCUS_FILTER |
"additionalScopeFocus" |
public static final java.lang.String |
ATTR_TRANSITIVE_FILTER |
"transitiveFilter" |
org.archive.crawler.scope.PathScope | ||
---|---|---|
public static final java.lang.String |
ATTR_ADDITIONAL_FOCUS_FILTER |
"additionalScopeFocus" |
public static final java.lang.String |
ATTR_TRANSITIVE_FILTER |
"transitiveFilter" |
org.archive.crawler.scope.RefinedScope | ||
---|---|---|
public static final java.lang.String |
ATTR_ADDITIONAL_FOCUS_FILTER |
"additionalScopeFocus" |
public static final java.lang.String |
ATTR_TRANSITIVE_FILTER |
"transitiveFilter" |
org.archive.crawler.scope.SurtPrefixScope | ||
---|---|---|
public static final java.lang.String |
ATTR_ALSO_CHECK_VIA |
"also-check-via" |
public static final java.lang.String |
ATTR_SEEDS_AS_SURT_PREFIXES |
"seeds-as-surt-prefixes" |
public static final java.lang.String |
ATTR_SURTS_DUMP_FILE |
"surts-dump-file" |
public static final java.lang.String |
ATTR_SURTS_SOURCE_FILE |
"surts-source-file" |
org.archive.crawler.selftest.SelfTestCase | ||
---|---|---|
protected static final java.lang.String |
SELFTEST |
"SelfTest" |
org.archive.crawler.settings.Constraint | ||
---|---|---|
static final long |
serialVersionUID |
-646814290764700497L |
org.archive.crawler.settings.SettingsHandler | ||
---|---|---|
static final java.lang.String |
BOOLEAN |
"boolean" |
static final java.lang.String |
DOUBLE |
"double" |
static final java.lang.String |
DOUBLE_LIST |
"doubleList" |
static final java.lang.String |
FLOAT |
"float" |
static final java.lang.String |
FLOAT_LIST |
"floatList" |
static final java.lang.String |
INTEGER |
"integer" |
static final java.lang.String |
INTEGER_LIST |
"integerList" |
static final java.lang.String |
LONG |
"long" |
static final java.lang.String |
LONG_LIST |
"longList" |
static final java.lang.String |
MAP |
"map" |
static final java.lang.String |
OBJECT |
"object" |
static final java.lang.String |
STRING |
"string" |
static final java.lang.String |
STRING_LIST |
"stringList" |
static final java.lang.String |
TEXT |
"text" |
static final java.lang.String |
TIMESTAMP |
"timestamp" |
org.archive.crawler.settings.XMLSettingsHandler | ||
---|---|---|
protected static final java.lang.String |
XML_ATTRIBUTE_CLASS |
"class" |
protected static final java.lang.String |
XML_ATTRIBUTE_FROM |
"from" |
protected static final java.lang.String |
XML_ATTRIBUTE_NAME |
"name" |
protected static final java.lang.String |
XML_ATTRIBUTE_TO |
"to" |
protected static final java.lang.String |
XML_ELEMENT_AUDIENCE |
"audience" |
protected static final java.lang.String |
XML_ELEMENT_CONTENTMATCHES |
"content-type-matches" |
protected static final java.lang.String |
XML_ELEMENT_CONTROLLER |
"controller" |
protected static final java.lang.String |
XML_ELEMENT_DATE |
"date" |
protected static final java.lang.String |
XML_ELEMENT_DESCRIPTION |
"description" |
protected static final java.lang.String |
XML_ELEMENT_LIMITS |
"limits" |
protected static final java.lang.String |
XML_ELEMENT_META |
"meta" |
protected static final java.lang.String |
XML_ELEMENT_NAME |
"name" |
protected static final java.lang.String |
XML_ELEMENT_NEW_OBJECT |
"newObject" |
protected static final java.lang.String |
XML_ELEMENT_OBJECT |
"object" |
protected static final java.lang.String |
XML_ELEMENT_OPERATOR |
"operator" |
protected static final java.lang.String |
XML_ELEMENT_ORGANIZATION |
"organization" |
protected static final java.lang.String |
XML_ELEMENT_PORTNUMBER |
"portnumber" |
protected static final java.lang.String |
XML_ELEMENT_REFERENCE |
"reference" |
protected static final java.lang.String |
XML_ELEMENT_REFINEMENT |
"refinement" |
protected static final java.lang.String |
XML_ELEMENT_REFINEMENTLIST |
"refinement-list" |
protected static final java.lang.String |
XML_ELEMENT_TIMESPAN |
"timespan" |
protected static final java.lang.String |
XML_ELEMENT_URIMATCHES |
"uri-matches" |
protected static final java.lang.String |
XML_ROOT_HOST_SETTINGS |
"crawl-settings" |
protected static final java.lang.String |
XML_ROOT_ORDER |
"crawl-order" |
protected static final java.lang.String |
XML_ROOT_REFINEMENT |
"crawl-refinement" |
protected static final java.lang.String |
XML_SCHEMA |
"heritrix_settings.xsd" |
org.archive.crawler.url.canonicalize.BaseRule | ||
---|---|---|
public static final java.lang.String |
ATTR_ENABLED |
"enabled" |
org.archive.crawler.util.BloomUriUniqFilter | ||
---|---|---|
protected static final java.lang.String |
EXPECTED_SIZE_KEY |
".expected-size" |
protected static final java.lang.String |
HASH_COUNT_KEY |
".hash-count" |
org.archive.crawler.util.CheckpointUtils | ||
---|---|---|
public static final java.lang.String |
SERIALIZED_CLASS_SUFFIX |
".serialized" |
org.archive.crawler.util.CrawledBytesHistotable | ||
---|---|---|
public static final java.lang.String |
DUPLICATE |
"dup-by-hash" |
public static final java.lang.String |
NOTMODIFIED |
"not-modified" |
public static final java.lang.String |
NOVEL |
"novel" |
org.archive.crawler.util.FPMergeUriUniqFilter | ||
---|---|---|
public static final int |
DEFAULT_MAX_PENDING |
10000 |
public static final long |
FLUSH_DELAY_FACTOR |
100L |
org.archive.crawler.writer.Kw3Constants | ||
---|---|---|
public static final java.lang.String |
ARCHIVE_TIME_KEY |
"HTTP-Archive-Time" |
public static final java.lang.String |
COLLECTION_KEY |
"HTTP-Collection" |
public static final java.lang.String |
CONTENT_LENGTH_KEY |
"HTTP-Content-Length" |
public static final java.lang.String |
CONTENT_MD5_KEY |
"HTTP-Content-MD5" |
public static final java.lang.String |
CONTENT_TYPE_KEY |
"Content-Type" |
public static final java.lang.String |
HARVESTER_KEY |
"HTTP-Harvester" |
public static final java.lang.String |
HEADER_LENGTH_KEY |
"HTTP-Header-Length" |
public static final java.lang.String |
HEADER_MD5_KEY |
"HTTP-Header-MD5" |
public static final java.lang.String |
IP_ADDRESS_KEY |
"HTTP-IP-Address" |
public static final java.lang.String |
STATUS_CODE_KEY |
"HTTP-Status-Code" |
public static final java.lang.String |
URL_KEY |
"HTTP-URL" |
org.archive.crawler.writer.Kw3WriterProcessor | ||
---|---|---|
public static final java.lang.String |
ATTR_CHMOD |
"chmod" |
public static final java.lang.String |
ATTR_CHMOD_VALUE |
"chmod-value" |
public static final java.lang.String |
ATTR_COLLECTION |
"collection" |
public static final java.lang.String |
ATTR_HARVESTER |
"harvester" |
public static final java.lang.String |
ATTR_MAX_BYTES_WRITTEN |
"total-bytes-to-write" |
public static final java.lang.String |
ATTR_MAX_SIZE_BYTES |
"max-size-bytes" |
public static final java.lang.String |
ATTR_PATH |
"path" |
public static final java.lang.String |
DEFAULT_CHMOD_VALUE |
"777" |
public static final java.lang.String |
DEFAULT_COLLECTION_VALUE |
"kw3" |
public static final java.lang.String |
DEFAULT_HARVESTER_VALUE |
"heritrix" |
public static final int |
DEFAULT_MAX_FILE_SIZE |
10000000 |
org.archive.crawler.writer.MirrorWriterProcessor | ||
---|---|---|
public static final java.lang.String |
ATTR_CASE_SENSITIVE |
"case-sensitive" |
public static final java.lang.String |
ATTR_CHAR_MAP |
"character-map" |
public static final java.lang.String |
ATTR_CONTENT_TYPE_MAP |
"content-type-map" |
public static final java.lang.String |
ATTR_DIRECTORY_FILE |
"directory-file" |
public static final java.lang.String |
ATTR_DOT_BEGIN |
"dot-begin" |
public static final java.lang.String |
ATTR_DOT_END |
"dot-end" |
public static final java.lang.String |
ATTR_HOST_DIRECTORY |
"host-directory" |
public static final java.lang.String |
ATTR_HOST_MAP |
"host-map" |
public static final java.lang.String |
ATTR_MAX_PATH_LEN |
"max-path-length" |
public static final java.lang.String |
ATTR_MAX_SEG_LEN |
"max-segment-length" |
public static final java.lang.String |
ATTR_PATH |
"path" |
public static final java.lang.String |
ATTR_PORT_DIRECTORY |
"port-directory" |
public static final java.lang.String |
ATTR_SUFFIX_AT_END |
"suffix-at-end" |
public static final java.lang.String |
ATTR_TOO_LONG_DIRECTORY |
"too-long-directory" |
public static final java.lang.String |
ATTR_UNDERSCORE_SET |
"underscore-set" |
org.archive.crawler.writer.org.archive.crawler.writer.MirrorWriterProcessor.PathSegment | ||
---|---|---|
protected static final int |
EXISTS_CASE_INSENSITIVE_MATCH |
3 |
protected static final int |
EXISTS_EXACT_MATCH |
2 |
protected static final int |
EXISTS_NOT |
1 |
org.archive.crawler.writer.WARCWriterProcessor | ||
---|---|---|
public static final java.lang.String |
ATTR_WRITE_METADATA |
"write-metadata" |
public static final java.lang.String |
ATTR_WRITE_REQUESTS |
"write-requests" |
public static final java.lang.String |
ATTR_WRITE_REVISIT_FOR_IDENTICAL_DIGESTS |
"write-revisit-for-identical-digests" |
public static final java.lang.String |
ATTR_WRITE_REVISIT_FOR_NOT_MODIFIED |
"write-revisit-for-not-modified" |
org.archive.extractor.RegexpCSSLinkExtractor | ||
---|---|---|
static final java.lang.String |
CSS_BACKSLASH_ESCAPE |
"\\\\([,\'\"\\(\\)\\s])" |
static final java.lang.String |
CSS_URI_EXTRACTOR |
"(?:@import (?:url[(]|)|url[(])\\s*([\\\"\']?)([^\\\"\'].*?)\\1\\s*[);]" |
org.archive.extractor.RegexpHTMLLinkExtractor | ||
---|---|---|
static final java.lang.String |
AMP |
"&" |
static final java.lang.String |
APPLET |
"applet" |
static final java.lang.String |
BASE |
"base" |
static final java.lang.String |
CLASSEXT |
".class" |
static final java.lang.String |
EACH_ATTRIBUTE_EXTRACTOR |
"(?is)\\s((href)|(action)|(on\\w*)|((?:src)|(?:lowsrc)|(?:background)|(?:cite)|(?:longdesc)|(?:usemap)|(?:profile)|(?:datasrc)|(?:for))|(codebase)|((?:classid)|(?:data))|(archive)|(code)|(value)|([-\\w]+))\\s*=\\s*(?:(?:\"(.*?)(?:\"|$))|(?:\'(.*?)(?:\'|$))|(\\S+))" |
static final java.lang.String |
ESCAPED_AMP |
"&" |
static final java.lang.String |
JAVASCRIPT |
"(?i)^javascript:.*" |
static final java.lang.String |
LIKELY_URI_PATH |
"(\\.{0,2}[^\\.\\n\\r\\s\"\']*(\\.[^\\.\\n\\r\\s\"\']+)+)" |
static final java.lang.String |
LINK |
"link" |
static final java.lang.String |
NON_HTML_PATH_EXTENSION |
"(?i)(gif)|(jp(e)?g)|(png)|(tif(f)?)|(bmp)|(avi)|(mov)|(mp(e)?g)|(mp3)|(mp4)|(swf)|(wav)|(au)|(aiff)|(mid)" |
static final java.lang.String |
RELEVANT_TAG_EXTRACTOR |
"(?is)<(?:((script[^>]*+)>.*?</script)|((style[^>]*+)>[^<]*+</style)|(((meta)|(?:\\w+))\\s+[^>]*+)|(!--.*?--))>" |
static final java.lang.String |
WHITESPACE |
"\\s" |
org.archive.extractor.RegexpJSLinkExtractor | ||
---|---|---|
static final java.lang.String |
AMP |
"&" |
static final java.lang.String |
ESCAPED_AMP |
"&" |
static final java.lang.String |
WHITESPACE |
"\\s" |
org.archive.httpclient.ConfigurableX509TrustManager | ||
---|---|---|
public static final java.lang.String |
DEFAULT |
"open" |
public static final java.lang.String |
LOOSE |
"loose" |
public static final java.lang.String |
NORMAL |
"normal" |
public static final java.lang.String |
OPEN |
"open" |
public static final java.lang.String |
STRICT |
"strict" |
org.archive.io.ArchiveFileConstants | ||
---|---|---|
public static final java.lang.String |
ABSOLUTE_OFFSET_KEY |
"absolute-offset" |
public static final java.lang.String |
CDX |
"cdx" |
public static final java.lang.String |
CDX_FILE |
"cdxfile" |
public static final int |
CDX_LINE_BUFFER_SIZE |
1224 |
public static final java.lang.String |
COMPRESSED_FILE_EXTENSION |
"gz" |
public static final java.lang.String |
CRLF |
"\r\n" |
public static final java.lang.String |
DATE_FIELD_KEY |
"creation-date" |
public static final java.lang.String |
DEFAULT_DIGEST_METHOD |
"SHA-1" |
public static final java.lang.String |
DOT_COMPRESSED_FILE_EXTENSION |
".gz" |
public static final java.lang.String |
DUMP |
"dump" |
public static final java.lang.String |
GZIP_DUMP |
"gzipdump" |
public static final java.lang.String |
HEADER |
"header" |
public static final java.lang.String |
INVALID_SUFFIX |
".invalid" |
public static final java.lang.String |
LENGTH_FIELD_KEY |
"length" |
public static final java.lang.String |
MIMETYPE_FIELD_KEY |
"content-type" |
public static final java.lang.String |
NOHEAD |
"nohead" |
public static final java.lang.String |
OCCUPIED_SUFFIX |
".open" |
public static final java.lang.String |
READER_IDENTIFIER_FIELD_KEY |
"reader-identifier" |
public static final java.lang.String |
RECORD_IDENTIFIER_FIELD_KEY |
"record-identifier" |
public static final char |
SINGLE_SPACE |
32 |
public static final java.lang.String |
TYPE_FIELD_KEY |
"type" |
public static final java.lang.String |
URL_FIELD_KEY |
"subject-uri" |
public static final java.lang.String |
VERSION_FIELD_KEY |
"version" |
org.archive.io.ArchiveReader | ||
---|---|---|
public static final int |
MAX_ALLOWED_RECOVERABLES |
10 |
org.archive.io.GzipHeader | ||
---|---|---|
public static final int |
MINIMAL_GZIP_HEADER_LENGTH |
10 |
org.archive.io.RecordingOutputStream | ||
---|---|---|
protected static final long |
MAX_HEADER_MATERIAL |
1048576L |
org.archive.io.RecyclingFastBufferedOutputStream | ||
---|---|---|
public static final int |
DEFAULT_BUFFER_SIZE |
16384 |
org.archive.io.UTF8Bytes | ||
---|---|---|
public static final java.lang.String |
UTF8 |
"UTF-8" |
org.archive.io.WriterPool | ||
---|---|---|
public static final int |
DEFAULT_MAX_ACTIVE |
1 |
public static final int |
DEFAULT_MAXIMUM_WAIT |
300000 |
protected static final int |
NO_MAX_IDLE |
-1 |
org.archive.io.WriterPoolMember | ||
---|---|---|
public static final java.lang.String |
DEFAULT_PREFIX |
"IAH" |
public static final java.lang.String |
DEFAULT_SUFFIX |
"${HOSTNAME}" |
public static final java.lang.String |
HOSTNAME_ADMINPORT_VARIABLE |
"${HOSTNAME_ADMINPORT}" |
public static final java.lang.String |
HOSTNAME_VARIABLE |
"${HOSTNAME}" |
public static final java.lang.String |
UTF8 |
"UTF-8" |
org.archive.io.arc.ARC2WCDX | ||
---|---|---|
public static final java.lang.String |
WCDX_VERSION |
"0.1" |
org.archive.io.arc.ARCConstants | ||
---|---|---|
public static final java.lang.String |
ARC_FILE_EXTENSION |
"arc" |
public static final java.lang.String |
ARC_MAGIC_NUMBER |
"filedesc://" |
public static final java.lang.String |
CHECKSUM_FIELD_KEY |
"checksum" |
public static final java.lang.String |
CHECKSUM_HEADER_FIELD_KEY |
"checksum" |
public static final java.lang.String |
CODE_HEADER_FIELD_KEY |
"result-code" |
public static final java.lang.String |
COMPRESSED_ARC_FILE_EXTENSION |
"arc.gz" |
public static final java.lang.String |
DEFAULT_ENCODING |
"ISO-8859-1" |
public static final int |
DEFAULT_GZIP_HEADER_LENGTH |
10 |
public static final long |
DEFAULT_MAX_ARC_FILE_SIZE |
100000000L |
public static final java.lang.String |
DOT_ARC_FILE_EXTENSION |
".arc" |
public static final java.lang.String |
DOT_COMPRESSED_ARC_FILE_EXTENSION |
".arc.gz" |
public static final java.lang.String |
DOT_COMPRESSED_FILE_EXTENSION |
".gz" |
public static final java.lang.String |
FILENAME_FIELD_KEY |
"filename" |
public static final java.lang.String |
FILENAME_HEADER_FIELD_KEY |
"filename" |
public static final char |
HEADER_FIELD_SEPARATOR |
32 |
public static final java.lang.String |
IP_HEADER_FIELD_KEY |
"ip-address" |
public static final char |
LINE_SEPARATOR |
10 |
public static final java.lang.String |
LOCATION_HEADER_FIELD_KEY |
"location" |
public static final int |
MAX_METADATA_LINE_LENGTH |
4096 |
public static final java.lang.String |
OFFSET_FIELD_KEY |
"offset" |
public static final java.lang.String |
OFFSET_HEADER_FIELD_KEY |
"offset" |
public static final java.lang.String |
STATUSCODE_FIELD_KEY |
"statuscode" |
public static final java.lang.String |
TOKENIZED_PREFIX |
"tokenized_" |
org.archive.io.warc.WARCConstants | ||
---|---|---|
public static final java.lang.String |
COLON_SPACE |
": " |
public static final java.lang.String |
COMPRESSED_WARC_FILE_EXTENSION |
"warc.gz" |
public static final java.lang.String |
CONTENT_DESCRIPTION |
"Content-Description" |
public static final java.lang.String |
CONTENT_LENGTH |
"Content-Length" |
public static final java.lang.String |
CONTENT_TYPE |
"Content-Type" |
public static final java.lang.String |
CONTINUATION |
"continuation" |
public static final int |
CONTINUATION_INDEX |
7 |
public static final java.lang.String |
CONVERSION |
"conversion" |
public static final int |
CONVERSION_INDEX |
6 |
public static final java.lang.String |
DEFAULT_ENCODING |
"UTF-8" |
public static final int |
DEFAULT_MAX_WARC_FILE_SIZE |
1073741824 |
public static final java.lang.String |
DOT_COMPRESSED_FILE_EXTENSION |
".gz" |
public static final java.lang.String |
DOT_COMPRESSED_WARC_FILE_EXTENSION |
".warc.gz" |
public static final java.lang.String |
DOT_WARC_FILE_EXTENSION |
".warc" |
public static final java.lang.String |
FTP_CONTROL_CONVERSATION_MIMETYPE |
"text/x-ftp-control-conversation" |
public static final char |
HEADER_FIELD_SEPARATOR |
32 |
public static final java.lang.String |
HEADER_KEY_BLOCK_DIGEST |
"WARC-Block-Digest" |
public static final java.lang.String |
HEADER_KEY_CONCURRENT_TO |
"WARC-Concurrent-To" |
public static final java.lang.String |
HEADER_KEY_DATE |
"WARC-Date" |
public static final java.lang.String |
HEADER_KEY_ETAG |
"WARC-Etag" |
public static final java.lang.String |
HEADER_KEY_FILENAME |
"WARC-Filename" |
public static final java.lang.String |
HEADER_KEY_ID |
"WARC-Record-ID" |
public static final java.lang.String |
HEADER_KEY_IP |
"WARC-IP-Address" |
public static final java.lang.String |
HEADER_KEY_LAST_MODIFIED |
"WARC-Last-Modified" |
public static final java.lang.String |
HEADER_KEY_PAYLOAD_DIGEST |
"WARC-Payload-Digest" |
public static final java.lang.String |
HEADER_KEY_PROFILE |
"WARC-Profile" |
public static final java.lang.String |
HEADER_KEY_TRUNCATED |
"WARC-Truncated" |
public static final java.lang.String |
HEADER_KEY_TYPE |
"WARC-Type" |
public static final java.lang.String |
HEADER_KEY_URI |
"WARC-Target-URI" |
public static final java.lang.String |
HEADER_LINE_ENCODING |
"UTF-8" |
public static final java.lang.String |
HTTP_REQUEST_MIMETYPE |
"application/http; msgtype=request" |
public static final java.lang.String |
HTTP_RESPONSE_MIMETYPE |
"application/http; msgtype=response" |
public static final int |
MAX_LINE_LENGTH |
102400 |
public static final int |
MAX_WARC_HEADER_LINE_LENGTH |
102400 |
public static final java.lang.String |
METADATA |
"metadata" |
public static final int |
METADATA_INDEX |
4 |
public static final java.lang.String |
NAMED_FIELD_CHECKSUM_LABEL |
"Checksum" |
public static final java.lang.String |
NAMED_FIELD_DESCRIPTION |
"Description" |
public static final java.lang.String |
NAMED_FIELD_FILEDESC |
"ARC-FileDesc" |
public static final java.lang.String |
NAMED_FIELD_IP_LABEL |
"IP-Address" |
public static final java.lang.String |
NAMED_FIELD_RELATED_LABEL |
"References" |
public static final java.lang.String |
NAMED_FIELD_TRUNCATED |
"Truncated" |
public static final java.lang.String |
NAMED_FIELD_TRUNCATED_VALUE_HEAD |
"long-headers" |
public static final java.lang.String |
NAMED_FIELD_TRUNCATED_VALUE_LENGTH |
"length" |
public static final java.lang.String |
NAMED_FIELD_TRUNCATED_VALUE_TIME |
"time" |
public static final java.lang.String |
NAMED_FIELD_WARCFILENAME |
"Filename" |
public static final java.lang.String |
PLACEHOLDER_RECORD_LENGTH_STRING |
"000000000000" |
public static final java.lang.String |
PROFILE_REVISIT_IDENTICAL_DIGEST |
"http://netpreserve.org/warc/1.0/revisit/identical-payload-digest" |
public static final java.lang.String |
PROFILE_REVISIT_NOT_MODIFIED |
"http://netpreserve.org/warc/1.0/revisit/server-not-modified" |
public static final java.lang.String |
REQUEST |
"request" |
public static final int |
REQUEST_INDEX |
3 |
public static final java.lang.String |
RESOURCE |
"resource" |
public static final int |
RESOURCE_INDEX |
2 |
public static final java.lang.String |
RESPONSE |
"response" |
public static final int |
RESPONSE_INDEX |
1 |
public static final java.lang.String |
REVISIT |
"revisit" |
public static final int |
REVISIT_INDEX |
5 |
public static final java.lang.String |
TRUNCATED_VALUE_UNSPECIFIED |
"unspecified" |
public static final java.lang.String |
TYPE |
"type" |
public static final java.lang.String |
WARC_010_ID |
"WARC/0.10" |
public static final java.lang.String |
WARC_010_MAGIC |
"WARC/" |
public static final java.lang.String |
WARC_FILE_EXTENSION |
"warc" |
public static final java.lang.String |
WARC_HEADER_ENCODING |
"UTF-8" |
public static final java.lang.String |
WARC_ID |
"WARC/1.0" |
public static final java.lang.String |
WARC_MAGIC |
"WARC/" |
public static final java.lang.String |
WARC_VERSION |
"1.0" |
public static final java.lang.String |
WARCINFO |
"warcinfo" |
public static final int |
WARCINFO_INDEX |
0 |
org.archive.io.warc.WARCWriter | ||
---|---|---|
public static final java.lang.String |
CONTENT_BYTES |
"contentBytes" |
public static final java.lang.String |
NUM_RECORDS |
"numRecords" |
public static final java.lang.String |
SIZE_ON_DISK |
"sizeOnDisk" |
public static final java.lang.String |
TOTAL_BYTES |
"totalBytes" |
public static final java.lang.String |
TOTALS |
"totals" |
org.archive.net.UURI | ||
---|---|---|
public static final java.lang.String |
MASSAGEHOST_PATTERN |
"^www\\d*\\." |
public static final int |
MAX_URL_LENGTH |
2083 |
org.archive.net.UURIFactory | ||
---|---|---|
static final java.lang.String |
ACCEPTABLE_ASCII_DOMAIN |
"^(?:[a-zA-Z0-9_-]++(?:\\.)?)++$" |
public static final java.lang.String |
APOSTROPH |
"`" |
public static final java.lang.String |
BACKSLASH |
"\\" |
public static final java.lang.String |
BACKSLASH_PATTERN |
"\\\\" |
public static final java.lang.String |
CIRCUMFLEX |
"^" |
public static final java.lang.String |
CIRCUMFLEX_PATTERN |
"\\^" |
public static final char |
COLON |
58 |
public static final java.lang.String |
COMMERCIAL_AT |
"@" |
public static final java.lang.String |
DOT |
"." |
public static final java.lang.String |
EMPTY_STRING |
"" |
public static final java.lang.String |
ESCAPED_APOSTROPH |
"%60" |
public static final java.lang.String |
ESCAPED_BACKSLASH |
"%5C" |
public static final java.lang.String |
ESCAPED_CIRCUMFLEX |
"%5E" |
public static final java.lang.String |
ESCAPED_LCURBRACKET |
"%7B" |
public static final java.lang.String |
ESCAPED_LSQRBRACKET |
"%5B" |
public static final java.lang.String |
ESCAPED_PIPE |
"%7C" |
public static final java.lang.String |
ESCAPED_QUOT |
"%22" |
public static final java.lang.String |
ESCAPED_RCURBRACKET |
"%7D" |
public static final java.lang.String |
ESCAPED_RSQRBRACKET |
"%5D" |
public static final java.lang.String |
ESCAPED_SPACE |
"%20" |
public static final java.lang.String |
ESCAPED_SQUOT |
"%27" |
public static final java.lang.String |
HTTP |
"http" |
public static final java.lang.String |
HTTP_PORT |
":80" |
public static final java.lang.String |
HTTPS |
"https" |
public static final java.lang.String |
HTTPS_PORT |
":443" |
public static final int |
IGNORED_SCHEME |
9999999 |
public static final java.lang.String |
IMPROPERESC |
"%((?:[^\\p{XDigit}])|(?:.[^\\p{XDigit}])|(?:\\z))" |
public static final java.lang.String |
IMPROPERESC_REPLACE |
"%25$1" |
public static final java.lang.String |
LCURBRACKET |
"{" |
public static final java.lang.String |
LCURBRACKET_PATTERN |
"\\{" |
public static final java.lang.String |
LSQRBRACKET |
"[" |
public static final java.lang.String |
LSQRBRACKET_PATTERN |
"\\[" |
public static final java.lang.String |
NBSP |
"\u00a0" |
public static final char |
PERCENT_SIGN |
37 |
public static final java.lang.String |
PIPE |
"|" |
public static final java.lang.String |
PIPE_PATTERN |
"\\|" |
public static final java.lang.String |
QUOT |
"\"" |
public static final java.lang.String |
RCURBRACKET |
"}" |
public static final java.lang.String |
RCURBRACKET_PATTERN |
"\\}" |
public static final java.lang.String |
RSQRBRACKET |
"]" |
public static final java.lang.String |
RSQRBRACKET_PATTERN |
"\\]" |
public static final java.lang.String |
SLASH |
"/" |
public static final java.lang.String |
SLASHDOTDOTSLASH |
"^(/\\.\\./)+" |
public static final java.lang.String |
SPACE |
" " |
public static final java.lang.String |
SQUOT |
"\'" |
public static final java.lang.String |
STRAY_SPACING |
"[\n\r\t]+" |
public static final java.lang.String |
TRAILING_ESCAPED_SPACE |
"^(.*)(%20)+$" |
public static final java.lang.String |
URI_HEX_ENCODING |
"^[^%]*%[\\p{XDigit}][\\p{XDigit}].*" |
org.archive.util.BloomFilter64bit | ||
---|---|---|
protected static final long |
ADDRESS_BITS_PER_UNIT |
6L |
protected static final long |
BIT_INDEX_MASK |
63L |
static final boolean |
DEBUG |
false |
static final int |
NUMBER_OF_WEIGHTS |
2083 |
protected static final int |
SUBARRAY_LENGTH_IN_LONGS |
67108864 |
protected static final int |
SUBARRAY_MASK |
67108863 |
protected static final int |
SUBARRAY_POWER_OF_TWO |
26 |
org.archive.util.JEApplicationMBean | ||
---|---|---|
public static final java.lang.String |
OP_CLOSE |
"closeJE" |
public static final java.lang.String |
OP_OPEN |
"openJE" |
org.archive.util.JEMBeanHelper | ||
---|---|---|
public static final java.lang.String |
ATT_CACHE_PERCENT |
"cachePercent" |
public static final java.lang.String |
ATT_CACHE_SIZE |
"cacheSize" |
public static final java.lang.String |
ATT_ENV_HOME |
"environmentHome" |
public static final java.lang.String |
ATT_IS_READ_ONLY |
"isReadOnly" |
public static final java.lang.String |
ATT_IS_SERIALIZABLE |
"isSerializableIsolation" |
public static final java.lang.String |
ATT_IS_TRANSACTIONAL |
"isTransactional" |
public static final java.lang.String |
ATT_LOCK_TIMEOUT |
"lockTimeout" |
public static final java.lang.String |
ATT_OPEN |
"isOpen" |
public static final java.lang.String |
ATT_SET_READ_ONLY |
"openReadOnly" |
public static final java.lang.String |
ATT_SET_SERIALIZABLE |
"openSerializableIsolation" |
public static final java.lang.String |
ATT_SET_TRANSACTIONAL |
"openTransactional" |
public static final java.lang.String |
ATT_TXN_TIMEOUT |
"transactionTimeout" |
static final java.lang.String |
OP_CHECKPOINT |
"checkpoint" |
static final java.lang.String |
OP_CLEAN |
"cleanLog" |
static final java.lang.String |
OP_DB_NAMES |
"getDatabaseNames" |
static final java.lang.String |
OP_DB_STAT |
"getDatabaseStats" |
static final java.lang.String |
OP_ENV_STAT |
"getEnvironmentStats" |
static final java.lang.String |
OP_ENV_STAT_STR |
"getEnvironmentStatsToString" |
static final java.lang.String |
OP_EVICT |
"evictMemory" |
static final java.lang.String |
OP_LOCK_STAT |
"getLockStats" |
static final java.lang.String |
OP_LOCK_STAT_STR |
"getLockStatsToString" |
static final java.lang.String |
OP_SYNC |
"sync" |
static final java.lang.String |
OP_TXN_STAT |
"getTxnStats" |
org.archive.util.JmxUtils | ||
---|---|---|
public static final java.lang.String |
GUI_PORT |
"guiport" |
public static final java.lang.String |
HOST |
"host" |
public static final java.lang.String |
JMX_PORT |
"jmxport" |
public static final java.lang.String |
JOB |
"CrawlService.Job" |
public static final java.lang.String |
KEY |
"key" |
public static final java.lang.String |
MOTHER |
"mother" |
public static final java.lang.String |
NAME |
"name" |
public static final java.lang.String |
SERVICE |
"CrawlService" |
public static final java.lang.String |
TYPE |
"type" |
org.archive.util.MimetypeUtils | ||
---|---|---|
public static final java.lang.String |
NO_TYPE_MIMETYPE |
"no-type" |
org.archive.util.UriUtils | ||
---|---|---|
static final java.lang.String |
LIKELY_URI_PATH |
"(\\.{0,2}[^\\.\\n\\r\\s\"\']*(\\.[^\\.\\n\\r\\s\"\']+)+)" |
static final java.lang.String |
NAIVE_LIKELY_URI_PATTERN |
"[^<>\\s]*[\\./][^<>\\s]*(?<!\\.)" |
static final java.lang.String |
STRING_URI_DETECTOR |
"(?:\\w|[\\.]{0,2}/)[\\S&&[^<>]]*(?:\\.|/)[\\S&&[^<>]]*(?:\\w|/)" |
org.archive.util.anvl.ANVLRecord | ||
---|---|---|
static final java.lang.String |
CRLF |
"\r\n" |
static final java.lang.String |
FOLD_PREFIX |
"\r\n " |
public static final long |
MAXIMUM_SIZE |
10240L |
public static final java.lang.String |
MIMETYPE |
"application/warc-fields" |
org.archive.util.anvl.org.archive.util.anvl.Label | ||
---|---|---|
public static final char |
COLON |
58 |
org.archive.util.fingerprint.ArrayLongFPCache | ||
---|---|---|
public static final int |
DEFAULT_CAPACITY |
1048576 |
public static final int |
DEFAULT_SMEAR |
5 |
org.archive.util.iterator.RegexpLineIterator | ||
---|---|---|
public static final java.lang.String |
COMMENT_LINE |
"\\s*(#.*)?" |
public static final java.lang.String |
ENTRY |
"$1" |
public static final java.lang.String |
NONWHITESPACE_ENTRY_TRAILING_COMMENT |
"^[\\s\ufeff]*(\\S+)\\s*(#.*)?$" |
public static final java.lang.String |
TRIMMED_ENTRY_TRAILING_COMMENT |
"^\\s*([^#]+?)\\s*(#.*)?$" |
org.archive.util.ms.BlockFileSystem | ||
---|---|---|
public static final int |
BLOCK_SIZE |
512 |
org.archive.util.ms.org.archive.util.ms.PieceTable | ||
---|---|---|
static final int |
CP1252_INDICATOR |
1073741824 |
static final int |
CP1252_MASK |
1073741823 |
|
||||||||||
PREV NEXT | FRAMES NO FRAMES |