|
||||||||||
PREV CLASS NEXT CLASS | FRAMES NO FRAMES | |||||||||
SUMMARY: NESTED | FIELD | CONSTR | METHOD | DETAIL: FIELD | CONSTR | METHOD |
java.lang.Object javax.management.Attribute org.archive.crawler.settings.Type org.archive.crawler.settings.ComplexType org.archive.crawler.settings.ModuleType org.archive.crawler.framework.Processor org.archive.crawler.framework.WriterPoolProcessor org.archive.crawler.writer.WARCWriterProcessor
public class WARCWriterProcessor
WARCWriterProcessor. Goes against the 0.18 version of the WARC specification (which is functionally identical to 0.17 except in the protocol identifier string). See http://archive-access.sourceforge.net/warc/
TODO: Remove ANVLRecord. Rename NameValue or use RFC822 (commons-httpclient?) or find something else.
Nested Class Summary |
---|
Nested classes/interfaces inherited from class org.archive.crawler.settings.ComplexType |
---|
ComplexType.MBeanAttributeInfoIterator |
Field Summary | |
---|---|
static java.lang.String |
ATTR_WRITE_METADATA
Key for whether to write 'metadata' type records where possible |
static java.lang.String |
ATTR_WRITE_REQUESTS
Key for whether to write 'request' type records where possible |
static java.lang.String |
ATTR_WRITE_REVISIT_FOR_IDENTICAL_DIGESTS
Key for whether to write 'revisit' type records when consecutive identical digest |
static java.lang.String |
ATTR_WRITE_REVISIT_FOR_NOT_MODIFIED
Key for whether to write 'revisit' type records for server "304 not modified" responses |
Fields inherited from class org.archive.crawler.framework.WriterPoolProcessor |
---|
ANNOTATION_UNWRITTEN, ATTR_COMPRESS, ATTR_MAX_BYTES_WRITTEN, ATTR_MAX_SIZE_BYTES, ATTR_PATH, ATTR_POOL_MAX_ACTIVE, ATTR_POOL_MAX_WAIT, ATTR_PREFIX, ATTR_SKIP_IDENTICAL_DIGESTS, ATTR_SUFFIX, DEFAULT_COMPRESS |
Fields inherited from class org.archive.crawler.framework.Processor |
---|
ATTR_DECIDE_RULES, ATTR_ENABLED, attrDecideRules |
Fields inherited from class org.archive.crawler.settings.ComplexType |
---|
definition, definitionMap |
Fields inherited from interface org.archive.io.ArchiveFileConstants |
---|
ABSOLUTE_OFFSET_KEY, CDX, CDX_FILE, CDX_LINE_BUFFER_SIZE, COMPRESSED_FILE_EXTENSION, CRLF, DATE_FIELD_KEY, DEFAULT_DIGEST_METHOD, DUMP, GZIP_DUMP, HEADER, INVALID_SUFFIX, LENGTH_FIELD_KEY, MIMETYPE_FIELD_KEY, NOHEAD, OCCUPIED_SUFFIX, READER_IDENTIFIER_FIELD_KEY, RECORD_IDENTIFIER_FIELD_KEY, SINGLE_SPACE, TYPE_FIELD_KEY, URL_FIELD_KEY, VERSION_FIELD_KEY |
Constructor Summary | |
---|---|
WARCWriterProcessor(java.lang.String name)
|
Method Summary | |
---|---|
protected void |
addIfNotBlank(ANVLRecord record,
java.lang.String label,
java.lang.String value)
|
protected void |
addStats(java.util.Map<java.lang.String,java.util.Map<java.lang.String,java.lang.Long>> statz)
|
long |
getDefaultMaxFileSize()
Default maximum file size. |
protected java.lang.String[] |
getDefaultPath()
|
protected java.lang.String |
getFirstrecordBody(java.io.File orderFile)
Return relevant values as header-like fields (here ANVLRecord, but spec-defined "application/warc-fields" type when written). |
protected java.lang.String |
getFirstrecordStylesheet()
|
protected java.net.URI |
getRecordID()
|
protected void |
innerProcess(CrawlURI curi)
Writes a CrawlURI and its associated data to store file. |
protected java.net.URI |
qualifyRecordID(java.net.URI base,
java.lang.String key,
java.lang.String value)
|
java.lang.String |
report()
Compiles and returns a report (in human readable form) about the status of the processor. |
protected void |
saveHeader(java.lang.String origName,
org.apache.commons.httpclient.HttpMethodBase method,
ANVLRecord headers,
java.lang.String newName)
Save a header from the given HTTP operation into the provider headers under a new name |
protected void |
setupPool(java.util.concurrent.atomic.AtomicInteger serialNo)
Set up pool of files. |
protected void |
write(java.lang.String lowerCaseScheme,
CrawlURI curi)
|
protected java.net.URI |
writeFtpControlConversation(WARCWriter w,
java.lang.String timestamp,
java.net.URI baseid,
CrawlURI curi,
ANVLRecord headers,
java.lang.String controlConversation)
|
protected java.net.URI |
writeMetadata(WARCWriter w,
java.lang.String timestamp,
java.net.URI baseid,
CrawlURI curi,
ANVLRecord namedFields)
|
protected java.net.URI |
writeRequest(WARCWriter w,
java.lang.String timestamp,
java.lang.String mimetype,
java.net.URI baseid,
CrawlURI curi,
ANVLRecord namedFields)
|
protected java.net.URI |
writeResource(WARCWriter w,
java.lang.String timestamp,
java.lang.String mimetype,
java.net.URI baseid,
CrawlURI curi,
ANVLRecord namedFields)
|
protected java.net.URI |
writeResponse(WARCWriter w,
java.lang.String timestamp,
java.lang.String mimetype,
java.net.URI baseid,
CrawlURI curi,
ANVLRecord namedFields)
|
protected java.net.URI |
writeRevisitDigest(WARCWriter w,
java.lang.String timestamp,
java.lang.String mimetype,
java.net.URI baseid,
CrawlURI curi,
ANVLRecord namedFields)
|
protected java.net.URI |
writeRevisitNotModified(WARCWriter w,
java.lang.String timestamp,
java.net.URI baseid,
CrawlURI curi,
ANVLRecord namedFields)
|
Methods inherited from class org.archive.crawler.framework.WriterPoolProcessor |
---|
cacheMetadata, checkBytesWritten, checkpointRecover, crawlCheckpoint, crawlEnded, crawlEnding, crawlPaused, crawlPausing, crawlResuming, crawlStarted, getAttributeUnchecked, getCheckpointStateFile, getHostAddress, getMaxSize, getMaxToWrite, getMetadata, getOutputDirs, getPool, getPoolMaximumActive, getPoolMaximumWait, getPrefix, getSerialNo, getSuffix, getTotalBytesWritten, initialTasks, isCompressed, loadCheckpointSerialNumber, saveCheckpointSerialNumber, setPool, setTotalBytesWritten, shouldWrite |
Methods inherited from class org.archive.crawler.framework.Processor |
---|
checkForInterrupt, finalTasks, getController, getDecideRule, getDefaultNextProcessor, innerRejectProcess, isContentToProcess, isEnabled, isExpectedMimeType, isHttpTransactionContentToProcess, kickUpdate, process, rulesAccept, rulesAccept, setDefaultNextProcessor, spawn |
Methods inherited from class org.archive.crawler.settings.ModuleType |
---|
addElement, listUsedFiles |
Methods inherited from class org.archive.crawler.settings.Type |
---|
addConstraint, equals, getConstraints, getLegalValueType, isExpertSetting, isOverrideable, isTransient, setExpertSetting, setLegalValueType, setOverrideable, setTransient |
Methods inherited from class javax.management.Attribute |
---|
getName, hashCode |
Methods inherited from class java.lang.Object |
---|
clone, finalize, getClass, notify, notifyAll, wait, wait, wait |
Methods inherited from interface org.archive.crawler.event.CrawlStatusListener |
---|
crawlCheckpoint, crawlEnded, crawlEnding, crawlPaused, crawlPausing, crawlResuming, crawlStarted |
Methods inherited from interface org.archive.io.WriterPoolSettings |
---|
getMaxSize, getMetadata, getOutputDirs, getPrefix, getSuffix, isCompressed |
Field Detail |
---|
public static final java.lang.String ATTR_WRITE_REQUESTS
public static final java.lang.String ATTR_WRITE_METADATA
public static final java.lang.String ATTR_WRITE_REVISIT_FOR_IDENTICAL_DIGESTS
public static final java.lang.String ATTR_WRITE_REVISIT_FOR_NOT_MODIFIED
Constructor Detail |
---|
public WARCWriterProcessor(java.lang.String name)
name
- Name of this writer.Method Detail |
---|
public long getDefaultMaxFileSize()
WriterPoolProcessor
getDefaultMaxFileSize
in class WriterPoolProcessor
protected java.lang.String[] getDefaultPath()
getDefaultPath
in class WriterPoolProcessor
protected void setupPool(java.util.concurrent.atomic.AtomicInteger serialNo)
WriterPoolProcessor
setupPool
in class WriterPoolProcessor
protected void innerProcess(CrawlURI curi)
innerProcess
in class WriterPoolProcessor
curi
- CrawlURI to process.protected void write(java.lang.String lowerCaseScheme, CrawlURI curi) throws java.io.IOException
java.io.IOException
protected void addStats(java.util.Map<java.lang.String,java.util.Map<java.lang.String,java.lang.Long>> statz)
protected java.net.URI writeFtpControlConversation(WARCWriter w, java.lang.String timestamp, java.net.URI baseid, CrawlURI curi, ANVLRecord headers, java.lang.String controlConversation) throws java.io.IOException
java.io.IOException
protected java.net.URI writeRequest(WARCWriter w, java.lang.String timestamp, java.lang.String mimetype, java.net.URI baseid, CrawlURI curi, ANVLRecord namedFields) throws java.io.IOException
java.io.IOException
protected java.net.URI writeResponse(WARCWriter w, java.lang.String timestamp, java.lang.String mimetype, java.net.URI baseid, CrawlURI curi, ANVLRecord namedFields) throws java.io.IOException
java.io.IOException
protected java.net.URI writeResource(WARCWriter w, java.lang.String timestamp, java.lang.String mimetype, java.net.URI baseid, CrawlURI curi, ANVLRecord namedFields) throws java.io.IOException
java.io.IOException
protected java.net.URI writeRevisitDigest(WARCWriter w, java.lang.String timestamp, java.lang.String mimetype, java.net.URI baseid, CrawlURI curi, ANVLRecord namedFields) throws java.io.IOException
java.io.IOException
protected java.net.URI writeRevisitNotModified(WARCWriter w, java.lang.String timestamp, java.net.URI baseid, CrawlURI curi, ANVLRecord namedFields) throws java.io.IOException
java.io.IOException
protected void saveHeader(java.lang.String origName, org.apache.commons.httpclient.HttpMethodBase method, ANVLRecord headers, java.lang.String newName)
origName
- header name to get if presentmethod
- http operation containing headersprotected java.net.URI writeMetadata(WARCWriter w, java.lang.String timestamp, java.net.URI baseid, CrawlURI curi, ANVLRecord namedFields) throws java.io.IOException
java.io.IOException
protected java.net.URI getRecordID() throws java.io.IOException
java.io.IOException
protected java.net.URI qualifyRecordID(java.net.URI base, java.lang.String key, java.lang.String value) throws java.io.IOException
java.io.IOException
protected java.lang.String getFirstrecordStylesheet()
getFirstrecordStylesheet
in class WriterPoolProcessor
protected java.lang.String getFirstrecordBody(java.io.File orderFile)
getFirstrecordBody
in class WriterPoolProcessor
orderFile
- Order file.
WriterPoolProcessor.getFirstrecordBody(java.io.File)
protected void addIfNotBlank(ANVLRecord record, java.lang.String label, java.lang.String value)
public java.lang.String report()
Processor
Examples of stats declared would include:
* Number of CrawlURIs handled.
* Number of links extracted (for link extractors)
etc.
report
in class Processor
|
||||||||||
PREV CLASS NEXT CLASS | FRAMES NO FRAMES | |||||||||
SUMMARY: NESTED | FIELD | CONSTR | METHOD | DETAIL: FIELD | CONSTR | METHOD |