org.archive.io.warc
Class WARCWriter

java.lang.Object
  extended by org.archive.io.WriterPoolMember
      extended by org.archive.io.warc.WARCWriter
All Implemented Interfaces:
ArchiveFileConstants, WARCConstants

public class WARCWriter
extends WriterPoolMember
implements WARCConstants

WARC implementation.

Assumption is that the caller is managing access to this WARCWriter ensuring only one thread accessing this WARC instance at any one time.

While being written, WARCs have a '.open' suffix appended.

Version:
$Revision: 4604 $ $Date: 2006-09-05 22:38:18 -0700 (Tue, 05 Sep 2006) $
Author:
stack

Field Summary
static java.lang.String CONTENT_BYTES
           
static byte[] CRLF_BYTES
          NEWLINE as bytes.
static java.lang.String NUM_RECORDS
           
static java.lang.String SIZE_ON_DISK
           
static java.lang.String TOTAL_BYTES
           
static java.lang.String TOTALS
           
 
Fields inherited from class org.archive.io.WriterPoolMember
DEFAULT_PREFIX, DEFAULT_SUFFIX, HOSTNAME_ADMINPORT_VARIABLE, HOSTNAME_VARIABLE, UTF8
 
Fields inherited from interface org.archive.io.warc.WARCConstants
COLON_SPACE, COMPRESSED_WARC_FILE_EXTENSION, CONTENT_DESCRIPTION, CONTENT_LENGTH, CONTENT_TYPE, CONTINUATION, CONTINUATION_INDEX, CONVERSION, CONVERSION_INDEX, DEFAULT_ENCODING, DEFAULT_MAX_WARC_FILE_SIZE, DOT_COMPRESSED_FILE_EXTENSION, DOT_COMPRESSED_WARC_FILE_EXTENSION, DOT_WARC_FILE_EXTENSION, FTP_CONTROL_CONVERSATION_MIMETYPE, HEADER_FIELD_KEYS, HEADER_FIELD_SEPARATOR, HEADER_KEY_BLOCK_DIGEST, HEADER_KEY_CONCURRENT_TO, HEADER_KEY_DATE, HEADER_KEY_ETAG, HEADER_KEY_FILENAME, HEADER_KEY_ID, HEADER_KEY_IP, HEADER_KEY_LAST_MODIFIED, HEADER_KEY_PAYLOAD_DIGEST, HEADER_KEY_PROFILE, HEADER_KEY_TRUNCATED, HEADER_KEY_TYPE, HEADER_KEY_URI, HEADER_LINE_ENCODING, HTTP_REQUEST_MIMETYPE, HTTP_RESPONSE_MIMETYPE, MAX_LINE_LENGTH, MAX_WARC_HEADER_LINE_LENGTH, METADATA, METADATA_INDEX, NAMED_FIELD_CHECKSUM_LABEL, NAMED_FIELD_DESCRIPTION, NAMED_FIELD_FILEDESC, NAMED_FIELD_IP_LABEL, NAMED_FIELD_RELATED_LABEL, NAMED_FIELD_TRUNCATED, NAMED_FIELD_TRUNCATED_VALUE_HEAD, NAMED_FIELD_TRUNCATED_VALUE_LENGTH, NAMED_FIELD_TRUNCATED_VALUE_TIME, NAMED_FIELD_TRUNCATED_VALUE_UNSPECIFIED, NAMED_FIELD_WARCFILENAME, PLACEHOLDER_RECORD_LENGTH_STRING, PROFILE_REVISIT_IDENTICAL_DIGEST, PROFILE_REVISIT_NOT_MODIFIED, REQUEST, REQUEST_INDEX, RESOURCE, RESOURCE_INDEX, RESPONSE, RESPONSE_INDEX, REVISIT, REVISIT_INDEX, TRUNCATED_VALUE_UNSPECIFIED, TYPE, TYPES, TYPES_LIST, WARC_010_ID, WARC_010_MAGIC, WARC_FILE_EXTENSION, WARC_HEADER_ENCODING, WARC_ID, WARC_MAGIC, WARC_VERSION, WARCINFO, WARCINFO_INDEX, WSP
 
Fields inherited from interface org.archive.io.ArchiveFileConstants
ABSOLUTE_OFFSET_KEY, CDX, CDX_FILE, CDX_LINE_BUFFER_SIZE, COMPRESSED_FILE_EXTENSION, CRLF, DATE_FIELD_KEY, DEFAULT_DIGEST_METHOD, DUMP, GZIP_DUMP, HEADER, INVALID_SUFFIX, LENGTH_FIELD_KEY, MIMETYPE_FIELD_KEY, NOHEAD, OCCUPIED_SUFFIX, READER_IDENTIFIER_FIELD_KEY, RECORD_IDENTIFIER_FIELD_KEY, SINGLE_SPACE, TYPE_FIELD_KEY, URL_FIELD_KEY, VERSION_FIELD_KEY
 
Constructor Summary
WARCWriter()
          Shutdown Constructor Has default access so can make instance to test utility methods.
WARCWriter(java.util.concurrent.atomic.AtomicInteger serialNo, java.util.List<java.io.File> dirs, java.lang.String prefix, java.lang.String suffix, boolean cmprs, long maxSize, java.util.List<java.lang.String> warcinfoData)
          Constructor.
WARCWriter(java.util.concurrent.atomic.AtomicInteger serialNo, java.io.OutputStream out, java.io.File f, boolean cmprs, java.lang.String a14DigitDate, java.util.List<java.lang.String> warcinfoData)
          Constructor.
 
Method Summary
protected  void baseCharacterCheck(char c, java.lang.String parameter)
           
protected  java.lang.String checkHeaderLineMimetypeParameter(java.lang.String parameter)
           
protected  java.lang.String checkHeaderValue(java.lang.String value)
           
protected  java.lang.String createFile(java.io.File file)
           
protected  java.lang.String createRecordHeader(java.lang.String type, java.lang.String url, java.lang.String create14DigitDate, java.lang.String mimetype, java.net.URI recordId, ANVLRecord xtraHeaders, long contentLength)
           
protected  java.net.URI generateRecordId(java.util.Map<java.lang.String,java.lang.String> qualifiers)
           
protected  java.net.URI generateRecordId(java.lang.String key, java.lang.String value)
           
static java.net.URI getRecordID()
          Convenience method for getting Record-Ids.
static long getStat(java.util.Map<java.lang.String,java.util.Map<java.lang.String,java.lang.Long>> statz, java.lang.String key, java.lang.String subkey)
           
 java.util.Map<java.lang.String,java.util.Map<java.lang.String,java.lang.Long>> getStats()
           
 void resetStats()
           
protected  void subtally(java.util.Map<java.lang.String,java.lang.Long> substats, long contentBytes, long totalBytes, long sizeOnDisk)
           
protected  void tally(java.lang.String recordType, long contentBytes, long totalBytes, long sizeOnDisk)
           
 void writeMetadataRecord(java.lang.String url, java.lang.String create14DigitDate, java.lang.String mimetype, java.net.URI recordId, ANVLRecord namedFields, java.io.InputStream metadata, long metadataLength)
           
protected  void writeRecord(java.lang.String type, java.lang.String url, java.lang.String create14DigitDate, java.lang.String mimetype, java.net.URI recordId, ANVLRecord xtraHeaders, java.io.InputStream contentStream, long contentLength)
          Deprecated. Use writeRecord(String,String,String,String,URI,ANVLRecord,InputStream,long,boolean) instead
protected  void writeRecord(java.lang.String type, java.lang.String url, java.lang.String create14DigitDate, java.lang.String mimetype, java.net.URI recordId, ANVLRecord xtraHeaders, java.io.InputStream contentStream, long contentLength, boolean enforceLength)
           
 void writeRequestRecord(java.lang.String url, java.lang.String create14DigitDate, java.lang.String mimetype, java.net.URI recordId, ANVLRecord namedFields, java.io.InputStream request, long requestLength)
           
 void writeResourceRecord(java.lang.String url, java.lang.String create14DigitDate, java.lang.String mimetype, ANVLRecord namedFields, java.io.InputStream response, long responseLength)
           
 void writeResourceRecord(java.lang.String url, java.lang.String create14DigitDate, java.lang.String mimetype, java.net.URI recordId, ANVLRecord namedFields, java.io.InputStream response, long responseLength)
           
 void writeResponseRecord(java.lang.String url, java.lang.String create14DigitDate, java.lang.String mimetype, java.net.URI recordId, ANVLRecord namedFields, java.io.InputStream response, long responseLength)
           
 void writeRevisitRecord(java.lang.String url, java.lang.String create14DigitDate, java.lang.String mimetype, java.net.URI recordId, ANVLRecord namedFields, java.io.InputStream response, long responseLength)
           
 java.net.URI writeWarcinfoRecord(java.lang.String filename)
           
 java.net.URI writeWarcinfoRecord(java.lang.String mimetype, ANVLRecord namedFields, java.io.InputStream fileMetadata, long fileMetadataLength)
          Write a warcinfo to current file.
 java.net.URI writeWarcinfoRecord(java.lang.String filename, java.lang.String description)
           
 void writeWarcinfoRecord(java.lang.String create14DigitDate, java.lang.String mimetype, java.net.URI recordId, ANVLRecord namedFields, java.io.InputStream fileMetadata, long fileMetadataLength)
          Write a warcinfo to current file.
 
Methods inherited from class org.archive.io.WriterPoolMember
checkSize, checkWriteable, close, copyFrom, createFile, flush, getBaseFilename, getCreateTimestamp, getFile, getNextDirectory, getOutputStream, getPosition, getTimestampSerialNo, getTimestampSerialNo, isCompressed, postWriteRecordTasks, preWriteRecordTasks, readFullyFrom, readToLimitFrom, write, write, write
 
Methods inherited from class java.lang.Object
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
 

Field Detail

TOTALS

public static final java.lang.String TOTALS
See Also:
Constant Field Values

SIZE_ON_DISK

public static final java.lang.String SIZE_ON_DISK
See Also:
Constant Field Values

TOTAL_BYTES

public static final java.lang.String TOTAL_BYTES
See Also:
Constant Field Values

CONTENT_BYTES

public static final java.lang.String CONTENT_BYTES
See Also:
Constant Field Values

NUM_RECORDS

public static final java.lang.String NUM_RECORDS
See Also:
Constant Field Values

CRLF_BYTES

public static byte[] CRLF_BYTES
NEWLINE as bytes.

Constructor Detail

WARCWriter

WARCWriter()
Shutdown Constructor Has default access so can make instance to test utility methods.


WARCWriter

public WARCWriter(java.util.concurrent.atomic.AtomicInteger serialNo,
                  java.io.OutputStream out,
                  java.io.File f,
                  boolean cmprs,
                  java.lang.String a14DigitDate,
                  java.util.List<java.lang.String> warcinfoData)
           throws java.io.IOException
Constructor. Takes a stream. Use with caution. There is no upperbound check on size. Will just keep writing. Only pass Streams that are bounded.

Parameters:
serialNo - used to generate unique file name sequences
out - Where to write.
f - File the out is connected to.
cmprs - Compress the content written.
a14DigitDate - If null, we'll write current time.
Throws:
java.io.IOException

WARCWriter

public WARCWriter(java.util.concurrent.atomic.AtomicInteger serialNo,
                  java.util.List<java.io.File> dirs,
                  java.lang.String prefix,
                  java.lang.String suffix,
                  boolean cmprs,
                  long maxSize,
                  java.util.List<java.lang.String> warcinfoData)
Constructor.

Parameters:
dirs - Where to drop files.
prefix - File prefix to use.
cmprs - Compress the records written.
maxSize - Maximum size for ARC files written.
suffix - File tail to use. If null, unused.
warcinfoData - File metadata for warcinfo record.
Method Detail

createFile

protected java.lang.String createFile(java.io.File file)
                               throws java.io.IOException
Overrides:
createFile in class WriterPoolMember
Throws:
java.io.IOException

baseCharacterCheck

protected void baseCharacterCheck(char c,
                                  java.lang.String parameter)
                           throws java.lang.IllegalArgumentException
Throws:
java.lang.IllegalArgumentException

checkHeaderValue

protected java.lang.String checkHeaderValue(java.lang.String value)
                                     throws java.lang.IllegalArgumentException
Throws:
java.lang.IllegalArgumentException

checkHeaderLineMimetypeParameter

protected java.lang.String checkHeaderLineMimetypeParameter(java.lang.String parameter)
                                                     throws java.io.IOException
Throws:
java.io.IOException

createRecordHeader

protected java.lang.String createRecordHeader(java.lang.String type,
                                              java.lang.String url,
                                              java.lang.String create14DigitDate,
                                              java.lang.String mimetype,
                                              java.net.URI recordId,
                                              ANVLRecord xtraHeaders,
                                              long contentLength)
                                       throws java.io.IOException
Throws:
java.io.IOException

writeRecord

protected void writeRecord(java.lang.String type,
                           java.lang.String url,
                           java.lang.String create14DigitDate,
                           java.lang.String mimetype,
                           java.net.URI recordId,
                           ANVLRecord xtraHeaders,
                           java.io.InputStream contentStream,
                           long contentLength)
                    throws java.io.IOException
Deprecated. Use writeRecord(String,String,String,String,URI,ANVLRecord,InputStream,long,boolean) instead

Throws:
java.io.IOException

writeRecord

protected void writeRecord(java.lang.String type,
                           java.lang.String url,
                           java.lang.String create14DigitDate,
                           java.lang.String mimetype,
                           java.net.URI recordId,
                           ANVLRecord xtraHeaders,
                           java.io.InputStream contentStream,
                           long contentLength,
                           boolean enforceLength)
                    throws java.io.IOException
Throws:
java.io.IOException

tally

protected void tally(java.lang.String recordType,
                     long contentBytes,
                     long totalBytes,
                     long sizeOnDisk)

subtally

protected void subtally(java.util.Map<java.lang.String,java.lang.Long> substats,
                        long contentBytes,
                        long totalBytes,
                        long sizeOnDisk)

generateRecordId

protected java.net.URI generateRecordId(java.util.Map<java.lang.String,java.lang.String> qualifiers)
                                 throws java.io.IOException
Throws:
java.io.IOException

generateRecordId

protected java.net.URI generateRecordId(java.lang.String key,
                                        java.lang.String value)
                                 throws java.io.IOException
Throws:
java.io.IOException

writeWarcinfoRecord

public java.net.URI writeWarcinfoRecord(java.lang.String filename)
                                 throws java.io.IOException
Throws:
java.io.IOException

writeWarcinfoRecord

public java.net.URI writeWarcinfoRecord(java.lang.String filename,
                                        java.lang.String description)
                                 throws java.io.IOException
Throws:
java.io.IOException

writeWarcinfoRecord

public java.net.URI writeWarcinfoRecord(java.lang.String mimetype,
                                        ANVLRecord namedFields,
                                        java.io.InputStream fileMetadata,
                                        long fileMetadataLength)
                                 throws java.io.IOException
Write a warcinfo to current file. TODO: Write crawl metadata or pointers to crawl description.

Parameters:
mimetype - Mimetype of the fileMetadata block.
namedFields - Named fields. Pass null if none.
fileMetadata - Metadata about this WARC as RDF, ANVL, etc.
fileMetadataLength - Length of fileMetadata.
Returns:
Generated record-id made with data: scheme and the current filename.
Throws:
java.io.IOException

writeWarcinfoRecord

public void writeWarcinfoRecord(java.lang.String create14DigitDate,
                                java.lang.String mimetype,
                                java.net.URI recordId,
                                ANVLRecord namedFields,
                                java.io.InputStream fileMetadata,
                                long fileMetadataLength)
                         throws java.io.IOException
Write a warcinfo to current file. The warcinfo type uses its recordId as its URL.

Parameters:
recordId - URI to use for this warcinfo.
create14DigitDate - Record creation date as 14 digit date.
mimetype - Mimetype of the fileMetadata.
namedFields - Named fields.
fileMetadata - Metadata about this WARC as RDF, ANVL, etc.
fileMetadataLength - Length of fileMetadata.
Throws:
java.io.IOException

writeRequestRecord

public void writeRequestRecord(java.lang.String url,
                               java.lang.String create14DigitDate,
                               java.lang.String mimetype,
                               java.net.URI recordId,
                               ANVLRecord namedFields,
                               java.io.InputStream request,
                               long requestLength)
                        throws java.io.IOException
Throws:
java.io.IOException

writeResourceRecord

public void writeResourceRecord(java.lang.String url,
                                java.lang.String create14DigitDate,
                                java.lang.String mimetype,
                                ANVLRecord namedFields,
                                java.io.InputStream response,
                                long responseLength)
                         throws java.io.IOException
Throws:
java.io.IOException

writeResourceRecord

public void writeResourceRecord(java.lang.String url,
                                java.lang.String create14DigitDate,
                                java.lang.String mimetype,
                                java.net.URI recordId,
                                ANVLRecord namedFields,
                                java.io.InputStream response,
                                long responseLength)
                         throws java.io.IOException
Throws:
java.io.IOException

writeResponseRecord

public void writeResponseRecord(java.lang.String url,
                                java.lang.String create14DigitDate,
                                java.lang.String mimetype,
                                java.net.URI recordId,
                                ANVLRecord namedFields,
                                java.io.InputStream response,
                                long responseLength)
                         throws java.io.IOException
Throws:
java.io.IOException

writeRevisitRecord

public void writeRevisitRecord(java.lang.String url,
                               java.lang.String create14DigitDate,
                               java.lang.String mimetype,
                               java.net.URI recordId,
                               ANVLRecord namedFields,
                               java.io.InputStream response,
                               long responseLength)
                        throws java.io.IOException
Throws:
java.io.IOException

writeMetadataRecord

public void writeMetadataRecord(java.lang.String url,
                                java.lang.String create14DigitDate,
                                java.lang.String mimetype,
                                java.net.URI recordId,
                                ANVLRecord namedFields,
                                java.io.InputStream metadata,
                                long metadataLength)
                         throws java.io.IOException
Throws:
java.io.IOException

getRecordID

public static java.net.URI getRecordID()
                                throws java.io.IOException
Convenience method for getting Record-Ids.

Returns:
A record ID.
Throws:
java.io.IOException

resetStats

public void resetStats()

getStats

public java.util.Map<java.lang.String,java.util.Map<java.lang.String,java.lang.Long>> getStats()

getStat

public static long getStat(java.util.Map<java.lang.String,java.util.Map<java.lang.String,java.lang.Long>> statz,
                           java.lang.String key,
                           java.lang.String subkey)


Copyright © 2003-2011 Internet Archive. All Rights Reserved.