edu.umd.cloud9.collection.clue
Class ClueWarcRecord

java.lang.Object
  extended by edu.umd.cloud9.collection.Indexable
      extended by edu.umd.cloud9.collection.clue.ClueWarcRecord
All Implemented Interfaces:
Writable

public class ClueWarcRecord
extends Indexable


Nested Class Summary
 class ClueWarcRecord.WarcHeader
          Warc header class
 
Field Summary
static String WARC_VERSION
           
static String WARC_VERSION_LINE
           
 
Constructor Summary
ClueWarcRecord()
          Default Constructor
ClueWarcRecord(ClueWarcRecord o)
          Copy Constructor
 
Method Summary
 void addHeaderMetadata(String key, String value)
          Adds a key/value pair to a WARC header.
 void clearHeaderMetadata()
          Clears all metadata items from a header
 byte[] getByteContent()
          Retrieves the byte content for this record
 String getContent()
          Returns the content of the document.
 String getContentUTF8()
          Retrieves the bytes content as a UTF-8 string
 String getDisplayContentType()
           
 String getDocid()
          Returns the globally-unique String identifier of the document within the collection.
 Set<Map.Entry<String,String>> getHeaderMetadata()
          Gets the set of metadata items from the header
 String getHeaderMetadataItem(String key)
          Gets a value for a specific header metadata key
 String getHeaderRecordType()
          Gets the header record type string
 String getHeaderString()
          Gets the WARC header as a string
 int getTotalRecordLength()
          Retrieves the total record length (header and content)
 String getWarcFilePath()
          Gets the file path from this WARC file (if set)
 void readFields(DataInput in)
          Serialization input
static ClueWarcRecord readNextWarcRecord(DataInputStream in)
          Reads in a WARC record from a data input stream
 void set(ClueWarcRecord o)
          Sets the record content (copy)
 void setContent(byte[] content)
          Sets the byte content for this record
 void setContent(String content)
          Sets the byte content for this record
 void setWarcContentType(String contentType)
          Sets the content type string
 void setWarcDate(String dateString)
          Sets the WARC header date string
 void setWarcFilePath(String path)
          Sets the warc file path (optional - for use with getWarcFilePath)
 void setWarcRecordType(String recordType)
          Sets the record type string
 void setWarcUUID(String UUID)
          Sets the WARC uuid string
 String toString()
           
 void write(DataOutput out)
          Serialization output
 
Methods inherited from class edu.umd.cloud9.collection.Indexable
getDisplayContent
 
Methods inherited from class java.lang.Object
equals, getClass, hashCode, notify, notifyAll, wait, wait, wait
 

Field Detail

WARC_VERSION

public static String WARC_VERSION

WARC_VERSION_LINE

public static String WARC_VERSION_LINE
Constructor Detail

ClueWarcRecord

public ClueWarcRecord()
Default Constructor


ClueWarcRecord

public ClueWarcRecord(ClueWarcRecord o)
Copy Constructor

Parameters:
o -
Method Detail

readNextWarcRecord

public static ClueWarcRecord readNextWarcRecord(DataInputStream in)
                                         throws IOException
Reads in a WARC record from a data input stream

Parameters:
in - the input stream
Returns:
a WARC record (or null if eof)
Throws:
IOException

getTotalRecordLength

public int getTotalRecordLength()
Retrieves the total record length (header and content)

Returns:
total record length

set

public void set(ClueWarcRecord o)
Sets the record content (copy)

Parameters:
o - record to copy from

getWarcFilePath

public String getWarcFilePath()
Gets the file path from this WARC file (if set)


setWarcFilePath

public void setWarcFilePath(String path)
Sets the warc file path (optional - for use with getWarcFilePath)

Parameters:
path -

setWarcRecordType

public void setWarcRecordType(String recordType)
Sets the record type string

Parameters:
recordType -

setWarcContentType

public void setWarcContentType(String contentType)
Sets the content type string

Parameters:
contentType -

setWarcDate

public void setWarcDate(String dateString)
Sets the WARC header date string

Parameters:
dateString -

setWarcUUID

public void setWarcUUID(String UUID)
Sets the WARC uuid string

Parameters:
UUID -

addHeaderMetadata

public void addHeaderMetadata(String key,
                              String value)
Adds a key/value pair to a WARC header. This is needed to filter out known keys

Parameters:
key -
value -

clearHeaderMetadata

public void clearHeaderMetadata()
Clears all metadata items from a header


getHeaderMetadata

public Set<Map.Entry<String,String>> getHeaderMetadata()
Gets the set of metadata items from the header


getHeaderMetadataItem

public String getHeaderMetadataItem(String key)
Gets a value for a specific header metadata key

Parameters:
key -

setContent

public void setContent(byte[] content)
Sets the byte content for this record

Parameters:
content -

setContent

public void setContent(String content)
Sets the byte content for this record

Parameters:
content -

getByteContent

public byte[] getByteContent()
Retrieves the byte content for this record


getContentUTF8

public String getContentUTF8()
Retrieves the bytes content as a UTF-8 string


getHeaderRecordType

public String getHeaderRecordType()
Gets the header record type string


toString

public String toString()
Overrides:
toString in class Object

getHeaderString

public String getHeaderString()
Gets the WARC header as a string


write

public void write(DataOutput out)
           throws IOException
Serialization output

Parameters:
out -
Throws:
IOException

readFields

public void readFields(DataInput in)
                throws IOException
Serialization input

Parameters:
in -
Throws:
IOException

getDocid

public String getDocid()
Description copied from class: Indexable
Returns the globally-unique String identifier of the document within the collection.

Specified by:
getDocid in class Indexable

getContent

public String getContent()
Description copied from class: Indexable
Returns the content of the document.

Specified by:
getContent in class Indexable

getDisplayContentType

public String getDisplayContentType()
Overrides:
getDisplayContentType in class Indexable