1 package org.archive.crawler.util; 2 3 import org.apache.commons.httpclient.HttpStatus; 4 import org.archive.crawler.datamodel.CoreAttributeConstants; 5 import org.archive.crawler.datamodel.CrawlURI; 6 import org.archive.crawler.deciderules.recrawl.IdenticalDigestDecideRule; 7 import org.archive.util.Accumulator; 8 import org.archive.util.ArchiveUtils; 9 import org.archive.util.Histotable; 10 11 public class CrawledBytesHistotable extends Histotable<String> 12 implements Accumulator<CrawlURI>, CoreAttributeConstants { 13 private static final long serialVersionUID = 7923431123239026213L; 14 15 public static final String NOTMODIFIED = "not-modified"; 16 public static final String DUPLICATE = "dup-by-hash"; 17 public static final String NOVEL = "novel"; 18 19 20 public CrawledBytesHistotable() { 21 super(); 22 tally(NOVEL,0); 23 } 24 25 public void accumulate(CrawlURI curi) { 26 if(curi.getFetchStatus()==HttpStatus.SC_NOT_MODIFIED) { 27 tally(NOTMODIFIED, curi.getContentSize()); 28 } else if (IdenticalDigestDecideRule.hasIdenticalDigest(curi)) { 29 tally(DUPLICATE,curi.getContentSize()); 30 } else { 31 tally(NOVEL,curi.getContentSize()); 32 } 33 } 34 35 public String summary() { 36 StringBuilder sb = new StringBuilder(); 37 sb.append(ArchiveUtils.formatBytesForDisplay(getTotal())); 38 sb.append(" crawled ("); 39 sb.append(ArchiveUtils.formatBytesForDisplay(get(NOVEL))); 40 sb.append(" novel"); 41 if(get(DUPLICATE)!=null) { 42 sb.append(", "); 43 sb.append(ArchiveUtils.formatBytesForDisplay(get(DUPLICATE))); 44 sb.append(" "); 45 sb.append(DUPLICATE); 46 } 47 if(get(NOTMODIFIED)!=null) { 48 sb.append(", "); 49 sb.append(ArchiveUtils.formatBytesForDisplay(get(NOTMODIFIED))); 50 sb.append(" "); 51 sb.append(NOTMODIFIED); 52 } 53 sb.append(")"); 54 return sb.toString(); 55 } 56 }