View Javadoc

1   /* SeedRecord
2    * 
3    * $Id: SeedRecord.java 6600 2009-10-16 01:31:38Z gojomo $
4    *
5    * Created on June 12, 2005
6    * 
7    * Copyright (C) 2005 Internet Archive.
8    *
9    * This file is part of the Heritrix web crawler (crawler.archive.org).
10   *
11   * Heritrix is free software; you can redistribute it and/or modify
12   * it under the terms of the GNU Lesser Public License as published by
13   * the Free Software Foundation; either version 2.1 of the License, or
14   * any later version.
15   *
16   * Heritrix is distributed in the hope that it will be useful,
17   * but WITHOUT ANY WARRANTY; without even the implied warranty of
18   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19   * GNU Lesser Public License for more details.
20   *
21   * You should have received a copy of the GNU Lesser Public License
22   * along with Heritrix; if not, write to the Free Software
23   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
24   *
25   */
26  package org.archive.crawler.admin;
27  
28  import java.io.Serializable;
29  import java.util.logging.Logger;
30  
31  import org.archive.crawler.datamodel.CandidateURI;
32  import org.archive.crawler.datamodel.CoreAttributeConstants;
33  import org.archive.crawler.datamodel.CrawlURI;
34  
35  /***
36   * Record of all interesting info about the most-recent
37   * processing of a specific seed.
38   * 
39   * @author gojomo
40   */
41  public class SeedRecord implements CoreAttributeConstants, Serializable {
42      private static final long serialVersionUID = -8455358640509744478L;
43      private static Logger logger =
44          Logger.getLogger(SeedRecord.class.getName());
45      private final String uri;
46      private int statusCode;
47      private String disposition;
48      private String redirectUri;
49      
50      /***
51       * Create a record from the given CrawlURI and disposition string
52       * 
53       * @param curi CrawlURI, already processed as reported to StatisticsTracker
54       * @param disposition descriptive disposition string
55       * 
56       */
57      public SeedRecord(CrawlURI curi, String disposition) {
58          super();
59          this.uri = curi.toString();
60          this.statusCode = curi.getFetchStatus();
61          this.disposition = disposition;
62          if (statusCode==301 || statusCode == 302) {
63              for (CandidateURI cauri: curi.getOutCandidates()) {
64                  if("location:".equalsIgnoreCase(cauri.getViaContext().
65                  		toString())) {
66                      redirectUri = cauri.toString();
67                  }
68              }
69          }
70      }
71      
72      /***
73       * Constructor for when a CrawlURI is unavailable; such
74       * as when considering seeds not yet passed through as
75       * CrawlURIs. 
76       * 
77       * @param uri
78       * @param disposition
79       */
80      public SeedRecord(String uri, String disposition) {
81      	this(uri, disposition, -1, null);
82      }
83  
84      /***
85       * Create a record from the given URI, disposition, HTTP status code,
86       * and redirect URI.
87       * @param uri
88       * @param disposition
89       * @param statusCode
90       * @param redirectUri
91       */
92      public SeedRecord(String uri, String disposition, int statusCode,
93      		String redirectUri) {
94          super();
95          this.uri = uri;
96          this.statusCode = statusCode;
97          this.disposition = disposition;
98          this.redirectUri = redirectUri;        
99      }
100 
101     
102     /***
103      * A later/repeat report of the same seed has arrived; update with
104      * latest. Should be rare/never?
105      * 
106      * @param curi
107      */
108     public void updateWith(CrawlURI curi,String disposition) {
109         if(!this.uri.equals(curi.toString())) {
110             logger.warning("SeedRecord URI changed: "+uri+"->"+curi.toString());
111         }
112         this.statusCode = curi.getFetchStatus();
113         this.disposition = disposition;
114         if (statusCode==301 || statusCode == 302) {
115             for (CandidateURI cauri: curi.getOutCandidates()) {
116                 if("location:".equalsIgnoreCase(cauri.getViaContext().
117                         toString())) {
118                     redirectUri = cauri.toString();
119                 }
120             }
121         } else {
122             redirectUri = null; 
123         }
124     }
125     
126     /***
127      * @return Returns the disposition.
128      */
129     public String getDisposition() {
130         return disposition;
131     }
132     /***
133      * @return Returns the redirectUri.
134      */
135     public String getRedirectUri() {
136         return redirectUri;
137     }
138     /***
139      * @return Returns the statusCode.
140      */
141     public int getStatusCode() {
142         return statusCode;
143     }
144     /***
145      * @return Returns the uri.
146      */
147     public String getUri() {
148         return uri;
149     }
150 }