View Javadoc

1   /* Copyright (C) 2003 Internet Archive.
2    *
3    * This file is part of the Heritrix web crawler (crawler.archive.org).
4    *
5    * Heritrix is free software; you can redistribute it and/or modify
6    * it under the terms of the GNU Lesser Public License as published by
7    * the Free Software Foundation; either version 2.1 of the License, or
8    * any later version.
9    *
10   * Heritrix is distributed in the hope that it will be useful,
11   * but WITHOUT ANY WARRANTY; without even the implied warranty of
12   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13   * GNU Lesser Public License for more details.
14   *
15   * You should have received a copy of the GNU Lesser Public License
16   * along with Heritrix; if not, write to the Free Software
17   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18   *
19   * Created on Jul 7, 2003
20   *
21   */
22  package org.archive.crawler.extractor;
23  
24  import java.io.IOException;
25  import java.util.logging.Logger;
26  import java.util.regex.Matcher;
27  import java.util.regex.Pattern;
28  
29  import org.apache.commons.httpclient.URIException;
30  import org.archive.crawler.datamodel.CoreAttributeConstants;
31  import org.archive.crawler.datamodel.CrawlURI;
32  import org.archive.io.ReplayInputStream;
33  import org.archive.io.SeekReader;
34  import org.archive.io.SeekReaderCharSequence;
35  import org.archive.util.ms.Doc;
36  
37  /***
38   *  This class allows the caller to extract href style links from word97-format word documents.
39   *
40   * @author Parker Thompson
41   *
42   */
43  public class ExtractorDOC extends Extractor implements CoreAttributeConstants {
44  
45      private static final long serialVersionUID = 1896822554981116303L;
46      
47      private static Pattern PATTERN = Pattern.compile("HYPERLINK.*?\"(.*?)\"");
48  
49      private static Logger logger =
50          Logger.getLogger("org.archive.crawler.extractor.ExtractorDOC");
51      private long numberOfCURIsHandled = 0;
52      private long numberOfLinksExtracted = 0;
53  
54      /***
55       * @param name
56       */
57      public ExtractorDOC(String name) {
58          super(name, "MS-Word document Extractor. Extracts links from MS-Word" +
59                  " '.doc' documents.");
60      }
61  
62      /***
63       *  Processes a word document and extracts any hyperlinks from it.
64       *  This only extracts href style links, and does not examine the actual
65       *  text for valid URIs.
66       * @param curi CrawlURI to process.
67       */
68      protected void extract(CrawlURI curi){
69          // Assumes docs will be coming in through http.
70          // TODO make this more general (currently we're only fetching via http
71          // so it doesn't matter)
72          if (!isHttpTransactionContentToProcess(curi) ||
73                  !isExpectedMimeType(curi.getContentType(),
74                      "application/msword")) {
75              return;
76          }
77  
78          int links = 0;
79          ReplayInputStream documentStream = null;
80          SeekReader docReader = null;
81          
82          numberOfCURIsHandled++;
83  
84          // Get the doc as a repositionable reader
85          try
86          {
87              documentStream = curi.getHttpRecorder().getRecordedInput().
88                  getContentReplayInputStream();
89  
90              if (documentStream==null) {
91                  // TODO: note problem
92                  return;
93              }
94              
95              docReader = Doc.getText(documentStream);
96          }catch(Exception e){
97              curi.addLocalizedError(getName(),e,"ExtractorDOC Exception");
98              return;
99          } finally {
100             try {
101                 documentStream.close();
102             } catch (IOException ignored) {
103 
104             }
105         }
106 
107         CharSequence cs = new SeekReaderCharSequence(docReader, 0);
108         Matcher m = PATTERN.matcher(cs);
109         while (m.find()) {
110             links++;
111             addLink(curi, m.group(1));
112         }
113         
114         curi.linkExtractorFinished(); // Set flag to indicate that link extraction is completed.
115         logger.fine(curi + " has " + links + " links.");
116     }
117     
118     
119     private void addLink(CrawlURI curi, String hyperlink) {
120         try {
121             curi.createAndAddLink(hyperlink,Link.NAVLINK_MISC,Link.NAVLINK_HOP);
122         } catch (URIException e1) {
123             getController().logUriError(e1, curi.getUURI(), hyperlink);
124             if (getController() != null) {
125                 // Controller can be null: e.g. when running
126                 // ExtractorTool.
127                 getController().logUriError(e1, curi.getUURI(), hyperlink);
128             } else {
129                 logger.info(curi + ", " + hyperlink + ": "
130                         + e1.getMessage());
131             }
132         }
133         numberOfLinksExtracted++;        
134     }
135 
136     /* (non-Javadoc)
137      * @see org.archive.crawler.framework.Processor#report()
138      */
139     public String report() {
140         StringBuffer ret = new StringBuffer();
141         ret.append("Processor: org.archive.crawler.extractor.ExtractorDOC\n");
142         ret.append("  Function:          Link extraction on MS Word documents (.doc)\n");
143         ret.append("  CrawlURIs handled: " + numberOfCURIsHandled + "\n");
144         ret.append("  Links extracted:   " + numberOfLinksExtracted + "\n\n");
145 
146         return ret.toString();
147     }
148 }