1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 package org.archive.crawler.extractor;
23
24 import java.io.IOException;
25 import java.util.logging.Logger;
26 import java.util.regex.Matcher;
27 import java.util.regex.Pattern;
28
29 import org.apache.commons.httpclient.URIException;
30 import org.archive.crawler.datamodel.CoreAttributeConstants;
31 import org.archive.crawler.datamodel.CrawlURI;
32 import org.archive.io.ReplayInputStream;
33 import org.archive.io.SeekReader;
34 import org.archive.io.SeekReaderCharSequence;
35 import org.archive.util.ms.Doc;
36
37 /***
38 * This class allows the caller to extract href style links from word97-format word documents.
39 *
40 * @author Parker Thompson
41 *
42 */
43 public class ExtractorDOC extends Extractor implements CoreAttributeConstants {
44
45 private static final long serialVersionUID = 1896822554981116303L;
46
47 private static Pattern PATTERN = Pattern.compile("HYPERLINK.*?\"(.*?)\"");
48
49 private static Logger logger =
50 Logger.getLogger("org.archive.crawler.extractor.ExtractorDOC");
51 private long numberOfCURIsHandled = 0;
52 private long numberOfLinksExtracted = 0;
53
54 /***
55 * @param name
56 */
57 public ExtractorDOC(String name) {
58 super(name, "MS-Word document Extractor. Extracts links from MS-Word" +
59 " '.doc' documents.");
60 }
61
62 /***
63 * Processes a word document and extracts any hyperlinks from it.
64 * This only extracts href style links, and does not examine the actual
65 * text for valid URIs.
66 * @param curi CrawlURI to process.
67 */
68 protected void extract(CrawlURI curi){
69
70
71
72 if (!isHttpTransactionContentToProcess(curi) ||
73 !isExpectedMimeType(curi.getContentType(),
74 "application/msword")) {
75 return;
76 }
77
78 int links = 0;
79 ReplayInputStream documentStream = null;
80 SeekReader docReader = null;
81
82 numberOfCURIsHandled++;
83
84
85 try
86 {
87 documentStream = curi.getHttpRecorder().getRecordedInput().
88 getContentReplayInputStream();
89
90 if (documentStream==null) {
91
92 return;
93 }
94
95 docReader = Doc.getText(documentStream);
96 }catch(Exception e){
97 curi.addLocalizedError(getName(),e,"ExtractorDOC Exception");
98 return;
99 } finally {
100 try {
101 documentStream.close();
102 } catch (IOException ignored) {
103
104 }
105 }
106
107 CharSequence cs = new SeekReaderCharSequence(docReader, 0);
108 Matcher m = PATTERN.matcher(cs);
109 while (m.find()) {
110 links++;
111 addLink(curi, m.group(1));
112 }
113
114 curi.linkExtractorFinished();
115 logger.fine(curi + " has " + links + " links.");
116 }
117
118
119 private void addLink(CrawlURI curi, String hyperlink) {
120 try {
121 curi.createAndAddLink(hyperlink,Link.NAVLINK_MISC,Link.NAVLINK_HOP);
122 } catch (URIException e1) {
123 getController().logUriError(e1, curi.getUURI(), hyperlink);
124 if (getController() != null) {
125
126
127 getController().logUriError(e1, curi.getUURI(), hyperlink);
128 } else {
129 logger.info(curi + ", " + hyperlink + ": "
130 + e1.getMessage());
131 }
132 }
133 numberOfLinksExtracted++;
134 }
135
136
137
138
139 public String report() {
140 StringBuffer ret = new StringBuffer();
141 ret.append("Processor: org.archive.crawler.extractor.ExtractorDOC\n");
142 ret.append(" Function: Link extraction on MS Word documents (.doc)\n");
143 ret.append(" CrawlURIs handled: " + numberOfCURIsHandled + "\n");
144 ret.append(" Links extracted: " + numberOfLinksExtracted + "\n\n");
145
146 return ret.toString();
147 }
148 }