1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25 package org.archive.util;
26
27 import java.util.regex.Matcher;
28 import java.util.regex.Pattern;
29
30 /***
31 * Class of mimetype utilities.
32 * @author stack
33 */
34 public class MimetypeUtils {
35 /***
36 * The 'no-type' content-type.
37 *
38 * Defined in the ARC file spec at
39 * http://www.archive.org/web/researcher/ArcFileFormat.php.
40 */
41 public static final String NO_TYPE_MIMETYPE = "no-type";
42
43 /***
44 * Truncation regex.
45 */
46 final static Pattern TRUNCATION_REGEX = Pattern.compile("^([^//s;,]+).*");
47
48
49 /***
50 * Truncate passed mimetype.
51 *
52 * Ensure no spaces. Strip encoding. Truncation required by
53 * ARC files.
54 *
55 * <p>Truncate at delimiters [;, ].
56 * Truncate multi-part content type header at ';'.
57 * Apache httpclient collapses values of multiple instances of the
58 * header into one comma-separated value,therefore truncated at ','.
59 * Current ia_tools that work with arc files expect 5-column
60 * space-separated meta-lines, therefore truncate at ' '.
61 *
62 * @param contentType Raw content-type.
63 *
64 * @return Computed content-type made from passed content-type after
65 * running it through a set of rules.
66 */
67 public static String truncate(String contentType) {
68 if (contentType == null) {
69 contentType = NO_TYPE_MIMETYPE;
70 } else {
71 Matcher matcher = TRUNCATION_REGEX.matcher(contentType);
72 if (matcher.matches()) {
73 contentType = matcher.group(1);
74 } else {
75 contentType = NO_TYPE_MIMETYPE;
76 }
77 }
78
79 return contentType;
80 }
81 }