View Javadoc

1   /* MimetypeUtils
2    * 
3    * $Id: MimetypeUtils.java 3119 2005-02-17 20:39:21Z stack-sf $
4    * 
5    * Created on Sep 22, 2004
6    *
7    * Copyright (C) 2004 Internet Archive.
8    * 
9    * This file is part of the Heritrix web crawler (crawler.archive.org).
10   * 
11   * Heritrix is free software; you can redistribute it and/or modify
12   * it under the terms of the GNU Lesser Public License as published by
13   * the Free Software Foundation; either version 2.1 of the License, or
14   * any later version.
15   * 
16   * Heritrix is distributed in the hope that it will be useful, 
17   * but WITHOUT ANY WARRANTY; without even the implied warranty of
18   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19   * GNU Lesser Public License for more details.
20   * 
21   * You should have received a copy of the GNU Lesser Public License
22   * along with Heritrix; if not, write to the Free Software
23   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
24   */
25  package org.archive.util;
26  
27  import java.util.regex.Matcher;
28  import java.util.regex.Pattern;
29  
30  /***
31   * Class of mimetype utilities.
32   * @author stack
33   */
34  public class MimetypeUtils {
35      /***
36       * The 'no-type' content-type.
37       * 
38       * Defined in the ARC file spec at
39       * http://www.archive.org/web/researcher/ArcFileFormat.php.
40       */
41      public static final String NO_TYPE_MIMETYPE = "no-type";
42      
43      /***
44       * Truncation regex.
45       */
46      final static Pattern TRUNCATION_REGEX = Pattern.compile("^([^//s;,]+).*");
47  
48  
49      /***
50       * Truncate passed mimetype.
51       * 
52       * Ensure no spaces.  Strip encoding.  Truncation required by
53       * ARC files.
54       *
55       * <p>Truncate at delimiters [;, ].
56       * Truncate multi-part content type header at ';'.
57       * Apache httpclient collapses values of multiple instances of the
58       * header into one comma-separated value,therefore truncated at ','.
59       * Current ia_tools that work with arc files expect 5-column
60       * space-separated meta-lines, therefore truncate at ' '.
61       *
62       * @param contentType Raw content-type.
63       *
64       * @return Computed content-type made from passed content-type after
65       * running it through a set of rules.
66       */
67      public static String truncate(String contentType) {
68          if (contentType == null) {
69              contentType = NO_TYPE_MIMETYPE;
70          } else {
71              Matcher matcher = TRUNCATION_REGEX.matcher(contentType);
72              if (matcher.matches()) {
73              	contentType = matcher.group(1);
74              } else {
75              	contentType = NO_TYPE_MIMETYPE;
76              }
77          }
78  
79          return contentType;
80      }
81  }