...

Source file src/net/http/sniff.go

Documentation: net/http

		 1  // Copyright 2011 The Go Authors. All rights reserved.
		 2  // Use of this source code is governed by a BSD-style
		 3  // license that can be found in the LICENSE file.
		 4  
		 5  package http
		 6  
		 7  import (
		 8  	"bytes"
		 9  	"encoding/binary"
		10  )
		11  
		12  // The algorithm uses at most sniffLen bytes to make its decision.
		13  const sniffLen = 512
		14  
		15  // DetectContentType implements the algorithm described
		16  // at https://mimesniff.spec.whatwg.org/ to determine the
		17  // Content-Type of the given data. It considers at most the
		18  // first 512 bytes of data. DetectContentType always returns
		19  // a valid MIME type: if it cannot determine a more specific one, it
		20  // returns "application/octet-stream".
		21  func DetectContentType(data []byte) string {
		22  	if len(data) > sniffLen {
		23  		data = data[:sniffLen]
		24  	}
		25  
		26  	// Index of the first non-whitespace byte in data.
		27  	firstNonWS := 0
		28  	for ; firstNonWS < len(data) && isWS(data[firstNonWS]); firstNonWS++ {
		29  	}
		30  
		31  	for _, sig := range sniffSignatures {
		32  		if ct := sig.match(data, firstNonWS); ct != "" {
		33  			return ct
		34  		}
		35  	}
		36  
		37  	return "application/octet-stream" // fallback
		38  }
		39  
		40  // isWS reports whether the provided byte is a whitespace byte (0xWS)
		41  // as defined in https://mimesniff.spec.whatwg.org/#terminology.
		42  func isWS(b byte) bool {
		43  	switch b {
		44  	case '\t', '\n', '\x0c', '\r', ' ':
		45  		return true
		46  	}
		47  	return false
		48  }
		49  
		50  // isTT reports whether the provided byte is a tag-terminating byte (0xTT)
		51  // as defined in https://mimesniff.spec.whatwg.org/#terminology.
		52  func isTT(b byte) bool {
		53  	switch b {
		54  	case ' ', '>':
		55  		return true
		56  	}
		57  	return false
		58  }
		59  
		60  type sniffSig interface {
		61  	// match returns the MIME type of the data, or "" if unknown.
		62  	match(data []byte, firstNonWS int) string
		63  }
		64  
		65  // Data matching the table in section 6.
		66  var sniffSignatures = []sniffSig{
		67  	htmlSig("<!DOCTYPE HTML"),
		68  	htmlSig("<HTML"),
		69  	htmlSig("<HEAD"),
		70  	htmlSig("<SCRIPT"),
		71  	htmlSig("<IFRAME"),
		72  	htmlSig("<H1"),
		73  	htmlSig("<DIV"),
		74  	htmlSig("<FONT"),
		75  	htmlSig("<TABLE"),
		76  	htmlSig("<A"),
		77  	htmlSig("<STYLE"),
		78  	htmlSig("<TITLE"),
		79  	htmlSig("<B"),
		80  	htmlSig("<BODY"),
		81  	htmlSig("<BR"),
		82  	htmlSig("<P"),
		83  	htmlSig("<!--"),
		84  	&maskedSig{
		85  		mask:	 []byte("\xFF\xFF\xFF\xFF\xFF"),
		86  		pat:		[]byte("<?xml"),
		87  		skipWS: true,
		88  		ct:		 "text/xml; charset=utf-8"},
		89  	&exactSig{[]byte("%PDF-"), "application/pdf"},
		90  	&exactSig{[]byte("%!PS-Adobe-"), "application/postscript"},
		91  
		92  	// UTF BOMs.
		93  	&maskedSig{
		94  		mask: []byte("\xFF\xFF\x00\x00"),
		95  		pat:	[]byte("\xFE\xFF\x00\x00"),
		96  		ct:	 "text/plain; charset=utf-16be",
		97  	},
		98  	&maskedSig{
		99  		mask: []byte("\xFF\xFF\x00\x00"),
	 100  		pat:	[]byte("\xFF\xFE\x00\x00"),
	 101  		ct:	 "text/plain; charset=utf-16le",
	 102  	},
	 103  	&maskedSig{
	 104  		mask: []byte("\xFF\xFF\xFF\x00"),
	 105  		pat:	[]byte("\xEF\xBB\xBF\x00"),
	 106  		ct:	 "text/plain; charset=utf-8",
	 107  	},
	 108  
	 109  	// Image types
	 110  	// For posterity, we originally returned "image/vnd.microsoft.icon" from
	 111  	// https://tools.ietf.org/html/draft-ietf-websec-mime-sniff-03#section-7
	 112  	// https://codereview.appspot.com/4746042
	 113  	// but that has since been replaced with "image/x-icon" in Section 6.2
	 114  	// of https://mimesniff.spec.whatwg.org/#matching-an-image-type-pattern
	 115  	&exactSig{[]byte("\x00\x00\x01\x00"), "image/x-icon"},
	 116  	&exactSig{[]byte("\x00\x00\x02\x00"), "image/x-icon"},
	 117  	&exactSig{[]byte("BM"), "image/bmp"},
	 118  	&exactSig{[]byte("GIF87a"), "image/gif"},
	 119  	&exactSig{[]byte("GIF89a"), "image/gif"},
	 120  	&maskedSig{
	 121  		mask: []byte("\xFF\xFF\xFF\xFF\x00\x00\x00\x00\xFF\xFF\xFF\xFF\xFF\xFF"),
	 122  		pat:	[]byte("RIFF\x00\x00\x00\x00WEBPVP"),
	 123  		ct:	 "image/webp",
	 124  	},
	 125  	&exactSig{[]byte("\x89PNG\x0D\x0A\x1A\x0A"), "image/png"},
	 126  	&exactSig{[]byte("\xFF\xD8\xFF"), "image/jpeg"},
	 127  
	 128  	// Audio and Video types
	 129  	// Enforce the pattern match ordering as prescribed in
	 130  	// https://mimesniff.spec.whatwg.org/#matching-an-audio-or-video-type-pattern
	 131  	&maskedSig{
	 132  		mask: []byte("\xFF\xFF\xFF\xFF"),
	 133  		pat:	[]byte(".snd"),
	 134  		ct:	 "audio/basic",
	 135  	},
	 136  	&maskedSig{
	 137  		mask: []byte("\xFF\xFF\xFF\xFF\x00\x00\x00\x00\xFF\xFF\xFF\xFF"),
	 138  		pat:	[]byte("FORM\x00\x00\x00\x00AIFF"),
	 139  		ct:	 "audio/aiff",
	 140  	},
	 141  	&maskedSig{
	 142  		mask: []byte("\xFF\xFF\xFF"),
	 143  		pat:	[]byte("ID3"),
	 144  		ct:	 "audio/mpeg",
	 145  	},
	 146  	&maskedSig{
	 147  		mask: []byte("\xFF\xFF\xFF\xFF\xFF"),
	 148  		pat:	[]byte("OggS\x00"),
	 149  		ct:	 "application/ogg",
	 150  	},
	 151  	&maskedSig{
	 152  		mask: []byte("\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF"),
	 153  		pat:	[]byte("MThd\x00\x00\x00\x06"),
	 154  		ct:	 "audio/midi",
	 155  	},
	 156  	&maskedSig{
	 157  		mask: []byte("\xFF\xFF\xFF\xFF\x00\x00\x00\x00\xFF\xFF\xFF\xFF"),
	 158  		pat:	[]byte("RIFF\x00\x00\x00\x00AVI "),
	 159  		ct:	 "video/avi",
	 160  	},
	 161  	&maskedSig{
	 162  		mask: []byte("\xFF\xFF\xFF\xFF\x00\x00\x00\x00\xFF\xFF\xFF\xFF"),
	 163  		pat:	[]byte("RIFF\x00\x00\x00\x00WAVE"),
	 164  		ct:	 "audio/wave",
	 165  	},
	 166  	// 6.2.0.2. video/mp4
	 167  	mp4Sig{},
	 168  	// 6.2.0.3. video/webm
	 169  	&exactSig{[]byte("\x1A\x45\xDF\xA3"), "video/webm"},
	 170  
	 171  	// Font types
	 172  	&maskedSig{
	 173  		// 34 NULL bytes followed by the string "LP"
	 174  		pat: []byte("\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00LP"),
	 175  		// 34 NULL bytes followed by \xF\xF
	 176  		mask: []byte("\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xFF\xFF"),
	 177  		ct:	 "application/vnd.ms-fontobject",
	 178  	},
	 179  	&exactSig{[]byte("\x00\x01\x00\x00"), "font/ttf"},
	 180  	&exactSig{[]byte("OTTO"), "font/otf"},
	 181  	&exactSig{[]byte("ttcf"), "font/collection"},
	 182  	&exactSig{[]byte("wOFF"), "font/woff"},
	 183  	&exactSig{[]byte("wOF2"), "font/woff2"},
	 184  
	 185  	// Archive types
	 186  	&exactSig{[]byte("\x1F\x8B\x08"), "application/x-gzip"},
	 187  	&exactSig{[]byte("PK\x03\x04"), "application/zip"},
	 188  	// RAR's signatures are incorrectly defined by the MIME spec as per
	 189  	//		https://github.com/whatwg/mimesniff/issues/63
	 190  	// However, RAR Labs correctly defines it at:
	 191  	//		https://www.rarlab.com/technote.htm#rarsign
	 192  	// so we use the definition from RAR Labs.
	 193  	// TODO: do whatever the spec ends up doing.
	 194  	&exactSig{[]byte("Rar!\x1A\x07\x00"), "application/x-rar-compressed"},		 // RAR v1.5-v4.0
	 195  	&exactSig{[]byte("Rar!\x1A\x07\x01\x00"), "application/x-rar-compressed"}, // RAR v5+
	 196  
	 197  	&exactSig{[]byte("\x00\x61\x73\x6D"), "application/wasm"},
	 198  
	 199  	textSig{}, // should be last
	 200  }
	 201  
	 202  type exactSig struct {
	 203  	sig []byte
	 204  	ct	string
	 205  }
	 206  
	 207  func (e *exactSig) match(data []byte, firstNonWS int) string {
	 208  	if bytes.HasPrefix(data, e.sig) {
	 209  		return e.ct
	 210  	}
	 211  	return ""
	 212  }
	 213  
	 214  type maskedSig struct {
	 215  	mask, pat []byte
	 216  	skipWS		bool
	 217  	ct				string
	 218  }
	 219  
	 220  func (m *maskedSig) match(data []byte, firstNonWS int) string {
	 221  	// pattern matching algorithm section 6
	 222  	// https://mimesniff.spec.whatwg.org/#pattern-matching-algorithm
	 223  
	 224  	if m.skipWS {
	 225  		data = data[firstNonWS:]
	 226  	}
	 227  	if len(m.pat) != len(m.mask) {
	 228  		return ""
	 229  	}
	 230  	if len(data) < len(m.pat) {
	 231  		return ""
	 232  	}
	 233  	for i, pb := range m.pat {
	 234  		maskedData := data[i] & m.mask[i]
	 235  		if maskedData != pb {
	 236  			return ""
	 237  		}
	 238  	}
	 239  	return m.ct
	 240  }
	 241  
	 242  type htmlSig []byte
	 243  
	 244  func (h htmlSig) match(data []byte, firstNonWS int) string {
	 245  	data = data[firstNonWS:]
	 246  	if len(data) < len(h)+1 {
	 247  		return ""
	 248  	}
	 249  	for i, b := range h {
	 250  		db := data[i]
	 251  		if 'A' <= b && b <= 'Z' {
	 252  			db &= 0xDF
	 253  		}
	 254  		if b != db {
	 255  			return ""
	 256  		}
	 257  	}
	 258  	// Next byte must be a tag-terminating byte(0xTT).
	 259  	if !isTT(data[len(h)]) {
	 260  		return ""
	 261  	}
	 262  	return "text/html; charset=utf-8"
	 263  }
	 264  
	 265  var mp4ftype = []byte("ftyp")
	 266  var mp4 = []byte("mp4")
	 267  
	 268  type mp4Sig struct{}
	 269  
	 270  func (mp4Sig) match(data []byte, firstNonWS int) string {
	 271  	// https://mimesniff.spec.whatwg.org/#signature-for-mp4
	 272  	// c.f. section 6.2.1
	 273  	if len(data) < 12 {
	 274  		return ""
	 275  	}
	 276  	boxSize := int(binary.BigEndian.Uint32(data[:4]))
	 277  	if len(data) < boxSize || boxSize%4 != 0 {
	 278  		return ""
	 279  	}
	 280  	if !bytes.Equal(data[4:8], mp4ftype) {
	 281  		return ""
	 282  	}
	 283  	for st := 8; st < boxSize; st += 4 {
	 284  		if st == 12 {
	 285  			// Ignores the four bytes that correspond to the version number of the "major brand".
	 286  			continue
	 287  		}
	 288  		if bytes.Equal(data[st:st+3], mp4) {
	 289  			return "video/mp4"
	 290  		}
	 291  	}
	 292  	return ""
	 293  }
	 294  
	 295  type textSig struct{}
	 296  
	 297  func (textSig) match(data []byte, firstNonWS int) string {
	 298  	// c.f. section 5, step 4.
	 299  	for _, b := range data[firstNonWS:] {
	 300  		switch {
	 301  		case b <= 0x08,
	 302  			b == 0x0B,
	 303  			0x0E <= b && b <= 0x1A,
	 304  			0x1C <= b && b <= 0x1F:
	 305  			return ""
	 306  		}
	 307  	}
	 308  	return "text/plain; charset=utf-8"
	 309  }
	 310  

View as plain text