encodedword.go

Documentation: mime

		 1  // Copyright 2015 The Go Authors. All rights reserved.
		 2  // Use of this source code is governed by a BSD-style
		 3  // license that can be found in the LICENSE file.
		 4  
		 5  package mime
		 6  
		 7  import (
		 8  	"bytes"
		 9  	"encoding/base64"
		10  	"errors"
		11  	"fmt"
		12  	"io"
		13  	"strings"
		14  	"unicode"
		15  	"unicode/utf8"
		16  )
		17  
		18  // A WordEncoder is an RFC 2047 encoded-word encoder.
		19  type WordEncoder byte
		20  
		21  const (
		22  	// BEncoding represents Base64 encoding scheme as defined by RFC 2045.
		23  	BEncoding = WordEncoder('b')
		24  	// QEncoding represents the Q-encoding scheme as defined by RFC 2047.
		25  	QEncoding = WordEncoder('q')
		26  )
		27  
		28  var (
		29  	errInvalidWord = errors.New("mime: invalid RFC 2047 encoded-word")
		30  )
		31  
		32  // Encode returns the encoded-word form of s. If s is ASCII without special
		33  // characters, it is returned unchanged. The provided charset is the IANA
		34  // charset name of s. It is case insensitive.
		35  func (e WordEncoder) Encode(charset, s string) string {
		36  	if !needsEncoding(s) {
		37  		return s
		38  	}
		39  	return e.encodeWord(charset, s)
		40  }
		41  
		42  func needsEncoding(s string) bool {
		43  	for _, b := range s {
		44  		if (b < ' ' || b > '~') && b != '\t' {
		45  			return true
		46  		}
		47  	}
		48  	return false
		49  }
		50  
		51  // encodeWord encodes a string into an encoded-word.
		52  func (e WordEncoder) encodeWord(charset, s string) string {
		53  	var buf strings.Builder
		54  	// Could use a hint like len(s)*3, but that's not enough for cases
		55  	// with word splits and too much for simpler inputs.
		56  	// 48 is close to maxEncodedWordLen/2, but adjusted to allocator size class.
		57  	buf.Grow(48)
		58  
		59  	e.openWord(&buf, charset)
		60  	if e == BEncoding {
		61  		e.bEncode(&buf, charset, s)
		62  	} else {
		63  		e.qEncode(&buf, charset, s)
		64  	}
		65  	closeWord(&buf)
		66  
		67  	return buf.String()
		68  }
		69  
		70  const (
		71  	// The maximum length of an encoded-word is 75 characters.
		72  	// See RFC 2047, section 2.
		73  	maxEncodedWordLen = 75
		74  	// maxContentLen is how much content can be encoded, ignoring the header and
		75  	// 2-byte footer.
		76  	maxContentLen = maxEncodedWordLen - len("=?UTF-8?q?") - len("?=")
		77  )
		78  
		79  var maxBase64Len = base64.StdEncoding.DecodedLen(maxContentLen)
		80  
		81  // bEncode encodes s using base64 encoding and writes it to buf.
		82  func (e WordEncoder) bEncode(buf *strings.Builder, charset, s string) {
		83  	w := base64.NewEncoder(base64.StdEncoding, buf)
		84  	// If the charset is not UTF-8 or if the content is short, do not bother
		85  	// splitting the encoded-word.
		86  	if !isUTF8(charset) || base64.StdEncoding.EncodedLen(len(s)) <= maxContentLen {
		87  		io.WriteString(w, s)
		88  		w.Close()
		89  		return
		90  	}
		91  
		92  	var currentLen, last, runeLen int
		93  	for i := 0; i < len(s); i += runeLen {
		94  		// Multi-byte characters must not be split across encoded-words.
		95  		// See RFC 2047, section 5.3.
		96  		_, runeLen = utf8.DecodeRuneInString(s[i:])
		97  
		98  		if currentLen+runeLen <= maxBase64Len {
		99  			currentLen += runeLen
	 100  		} else {
	 101  			io.WriteString(w, s[last:i])
	 102  			w.Close()
	 103  			e.splitWord(buf, charset)
	 104  			last = i
	 105  			currentLen = runeLen
	 106  		}
	 107  	}
	 108  	io.WriteString(w, s[last:])
	 109  	w.Close()
	 110  }
	 111  
	 112  // qEncode encodes s using Q encoding and writes it to buf. It splits the
	 113  // encoded-words when necessary.
	 114  func (e WordEncoder) qEncode(buf *strings.Builder, charset, s string) {
	 115  	// We only split encoded-words when the charset is UTF-8.
	 116  	if !isUTF8(charset) {
	 117  		writeQString(buf, s)
	 118  		return
	 119  	}
	 120  
	 121  	var currentLen, runeLen int
	 122  	for i := 0; i < len(s); i += runeLen {
	 123  		b := s[i]
	 124  		// Multi-byte characters must not be split across encoded-words.
	 125  		// See RFC 2047, section 5.3.
	 126  		var encLen int
	 127  		if b >= ' ' && b <= '~' && b != '=' && b != '?' && b != '_' {
	 128  			runeLen, encLen = 1, 1
	 129  		} else {
	 130  			_, runeLen = utf8.DecodeRuneInString(s[i:])
	 131  			encLen = 3 * runeLen
	 132  		}
	 133  
	 134  		if currentLen+encLen > maxContentLen {
	 135  			e.splitWord(buf, charset)
	 136  			currentLen = 0
	 137  		}
	 138  		writeQString(buf, s[i:i+runeLen])
	 139  		currentLen += encLen
	 140  	}
	 141  }
	 142  
	 143  // writeQString encodes s using Q encoding and writes it to buf.
	 144  func writeQString(buf *strings.Builder, s string) {
	 145  	for i := 0; i < len(s); i++ {
	 146  		switch b := s[i]; {
	 147  		case b == ' ':
	 148  			buf.WriteByte('_')
	 149  		case b >= '!' && b <= '~' && b != '=' && b != '?' && b != '_':
	 150  			buf.WriteByte(b)
	 151  		default:
	 152  			buf.WriteByte('=')
	 153  			buf.WriteByte(upperhex[b>>4])
	 154  			buf.WriteByte(upperhex[b&0x0f])
	 155  		}
	 156  	}
	 157  }
	 158  
	 159  // openWord writes the beginning of an encoded-word into buf.
	 160  func (e WordEncoder) openWord(buf *strings.Builder, charset string) {
	 161  	buf.WriteString("=?")
	 162  	buf.WriteString(charset)
	 163  	buf.WriteByte('?')
	 164  	buf.WriteByte(byte(e))
	 165  	buf.WriteByte('?')
	 166  }
	 167  
	 168  // closeWord writes the end of an encoded-word into buf.
	 169  func closeWord(buf *strings.Builder) {
	 170  	buf.WriteString("?=")
	 171  }
	 172  
	 173  // splitWord closes the current encoded-word and opens a new one.
	 174  func (e WordEncoder) splitWord(buf *strings.Builder, charset string) {
	 175  	closeWord(buf)
	 176  	buf.WriteByte(' ')
	 177  	e.openWord(buf, charset)
	 178  }
	 179  
	 180  func isUTF8(charset string) bool {
	 181  	return strings.EqualFold(charset, "UTF-8")
	 182  }
	 183  
	 184  const upperhex = "0123456789ABCDEF"
	 185  
	 186  // A WordDecoder decodes MIME headers containing RFC 2047 encoded-words.
	 187  type WordDecoder struct {
	 188  	// CharsetReader, if non-nil, defines a function to generate
	 189  	// charset-conversion readers, converting from the provided
	 190  	// charset into UTF-8.
	 191  	// Charsets are always lower-case. utf-8, iso-8859-1 and us-ascii charsets
	 192  	// are handled by default.
	 193  	// One of the CharsetReader's result values must be non-nil.
	 194  	CharsetReader func(charset string, input io.Reader) (io.Reader, error)
	 195  }
	 196  
	 197  // Decode decodes an RFC 2047 encoded-word.
	 198  func (d *WordDecoder) Decode(word string) (string, error) {
	 199  	// See https://tools.ietf.org/html/rfc2047#section-2 for details.
	 200  	// Our decoder is permissive, we accept empty encoded-text.
	 201  	if len(word) < 8 || !strings.HasPrefix(word, "=?") || !strings.HasSuffix(word, "?=") || strings.Count(word, "?") != 4 {
	 202  		return "", errInvalidWord
	 203  	}
	 204  	word = word[2 : len(word)-2]
	 205  
	 206  	// split delimits the first 2 fields
	 207  	split := strings.IndexByte(word, '?')
	 208  
	 209  	// split word "UTF-8?q?ascii" into "UTF-8", 'q', and "ascii"
	 210  	charset := word[:split]
	 211  	if len(charset) == 0 {
	 212  		return "", errInvalidWord
	 213  	}
	 214  	if len(word) < split+3 {
	 215  		return "", errInvalidWord
	 216  	}
	 217  	encoding := word[split+1]
	 218  	// the field after split must only be one byte
	 219  	if word[split+2] != '?' {
	 220  		return "", errInvalidWord
	 221  	}
	 222  	text := word[split+3:]
	 223  
	 224  	content, err := decode(encoding, text)
	 225  	if err != nil {
	 226  		return "", err
	 227  	}
	 228  
	 229  	var buf strings.Builder
	 230  
	 231  	if err := d.convert(&buf, charset, content); err != nil {
	 232  		return "", err
	 233  	}
	 234  
	 235  	return buf.String(), nil
	 236  }
	 237  
	 238  // DecodeHeader decodes all encoded-words of the given string. It returns an
	 239  // error if and only if CharsetReader of d returns an error.
	 240  func (d *WordDecoder) DecodeHeader(header string) (string, error) {
	 241  	// If there is no encoded-word, returns before creating a buffer.
	 242  	i := strings.Index(header, "=?")
	 243  	if i == -1 {
	 244  		return header, nil
	 245  	}
	 246  
	 247  	var buf strings.Builder
	 248  
	 249  	buf.WriteString(header[:i])
	 250  	header = header[i:]
	 251  
	 252  	betweenWords := false
	 253  	for {
	 254  		start := strings.Index(header, "=?")
	 255  		if start == -1 {
	 256  			break
	 257  		}
	 258  		cur := start + len("=?")
	 259  
	 260  		i := strings.Index(header[cur:], "?")
	 261  		if i == -1 {
	 262  			break
	 263  		}
	 264  		charset := header[cur : cur+i]
	 265  		cur += i + len("?")
	 266  
	 267  		if len(header) < cur+len("Q??=") {
	 268  			break
	 269  		}
	 270  		encoding := header[cur]
	 271  		cur++
	 272  
	 273  		if header[cur] != '?' {
	 274  			break
	 275  		}
	 276  		cur++
	 277  
	 278  		j := strings.Index(header[cur:], "?=")
	 279  		if j == -1 {
	 280  			break
	 281  		}
	 282  		text := header[cur : cur+j]
	 283  		end := cur + j + len("?=")
	 284  
	 285  		content, err := decode(encoding, text)
	 286  		if err != nil {
	 287  			betweenWords = false
	 288  			buf.WriteString(header[:start+2])
	 289  			header = header[start+2:]
	 290  			continue
	 291  		}
	 292  
	 293  		// Write characters before the encoded-word. White-space and newline
	 294  		// characters separating two encoded-words must be deleted.
	 295  		if start > 0 && (!betweenWords || hasNonWhitespace(header[:start])) {
	 296  			buf.WriteString(header[:start])
	 297  		}
	 298  
	 299  		if err := d.convert(&buf, charset, content); err != nil {
	 300  			return "", err
	 301  		}
	 302  
	 303  		header = header[end:]
	 304  		betweenWords = true
	 305  	}
	 306  
	 307  	if len(header) > 0 {
	 308  		buf.WriteString(header)
	 309  	}
	 310  
	 311  	return buf.String(), nil
	 312  }
	 313  
	 314  func decode(encoding byte, text string) ([]byte, error) {
	 315  	switch encoding {
	 316  	case 'B', 'b':
	 317  		return base64.StdEncoding.DecodeString(text)
	 318  	case 'Q', 'q':
	 319  		return qDecode(text)
	 320  	default:
	 321  		return nil, errInvalidWord
	 322  	}
	 323  }
	 324  
	 325  func (d *WordDecoder) convert(buf *strings.Builder, charset string, content []byte) error {
	 326  	switch {
	 327  	case strings.EqualFold("utf-8", charset):
	 328  		buf.Write(content)
	 329  	case strings.EqualFold("iso-8859-1", charset):
	 330  		for _, c := range content {
	 331  			buf.WriteRune(rune(c))
	 332  		}
	 333  	case strings.EqualFold("us-ascii", charset):
	 334  		for _, c := range content {
	 335  			if c >= utf8.RuneSelf {
	 336  				buf.WriteRune(unicode.ReplacementChar)
	 337  			} else {
	 338  				buf.WriteByte(c)
	 339  			}
	 340  		}
	 341  	default:
	 342  		if d.CharsetReader == nil {
	 343  			return fmt.Errorf("mime: unhandled charset %q", charset)
	 344  		}
	 345  		r, err := d.CharsetReader(strings.ToLower(charset), bytes.NewReader(content))
	 346  		if err != nil {
	 347  			return err
	 348  		}
	 349  		if _, err = io.Copy(buf, r); err != nil {
	 350  			return err
	 351  		}
	 352  	}
	 353  	return nil
	 354  }
	 355  
	 356  // hasNonWhitespace reports whether s (assumed to be ASCII) contains at least
	 357  // one byte of non-whitespace.
	 358  func hasNonWhitespace(s string) bool {
	 359  	for _, b := range s {
	 360  		switch b {
	 361  		// Encoded-words can only be separated by linear white spaces which does
	 362  		// not include vertical tabs (\v).
	 363  		case ' ', '\t', '\n', '\r':
	 364  		default:
	 365  			return true
	 366  		}
	 367  	}
	 368  	return false
	 369  }
	 370  
	 371  // qDecode decodes a Q encoded string.
	 372  func qDecode(s string) ([]byte, error) {
	 373  	dec := make([]byte, len(s))
	 374  	n := 0
	 375  	for i := 0; i < len(s); i++ {
	 376  		switch c := s[i]; {
	 377  		case c == '_':
	 378  			dec[n] = ' '
	 379  		case c == '=':
	 380  			if i+2 >= len(s) {
	 381  				return nil, errInvalidWord
	 382  			}
	 383  			b, err := readHexByte(s[i+1], s[i+2])
	 384  			if err != nil {
	 385  				return nil, err
	 386  			}
	 387  			dec[n] = b
	 388  			i += 2
	 389  		case (c <= '~' && c >= ' ') || c == '\n' || c == '\r' || c == '\t':
	 390  			dec[n] = c
	 391  		default:
	 392  			return nil, errInvalidWord
	 393  		}
	 394  		n++
	 395  	}
	 396  
	 397  	return dec[:n], nil
	 398  }
	 399  
	 400  // readHexByte returns the byte from its quoted-printable representation.
	 401  func readHexByte(a, b byte) (byte, error) {
	 402  	var hb, lb byte
	 403  	var err error
	 404  	if hb, err = fromHex(a); err != nil {
	 405  		return 0, err
	 406  	}
	 407  	if lb, err = fromHex(b); err != nil {
	 408  		return 0, err
	 409  	}
	 410  	return hb<<4 | lb, nil
	 411  }
	 412  
	 413  func fromHex(b byte) (byte, error) {
	 414  	switch {
	 415  	case b >= '0' && b <= '9':
	 416  		return b - '0', nil
	 417  	case b >= 'A' && b <= 'F':
	 418  		return b - 'A' + 10, nil
	 419  	// Accept badly encoded bytes.
	 420  	case b >= 'a' && b <= 'f':
	 421  		return b - 'a' + 10, nil
	 422  	}
	 423  	return 0, fmt.Errorf("mime: invalid hex byte %#02x", b)
	 424  }
	 425
View as plain text