escape.go

Documentation: html

		 1  // Copyright 2010 The Go Authors. All rights reserved.
		 2  // Use of this source code is governed by a BSD-style
		 3  // license that can be found in the LICENSE file.
		 4  
		 5  // Package html provides functions for escaping and unescaping HTML text.
		 6  package html
		 7  
		 8  import (
		 9  	"strings"
		10  	"unicode/utf8"
		11  )
		12  
		13  // These replacements permit compatibility with old numeric entities that
		14  // assumed Windows-1252 encoding.
		15  // https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-end-state
		16  var replacementTable = [...]rune{
		17  	'\u20AC', // First entry is what 0x80 should be replaced with.
		18  	'\u0081',
		19  	'\u201A',
		20  	'\u0192',
		21  	'\u201E',
		22  	'\u2026',
		23  	'\u2020',
		24  	'\u2021',
		25  	'\u02C6',
		26  	'\u2030',
		27  	'\u0160',
		28  	'\u2039',
		29  	'\u0152',
		30  	'\u008D',
		31  	'\u017D',
		32  	'\u008F',
		33  	'\u0090',
		34  	'\u2018',
		35  	'\u2019',
		36  	'\u201C',
		37  	'\u201D',
		38  	'\u2022',
		39  	'\u2013',
		40  	'\u2014',
		41  	'\u02DC',
		42  	'\u2122',
		43  	'\u0161',
		44  	'\u203A',
		45  	'\u0153',
		46  	'\u009D',
		47  	'\u017E',
		48  	'\u0178', // Last entry is 0x9F.
		49  	// 0x00->'\uFFFD' is handled programmatically.
		50  	// 0x0D->'\u000D' is a no-op.
		51  }
		52  
		53  // unescapeEntity reads an entity like "&lt;" from b[src:] and writes the
		54  // corresponding "<" to b[dst:], returning the incremented dst and src cursors.
		55  // Precondition: b[src] == '&' && dst <= src.
		56  func unescapeEntity(b []byte, dst, src int) (dst1, src1 int) {
		57  	const attribute = false
		58  
		59  	// http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#consume-a-character-reference
		60  
		61  	// i starts at 1 because we already know that s[0] == '&'.
		62  	i, s := 1, b[src:]
		63  
		64  	if len(s) <= 1 {
		65  		b[dst] = b[src]
		66  		return dst + 1, src + 1
		67  	}
		68  
		69  	if s[i] == '#' {
		70  		if len(s) <= 3 { // We need to have at least "&#.".
		71  			b[dst] = b[src]
		72  			return dst + 1, src + 1
		73  		}
		74  		i++
		75  		c := s[i]
		76  		hex := false
		77  		if c == 'x' || c == 'X' {
		78  			hex = true
		79  			i++
		80  		}
		81  
		82  		x := '\x00'
		83  		for i < len(s) {
		84  			c = s[i]
		85  			i++
		86  			if hex {
		87  				if '0' <= c && c <= '9' {
		88  					x = 16*x + rune(c) - '0'
		89  					continue
		90  				} else if 'a' <= c && c <= 'f' {
		91  					x = 16*x + rune(c) - 'a' + 10
		92  					continue
		93  				} else if 'A' <= c && c <= 'F' {
		94  					x = 16*x + rune(c) - 'A' + 10
		95  					continue
		96  				}
		97  			} else if '0' <= c && c <= '9' {
		98  				x = 10*x + rune(c) - '0'
		99  				continue
	 100  			}
	 101  			if c != ';' {
	 102  				i--
	 103  			}
	 104  			break
	 105  		}
	 106  
	 107  		if i <= 3 { // No characters matched.
	 108  			b[dst] = b[src]
	 109  			return dst + 1, src + 1
	 110  		}
	 111  
	 112  		if 0x80 <= x && x <= 0x9F {
	 113  			// Replace characters from Windows-1252 with UTF-8 equivalents.
	 114  			x = replacementTable[x-0x80]
	 115  		} else if x == 0 || (0xD800 <= x && x <= 0xDFFF) || x > 0x10FFFF {
	 116  			// Replace invalid characters with the replacement character.
	 117  			x = '\uFFFD'
	 118  		}
	 119  
	 120  		return dst + utf8.EncodeRune(b[dst:], x), src + i
	 121  	}
	 122  
	 123  	// Consume the maximum number of characters possible, with the
	 124  	// consumed characters matching one of the named references.
	 125  
	 126  	for i < len(s) {
	 127  		c := s[i]
	 128  		i++
	 129  		// Lower-cased characters are more common in entities, so we check for them first.
	 130  		if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || '0' <= c && c <= '9' {
	 131  			continue
	 132  		}
	 133  		if c != ';' {
	 134  			i--
	 135  		}
	 136  		break
	 137  	}
	 138  
	 139  	entityName := s[1:i]
	 140  	if len(entityName) == 0 {
	 141  		// No-op.
	 142  	} else if attribute && entityName[len(entityName)-1] != ';' && len(s) > i && s[i] == '=' {
	 143  		// No-op.
	 144  	} else if x := entity[string(entityName)]; x != 0 {
	 145  		return dst + utf8.EncodeRune(b[dst:], x), src + i
	 146  	} else if x := entity2[string(entityName)]; x[0] != 0 {
	 147  		dst1 := dst + utf8.EncodeRune(b[dst:], x[0])
	 148  		return dst1 + utf8.EncodeRune(b[dst1:], x[1]), src + i
	 149  	} else if !attribute {
	 150  		maxLen := len(entityName) - 1
	 151  		if maxLen > longestEntityWithoutSemicolon {
	 152  			maxLen = longestEntityWithoutSemicolon
	 153  		}
	 154  		for j := maxLen; j > 1; j-- {
	 155  			if x := entity[string(entityName[:j])]; x != 0 {
	 156  				return dst + utf8.EncodeRune(b[dst:], x), src + j + 1
	 157  			}
	 158  		}
	 159  	}
	 160  
	 161  	dst1, src1 = dst+i, src+i
	 162  	copy(b[dst:dst1], b[src:src1])
	 163  	return dst1, src1
	 164  }
	 165  
	 166  var htmlEscaper = strings.NewReplacer(
	 167  	`&`, "&amp;",
	 168  	`'`, "&#39;", // "&#39;" is shorter than "&apos;" and apos was not in HTML until HTML5.
	 169  	`<`, "&lt;",
	 170  	`>`, "&gt;",
	 171  	`"`, "&#34;", // "&#34;" is shorter than "&quot;".
	 172  )
	 173  
	 174  // EscapeString escapes special characters like "<" to become "&lt;". It
	 175  // escapes only five such characters: <, >, &, ' and ".
	 176  // UnescapeString(EscapeString(s)) == s always holds, but the converse isn't
	 177  // always true.
	 178  func EscapeString(s string) string {
	 179  	return htmlEscaper.Replace(s)
	 180  }
	 181  
	 182  // UnescapeString unescapes entities like "&lt;" to become "<". It unescapes a
	 183  // larger range of entities than EscapeString escapes. For example, "&aacute;"
	 184  // unescapes to "á", as does "&#225;" and "&#xE1;".
	 185  // UnescapeString(EscapeString(s)) == s always holds, but the converse isn't
	 186  // always true.
	 187  func UnescapeString(s string) string {
	 188  	populateMapsOnce.Do(populateMaps)
	 189  	i := strings.IndexByte(s, '&')
	 190  
	 191  	if i < 0 {
	 192  		return s
	 193  	}
	 194  
	 195  	b := []byte(s)
	 196  	dst, src := unescapeEntity(b, i, i)
	 197  	for len(s[src:]) > 0 {
	 198  		if s[src] == '&' {
	 199  			i = 0
	 200  		} else {
	 201  			i = strings.IndexByte(s[src:], '&')
	 202  		}
	 203  		if i < 0 {
	 204  			dst += copy(b[dst:], s[src:])
	 205  			break
	 206  		}
	 207  
	 208  		if i > 0 {
	 209  			copy(b[dst:], s[src:src+i])
	 210  		}
	 211  		dst, src = unescapeEntity(b, dst+i, src+i)
	 212  	}
	 213  	return string(b[:dst])
	 214  }
	 215
View as plain text