go-ethereum/vendor/golang.org/x/net/html/charset/charset.go

// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

// Package charset provides common text encodings for HTML documents.
//
// The mapping from encoding labels to encodings is defined at
// https://encoding.spec.whatwg.org/.
package charset // import "golang.org/x/net/html/charset"

import (
	"bytes"
	"fmt"
	"io"
	"mime"
	"strings"
	"unicode/utf8"

	"golang.org/x/net/html"
	"golang.org/x/text/encoding"
	"golang.org/x/text/encoding/charmap"
	"golang.org/x/text/encoding/htmlindex"
	"golang.org/x/text/transform"
)

// Lookup returns the encoding with the specified label, and its canonical
// name. It returns nil and the empty string if label is not one of the
// standard encodings for HTML. Matching is case-insensitive and ignores
// leading and trailing whitespace. Encoders will use HTML escape sequences for
// runes that are not supported by the character set.
func Lookup(label string) (e encoding.Encoding, name string) {
	e, err := htmlindex.Get(label)
	if err != nil {
		return nil, ""
	}
	name, _ = htmlindex.Name(e)
	return &htmlEncoding{e}, name
}

type htmlEncoding struct{ encoding.Encoding }

func (h *htmlEncoding) NewEncoder() *encoding.Encoder {
	// HTML requires a non-terminating legacy encoder. We use HTML escapes to
	// substitute unsupported code points.
	return encoding.HTMLEscapeUnsupported(h.Encoding.NewEncoder())
}

// DetermineEncoding determines the encoding of an HTML document by examining
// up to the first 1024 bytes of content and the declared Content-Type.
//
// See http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#determining-the-character-encoding
func DetermineEncoding(content []byte, contentType string) (e encoding.Encoding, name string, certain bool) {
	if len(content) > 1024 {
		content = content[:1024]
	}

	for _, b := range boms {
		if bytes.HasPrefix(content, b.bom) {
			e, name = Lookup(b.enc)
			return e, name, true
		}
	}

	if _, params, err := mime.ParseMediaType(contentType); err == nil {
		if cs, ok := params["charset"]; ok {
			if e, name = Lookup(cs); e != nil {
				return e, name, true
			}
		}
	}

	if len(content) > 0 {
		e, name = prescan(content)
		if e != nil {
			return e, name, false
		}
	}

	// Try to detect UTF-8.
	// First eliminate any partial rune at the end.
	for i := len(content) - 1; i >= 0 && i > len(content)-4; i-- {
		b := content[i]
		if b < 0x80 {
			break
		}
		if utf8.RuneStart(b) {
			content = content[:i]
			break
		}
	}
	hasHighBit := false
	for _, c := range content {
		if c >= 0x80 {
			hasHighBit = true
			break
		}
	}
	if hasHighBit && utf8.Valid(content) {
		return encoding.Nop, "utf-8", false
	}

	// TODO: change default depending on user's locale?
	return charmap.Windows1252, "windows-1252", false
}

// NewReader returns an io.Reader that converts the content of r to UTF-8.
// It calls DetermineEncoding to find out what r's encoding is.
func NewReader(r io.Reader, contentType string) (io.Reader, error) {
	preview := make([]byte, 1024)
	n, err := io.ReadFull(r, preview)
	switch {
	case err == io.ErrUnexpectedEOF:
		preview = preview[:n]
		r = bytes.NewReader(preview)
	case err != nil:
		return nil, err
	default:
		r = io.MultiReader(bytes.NewReader(preview), r)
	}

	if e, _, _ := DetermineEncoding(preview, contentType); e != encoding.Nop {
		r = transform.NewReader(r, e.NewDecoder())
	}
	return r, nil
}

// NewReaderLabel returns a reader that converts from the specified charset to
// UTF-8. It uses Lookup to find the encoding that corresponds to label, and
// returns an error if Lookup returns nil. It is suitable for use as
// encoding/xml.Decoder's CharsetReader function.
func NewReaderLabel(label string, input io.Reader) (io.Reader, error) {
	e, _ := Lookup(label)
	if e == nil {
		return nil, fmt.Errorf("unsupported charset: %q", label)
	}
	return transform.NewReader(input, e.NewDecoder()), nil
}

func prescan(content []byte) (e encoding.Encoding, name string) {
	z := html.NewTokenizer(bytes.NewReader(content))
	for {
		switch z.Next() {
		case html.ErrorToken:
			return nil, ""

		case html.StartTagToken, html.SelfClosingTagToken:
			tagName, hasAttr := z.TagName()
			if !bytes.Equal(tagName, []byte("meta")) {
				continue
			}
			attrList := make(map[string]bool)
			gotPragma := false

			const (
				dontKnow = iota
				doNeedPragma
				doNotNeedPragma
			)
			needPragma := dontKnow

			name = ""
			e = nil
			for hasAttr {
				var key, val []byte
				key, val, hasAttr = z.TagAttr()
				ks := string(key)
				if attrList[ks] {
					continue
				}
				attrList[ks] = true
				for i, c := range val {
					if 'A' <= c && c <= 'Z' {
						val[i] = c + 0x20
					}
				}

				switch ks {
				case "http-equiv":
					if bytes.Equal(val, []byte("content-type")) {
						gotPragma = true
					}

				case "content":
					if e == nil {
						name = fromMetaElement(string(val))
						if name != "" {
							e, name = Lookup(name)
							if e != nil {
								needPragma = doNeedPragma
							}
						}
					}

				case "charset":
					e, name = Lookup(string(val))
					needPragma = doNotNeedPragma
				}
			}

			if needPragma == dontKnow || needPragma == doNeedPragma && !gotPragma {
				continue
			}

			if strings.HasPrefix(name, "utf-16") {
				name = "utf-8"
				e = encoding.Nop
			}

			if e != nil {
				return e, name
			}
		}
	}
}

func fromMetaElement(s string) string {
	for s != "" {
		csLoc := strings.Index(s, "charset")
		if csLoc == -1 {
			return ""
		}
		s = s[csLoc+len("charset"):]
		s = strings.TrimLeft(s, " \t\n\f\r")
		if !strings.HasPrefix(s, "=") {
			continue
		}
		s = s[1:]
		s = strings.TrimLeft(s, " \t\n\f\r")
		if s == "" {
			return ""
		}
		if q := s[0]; q == '"' || q == '\'' {
			s = s[1:]
			closeQuote := strings.IndexRune(s, rune(q))
			if closeQuote == -1 {
				return ""
			}
			return s[:closeQuote]
		}

		end := strings.IndexAny(s, "; \t\n\f\r")
		if end == -1 {
			end = len(s)
		}
		return s[:end]
	}
	return ""
}

var boms = []struct {
	bom []byte
	enc string
}{
	{[]byte{0xfe, 0xff}, "utf-16be"},
	{[]byte{0xff, 0xfe}, "utf-16le"},
	{[]byte{0xef, 0xbb, 0xbf}, "utf-8"},
}
Godeps: bump github.com/huin/goupnp to c57ae84 2015-05-14 12:29:41 +02:00			`// Copyright 2013 The Go Authors. All rights reserved.`
			`// Use of this source code is governed by a BSD-style`
			`// license that can be found in the LICENSE file.`

			`// Package charset provides common text encodings for HTML documents.`
			`//`
			`// The mapping from encoding labels to encodings is defined at`
			`// https://encoding.spec.whatwg.org/.`
Godeps, vendor: convert dependency management to trash (#3198) This commit converts the dependency management from Godeps to the vendor folder, also switching the tool from godep to trash. Since the upstream tool lacks a few features proposed via a few PRs, until those PRs are merged in (if), use github.com/karalabe/trash. You can update dependencies via trash --update. All dependencies have been updated to their latest version. Parts of the build system are reworked to drop old notions of Godeps and invocation of the go vet command so that it doesn't run against the vendor folder, as that will just blow up during vetting. The conversion drops OpenCL (and hence GPU mining support) from ethash and our codebase. The short reasoning is that there's noone to maintain and having opencl libs in our deps messes up builds as go install ./... tries to build them, failing with unsatisfied link errors for the C OpenCL deps. golang.org/x/net/context is not vendored in. We expect it to be fetched by the user (i.e. using go get). To keep ci.go builds reproducible the package is "vendored" in build/_vendor. 2016-10-28 20:05:01 +03:00			`package charset // import "golang.org/x/net/html/charset"`
Godeps: bump github.com/huin/goupnp to c57ae84 2015-05-14 12:29:41 +02:00
			`import (`
			`"bytes"`
			`"fmt"`
			`"io"`
			`"mime"`
			`"strings"`
			`"unicode/utf8"`

			`"golang.org/x/net/html"`
			`"golang.org/x/text/encoding"`
			`"golang.org/x/text/encoding/charmap"`
Godeps: update all dependencies to latest code 2016-02-11 16:16:52 +02:00			`"golang.org/x/text/encoding/htmlindex"`
Godeps: bump github.com/huin/goupnp to c57ae84 2015-05-14 12:29:41 +02:00			`"golang.org/x/text/transform"`
			`)`

			`// Lookup returns the encoding with the specified label, and its canonical`
			`// name. It returns nil and the empty string if label is not one of the`
			`// standard encodings for HTML. Matching is case-insensitive and ignores`
Godeps: update all dependencies to latest code 2016-02-11 16:16:52 +02:00			`// leading and trailing whitespace. Encoders will use HTML escape sequences for`
			`// runes that are not supported by the character set.`
Godeps: bump github.com/huin/goupnp to c57ae84 2015-05-14 12:29:41 +02:00			`func Lookup(label string) (e encoding.Encoding, name string) {`
Godeps: update all dependencies to latest code 2016-02-11 16:16:52 +02:00			`e, err := htmlindex.Get(label)`
			`if err != nil {`
			`return nil, ""`
			`}`
			`name, _ = htmlindex.Name(e)`
			`return &htmlEncoding{e}, name`
			`}`

			`type htmlEncoding struct{ encoding.Encoding }`

			`func (h htmlEncoding) NewEncoder() encoding.Encoder {`
			`// HTML requires a non-terminating legacy encoder. We use HTML escapes to`
			`// substitute unsupported code points.`
			`return encoding.HTMLEscapeUnsupported(h.Encoding.NewEncoder())`
Godeps: bump github.com/huin/goupnp to c57ae84 2015-05-14 12:29:41 +02:00			`}`

			`// DetermineEncoding determines the encoding of an HTML document by examining`
			`// up to the first 1024 bytes of content and the declared Content-Type.`
			`//`
			`// See http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#determining-the-character-encoding`
			`func DetermineEncoding(content []byte, contentType string) (e encoding.Encoding, name string, certain bool) {`
			`if len(content) > 1024 {`
			`content = content[:1024]`
			`}`

			`for _, b := range boms {`
			`if bytes.HasPrefix(content, b.bom) {`
			`e, name = Lookup(b.enc)`
			`return e, name, true`
			`}`
			`}`

			`if _, params, err := mime.ParseMediaType(contentType); err == nil {`
			`if cs, ok := params["charset"]; ok {`
			`if e, name = Lookup(cs); e != nil {`
			`return e, name, true`
			`}`
			`}`
			`}`

			`if len(content) > 0 {`
			`e, name = prescan(content)`
			`if e != nil {`
			`return e, name, false`
			`}`
			`}`

			`// Try to detect UTF-8.`
			`// First eliminate any partial rune at the end.`
			`for i := len(content) - 1; i >= 0 && i > len(content)-4; i-- {`
			`b := content[i]`
			`if b < 0x80 {`
			`break`
			`}`
			`if utf8.RuneStart(b) {`
			`content = content[:i]`
			`break`
			`}`
			`}`
			`hasHighBit := false`
			`for _, c := range content {`
			`if c >= 0x80 {`
			`hasHighBit = true`
			`break`
			`}`
			`}`
			`if hasHighBit && utf8.Valid(content) {`
			`return encoding.Nop, "utf-8", false`
			`}`

			`// TODO: change default depending on user's locale?`
			`return charmap.Windows1252, "windows-1252", false`
			`}`

			`// NewReader returns an io.Reader that converts the content of r to UTF-8.`
			`// It calls DetermineEncoding to find out what r's encoding is.`
			`func NewReader(r io.Reader, contentType string) (io.Reader, error) {`
			`preview := make([]byte, 1024)`
			`n, err := io.ReadFull(r, preview)`
			`switch {`
			`case err == io.ErrUnexpectedEOF:`
			`preview = preview[:n]`
			`r = bytes.NewReader(preview)`
			`case err != nil:`
			`return nil, err`
			`default:`
			`r = io.MultiReader(bytes.NewReader(preview), r)`
			`}`

			`if e, _, _ := DetermineEncoding(preview, contentType); e != encoding.Nop {`
			`r = transform.NewReader(r, e.NewDecoder())`
			`}`
			`return r, nil`
			`}`

			`// NewReaderLabel returns a reader that converts from the specified charset to`
			`// UTF-8. It uses Lookup to find the encoding that corresponds to label, and`
			`// returns an error if Lookup returns nil. It is suitable for use as`
			`// encoding/xml.Decoder's CharsetReader function.`
			`func NewReaderLabel(label string, input io.Reader) (io.Reader, error) {`
			`e, _ := Lookup(label)`
			`if e == nil {`
			`return nil, fmt.Errorf("unsupported charset: %q", label)`
			`}`
			`return transform.NewReader(input, e.NewDecoder()), nil`
			`}`

			`func prescan(content []byte) (e encoding.Encoding, name string) {`
			`z := html.NewTokenizer(bytes.NewReader(content))`
			`for {`
			`switch z.Next() {`
			`case html.ErrorToken:`
			`return nil, ""`

			`case html.StartTagToken, html.SelfClosingTagToken:`
			`tagName, hasAttr := z.TagName()`
			`if !bytes.Equal(tagName, []byte("meta")) {`
			`continue`
			`}`
			`attrList := make(map[string]bool)`
			`gotPragma := false`

			`const (`
			`dontKnow = iota`
			`doNeedPragma`
			`doNotNeedPragma`
			`)`
			`needPragma := dontKnow`

			`name = ""`
			`e = nil`
			`for hasAttr {`
			`var key, val []byte`
			`key, val, hasAttr = z.TagAttr()`
			`ks := string(key)`
			`if attrList[ks] {`
			`continue`
			`}`
			`attrList[ks] = true`
			`for i, c := range val {`
			`if 'A' <= c && c <= 'Z' {`
			`val[i] = c + 0x20`
			`}`
			`}`

			`switch ks {`
			`case "http-equiv":`
			`if bytes.Equal(val, []byte("content-type")) {`
			`gotPragma = true`
			`}`

			`case "content":`
			`if e == nil {`
			`name = fromMetaElement(string(val))`
			`if name != "" {`
			`e, name = Lookup(name)`
			`if e != nil {`
			`needPragma = doNeedPragma`
			`}`
			`}`
			`}`

			`case "charset":`
			`e, name = Lookup(string(val))`
			`needPragma = doNotNeedPragma`
			`}`
			`}`

			`if needPragma == dontKnow \|\| needPragma == doNeedPragma && !gotPragma {`
			`continue`
			`}`

			`if strings.HasPrefix(name, "utf-16") {`
			`name = "utf-8"`
			`e = encoding.Nop`
			`}`

			`if e != nil {`
			`return e, name`
			`}`
			`}`
			`}`
			`}`

			`func fromMetaElement(s string) string {`
			`for s != "" {`
			`csLoc := strings.Index(s, "charset")`
			`if csLoc == -1 {`
			`return ""`
			`}`
			`s = s[csLoc+len("charset"):]`
			`s = strings.TrimLeft(s, " \t\n\f\r")`
			`if !strings.HasPrefix(s, "=") {`
			`continue`
			`}`
			`s = s[1:]`
			`s = strings.TrimLeft(s, " \t\n\f\r")`
			`if s == "" {`
			`return ""`
			`}`
			`if q := s[0]; q == '"' \|\| q == '\'' {`
			`s = s[1:]`
			`closeQuote := strings.IndexRune(s, rune(q))`
			`if closeQuote == -1 {`
			`return ""`
			`}`
			`return s[:closeQuote]`
			`}`

			`end := strings.IndexAny(s, "; \t\n\f\r")`
			`if end == -1 {`
			`end = len(s)`
			`}`
			`return s[:end]`
			`}`
			`return ""`
			`}`

			`var boms = []struct {`
			`bom []byte`
			`enc string`
			`}{`
			`{[]byte{0xfe, 0xff}, "utf-16be"},`
			`{[]byte{0xff, 0xfe}, "utf-16le"},`
			`{[]byte{0xef, 0xbb, 0xbf}, "utf-8"},`
			`}`