| 
									
										
										
										
											2015-05-14 12:29:41 +02:00
										 |  |  | // Copyright 2013 The Go Authors. All rights reserved. | 
					
						
							|  |  |  | // Use of this source code is governed by a BSD-style | 
					
						
							|  |  |  | // license that can be found in the LICENSE file. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | // Package encoding defines an interface for character encodings, such as Shift | 
					
						
							|  |  |  | // JIS and Windows 1252, that can convert to and from UTF-8. | 
					
						
							|  |  |  | // | 
					
						
							|  |  |  | // Encoding implementations are provided in other packages, such as | 
					
						
							|  |  |  | // golang.org/x/text/encoding/charmap and | 
					
						
							|  |  |  | // golang.org/x/text/encoding/japanese. | 
					
						
							| 
									
										
										
										
											2016-10-28 20:05:01 +03:00
										 |  |  | package encoding // import "golang.org/x/text/encoding" | 
					
						
							| 
									
										
										
										
											2015-05-14 12:29:41 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  | import ( | 
					
						
							|  |  |  | 	"errors" | 
					
						
							| 
									
										
										
										
											2016-02-11 16:16:52 +02:00
										 |  |  | 	"io" | 
					
						
							|  |  |  | 	"strconv" | 
					
						
							| 
									
										
										
										
											2015-05-14 12:29:41 +02:00
										 |  |  | 	"unicode/utf8" | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2016-02-11 16:16:52 +02:00
										 |  |  | 	"golang.org/x/text/encoding/internal/identifier" | 
					
						
							| 
									
										
										
										
											2015-05-14 12:29:41 +02:00
										 |  |  | 	"golang.org/x/text/transform" | 
					
						
							|  |  |  | ) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2016-02-11 16:16:52 +02:00
										 |  |  | // TODO: | 
					
						
							|  |  |  | // - There seems to be some inconsistency in when decoders return errors | 
					
						
							|  |  |  | //   and when not. Also documentation seems to suggest they shouldn't return | 
					
						
							|  |  |  | //   errors at all (except for UTF-16). | 
					
						
							|  |  |  | // - Encoders seem to rely on or at least benefit from the input being in NFC | 
					
						
							|  |  |  | //   normal form. Perhaps add an example how users could prepare their output. | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-05-14 12:29:41 +02:00
										 |  |  | // Encoding is a character set encoding that can be transformed to and from | 
					
						
							|  |  |  | // UTF-8. | 
					
						
							|  |  |  | type Encoding interface { | 
					
						
							| 
									
										
										
										
											2016-02-11 16:16:52 +02:00
										 |  |  | 	// NewDecoder returns a Decoder. | 
					
						
							|  |  |  | 	NewDecoder() *Decoder | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	// NewEncoder returns an Encoder. | 
					
						
							|  |  |  | 	NewEncoder() *Encoder | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | // A Decoder converts bytes to UTF-8. It implements transform.Transformer. | 
					
						
							|  |  |  | // | 
					
						
							|  |  |  | // Transforming source bytes that are not of that encoding will not result in an | 
					
						
							|  |  |  | // error per se. Each byte that cannot be transcoded will be represented in the | 
					
						
							|  |  |  | // output by the UTF-8 encoding of '\uFFFD', the replacement rune. | 
					
						
							|  |  |  | type Decoder struct { | 
					
						
							|  |  |  | 	transform.Transformer | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	// This forces external creators of Decoders to use names in struct | 
					
						
							|  |  |  | 	// initializers, allowing for future extendibility without having to break | 
					
						
							|  |  |  | 	// code. | 
					
						
							|  |  |  | 	_ struct{} | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | // Bytes converts the given encoded bytes to UTF-8. It returns the converted | 
					
						
							| 
									
										
										
										
											2016-08-30 12:20:34 +02:00
										 |  |  | // bytes or nil, err if any error occurred. | 
					
						
							| 
									
										
										
										
											2016-02-11 16:16:52 +02:00
										 |  |  | func (d *Decoder) Bytes(b []byte) ([]byte, error) { | 
					
						
							|  |  |  | 	b, _, err := transform.Bytes(d, b) | 
					
						
							|  |  |  | 	if err != nil { | 
					
						
							|  |  |  | 		return nil, err | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | 	return b, nil | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | // String converts the given encoded string to UTF-8. It returns the converted | 
					
						
							| 
									
										
										
										
											2016-08-30 12:20:34 +02:00
										 |  |  | // string or "", err if any error occurred. | 
					
						
							| 
									
										
										
										
											2016-02-11 16:16:52 +02:00
										 |  |  | func (d *Decoder) String(s string) (string, error) { | 
					
						
							|  |  |  | 	s, _, err := transform.String(d, s) | 
					
						
							|  |  |  | 	if err != nil { | 
					
						
							|  |  |  | 		return "", err | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | 	return s, nil | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | // Reader wraps another Reader to decode its bytes. | 
					
						
							|  |  |  | // | 
					
						
							|  |  |  | // The Decoder may not be used for any other operation as long as the returned | 
					
						
							|  |  |  | // Reader is in use. | 
					
						
							|  |  |  | func (d *Decoder) Reader(r io.Reader) io.Reader { | 
					
						
							|  |  |  | 	return transform.NewReader(r, d) | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | // An Encoder converts bytes from UTF-8. It implements transform.Transformer. | 
					
						
							|  |  |  | // | 
					
						
							|  |  |  | // Each rune that cannot be transcoded will result in an error. In this case, | 
					
						
							|  |  |  | // the transform will consume all source byte up to, not including the offending | 
					
						
							|  |  |  | // rune. Transforming source bytes that are not valid UTF-8 will be replaced by | 
					
						
							|  |  |  | // `\uFFFD`. To return early with an error instead, use transform.Chain to | 
					
						
							|  |  |  | // preprocess the data with a UTF8Validator. | 
					
						
							|  |  |  | type Encoder struct { | 
					
						
							|  |  |  | 	transform.Transformer | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	// This forces external creators of Encoders to use names in struct | 
					
						
							|  |  |  | 	// initializers, allowing for future extendibility without having to break | 
					
						
							|  |  |  | 	// code. | 
					
						
							|  |  |  | 	_ struct{} | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2016-08-30 12:20:34 +02:00
										 |  |  | // Bytes converts bytes from UTF-8. It returns the converted bytes or nil, err if | 
					
						
							| 
									
										
										
										
											2016-02-11 16:16:52 +02:00
										 |  |  | // any error occurred. | 
					
						
							|  |  |  | func (e *Encoder) Bytes(b []byte) ([]byte, error) { | 
					
						
							|  |  |  | 	b, _, err := transform.Bytes(e, b) | 
					
						
							|  |  |  | 	if err != nil { | 
					
						
							|  |  |  | 		return nil, err | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | 	return b, nil | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | // String converts a string from UTF-8. It returns the converted string or | 
					
						
							| 
									
										
										
										
											2016-08-30 12:20:34 +02:00
										 |  |  | // "", err if any error occurred. | 
					
						
							| 
									
										
										
										
											2016-02-11 16:16:52 +02:00
										 |  |  | func (e *Encoder) String(s string) (string, error) { | 
					
						
							|  |  |  | 	s, _, err := transform.String(e, s) | 
					
						
							|  |  |  | 	if err != nil { | 
					
						
							|  |  |  | 		return "", err | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | 	return s, nil | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | // Writer wraps another Writer to encode its UTF-8 output. | 
					
						
							|  |  |  | // | 
					
						
							|  |  |  | // The Encoder may not be used for any other operation as long as the returned | 
					
						
							|  |  |  | // Writer is in use. | 
					
						
							|  |  |  | func (e *Encoder) Writer(w io.Writer) io.Writer { | 
					
						
							|  |  |  | 	return transform.NewWriter(w, e) | 
					
						
							| 
									
										
										
										
											2015-05-14 12:29:41 +02:00
										 |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | // ASCIISub is the ASCII substitute character, as recommended by | 
					
						
							|  |  |  | // http://unicode.org/reports/tr36/#Text_Comparison | 
					
						
							|  |  |  | const ASCIISub = '\x1a' | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | // Nop is the nop encoding. Its transformed bytes are the same as the source | 
					
						
							|  |  |  | // bytes; it does not replace invalid UTF-8 sequences. | 
					
						
							|  |  |  | var Nop Encoding = nop{} | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | type nop struct{} | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2016-02-11 16:16:52 +02:00
										 |  |  | func (nop) NewDecoder() *Decoder { | 
					
						
							|  |  |  | 	return &Decoder{Transformer: transform.Nop} | 
					
						
							| 
									
										
										
										
											2015-05-14 12:29:41 +02:00
										 |  |  | } | 
					
						
							| 
									
										
										
										
											2016-02-11 16:16:52 +02:00
										 |  |  | func (nop) NewEncoder() *Encoder { | 
					
						
							|  |  |  | 	return &Encoder{Transformer: transform.Nop} | 
					
						
							| 
									
										
										
										
											2015-05-14 12:29:41 +02:00
										 |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | // Replacement is the replacement encoding. Decoding from the replacement | 
					
						
							|  |  |  | // encoding yields a single '\uFFFD' replacement rune. Encoding from UTF-8 to | 
					
						
							|  |  |  | // the replacement encoding yields the same as the source bytes except that | 
					
						
							|  |  |  | // invalid UTF-8 is converted to '\uFFFD'. | 
					
						
							|  |  |  | // | 
					
						
							|  |  |  | // It is defined at http://encoding.spec.whatwg.org/#replacement | 
					
						
							|  |  |  | var Replacement Encoding = replacement{} | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | type replacement struct{} | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2016-02-11 16:16:52 +02:00
										 |  |  | func (replacement) NewDecoder() *Decoder { | 
					
						
							|  |  |  | 	return &Decoder{Transformer: replacementDecoder{}} | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | func (replacement) NewEncoder() *Encoder { | 
					
						
							|  |  |  | 	return &Encoder{Transformer: replacementEncoder{}} | 
					
						
							| 
									
										
										
										
											2015-05-14 12:29:41 +02:00
										 |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2016-02-11 16:16:52 +02:00
										 |  |  | func (replacement) ID() (mib identifier.MIB, other string) { | 
					
						
							|  |  |  | 	return identifier.Replacement, "" | 
					
						
							| 
									
										
										
										
											2015-05-14 12:29:41 +02:00
										 |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | type replacementDecoder struct{ transform.NopResetter } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | func (replacementDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { | 
					
						
							|  |  |  | 	if len(dst) < 3 { | 
					
						
							|  |  |  | 		return 0, 0, transform.ErrShortDst | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | 	if atEOF { | 
					
						
							|  |  |  | 		const fffd = "\ufffd" | 
					
						
							|  |  |  | 		dst[0] = fffd[0] | 
					
						
							|  |  |  | 		dst[1] = fffd[1] | 
					
						
							|  |  |  | 		dst[2] = fffd[2] | 
					
						
							|  |  |  | 		nDst = 3 | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | 	return nDst, len(src), nil | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | type replacementEncoder struct{ transform.NopResetter } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | func (replacementEncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { | 
					
						
							|  |  |  | 	r, size := rune(0), 0 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	for ; nSrc < len(src); nSrc += size { | 
					
						
							|  |  |  | 		r = rune(src[nSrc]) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 		// Decode a 1-byte rune. | 
					
						
							|  |  |  | 		if r < utf8.RuneSelf { | 
					
						
							|  |  |  | 			size = 1 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 		} else { | 
					
						
							|  |  |  | 			// Decode a multi-byte rune. | 
					
						
							|  |  |  | 			r, size = utf8.DecodeRune(src[nSrc:]) | 
					
						
							|  |  |  | 			if size == 1 { | 
					
						
							|  |  |  | 				// All valid runes of size 1 (those below utf8.RuneSelf) were | 
					
						
							|  |  |  | 				// handled above. We have invalid UTF-8 or we haven't seen the | 
					
						
							|  |  |  | 				// full character yet. | 
					
						
							|  |  |  | 				if !atEOF && !utf8.FullRune(src[nSrc:]) { | 
					
						
							|  |  |  | 					err = transform.ErrShortSrc | 
					
						
							|  |  |  | 					break | 
					
						
							|  |  |  | 				} | 
					
						
							|  |  |  | 				r = '\ufffd' | 
					
						
							|  |  |  | 			} | 
					
						
							|  |  |  | 		} | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 		if nDst+utf8.RuneLen(r) > len(dst) { | 
					
						
							|  |  |  | 			err = transform.ErrShortDst | 
					
						
							|  |  |  | 			break | 
					
						
							|  |  |  | 		} | 
					
						
							|  |  |  | 		nDst += utf8.EncodeRune(dst[nDst:], r) | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | 	return nDst, nSrc, err | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2016-02-11 16:16:52 +02:00
										 |  |  | // HTMLEscapeUnsupported wraps encoders to replace source runes outside the | 
					
						
							|  |  |  | // repertoire of the destination encoding with HTML escape sequences. | 
					
						
							|  |  |  | // | 
					
						
							|  |  |  | // This wrapper exists to comply to URL and HTML forms requiring a | 
					
						
							|  |  |  | // non-terminating legacy encoder. The produced sequences may lead to data | 
					
						
							|  |  |  | // loss as they are indistinguishable from legitimate input. To avoid this | 
					
						
							|  |  |  | // issue, use UTF-8 encodings whenever possible. | 
					
						
							|  |  |  | func HTMLEscapeUnsupported(e *Encoder) *Encoder { | 
					
						
							|  |  |  | 	return &Encoder{Transformer: &errorHandler{e, errorToHTML}} | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | // ReplaceUnsupported wraps encoders to replace source runes outside the | 
					
						
							|  |  |  | // repertoire of the destination encoding with an encoding-specific | 
					
						
							|  |  |  | // replacement. | 
					
						
							|  |  |  | // | 
					
						
							|  |  |  | // This wrapper is only provided for backwards compatibility and legacy | 
					
						
							|  |  |  | // handling. Its use is strongly discouraged. Use UTF-8 whenever possible. | 
					
						
							|  |  |  | func ReplaceUnsupported(e *Encoder) *Encoder { | 
					
						
							|  |  |  | 	return &Encoder{Transformer: &errorHandler{e, errorToReplacement}} | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | type errorHandler struct { | 
					
						
							|  |  |  | 	*Encoder | 
					
						
							|  |  |  | 	handler func(dst []byte, r rune, err repertoireError) (n int, ok bool) | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | // TODO: consider making this error public in some form. | 
					
						
							|  |  |  | type repertoireError interface { | 
					
						
							|  |  |  | 	Replacement() byte | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | func (h errorHandler) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { | 
					
						
							|  |  |  | 	nDst, nSrc, err = h.Transformer.Transform(dst, src, atEOF) | 
					
						
							|  |  |  | 	for err != nil { | 
					
						
							|  |  |  | 		rerr, ok := err.(repertoireError) | 
					
						
							|  |  |  | 		if !ok { | 
					
						
							|  |  |  | 			return nDst, nSrc, err | 
					
						
							|  |  |  | 		} | 
					
						
							|  |  |  | 		r, sz := utf8.DecodeRune(src[nSrc:]) | 
					
						
							|  |  |  | 		n, ok := h.handler(dst[nDst:], r, rerr) | 
					
						
							|  |  |  | 		if !ok { | 
					
						
							|  |  |  | 			return nDst, nSrc, transform.ErrShortDst | 
					
						
							|  |  |  | 		} | 
					
						
							|  |  |  | 		err = nil | 
					
						
							|  |  |  | 		nDst += n | 
					
						
							|  |  |  | 		if nSrc += sz; nSrc < len(src) { | 
					
						
							|  |  |  | 			var dn, sn int | 
					
						
							|  |  |  | 			dn, sn, err = h.Transformer.Transform(dst[nDst:], src[nSrc:], atEOF) | 
					
						
							|  |  |  | 			nDst += dn | 
					
						
							|  |  |  | 			nSrc += sn | 
					
						
							|  |  |  | 		} | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | 	return nDst, nSrc, err | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | func errorToHTML(dst []byte, r rune, err repertoireError) (n int, ok bool) { | 
					
						
							|  |  |  | 	buf := [8]byte{} | 
					
						
							|  |  |  | 	b := strconv.AppendUint(buf[:0], uint64(r), 10) | 
					
						
							|  |  |  | 	if n = len(b) + len("&#;"); n >= len(dst) { | 
					
						
							|  |  |  | 		return 0, false | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | 	dst[0] = '&' | 
					
						
							|  |  |  | 	dst[1] = '#' | 
					
						
							|  |  |  | 	dst[copy(dst[2:], b)+2] = ';' | 
					
						
							|  |  |  | 	return n, true | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | func errorToReplacement(dst []byte, r rune, err repertoireError) (n int, ok bool) { | 
					
						
							|  |  |  | 	if len(dst) == 0 { | 
					
						
							|  |  |  | 		return 0, false | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | 	dst[0] = err.Replacement() | 
					
						
							|  |  |  | 	return 1, true | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-05-14 12:29:41 +02:00
										 |  |  | // ErrInvalidUTF8 means that a transformer encountered invalid UTF-8. | 
					
						
							|  |  |  | var ErrInvalidUTF8 = errors.New("encoding: invalid UTF-8") | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | // UTF8Validator is a transformer that returns ErrInvalidUTF8 on the first | 
					
						
							|  |  |  | // input byte that is not valid UTF-8. | 
					
						
							|  |  |  | var UTF8Validator transform.Transformer = utf8Validator{} | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | type utf8Validator struct{ transform.NopResetter } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | func (utf8Validator) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { | 
					
						
							|  |  |  | 	n := len(src) | 
					
						
							|  |  |  | 	if n > len(dst) { | 
					
						
							|  |  |  | 		n = len(dst) | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | 	for i := 0; i < n; { | 
					
						
							|  |  |  | 		if c := src[i]; c < utf8.RuneSelf { | 
					
						
							|  |  |  | 			dst[i] = c | 
					
						
							|  |  |  | 			i++ | 
					
						
							|  |  |  | 			continue | 
					
						
							|  |  |  | 		} | 
					
						
							|  |  |  | 		_, size := utf8.DecodeRune(src[i:]) | 
					
						
							|  |  |  | 		if size == 1 { | 
					
						
							|  |  |  | 			// All valid runes of size 1 (those below utf8.RuneSelf) were | 
					
						
							|  |  |  | 			// handled above. We have invalid UTF-8 or we haven't seen the | 
					
						
							|  |  |  | 			// full character yet. | 
					
						
							|  |  |  | 			err = ErrInvalidUTF8 | 
					
						
							|  |  |  | 			if !atEOF && !utf8.FullRune(src[i:]) { | 
					
						
							|  |  |  | 				err = transform.ErrShortSrc | 
					
						
							|  |  |  | 			} | 
					
						
							|  |  |  | 			return i, i, err | 
					
						
							|  |  |  | 		} | 
					
						
							|  |  |  | 		if i+size > len(dst) { | 
					
						
							|  |  |  | 			return i, i, transform.ErrShortDst | 
					
						
							|  |  |  | 		} | 
					
						
							|  |  |  | 		for ; size > 0; size-- { | 
					
						
							|  |  |  | 			dst[i] = src[i] | 
					
						
							|  |  |  | 			i++ | 
					
						
							|  |  |  | 		} | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | 	if len(src) > len(dst) { | 
					
						
							|  |  |  | 		err = transform.ErrShortDst | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | 	return n, n, err | 
					
						
							|  |  |  | } |