add: string source code

This commit is contained in:
Inanc Gumus
2019-02-23 15:06:24 +03:00
parent 1dab45cd57
commit b403c239d2
18 changed files with 113 additions and 744 deletions

View File

@ -14,15 +14,7 @@ import (
"strings" "strings"
) )
const (
asciiStart = 0x41 // 65
asciiStop = 0x5a // 90
cols = 1
)
func main() { func main() {
// DETERMINE START - STOP POSITIONS
var start, stop int var start, stop int
if args := os.Args[1:]; len(args) == 2 { if args := os.Args[1:]; len(args) == 2 {
@ -31,32 +23,20 @@ func main() {
} }
if start == 0 || stop == 0 { if start == 0 || stop == 0 {
start, stop = asciiStart, asciiStop start, stop = 'A', 'Z'
} }
// PRINT HEADER fmt.Printf("%-10s %-10s %-10s %-12s\n%s\n",
for i := 0; i < cols; i++ { "literal", "dec", "hex", "encoded",
fmt.Printf("%-10s %-12s %-12s %-14s", strings.Repeat("-", 45))
"literal", "decimal", "codepoint", "bytes")
}
fmt.Print("\n", strings.Repeat("-", 50*cols), "\n")
// PRINT TABLE for n := start; n <= stop; n++ {
for n, l := start, 0; n <= stop; n++ { fmt.Printf("%-10c %-10[1]d %-10[1]x % -12x\n", n, string(n))
// draw the line
fmt.Printf("%-10q %-12d %-12U % -14x", n, n, n, string(n))
// go to next line if columns are consumed
if l++; l%cols == 0 {
fmt.Println()
continue
} }
} }
fmt.Println()
}
/* /*
EXAMPLE BLOCKS EXAMPLE UNICODE BLOCKS
1 byte 1 byte
------------------------------------------------------------ ------------------------------------------------------------
@ -72,7 +52,7 @@ lowerCaseStop = '\u007a' -> 122
2 bytes 2 bytes
------------------------------------------------------------ ------------------------------------------------------------
latin1Start = '\u0080' -> 128 latin1Start = '\u0080' -> 161
latin1Stop = '\u00ff' -> 255 latin1Stop = '\u00ff' -> 255
@ -86,8 +66,4 @@ dingbatStop = '\u27bf' -> 10175
------------------------------------------------------------ ------------------------------------------------------------
emojiStart = '\U0001f600' -> 128512 emojiStart = '\U0001f600' -> 128512
emojiStop = '\U0001f64f' -> 128591 emojiStop = '\U0001f64f' -> 128591
transportStart = '\U0001F680' -> 128640
transportStop = '\U0001f6ff' -> 128767
BIG THANK YOU! -> https://unicode-table.com/
*/ */

View File

@ -0,0 +1,61 @@
// For more tutorials: https://blog.learngoprogramming.com
//
// Copyright © 2018 Inanc Gumus
// Learn Go Programming Course
// License: https://creativecommons.org/licenses/by-nc-sa/4.0/
//
package main
import (
"fmt"
"unicode/utf8"
"unsafe"
)
func main() {
str := "Yūgen ☯ 💀"
// can't change a string
// a string is a read-only byte-slice
// str[0] = 'N'
// str[1] = 'o'
bytes := []byte(str)
// can change a byte slice
// bytes[0] = 'N'
// bytes[1] = 'o'
str = string(bytes)
fmt.Printf("%s\n", str)
fmt.Printf("\t%d bytes\n", len(str))
fmt.Printf("\t%d runes\n", utf8.RuneCountInString(str))
fmt.Printf("% x\n", bytes)
fmt.Printf("\t%d bytes\n", len(bytes))
fmt.Printf("\t%d runes\n", utf8.RuneCount(bytes))
// fmt.Println()
// for i, r := range str {
// fmt.Printf("str[%2d] = % -12x = %q\n", i, string(r), r)
// }
fmt.Println()
fmt.Printf("1st byte : %c\n", str[0]) // ok
fmt.Printf("2nd byte : %c\n", str[1]) // not ok
fmt.Printf("2nd rune : %s\n", str[1:3]) // ok
fmt.Printf("last rune : %s\n", str[11:]) // ok
// disadvantage: each one is 4 bytes
runes := []rune(str)
fmt.Println()
fmt.Printf("%s\n", str)
fmt.Printf("\t%d bytes\n", int(unsafe.Sizeof(runes[0]))*len(runes))
fmt.Printf("\t%d runes\n", len(runes))
fmt.Printf("1st rune : %c\n", runes[0])
fmt.Printf("2nd rune : %c\n", runes[1])
fmt.Printf("first five : %c\n", runes[:5])
}

View File

@ -7,11 +7,11 @@
/* /*
#1- Get and check the input #1- Get and check the input
#2- Create a buffer with a sufficient size #2- Create a byte buffer and use it as the output
#3- Write input to the buffer as it is and print it #3- Write input to the buffer as it is and print it
#4- Detect the link #4- Detect the link
#5- Mask the link #5- Mask the link
#6- Detect white spaces and disable the masking #6- Stop masking when a whitespace is detected
#7- Write http:// to the buffer, just before the link #7- Write http:// to the buffer, just before the link
*/ */
@ -24,6 +24,7 @@ import (
const ( const (
link = "http://" link = "http://"
nlink = len(link)
mask = '*' mask = '*'
) )
@ -37,30 +38,23 @@ func main() {
var ( var (
text = args[0] text = args[0]
size = len(text) size = len(text)
// create a sufficient buffer for the output
//
// and adjust its slice pointer to the first element
// of the backing array! -> make(..., 0, ...)
buf = make([]byte, 0, size) buf = make([]byte, 0, size)
in bool in bool
) )
for i := 0; i < size; i++ { for i := 0; i < size; i++ {
nlink := len(link)
// slice the input and look for the link pattern // slice the input and look for the link pattern
// do not slice it when it goes beyond the input text's capacity // do not slice it when it goes beyond the input text's capacity
if len(text[i:]) >= nlink && text[i:i+nlink] == link { if len(text[i:]) >= nlink && text[i:i+nlink] == link {
// jump to the next character after "http://"
i += nlink
// set the flag: we're in a link! -> "http://....." // set the flag: we're in a link! -> "http://....."
in = true in = true
// add the "http://" manually // add the "http://" manually
buf = append(buf, link...) buf = append(buf, link...)
// jump to the next character after "http://"
i += nlink
} }
// get the current byte from the input // get the current byte from the input
@ -78,8 +72,6 @@ func main() {
if in { if in {
c = mask c = mask
} }
// add the current character to the buffer
buf = append(buf, c) buf = append(buf, c)
} }

View File

@ -10,11 +10,8 @@ package main
import ( import (
"fmt" "fmt"
"unicode" "unicode"
"unicode/utf8"
) )
const lineWidth = 40
func main() { func main() {
text := `Galaksinin Batı Sarmal Kolu'nun bir ucunda, haritası bile çıkarılmamış ücra bir köşede, gözlerden uzak, küçük ve sarı bir güneş vardır. text := `Galaksinin Batı Sarmal Kolu'nun bir ucunda, haritası bile çıkarılmamış ücra bir köşede, gözlerden uzak, küçük ve sarı bir güneş vardır.
@ -22,13 +19,15 @@ Bu güneşin yörüngesinde, kabaca yüz kırksekiz milyon kilometre uzağında,
Gezegenin maymun soyundan gelen canlıları öyle ilkeldir ki dijital kol saatinin hâlâ çok etkileyici bir buluş olduğunu düşünürler.` Gezegenin maymun soyundan gelen canlıları öyle ilkeldir ki dijital kol saatinin hâlâ çok etkileyici bir buluş olduğunu düşünürler.`
const maxWidth = 40
var lw int // line width var lw int // line width
for _, r := range text { for _, r := range text {
fmt.Printf("%c", r) fmt.Printf("%c", r)
switch lw++; { switch lw++; {
case lw > lineWidth && r != '\n' && unicode.IsSpace(r): case lw > maxWidth && r != '\n' && unicode.IsSpace(r):
fmt.Println() fmt.Println()
fallthrough fallthrough
case r == '\n': case r == '\n':
@ -37,30 +36,3 @@ Gezegenin maymun soyundan gelen canlıları öyle ilkeldir ki dijital kol saatin
} }
fmt.Println() fmt.Println()
} }
// call it like: runeHandler(text)
func runeHandler(text string) {
for i := 0; i < len(text); {
r := rune(text[i])
size := 1
if r > utf8.RuneSelf {
r, size = utf8.DecodeRuneInString(text[i:])
// check out the other functions as well, play with them!
//
// for example (type these into the command-line):
// go doc utf8
// go doc utf8 EncodeRune
}
i += size
fmt.Printf("%c", r)
}
}
// call it like: byteHandler(text)
func byteHandler(text string) {
for i := 0; i < len(text); i++ {
fmt.Printf("%c", text[i])
}
}

View File

@ -0,0 +1,5 @@
Galaksinin Batı Sarmal Kolu'nun bir ucunda, haritası bile çıkarılmamış ücra bir köşede, gözlerden uzak, küçük ve sarı bir güneş vardır.
Bu güneşin yörüngesinde, kabaca yüz kırksekiz milyon kilometre uzağında, tamamıyla önemsiz ve mavi-yeşil renkli, küçük bir gezegen döner.
Gezegenin maymun soyundan gelen canlıları öyle ilkeldir ki dijital kol saatinin hâlâ çok etkileyici bir buluş olduğunu düşünürler.

View File

@ -0,0 +1,9 @@
TODO
* Write a program that dumps the bytes of the given argument
* Get the first unicode char ([]rune)
* Get the last unicode char ([]rune)
* In the masker program:
* Use copy instead of append when appendin the "http://" manually
*

View File

@ -0,0 +1,18 @@
// For more tutorials: https://blog.learngoprogramming.com
//
// Copyright © 2018 Inanc Gumus
// Learn Go Programming Course
// License: https://creativecommons.org/licenses/by-nc-sa/4.0/
//
// + Masker : Use copy instead of append when appending the "http://" manually
// + Wrapper: Accept the width from the cmdline
// args, maxWidth := os.Args[1:], 40
// if len(args) == 1 {
// maxWidth, _ = strconv.Atoi(args[0])
// }
package main
func main() {
}

View File

@ -1,45 +0,0 @@
// For more tutorials: https://blog.learngoprogramming.com
//
// Copyright © 2018 Inanc Gumus
// Learn Go Programming Course
// License: https://creativecommons.org/licenses/by-nc-sa/4.0/
//
package main
import "fmt"
func main() {
var g, o rune
g, o = 'g', 'o'
g, o = 103, 111
g, o = 0x67, 0x6f
g, o = '\U00000067', '\U0000006f'
g, o = '\u0067', '\u006f'
g, o = '\x67', '\x6f'
fmt.Println("codepoints")
fmt.Printf(" dec : %d %d\n", g, o)
fmt.Printf(" hex : %x %x\n", g, o)
fmt.Printf(" unicode : %U %U\n", g, o)
fmt.Printf(" chars : %c %c\n", g, o)
// g++
// o -= 6
g -= 'a' - 'A'
o -= 'a' - 'A'
fmt.Println("codepoints")
fmt.Printf(" dec : %d %d\n", g, o)
fmt.Printf(" hex : %x %x\n", g, o)
fmt.Printf(" unicode : %U %U\n", g, o)
fmt.Printf(" chars : %c %c\n", g, o)
// string representations
// fmt.Print("string() : ", string(g), string(o), "\n")
// fmt.Print("hex 1 byte : \x67\x6f \n")
// fmt.Print("hex 2 bytes : \u0067\u006f \n")
// fmt.Print("hex 4 bytes : \U00000067\U0000006f \n")
}

View File

@ -1,28 +0,0 @@
// For more tutorials: https://blog.learngoprogramming.com
//
// Copyright © 2018 Inanc Gumus
// Learn Go Programming Course
// License: https://creativecommons.org/licenses/by-nc-sa/4.0/
//
package main
import "fmt"
func main() {
msg := "WONDERFUL!"
bytes := []byte(msg)
fmt.Println("msg :", msg)
fmt.Println("bytes :", bytes)
fmt.Println("string(bytes) :", string(bytes))
fmt.Println("string(87) :", string(87))
fmt.Println()
for i, v := range msg {
fmt.Printf(
"msg[%d] : %d = %[2]q\n",
i, v)
}
}

View File

@ -1,57 +0,0 @@
// For more tutorials: https://blog.learngoprogramming.com
//
// Copyright © 2018 Inanc Gumus
// Learn Go Programming Course
// License: https://creativecommons.org/licenses/by-nc-sa/4.0/
//
package main
import (
"fmt"
)
func main() {
// GOALS:
// 1- String value is immutable
// 2- Indexing vs Slicing
// 3- Using bytes for manipulating strings
mood := "wonder"
// 1- a string value is immutable (read-only)
// mood[1] = 'a'
// 2- Indexing vs Slicing
// "wonder"
// ^ ^^^^
// | ||||
// "wandering"
// "w" + "a" + "nder" + "ing"
// wandering := mood[0] + "a" + mood[2:] + "ing"
// fmt.Printf("mood[0] : %T - %[1]v\n", mood[0]) // byte
// fmt.Printf("mood[0:1] : %T - %[1]v\n", mood[0:1]) // string
// wandering := mood[:1] + "a" + mood[2:] + "ing"
fmt.Println(mood)
// fmt.Println(wandering)
// 3- converting creates a new byte slice (allocation)
b := []byte(mood)
b[1] = 'a'
// b = append(b, 'i', 'n', 'g')
// b = append(b, []byte{'i', 'n', 'g'})
b = append(b, "ing"...)
// starts copying from the first element
copy(b, "listen")
// starts copying from the "7th" element
copy(b[6:], "ed.")
fmt.Println(string(b))
}

View File

@ -1,144 +0,0 @@
// For more tutorials: https://blog.learngoprogramming.com
//
// Copyright © 2018 Inanc Gumus
// Learn Go Programming Course
// License: https://creativecommons.org/licenses/by-nc-sa/4.0/
//
package main
import (
"fmt"
"unicode/utf8"
"unsafe"
)
// Please run this code and experiment with it
// Observe the results
// USELESS-NOTE : "Öykü" means "Story" in Turkish!
func main() {
fmt.Println("-----------------------------------")
fmt.Println("ASCII Codepoints")
fmt.Println("-----------------------------------")
var (
a, z byte = 'a', 'z'
A, Z byte = 'A', 'Z'
d0, d9 byte = '0', '9'
)
for _, c := range []byte{a, z, A, Z, d0, d9} {
fmt.Printf("%c - 1 byte - %[1]U - %[1]d\n", c)
}
fmt.Println("\n-----------------------------------")
fmt.Println("Unicode Codepoints")
fmt.Println("-----------------------------------")
var (
Ö = 'Ö'
= '栗'
monkey = '🙉'
)
for _, c := range []rune{rune(A), Ö, , monkey} {
fmt.Printf("%c - %d bytes - %[1]U - %[1]d\n", c, cptb(c))
}
fmt.Println("\n-----------------------------------")
fmt.Println("UTF-8 Encoded")
fmt.Println("-----------------------------------")
// utf8.RuneLen finds the number of bytes necessary for
// encoding a codepoint to utf8
for _, c := range []rune{rune(A), Ö, , monkey} {
fmt.Printf("%c - %d bytes - %[1]U - %[1]d\n", c,
utf8.RuneLen(c))
}
fmt.Println("\n-----------------------------------")
fmt.Println("Example: Unicode Codepoints")
fmt.Println("-----------------------------------")
var (
ö = 'ö'
y = 'y'
k = 'k'
ü = 'ü'
)
var (
oykuRunes = []rune{ö, y, k, ü}
total int
)
for _, c := range oykuRunes {
fmt.Printf("%c - %d bytes - %[1]U - %[1]d\n", c, cptb(c))
// unsafe.Sizeof finds the memory size of simple values
// don't use it in production-level code -> it's unsafe!
total += int(unsafe.Sizeof(c))
}
fmt.Printf("TOTAL: %d bytes.\n", total)
fmt.Println("\n-----------------------------------")
fmt.Println("Example: Indexing")
fmt.Println("-----------------------------------")
fmt.Printf("%c%c%c%c\n",
oykuRunes[0], oykuRunes[1], oykuRunes[2],
oykuRunes[len(oykuRunes)-1])
// string to []rune
oykuRunes = []rune("öykü")
fmt.Printf("%c%c%c%c\n",
oykuRunes[0], oykuRunes[1], oykuRunes[2],
oykuRunes[len(oykuRunes)-1])
fmt.Println("\n-----------------------------------")
fmt.Println("Example: UTF-8 Encoding")
fmt.Println("-----------------------------------")
// this is also ok
// oykuString := string(oykuRunes)
oykuString := "öykü"
fmt.Printf("TOTAL bytes in oykuRunes : %d\n", total)
fmt.Printf("TOTAL bytes in oykuString: %d\n", len(oykuString))
fmt.Printf("TOTAL runes in oykuString: %d\n",
utf8.RuneCountInString(oykuString))
fmt.Printf("Runes of oykuString : %s\n", oykuString)
fmt.Printf("Bytes of oykuString : % x\n", oykuString)
fmt.Println()
for i := 0; i < len(oykuString); i++ {
fmt.Printf("oykuString[%d]: %c\n", i, oykuString[i])
}
// slicing returns a slice with the type of the sliced value
// so, the sliced value is a string, then a string is returned
//
// example:
// oykuString[0:2] is a string
fmt.Println()
fmt.Printf("oykuString[0:2]: %q\n", oykuString[0:2])
fmt.Printf("oykuString[4:6]: %q\n", oykuString[4:6])
}
// -------------------------------------------------------------------
// cptb finds how many bytes are necessary to represent a codepoint
// cptb means codepoint to bytes
func cptb(r rune) int {
switch {
case r <= 0xFF: // 255
return 1
case r <= 0xFFFF: // 65,535
return 2
case r <= 0xFFFFF: // 16,777,215
return 3
}
return 4
}

View File

@ -1,60 +0,0 @@
// For more tutorials: https://blog.learngoprogramming.com
//
// Copyright © 2018 Inanc Gumus
// Learn Go Programming Course
// License: https://creativecommons.org/licenses/by-nc-sa/4.0/
//
package main
import (
"fmt"
"unicode/utf8"
"unsafe"
)
func main() {
// try yourself: try other runes!
// you can find more here: https://unicode-table.com
// r := '🙉'
// r := '\u011e'
r := 'Ğ'
// only codepoint (can't be printed)
fmt.Printf("before encoding: %d\n", r)
fmt.Printf(" bits : %016b\n", r)
fmt.Printf(" bytes: % x\n", r)
// utf-8 encoded string
encoded := string(r)
encodedBytes := []byte(encoded)
fmt.Println()
fmt.Printf("after encoding: %q\n", encoded)
fmt.Printf(" bits : %8b\n", encodedBytes)
fmt.Printf(" bytes: % x\n", encodedBytes)
// utf-8 string efficient to store and transmit
// but, it's harder to use.
//
// rune slice is inefficient.
// but, it's easy to use.
fmt.Println()
fmt.Println("string (utf-8) vs []rune (unicode)")
s := "hava çok güzel 😳"
fmt.Printf("%q\n", s)
fmt.Printf(" size : %d bytes\n", len(s))
fmt.Printf(" len : %d chars\n", utf8.RuneCountInString(s))
fmt.Printf(" s[5] : %q\n", s[5])
fmt.Printf(" s[5:7] : %q\n", s[5:7])
runes := []rune(s)
size := int(unsafe.Sizeof(runes[0])) * len(runes)
fmt.Printf("\n%q\n", runes)
fmt.Printf(" size : %d bytes\n", size)
fmt.Printf(" len : %d chars\n", len(runes))
fmt.Printf(" runes[5] : %q\n", runes[5])
}

View File

@ -1,45 +0,0 @@
// For more tutorials: https://blog.learngoprogramming.com
//
// Copyright © 2018 Inanc Gumus
// Learn Go Programming Course
// License: https://creativecommons.org/licenses/by-nc-sa/4.0/
//
package main
import (
"fmt"
"unsafe"
)
func main() {
// empty := ""
// dump(empty)
hello := "hello"
dump(hello)
dump("hello")
dump("hello!")
for i := range hello {
dump(hello[i : i+1])
}
dump(string([]byte(hello)))
dump(string([]byte(hello)))
dump(string([]rune(hello)))
}
// StringHeader is used by a string value
// In practice, you should use: reflect.Header
type StringHeader struct {
// points to a backing array's item
pointer uintptr // where it starts
length int // where it ends
}
// dump prints the string header of a string value
func dump(s string) {
ptr := *(*StringHeader)(unsafe.Pointer(&s))
fmt.Printf("%q: %+v\n", s, ptr)
}

View File

@ -1,52 +0,0 @@
package main
import (
"fmt"
"strings"
)
// ---------------------------------------------------------
// EXERCISE: Assign the Arrays
//
// 1. Create an array named books
// 2. Add book titles to the array
// 3. Create two more copies of the array named: upper and lower
// 4. Change the book titles to uppercase in the upper array only
// 5. Change the book titles to lowercase in the lower array only
// 6. Print all the arrays
//
// NOTE
// Check out the strings package, it has function to convert cases to
// upper and lower cases.
//
// BONUS
// 1. Invent your own arrays with different types other than string,
// and do some manipulations on them.
//
// 👉 THISSSS--------------------------------------------------------
// 2. Find some Turkish book titles and do the same upper, lowercase conversion
// for them.
//
// Here are some books: https://www.goodreads.com/group/bookshelf/417154-bilimkurgu-kul-b?shelf=read
//
// Note: You'd need to use special functions to convert the Turkish letters.
// They're in the strings package as well.
//
// EXPECTED OUTPUT
// ?
// ---------------------------------------------------------
func main() {
books := [...]string{"Kafka's Revenge", "Stay Golden", "Everythingship"}
upper, lower := books, books
for i := range books {
upper[i] = strings.ToUpper(upper[i])
lower[i] = strings.ToLower(lower[i])
}
fmt.Printf("books: %q\n", books)
fmt.Printf("upper: %q\n", upper)
fmt.Printf("lower: %q\n", lower)
}

View File

@ -1,233 +0,0 @@
# Strings Revisited
## Bytes
* ASCII
* Immutable
## Runes
* Unicode
* vs ASCII
* UTF-8
* Made up of bytes
## Slicing
* String: Read-Only Byte Slice
* Slicing -> String
* Index -> Byte
---
## Read-only byte slice
* A string is a read-only slice
* You can't change its data
* Indexable: Returns you a byte
* Slicable: Returns you a string
## Slicing
* Strings can be sliced just like a slice
* After slicing Go returns you a new string slice
* WARNING: Indexing expression returns you a byte
* s := "hey"
* s[0] + s[1] + s[2] != "hey"
* s[0:3] == "hey
## Underlying array
* Underlying array is a string array
* There's no capacity this time: Only length and pointer.
* Sliced string will refer to that array
* String slicing is cheap — They share the same array
## Unicode
* At the beginning there was only ASCII code standard
* It was using 7-bits to represents 128 characters
* Only English characters
* Each code corresponding to a character
* After Internet nothing couldn't stay the same
* There was a need to introduce more languages
* 127 characters aren't enough for the entire world
* So: Unicode is born
* It collects all of the characters in world's languages
* Unicode can represent every character in every imaginable language system.
* Assigns each character to a codepoint or a rune (in Go)
* Unicode assigns each character a unique number, or code point.
* Codepoint is a numeric number which represents a character in general
* U+2700 -> hex
* Unicode defines codepoints for 1m+ characters
* It includes the ASCII codes too
A chinese character: 汉
Its unicode value: U+6C49
convert 6C49 to binary: 01101100 01001001
embed 6C49 as UTF-8: 11100110 10110001 10001001
## Unicode and Runes
* Rune is a 4-bytes type for storing unicode codepoints
* Rune data type and rune codepoints are different things!
* There's UTF-32 standard which assigns 4 bytes to each codepoint
* But, that's inefficient, so, instead Go uses a variable encoding standard called UTF-8. It assigns different number of bytes to codepoints.
* UTF-8 has been invented by Rob Pike and Ken Thompson (two of the creators of Go)
* So, a rune is 1-4 bytes. Uses 1 byte for ASCII (english).
* 2-3 bytes for most of the characters.
* A string can contain runes
* Each rune can span to multiple bytes
* WARNING: Getting one byte of a string may give you corrupt data
* If you're getting one part of a rune inside the string!
* In a string with runes, you can't easily index the characters
* You need to use unicode and utf8 packages
* Or you need to convert the string into a rune slice
* unicode: letters vs nums, to uppercase, ...
* utf8 : working w/bytes and runes
* RuneCountInString(s) == len([]rune(s))
* DecodeRuneInString(s) returns the first rune
## Ranging over strings
* You can range over a string like a slice
* It will jump over the runes inside the string
* The index variable will be the starting position of each rune
* And the value will be the rune itself
## Representing bytes
* Unicode characters can be hard to type in code
* So, we can use \x and \u in a string to represent bytes and runes
* A string literal is always utf-8 but a string value is not
## Convenience
* It's easy to work with runes in code: []rune
* However, it will consume more memory: Each char is 4 bytes
* "inanç"[4] = gibberish
* r := []rune("inanç") -> five elements rune slice
* r[4] = 'ç'
* string(r)
* // inanç: automatically concatenates the runes to form a string
* string(105) // i -> interprets 105 as a rune value; 'i' not 105
* string(351) // ş -> ""
* printf: %q -> 'ç' %c -> ç %d -> 231
## Bytes
* major libs:
* strings, bytes (have corresponding funcs)
* strconv, unicode
* bytes.Buffer
* []byte can be modified whereas string is immutable
* if you do a lot of string manipulations you can use []byte
* []byte <-> string convertable
* but, each conversion copies the data
* compiler optimizes it mostly
* however, do not blindly convert; use bytes pkg
* it's like the string pkg
* s := "inanc"
* b := []byte(s)
* s := string(b)
## Sprintf
* Just like printf but instead of printing it returns a string
## Builders
* bytes.Buffer
* strings.Builder
* Use WriteRune when adding rune
## Terminology:
Summary: Unicode is a large table mapping characters to numbers and the different UTF encodings specify how these numbers are encoded as bits.
* **ASCII** First character set that maps characters to codepoints or character codes. In terms of alphabets, it only supports basic latin alphabet: English. 2^7=127
* The center of the computer industry was in the USA at that time. As a consequence, they didn't need to support accents or other marks such as á, ü, ç, ñ, etc.
* Once upon a time, computer memory and storage was very expensive. And all of the computers in the world (for practical purposes) were in the hands of English-speaking countries.
* Single byte encoding only using the bottom 7 bits. Basic Latin. (Unicode code points 0-127.) No accents etc.
* **Unicode** is a coded character set. A set of characters and a mapping between the characters and integer code points representing them. Unicode is a superset of ASCII.
* You cannot save text to your hard drive as "Unicode". Unicode is an abstract representation of the text. You need to "encode" this abstract representation. That's where an encoding comes into play.
* Unicode first and foremost defines a table of code points for characters. That's a fancy way of saying "65 stands for A, 66 stands for B and 9,731 stands for ☃" (seriously, it does). How these code points are actually encoded into bits is a different topic.
* **UTF-8** is a character encoding - a way of converting from sequences of bytes to sequences of characters and vice versa. It covers the whole of the Unicode character set.
* UTF-8 uses the ASCII set for the first 128 characters. That's handy because it means ASCII text is also valid in UTF-8.
* **Character Set:** A character set is a list of characters with unique numbers (these numbers are sometimes referred to as “code points”). For example, in the Unicode character set, the number for A is 41.
* **Codepoint:** Characters are referred to by their "Unicode code point".
* Written in hexadecimal (to keep the numbers shorter).
* Preceded by a "U+" (that's just what they do, it has no other meaning than "this is a Unicode code point").
* Unicode itself is a mapping, it defines codepoints and a codepoint is a number, associated with usually a character.
* Code: a system of words, letters, figures, or other symbols substituted for other words, letters, etc.
* **Encoding:** Converting data into a coded form. An encoding on the other hand, is an algorithm that translates a list of numbers to binary so it can be stored on disk. For example UTF-8 would translate the number sequence 1, 2, 3, 4 like this: `00000001 00000010 00000011 00000100`. Our data is now translated into binary and can now be saved to disk.
* To encode means to use something to represent something else. An encoding is the set of rules with which to convert something from one representation to another.
* To represent 1,114,112 different values, two bytes aren't enough. Three bytes are, but three bytes are often awkward to work with, so four bytes would be the comfortable minimum. But, unless you're actually using Chinese or some of the other characters with big numbers that take a lot of bits to encode, you're never going to use a huge chunk of those four bytes.
* If the letter "A" was always encoded to 00000000 00000000 00000000 01000001, "B" always to 00000000 00000000 00000000 01000010 and so on, any document would bloat to four times the necessary size.
* To optimize this, there are several ways to encode Unicode code points into bits. UTF-8 is one of them.
character encoding bits
A UTF-8 01000001
A UTF-16 00000000 01000001
A UTF-32 00000000 00000000 00000000 01000001
U+0000 to U+007F are (correctly) encoded with one byte
U+0080 to U+07FF are encoded with 2 bytes
U+0800 to U+FFFF are encoded with 3 bytes
U+010000 to U+10FFFF are encoded with 4 bytes
* There is NO string or text, without an accompanying encoding standard.
## REFS:
https://unicode-table.com/en/
What's the difference between ASCII and Unicode?
https://stackoverflow.com/a/41198513/115363
https://stackoverflow.com/questions/643694/what-is-the-difference-between-utf-8-and-unicode
https://stackoverflow.com/questions/3951722/whats-the-difference-between-unicode-and-utf-8
https://stackoverflow.com/questions/1543613/how-does-utf-8-variable-width-encoding-work
http://kunststube.net/encoding/
(detailed and simple)
http://www.joelonsoftware.com/articles/Unicode.html
Unicode codepoint to UTF-8 encoding answer: https://stackoverflow.com/a/27939161/115363
http://www.polylab.dk/utf8-vs-unicode.html
Characters, Symbols and the Unicode Miracle - Computerphile
https://www.youtube.com/watch?v=MijmeoH9LT4
The history of UTF-8 as told by Rob Pike
http://doc.cat-v.org/bell_labs/utf-8_history