add: string source code
This commit is contained in:
@ -14,15 +14,7 @@ import (
|
|||||||
"strings"
|
"strings"
|
||||||
)
|
)
|
||||||
|
|
||||||
const (
|
|
||||||
asciiStart = 0x41 // 65
|
|
||||||
asciiStop = 0x5a // 90
|
|
||||||
|
|
||||||
cols = 1
|
|
||||||
)
|
|
||||||
|
|
||||||
func main() {
|
func main() {
|
||||||
// DETERMINE START - STOP POSITIONS
|
|
||||||
var start, stop int
|
var start, stop int
|
||||||
|
|
||||||
if args := os.Args[1:]; len(args) == 2 {
|
if args := os.Args[1:]; len(args) == 2 {
|
||||||
@ -31,32 +23,20 @@ func main() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if start == 0 || stop == 0 {
|
if start == 0 || stop == 0 {
|
||||||
start, stop = asciiStart, asciiStop
|
start, stop = 'A', 'Z'
|
||||||
}
|
}
|
||||||
|
|
||||||
// PRINT HEADER
|
fmt.Printf("%-10s %-10s %-10s %-12s\n%s\n",
|
||||||
for i := 0; i < cols; i++ {
|
"literal", "dec", "hex", "encoded",
|
||||||
fmt.Printf("%-10s %-12s %-12s %-14s",
|
strings.Repeat("-", 45))
|
||||||
"literal", "decimal", "codepoint", "bytes")
|
|
||||||
}
|
|
||||||
fmt.Print("\n", strings.Repeat("-", 50*cols), "\n")
|
|
||||||
|
|
||||||
// PRINT TABLE
|
for n := start; n <= stop; n++ {
|
||||||
for n, l := start, 0; n <= stop; n++ {
|
fmt.Printf("%-10c %-10[1]d %-10[1]x % -12x\n", n, string(n))
|
||||||
// draw the line
|
|
||||||
fmt.Printf("%-10q %-12d %-12U % -14x", n, n, n, string(n))
|
|
||||||
|
|
||||||
// go to next line if columns are consumed
|
|
||||||
if l++; l%cols == 0 {
|
|
||||||
fmt.Println()
|
|
||||||
continue
|
|
||||||
}
|
}
|
||||||
}
|
|
||||||
fmt.Println()
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
EXAMPLE BLOCKS
|
EXAMPLE UNICODE BLOCKS
|
||||||
|
|
||||||
1 byte
|
1 byte
|
||||||
------------------------------------------------------------
|
------------------------------------------------------------
|
||||||
@ -72,7 +52,7 @@ lowerCaseStop = '\u007a' -> 122
|
|||||||
|
|
||||||
2 bytes
|
2 bytes
|
||||||
------------------------------------------------------------
|
------------------------------------------------------------
|
||||||
latin1Start = '\u0080' -> 128
|
latin1Start = '\u0080' -> 161
|
||||||
latin1Stop = '\u00ff' -> 255
|
latin1Stop = '\u00ff' -> 255
|
||||||
|
|
||||||
|
|
||||||
@ -86,8 +66,4 @@ dingbatStop = '\u27bf' -> 10175
|
|||||||
------------------------------------------------------------
|
------------------------------------------------------------
|
||||||
emojiStart = '\U0001f600' -> 128512
|
emojiStart = '\U0001f600' -> 128512
|
||||||
emojiStop = '\U0001f64f' -> 128591
|
emojiStop = '\U0001f64f' -> 128591
|
||||||
transportStart = '\U0001F680' -> 128640
|
|
||||||
transportStop = '\U0001f6ff' -> 128767
|
|
||||||
|
|
||||||
BIG THANK YOU! -> https://unicode-table.com/
|
|
||||||
*/
|
*/
|
61
18-strings/01-bytes-runes-strings/02-example/main.go
Normal file
61
18-strings/01-bytes-runes-strings/02-example/main.go
Normal file
@ -0,0 +1,61 @@
|
|||||||
|
// For more tutorials: https://blog.learngoprogramming.com
|
||||||
|
//
|
||||||
|
// Copyright © 2018 Inanc Gumus
|
||||||
|
// Learn Go Programming Course
|
||||||
|
// License: https://creativecommons.org/licenses/by-nc-sa/4.0/
|
||||||
|
//
|
||||||
|
|
||||||
|
package main
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"unicode/utf8"
|
||||||
|
"unsafe"
|
||||||
|
)
|
||||||
|
|
||||||
|
func main() {
|
||||||
|
str := "Yūgen ☯ 💀"
|
||||||
|
|
||||||
|
// can't change a string
|
||||||
|
// a string is a read-only byte-slice
|
||||||
|
// str[0] = 'N'
|
||||||
|
// str[1] = 'o'
|
||||||
|
|
||||||
|
bytes := []byte(str)
|
||||||
|
|
||||||
|
// can change a byte slice
|
||||||
|
// bytes[0] = 'N'
|
||||||
|
// bytes[1] = 'o'
|
||||||
|
|
||||||
|
str = string(bytes)
|
||||||
|
|
||||||
|
fmt.Printf("%s\n", str)
|
||||||
|
fmt.Printf("\t%d bytes\n", len(str))
|
||||||
|
fmt.Printf("\t%d runes\n", utf8.RuneCountInString(str))
|
||||||
|
fmt.Printf("% x\n", bytes)
|
||||||
|
fmt.Printf("\t%d bytes\n", len(bytes))
|
||||||
|
fmt.Printf("\t%d runes\n", utf8.RuneCount(bytes))
|
||||||
|
|
||||||
|
// fmt.Println()
|
||||||
|
// for i, r := range str {
|
||||||
|
// fmt.Printf("str[%2d] = % -12x = %q\n", i, string(r), r)
|
||||||
|
// }
|
||||||
|
|
||||||
|
fmt.Println()
|
||||||
|
fmt.Printf("1st byte : %c\n", str[0]) // ok
|
||||||
|
fmt.Printf("2nd byte : %c\n", str[1]) // not ok
|
||||||
|
fmt.Printf("2nd rune : %s\n", str[1:3]) // ok
|
||||||
|
fmt.Printf("last rune : %s\n", str[11:]) // ok
|
||||||
|
|
||||||
|
// disadvantage: each one is 4 bytes
|
||||||
|
runes := []rune(str)
|
||||||
|
|
||||||
|
fmt.Println()
|
||||||
|
fmt.Printf("%s\n", str)
|
||||||
|
fmt.Printf("\t%d bytes\n", int(unsafe.Sizeof(runes[0]))*len(runes))
|
||||||
|
fmt.Printf("\t%d runes\n", len(runes))
|
||||||
|
|
||||||
|
fmt.Printf("1st rune : %c\n", runes[0])
|
||||||
|
fmt.Printf("2nd rune : %c\n", runes[1])
|
||||||
|
fmt.Printf("first five : %c\n", runes[:5])
|
||||||
|
}
|
@ -7,11 +7,11 @@
|
|||||||
|
|
||||||
/*
|
/*
|
||||||
#1- Get and check the input
|
#1- Get and check the input
|
||||||
#2- Create a buffer with a sufficient size
|
#2- Create a byte buffer and use it as the output
|
||||||
#3- Write input to the buffer as it is and print it
|
#3- Write input to the buffer as it is and print it
|
||||||
#4- Detect the link
|
#4- Detect the link
|
||||||
#5- Mask the link
|
#5- Mask the link
|
||||||
#6- Detect white spaces and disable the masking
|
#6- Stop masking when a whitespace is detected
|
||||||
#7- Write http:// to the buffer, just before the link
|
#7- Write http:// to the buffer, just before the link
|
||||||
*/
|
*/
|
||||||
|
|
||||||
@ -24,6 +24,7 @@ import (
|
|||||||
|
|
||||||
const (
|
const (
|
||||||
link = "http://"
|
link = "http://"
|
||||||
|
nlink = len(link)
|
||||||
mask = '*'
|
mask = '*'
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -37,30 +38,23 @@ func main() {
|
|||||||
var (
|
var (
|
||||||
text = args[0]
|
text = args[0]
|
||||||
size = len(text)
|
size = len(text)
|
||||||
|
|
||||||
// create a sufficient buffer for the output
|
|
||||||
//
|
|
||||||
// and adjust its slice pointer to the first element
|
|
||||||
// of the backing array! -> make(..., 0, ...)
|
|
||||||
buf = make([]byte, 0, size)
|
buf = make([]byte, 0, size)
|
||||||
|
|
||||||
in bool
|
in bool
|
||||||
)
|
)
|
||||||
|
|
||||||
for i := 0; i < size; i++ {
|
for i := 0; i < size; i++ {
|
||||||
nlink := len(link)
|
|
||||||
|
|
||||||
// slice the input and look for the link pattern
|
// slice the input and look for the link pattern
|
||||||
// do not slice it when it goes beyond the input text's capacity
|
// do not slice it when it goes beyond the input text's capacity
|
||||||
if len(text[i:]) >= nlink && text[i:i+nlink] == link {
|
if len(text[i:]) >= nlink && text[i:i+nlink] == link {
|
||||||
// jump to the next character after "http://"
|
|
||||||
i += nlink
|
|
||||||
|
|
||||||
// set the flag: we're in a link! -> "http://....."
|
// set the flag: we're in a link! -> "http://....."
|
||||||
in = true
|
in = true
|
||||||
|
|
||||||
// add the "http://" manually
|
// add the "http://" manually
|
||||||
buf = append(buf, link...)
|
buf = append(buf, link...)
|
||||||
|
|
||||||
|
// jump to the next character after "http://"
|
||||||
|
i += nlink
|
||||||
}
|
}
|
||||||
|
|
||||||
// get the current byte from the input
|
// get the current byte from the input
|
||||||
@ -78,8 +72,6 @@ func main() {
|
|||||||
if in {
|
if in {
|
||||||
c = mask
|
c = mask
|
||||||
}
|
}
|
||||||
|
|
||||||
// add the current character to the buffer
|
|
||||||
buf = append(buf, c)
|
buf = append(buf, c)
|
||||||
}
|
}
|
||||||
|
|
@ -10,11 +10,8 @@ package main
|
|||||||
import (
|
import (
|
||||||
"fmt"
|
"fmt"
|
||||||
"unicode"
|
"unicode"
|
||||||
"unicode/utf8"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
const lineWidth = 40
|
|
||||||
|
|
||||||
func main() {
|
func main() {
|
||||||
text := `Galaksinin Batı Sarmal Kolu'nun bir ucunda, haritası bile çıkarılmamış ücra bir köşede, gözlerden uzak, küçük ve sarı bir güneş vardır.
|
text := `Galaksinin Batı Sarmal Kolu'nun bir ucunda, haritası bile çıkarılmamış ücra bir köşede, gözlerden uzak, küçük ve sarı bir güneş vardır.
|
||||||
|
|
||||||
@ -22,13 +19,15 @@ Bu güneşin yörüngesinde, kabaca yüz kırksekiz milyon kilometre uzağında,
|
|||||||
|
|
||||||
Gezegenin maymun soyundan gelen canlıları öyle ilkeldir ki dijital kol saatinin hâlâ çok etkileyici bir buluş olduğunu düşünürler.`
|
Gezegenin maymun soyundan gelen canlıları öyle ilkeldir ki dijital kol saatinin hâlâ çok etkileyici bir buluş olduğunu düşünürler.`
|
||||||
|
|
||||||
|
const maxWidth = 40
|
||||||
|
|
||||||
var lw int // line width
|
var lw int // line width
|
||||||
|
|
||||||
for _, r := range text {
|
for _, r := range text {
|
||||||
fmt.Printf("%c", r)
|
fmt.Printf("%c", r)
|
||||||
|
|
||||||
switch lw++; {
|
switch lw++; {
|
||||||
case lw > lineWidth && r != '\n' && unicode.IsSpace(r):
|
case lw > maxWidth && r != '\n' && unicode.IsSpace(r):
|
||||||
fmt.Println()
|
fmt.Println()
|
||||||
fallthrough
|
fallthrough
|
||||||
case r == '\n':
|
case r == '\n':
|
||||||
@ -37,30 +36,3 @@ Gezegenin maymun soyundan gelen canlıları öyle ilkeldir ki dijital kol saatin
|
|||||||
}
|
}
|
||||||
fmt.Println()
|
fmt.Println()
|
||||||
}
|
}
|
||||||
|
|
||||||
// call it like: runeHandler(text)
|
|
||||||
func runeHandler(text string) {
|
|
||||||
for i := 0; i < len(text); {
|
|
||||||
r := rune(text[i])
|
|
||||||
|
|
||||||
size := 1
|
|
||||||
if r > utf8.RuneSelf {
|
|
||||||
r, size = utf8.DecodeRuneInString(text[i:])
|
|
||||||
// check out the other functions as well, play with them!
|
|
||||||
//
|
|
||||||
// for example (type these into the command-line):
|
|
||||||
// go doc utf8
|
|
||||||
// go doc utf8 EncodeRune
|
|
||||||
}
|
|
||||||
i += size
|
|
||||||
|
|
||||||
fmt.Printf("%c", r)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// call it like: byteHandler(text)
|
|
||||||
func byteHandler(text string) {
|
|
||||||
for i := 0; i < len(text); i++ {
|
|
||||||
fmt.Printf("%c", text[i])
|
|
||||||
}
|
|
||||||
}
|
|
5
18-strings/08-wrapper-example/story.txt
Normal file
5
18-strings/08-wrapper-example/story.txt
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
Galaksinin Batı Sarmal Kolu'nun bir ucunda, haritası bile çıkarılmamış ücra bir köşede, gözlerden uzak, küçük ve sarı bir güneş vardır.
|
||||||
|
|
||||||
|
Bu güneşin yörüngesinde, kabaca yüz kırksekiz milyon kilometre uzağında, tamamıyla önemsiz ve mavi-yeşil renkli, küçük bir gezegen döner.
|
||||||
|
|
||||||
|
Gezegenin maymun soyundan gelen canlıları öyle ilkeldir ki dijital kol saatinin hâlâ çok etkileyici bir buluş olduğunu düşünürler.
|
9
18-strings/exercises/README.md
Normal file
9
18-strings/exercises/README.md
Normal file
@ -0,0 +1,9 @@
|
|||||||
|
TODO
|
||||||
|
|
||||||
|
* Write a program that dumps the bytes of the given argument
|
||||||
|
* Get the first unicode char ([]rune)
|
||||||
|
* Get the last unicode char ([]rune)
|
||||||
|
|
||||||
|
* In the masker program:
|
||||||
|
* Use copy instead of append when appendin the "http://" manually
|
||||||
|
*
|
18
18-strings/exercises/main.go
Normal file
18
18-strings/exercises/main.go
Normal file
@ -0,0 +1,18 @@
|
|||||||
|
// For more tutorials: https://blog.learngoprogramming.com
|
||||||
|
//
|
||||||
|
// Copyright © 2018 Inanc Gumus
|
||||||
|
// Learn Go Programming Course
|
||||||
|
// License: https://creativecommons.org/licenses/by-nc-sa/4.0/
|
||||||
|
//
|
||||||
|
|
||||||
|
// + Masker : Use copy instead of append when appending the "http://" manually
|
||||||
|
// + Wrapper: Accept the width from the cmdline
|
||||||
|
// args, maxWidth := os.Args[1:], 40
|
||||||
|
// if len(args) == 1 {
|
||||||
|
// maxWidth, _ = strconv.Atoi(args[0])
|
||||||
|
// }
|
||||||
|
|
||||||
|
package main
|
||||||
|
|
||||||
|
func main() {
|
||||||
|
}
|
@ -1,45 +0,0 @@
|
|||||||
// For more tutorials: https://blog.learngoprogramming.com
|
|
||||||
//
|
|
||||||
// Copyright © 2018 Inanc Gumus
|
|
||||||
// Learn Go Programming Course
|
|
||||||
// License: https://creativecommons.org/licenses/by-nc-sa/4.0/
|
|
||||||
//
|
|
||||||
|
|
||||||
package main
|
|
||||||
|
|
||||||
import "fmt"
|
|
||||||
|
|
||||||
func main() {
|
|
||||||
var g, o rune
|
|
||||||
|
|
||||||
g, o = 'g', 'o'
|
|
||||||
g, o = 103, 111
|
|
||||||
g, o = 0x67, 0x6f
|
|
||||||
g, o = '\U00000067', '\U0000006f'
|
|
||||||
g, o = '\u0067', '\u006f'
|
|
||||||
g, o = '\x67', '\x6f'
|
|
||||||
|
|
||||||
fmt.Println("codepoints")
|
|
||||||
fmt.Printf(" dec : %d %d\n", g, o)
|
|
||||||
fmt.Printf(" hex : %x %x\n", g, o)
|
|
||||||
fmt.Printf(" unicode : %U %U\n", g, o)
|
|
||||||
fmt.Printf(" chars : %c %c\n", g, o)
|
|
||||||
|
|
||||||
// g++
|
|
||||||
// o -= 6
|
|
||||||
|
|
||||||
g -= 'a' - 'A'
|
|
||||||
o -= 'a' - 'A'
|
|
||||||
|
|
||||||
fmt.Println("codepoints")
|
|
||||||
fmt.Printf(" dec : %d %d\n", g, o)
|
|
||||||
fmt.Printf(" hex : %x %x\n", g, o)
|
|
||||||
fmt.Printf(" unicode : %U %U\n", g, o)
|
|
||||||
fmt.Printf(" chars : %c %c\n", g, o)
|
|
||||||
|
|
||||||
// string representations
|
|
||||||
// fmt.Print("string() : ", string(g), string(o), "\n")
|
|
||||||
// fmt.Print("hex 1 byte : \x67\x6f \n")
|
|
||||||
// fmt.Print("hex 2 bytes : \u0067\u006f \n")
|
|
||||||
// fmt.Print("hex 4 bytes : \U00000067\U0000006f \n")
|
|
||||||
}
|
|
@ -1,28 +0,0 @@
|
|||||||
// For more tutorials: https://blog.learngoprogramming.com
|
|
||||||
//
|
|
||||||
// Copyright © 2018 Inanc Gumus
|
|
||||||
// Learn Go Programming Course
|
|
||||||
// License: https://creativecommons.org/licenses/by-nc-sa/4.0/
|
|
||||||
//
|
|
||||||
|
|
||||||
package main
|
|
||||||
|
|
||||||
import "fmt"
|
|
||||||
|
|
||||||
func main() {
|
|
||||||
msg := "WONDERFUL!"
|
|
||||||
bytes := []byte(msg)
|
|
||||||
|
|
||||||
fmt.Println("msg :", msg)
|
|
||||||
fmt.Println("bytes :", bytes)
|
|
||||||
fmt.Println("string(bytes) :", string(bytes))
|
|
||||||
fmt.Println("string(87) :", string(87))
|
|
||||||
|
|
||||||
fmt.Println()
|
|
||||||
|
|
||||||
for i, v := range msg {
|
|
||||||
fmt.Printf(
|
|
||||||
"msg[%d] : %d = %[2]q\n",
|
|
||||||
i, v)
|
|
||||||
}
|
|
||||||
}
|
|
@ -1,57 +0,0 @@
|
|||||||
// For more tutorials: https://blog.learngoprogramming.com
|
|
||||||
//
|
|
||||||
// Copyright © 2018 Inanc Gumus
|
|
||||||
// Learn Go Programming Course
|
|
||||||
// License: https://creativecommons.org/licenses/by-nc-sa/4.0/
|
|
||||||
//
|
|
||||||
|
|
||||||
package main
|
|
||||||
|
|
||||||
import (
|
|
||||||
"fmt"
|
|
||||||
)
|
|
||||||
|
|
||||||
func main() {
|
|
||||||
// GOALS:
|
|
||||||
// 1- String value is immutable
|
|
||||||
// 2- Indexing vs Slicing
|
|
||||||
// 3- Using bytes for manipulating strings
|
|
||||||
|
|
||||||
mood := "wonder"
|
|
||||||
|
|
||||||
// 1- a string value is immutable (read-only)
|
|
||||||
// mood[1] = 'a'
|
|
||||||
|
|
||||||
// 2- Indexing vs Slicing
|
|
||||||
|
|
||||||
// "wonder"
|
|
||||||
// ^ ^^^^
|
|
||||||
// | ||||
|
|
||||||
// "wandering"
|
|
||||||
|
|
||||||
// "w" + "a" + "nder" + "ing"
|
|
||||||
|
|
||||||
// wandering := mood[0] + "a" + mood[2:] + "ing"
|
|
||||||
// fmt.Printf("mood[0] : %T - %[1]v\n", mood[0]) // byte
|
|
||||||
// fmt.Printf("mood[0:1] : %T - %[1]v\n", mood[0:1]) // string
|
|
||||||
|
|
||||||
// wandering := mood[:1] + "a" + mood[2:] + "ing"
|
|
||||||
fmt.Println(mood)
|
|
||||||
// fmt.Println(wandering)
|
|
||||||
|
|
||||||
// 3- converting creates a new byte slice (allocation)
|
|
||||||
b := []byte(mood)
|
|
||||||
b[1] = 'a'
|
|
||||||
|
|
||||||
// b = append(b, 'i', 'n', 'g')
|
|
||||||
// b = append(b, []byte{'i', 'n', 'g'})
|
|
||||||
b = append(b, "ing"...)
|
|
||||||
|
|
||||||
// starts copying from the first element
|
|
||||||
copy(b, "listen")
|
|
||||||
|
|
||||||
// starts copying from the "7th" element
|
|
||||||
copy(b[6:], "ed.")
|
|
||||||
|
|
||||||
fmt.Println(string(b))
|
|
||||||
}
|
|
@ -1,144 +0,0 @@
|
|||||||
// For more tutorials: https://blog.learngoprogramming.com
|
|
||||||
//
|
|
||||||
// Copyright © 2018 Inanc Gumus
|
|
||||||
// Learn Go Programming Course
|
|
||||||
// License: https://creativecommons.org/licenses/by-nc-sa/4.0/
|
|
||||||
//
|
|
||||||
|
|
||||||
package main
|
|
||||||
|
|
||||||
import (
|
|
||||||
"fmt"
|
|
||||||
"unicode/utf8"
|
|
||||||
"unsafe"
|
|
||||||
)
|
|
||||||
|
|
||||||
// Please run this code and experiment with it
|
|
||||||
// Observe the results
|
|
||||||
|
|
||||||
// USELESS-NOTE : "Öykü" means "Story" in Turkish!
|
|
||||||
|
|
||||||
func main() {
|
|
||||||
fmt.Println("-----------------------------------")
|
|
||||||
fmt.Println("ASCII Codepoints")
|
|
||||||
fmt.Println("-----------------------------------")
|
|
||||||
|
|
||||||
var (
|
|
||||||
a, z byte = 'a', 'z'
|
|
||||||
A, Z byte = 'A', 'Z'
|
|
||||||
d0, d9 byte = '0', '9'
|
|
||||||
)
|
|
||||||
|
|
||||||
for _, c := range []byte{a, z, A, Z, d0, d9} {
|
|
||||||
fmt.Printf("%c - 1 byte - %[1]U - %[1]d\n", c)
|
|
||||||
}
|
|
||||||
|
|
||||||
fmt.Println("\n-----------------------------------")
|
|
||||||
fmt.Println("Unicode Codepoints")
|
|
||||||
fmt.Println("-----------------------------------")
|
|
||||||
|
|
||||||
var (
|
|
||||||
Ö = 'Ö'
|
|
||||||
栗 = '栗'
|
|
||||||
monkey = '🙉'
|
|
||||||
)
|
|
||||||
for _, c := range []rune{rune(A), Ö, 栗, monkey} {
|
|
||||||
fmt.Printf("%c - %d bytes - %[1]U - %[1]d\n", c, cptb(c))
|
|
||||||
}
|
|
||||||
|
|
||||||
fmt.Println("\n-----------------------------------")
|
|
||||||
fmt.Println("UTF-8 Encoded")
|
|
||||||
fmt.Println("-----------------------------------")
|
|
||||||
|
|
||||||
// utf8.RuneLen finds the number of bytes necessary for
|
|
||||||
// encoding a codepoint to utf8
|
|
||||||
for _, c := range []rune{rune(A), Ö, 栗, monkey} {
|
|
||||||
fmt.Printf("%c - %d bytes - %[1]U - %[1]d\n", c,
|
|
||||||
utf8.RuneLen(c))
|
|
||||||
}
|
|
||||||
|
|
||||||
fmt.Println("\n-----------------------------------")
|
|
||||||
fmt.Println("Example: Unicode Codepoints")
|
|
||||||
fmt.Println("-----------------------------------")
|
|
||||||
|
|
||||||
var (
|
|
||||||
ö = 'ö'
|
|
||||||
y = 'y'
|
|
||||||
k = 'k'
|
|
||||||
ü = 'ü'
|
|
||||||
)
|
|
||||||
|
|
||||||
var (
|
|
||||||
oykuRunes = []rune{ö, y, k, ü}
|
|
||||||
total int
|
|
||||||
)
|
|
||||||
|
|
||||||
for _, c := range oykuRunes {
|
|
||||||
fmt.Printf("%c - %d bytes - %[1]U - %[1]d\n", c, cptb(c))
|
|
||||||
|
|
||||||
// unsafe.Sizeof finds the memory size of simple values
|
|
||||||
// don't use it in production-level code -> it's unsafe!
|
|
||||||
total += int(unsafe.Sizeof(c))
|
|
||||||
}
|
|
||||||
fmt.Printf("TOTAL: %d bytes.\n", total)
|
|
||||||
|
|
||||||
fmt.Println("\n-----------------------------------")
|
|
||||||
fmt.Println("Example: Indexing")
|
|
||||||
fmt.Println("-----------------------------------")
|
|
||||||
|
|
||||||
fmt.Printf("%c%c%c%c\n",
|
|
||||||
oykuRunes[0], oykuRunes[1], oykuRunes[2],
|
|
||||||
oykuRunes[len(oykuRunes)-1])
|
|
||||||
|
|
||||||
// string to []rune
|
|
||||||
oykuRunes = []rune("öykü")
|
|
||||||
fmt.Printf("%c%c%c%c\n",
|
|
||||||
oykuRunes[0], oykuRunes[1], oykuRunes[2],
|
|
||||||
oykuRunes[len(oykuRunes)-1])
|
|
||||||
|
|
||||||
fmt.Println("\n-----------------------------------")
|
|
||||||
fmt.Println("Example: UTF-8 Encoding")
|
|
||||||
fmt.Println("-----------------------------------")
|
|
||||||
|
|
||||||
// this is also ok
|
|
||||||
// oykuString := string(oykuRunes)
|
|
||||||
|
|
||||||
oykuString := "öykü"
|
|
||||||
|
|
||||||
fmt.Printf("TOTAL bytes in oykuRunes : %d\n", total)
|
|
||||||
fmt.Printf("TOTAL bytes in oykuString: %d\n", len(oykuString))
|
|
||||||
fmt.Printf("TOTAL runes in oykuString: %d\n",
|
|
||||||
utf8.RuneCountInString(oykuString))
|
|
||||||
|
|
||||||
fmt.Printf("Runes of oykuString : %s\n", oykuString)
|
|
||||||
fmt.Printf("Bytes of oykuString : % x\n", oykuString)
|
|
||||||
|
|
||||||
fmt.Println()
|
|
||||||
for i := 0; i < len(oykuString); i++ {
|
|
||||||
fmt.Printf("oykuString[%d]: %c\n", i, oykuString[i])
|
|
||||||
}
|
|
||||||
|
|
||||||
// slicing returns a slice with the type of the sliced value
|
|
||||||
// so, the sliced value is a string, then a string is returned
|
|
||||||
//
|
|
||||||
// example:
|
|
||||||
// oykuString[0:2] is a string
|
|
||||||
fmt.Println()
|
|
||||||
fmt.Printf("oykuString[0:2]: %q\n", oykuString[0:2])
|
|
||||||
fmt.Printf("oykuString[4:6]: %q\n", oykuString[4:6])
|
|
||||||
}
|
|
||||||
|
|
||||||
// -------------------------------------------------------------------
|
|
||||||
// cptb finds how many bytes are necessary to represent a codepoint
|
|
||||||
// cptb means codepoint to bytes
|
|
||||||
func cptb(r rune) int {
|
|
||||||
switch {
|
|
||||||
case r <= 0xFF: // 255
|
|
||||||
return 1
|
|
||||||
case r <= 0xFFFF: // 65,535
|
|
||||||
return 2
|
|
||||||
case r <= 0xFFFFF: // 16,777,215
|
|
||||||
return 3
|
|
||||||
}
|
|
||||||
return 4
|
|
||||||
}
|
|
@ -1,60 +0,0 @@
|
|||||||
// For more tutorials: https://blog.learngoprogramming.com
|
|
||||||
//
|
|
||||||
// Copyright © 2018 Inanc Gumus
|
|
||||||
// Learn Go Programming Course
|
|
||||||
// License: https://creativecommons.org/licenses/by-nc-sa/4.0/
|
|
||||||
//
|
|
||||||
|
|
||||||
package main
|
|
||||||
|
|
||||||
import (
|
|
||||||
"fmt"
|
|
||||||
"unicode/utf8"
|
|
||||||
"unsafe"
|
|
||||||
)
|
|
||||||
|
|
||||||
func main() {
|
|
||||||
// try yourself: try other runes!
|
|
||||||
// you can find more here: https://unicode-table.com
|
|
||||||
// r := '🙉'
|
|
||||||
|
|
||||||
// r := '\u011e'
|
|
||||||
r := 'Ğ'
|
|
||||||
|
|
||||||
// only codepoint (can't be printed)
|
|
||||||
fmt.Printf("before encoding: %d\n", r)
|
|
||||||
fmt.Printf(" bits : %016b\n", r)
|
|
||||||
fmt.Printf(" bytes: % x\n", r)
|
|
||||||
|
|
||||||
// utf-8 encoded string
|
|
||||||
encoded := string(r)
|
|
||||||
encodedBytes := []byte(encoded)
|
|
||||||
|
|
||||||
fmt.Println()
|
|
||||||
fmt.Printf("after encoding: %q\n", encoded)
|
|
||||||
fmt.Printf(" bits : %8b\n", encodedBytes)
|
|
||||||
fmt.Printf(" bytes: % x\n", encodedBytes)
|
|
||||||
|
|
||||||
// utf-8 string efficient to store and transmit
|
|
||||||
// but, it's harder to use.
|
|
||||||
//
|
|
||||||
// rune slice is inefficient.
|
|
||||||
// but, it's easy to use.
|
|
||||||
fmt.Println()
|
|
||||||
fmt.Println("string (utf-8) vs []rune (unicode)")
|
|
||||||
|
|
||||||
s := "hava çok güzel 😳"
|
|
||||||
fmt.Printf("%q\n", s)
|
|
||||||
fmt.Printf(" size : %d bytes\n", len(s))
|
|
||||||
fmt.Printf(" len : %d chars\n", utf8.RuneCountInString(s))
|
|
||||||
fmt.Printf(" s[5] : %q\n", s[5])
|
|
||||||
fmt.Printf(" s[5:7] : %q\n", s[5:7])
|
|
||||||
|
|
||||||
runes := []rune(s)
|
|
||||||
size := int(unsafe.Sizeof(runes[0])) * len(runes)
|
|
||||||
|
|
||||||
fmt.Printf("\n%q\n", runes)
|
|
||||||
fmt.Printf(" size : %d bytes\n", size)
|
|
||||||
fmt.Printf(" len : %d chars\n", len(runes))
|
|
||||||
fmt.Printf(" runes[5] : %q\n", runes[5])
|
|
||||||
}
|
|
@ -1,45 +0,0 @@
|
|||||||
// For more tutorials: https://blog.learngoprogramming.com
|
|
||||||
//
|
|
||||||
// Copyright © 2018 Inanc Gumus
|
|
||||||
// Learn Go Programming Course
|
|
||||||
// License: https://creativecommons.org/licenses/by-nc-sa/4.0/
|
|
||||||
//
|
|
||||||
|
|
||||||
package main
|
|
||||||
|
|
||||||
import (
|
|
||||||
"fmt"
|
|
||||||
"unsafe"
|
|
||||||
)
|
|
||||||
|
|
||||||
func main() {
|
|
||||||
// empty := ""
|
|
||||||
// dump(empty)
|
|
||||||
|
|
||||||
hello := "hello"
|
|
||||||
dump(hello)
|
|
||||||
dump("hello")
|
|
||||||
dump("hello!")
|
|
||||||
|
|
||||||
for i := range hello {
|
|
||||||
dump(hello[i : i+1])
|
|
||||||
}
|
|
||||||
|
|
||||||
dump(string([]byte(hello)))
|
|
||||||
dump(string([]byte(hello)))
|
|
||||||
dump(string([]rune(hello)))
|
|
||||||
}
|
|
||||||
|
|
||||||
// StringHeader is used by a string value
|
|
||||||
// In practice, you should use: reflect.Header
|
|
||||||
type StringHeader struct {
|
|
||||||
// points to a backing array's item
|
|
||||||
pointer uintptr // where it starts
|
|
||||||
length int // where it ends
|
|
||||||
}
|
|
||||||
|
|
||||||
// dump prints the string header of a string value
|
|
||||||
func dump(s string) {
|
|
||||||
ptr := *(*StringHeader)(unsafe.Pointer(&s))
|
|
||||||
fmt.Printf("%q: %+v\n", s, ptr)
|
|
||||||
}
|
|
@ -1,52 +0,0 @@
|
|||||||
package main
|
|
||||||
|
|
||||||
import (
|
|
||||||
"fmt"
|
|
||||||
"strings"
|
|
||||||
)
|
|
||||||
|
|
||||||
// ---------------------------------------------------------
|
|
||||||
// EXERCISE: Assign the Arrays
|
|
||||||
//
|
|
||||||
// 1. Create an array named books
|
|
||||||
// 2. Add book titles to the array
|
|
||||||
// 3. Create two more copies of the array named: upper and lower
|
|
||||||
// 4. Change the book titles to uppercase in the upper array only
|
|
||||||
// 5. Change the book titles to lowercase in the lower array only
|
|
||||||
// 6. Print all the arrays
|
|
||||||
//
|
|
||||||
// NOTE
|
|
||||||
// Check out the strings package, it has function to convert cases to
|
|
||||||
// upper and lower cases.
|
|
||||||
//
|
|
||||||
// BONUS
|
|
||||||
// 1. Invent your own arrays with different types other than string,
|
|
||||||
// and do some manipulations on them.
|
|
||||||
//
|
|
||||||
// 👉 THISSSS--------------------------------------------------------
|
|
||||||
// 2. Find some Turkish book titles and do the same upper, lowercase conversion
|
|
||||||
// for them.
|
|
||||||
//
|
|
||||||
// Here are some books: https://www.goodreads.com/group/bookshelf/417154-bilimkurgu-kul-b?shelf=read
|
|
||||||
//
|
|
||||||
// Note: You'd need to use special functions to convert the Turkish letters.
|
|
||||||
// They're in the strings package as well.
|
|
||||||
//
|
|
||||||
// EXPECTED OUTPUT
|
|
||||||
// ?
|
|
||||||
// ---------------------------------------------------------
|
|
||||||
|
|
||||||
func main() {
|
|
||||||
books := [...]string{"Kafka's Revenge", "Stay Golden", "Everythingship"}
|
|
||||||
|
|
||||||
upper, lower := books, books
|
|
||||||
|
|
||||||
for i := range books {
|
|
||||||
upper[i] = strings.ToUpper(upper[i])
|
|
||||||
lower[i] = strings.ToLower(lower[i])
|
|
||||||
}
|
|
||||||
|
|
||||||
fmt.Printf("books: %q\n", books)
|
|
||||||
fmt.Printf("upper: %q\n", upper)
|
|
||||||
fmt.Printf("lower: %q\n", lower)
|
|
||||||
}
|
|
@ -1,233 +0,0 @@
|
|||||||
# Strings Revisited
|
|
||||||
|
|
||||||
## Bytes
|
|
||||||
* ASCII
|
|
||||||
* Immutable
|
|
||||||
|
|
||||||
## Runes
|
|
||||||
* Unicode
|
|
||||||
* vs ASCII
|
|
||||||
* UTF-8
|
|
||||||
* Made up of bytes
|
|
||||||
|
|
||||||
## Slicing
|
|
||||||
* String: Read-Only Byte Slice
|
|
||||||
* Slicing -> String
|
|
||||||
* Index -> Byte
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Read-only byte slice
|
|
||||||
* A string is a read-only slice
|
|
||||||
* You can't change its data
|
|
||||||
* Indexable: Returns you a byte
|
|
||||||
* Slicable: Returns you a string
|
|
||||||
|
|
||||||
## Slicing
|
|
||||||
* Strings can be sliced just like a slice
|
|
||||||
* After slicing Go returns you a new string slice
|
|
||||||
* WARNING: Indexing expression returns you a byte
|
|
||||||
* s := "hey"
|
|
||||||
* s[0] + s[1] + s[2] != "hey"
|
|
||||||
* s[0:3] == "hey
|
|
||||||
|
|
||||||
## Underlying array
|
|
||||||
* Underlying array is a string array
|
|
||||||
* There's no capacity this time: Only length and pointer.
|
|
||||||
|
|
||||||
* Sliced string will refer to that array
|
|
||||||
* String slicing is cheap — They share the same array
|
|
||||||
|
|
||||||
## Unicode
|
|
||||||
* At the beginning there was only ASCII code standard
|
|
||||||
* It was using 7-bits to represents 128 characters
|
|
||||||
* Only English characters
|
|
||||||
* Each code corresponding to a character
|
|
||||||
|
|
||||||
* After Internet nothing couldn't stay the same
|
|
||||||
* There was a need to introduce more languages
|
|
||||||
* 127 characters aren't enough for the entire world
|
|
||||||
|
|
||||||
* So: Unicode is born
|
|
||||||
* It collects all of the characters in world's languages
|
|
||||||
* Unicode can represent every character in every imaginable language system.
|
|
||||||
|
|
||||||
* Assigns each character to a codepoint or a rune (in Go)
|
|
||||||
* Unicode assigns each character a unique number, or code point.
|
|
||||||
|
|
||||||
* Codepoint is a numeric number which represents a character in general
|
|
||||||
* U+2700 -> hex
|
|
||||||
* Unicode defines codepoints for 1m+ characters
|
|
||||||
* It includes the ASCII codes too
|
|
||||||
|
|
||||||
A chinese character: 汉
|
|
||||||
Its unicode value: U+6C49
|
|
||||||
convert 6C49 to binary: 01101100 01001001
|
|
||||||
embed 6C49 as UTF-8: 11100110 10110001 10001001
|
|
||||||
|
|
||||||
|
|
||||||
## Unicode and Runes
|
|
||||||
* Rune is a 4-bytes type for storing unicode codepoints
|
|
||||||
|
|
||||||
* Rune data type and rune codepoints are different things!
|
|
||||||
|
|
||||||
* There's UTF-32 standard which assigns 4 bytes to each codepoint
|
|
||||||
* But, that's inefficient, so, instead Go uses a variable encoding standard called UTF-8. It assigns different number of bytes to codepoints.
|
|
||||||
* UTF-8 has been invented by Rob Pike and Ken Thompson (two of the creators of Go)
|
|
||||||
|
|
||||||
* So, a rune is 1-4 bytes. Uses 1 byte for ASCII (english).
|
|
||||||
* 2-3 bytes for most of the characters.
|
|
||||||
|
|
||||||
* A string can contain runes
|
|
||||||
* Each rune can span to multiple bytes
|
|
||||||
* WARNING: Getting one byte of a string may give you corrupt data
|
|
||||||
* If you're getting one part of a rune inside the string!
|
|
||||||
|
|
||||||
* In a string with runes, you can't easily index the characters
|
|
||||||
* You need to use unicode and utf8 packages
|
|
||||||
* Or you need to convert the string into a rune slice
|
|
||||||
* unicode: letters vs nums, to uppercase, ...
|
|
||||||
* utf8 : working w/bytes and runes
|
|
||||||
|
|
||||||
* RuneCountInString(s) == len([]rune(s))
|
|
||||||
* DecodeRuneInString(s) returns the first rune
|
|
||||||
|
|
||||||
## Ranging over strings
|
|
||||||
* You can range over a string like a slice
|
|
||||||
* It will jump over the runes inside the string
|
|
||||||
* The index variable will be the starting position of each rune
|
|
||||||
* And the value will be the rune itself
|
|
||||||
|
|
||||||
## Representing bytes
|
|
||||||
* Unicode characters can be hard to type in code
|
|
||||||
* So, we can use \x and \u in a string to represent bytes and runes
|
|
||||||
|
|
||||||
* A string literal is always utf-8 but a string value is not
|
|
||||||
|
|
||||||
## Convenience
|
|
||||||
* It's easy to work with runes in code: []rune
|
|
||||||
* However, it will consume more memory: Each char is 4 bytes
|
|
||||||
|
|
||||||
* "inanç"[4] = gibberish
|
|
||||||
|
|
||||||
* r := []rune("inanç") -> five elements rune slice
|
|
||||||
* r[4] = 'ç'
|
|
||||||
* string(r)
|
|
||||||
* // inanç: automatically concatenates the runes to form a string
|
|
||||||
|
|
||||||
* string(105) // i -> interprets 105 as a rune value; 'i' not 105
|
|
||||||
* string(351) // ş -> ""
|
|
||||||
|
|
||||||
* printf: %q -> 'ç' %c -> ç %d -> 231
|
|
||||||
|
|
||||||
## Bytes
|
|
||||||
* major libs:
|
|
||||||
* strings, bytes (have corresponding funcs)
|
|
||||||
* strconv, unicode
|
|
||||||
* bytes.Buffer
|
|
||||||
|
|
||||||
* []byte can be modified whereas string is immutable
|
|
||||||
* if you do a lot of string manipulations you can use []byte
|
|
||||||
|
|
||||||
* []byte <-> string convertable
|
|
||||||
* but, each conversion copies the data
|
|
||||||
* compiler optimizes it mostly
|
|
||||||
|
|
||||||
* however, do not blindly convert; use bytes pkg
|
|
||||||
* it's like the string pkg
|
|
||||||
|
|
||||||
* s := "inanc"
|
|
||||||
* b := []byte(s)
|
|
||||||
* s := string(b)
|
|
||||||
|
|
||||||
## Sprintf
|
|
||||||
* Just like printf but instead of printing it returns a string
|
|
||||||
|
|
||||||
## Builders
|
|
||||||
* bytes.Buffer
|
|
||||||
* strings.Builder
|
|
||||||
|
|
||||||
* Use WriteRune when adding rune
|
|
||||||
|
|
||||||
|
|
||||||
## Terminology:
|
|
||||||
Summary: Unicode is a large table mapping characters to numbers and the different UTF encodings specify how these numbers are encoded as bits.
|
|
||||||
|
|
||||||
* **ASCII** First character set that maps characters to codepoints or character codes. In terms of alphabets, it only supports basic latin alphabet: English. 2^7=127
|
|
||||||
|
|
||||||
* The center of the computer industry was in the USA at that time. As a consequence, they didn't need to support accents or other marks such as á, ü, ç, ñ, etc.
|
|
||||||
|
|
||||||
* Once upon a time, computer memory and storage was very expensive. And all of the computers in the world (for practical purposes) were in the hands of English-speaking countries.
|
|
||||||
|
|
||||||
* Single byte encoding only using the bottom 7 bits. Basic Latin. (Unicode code points 0-127.) No accents etc.
|
|
||||||
|
|
||||||
* **Unicode** is a coded character set. A set of characters and a mapping between the characters and integer code points representing them. Unicode is a superset of ASCII.
|
|
||||||
|
|
||||||
* You cannot save text to your hard drive as "Unicode". Unicode is an abstract representation of the text. You need to "encode" this abstract representation. That's where an encoding comes into play.
|
|
||||||
|
|
||||||
* Unicode first and foremost defines a table of code points for characters. That's a fancy way of saying "65 stands for A, 66 stands for B and 9,731 stands for ☃" (seriously, it does). How these code points are actually encoded into bits is a different topic.
|
|
||||||
|
|
||||||
* **UTF-8** is a character encoding - a way of converting from sequences of bytes to sequences of characters and vice versa. It covers the whole of the Unicode character set.
|
|
||||||
|
|
||||||
* UTF-8 uses the ASCII set for the first 128 characters. That's handy because it means ASCII text is also valid in UTF-8.
|
|
||||||
|
|
||||||
* **Character Set:** A character set is a list of characters with unique numbers (these numbers are sometimes referred to as “code points”). For example, in the Unicode character set, the number for A is 41.
|
|
||||||
|
|
||||||
* **Codepoint:** Characters are referred to by their "Unicode code point".
|
|
||||||
|
|
||||||
* Written in hexadecimal (to keep the numbers shorter).
|
|
||||||
|
|
||||||
* Preceded by a "U+" (that's just what they do, it has no other meaning than "this is a Unicode code point").
|
|
||||||
|
|
||||||
* Unicode itself is a mapping, it defines codepoints and a codepoint is a number, associated with usually a character.
|
|
||||||
|
|
||||||
* Code: a system of words, letters, figures, or other symbols substituted for other words, letters, etc.
|
|
||||||
|
|
||||||
* **Encoding:** Converting data into a coded form. An encoding on the other hand, is an algorithm that translates a list of numbers to binary so it can be stored on disk. For example UTF-8 would translate the number sequence 1, 2, 3, 4 like this: `00000001 00000010 00000011 00000100`. Our data is now translated into binary and can now be saved to disk.
|
|
||||||
|
|
||||||
* To encode means to use something to represent something else. An encoding is the set of rules with which to convert something from one representation to another.
|
|
||||||
|
|
||||||
* To represent 1,114,112 different values, two bytes aren't enough. Three bytes are, but three bytes are often awkward to work with, so four bytes would be the comfortable minimum. But, unless you're actually using Chinese or some of the other characters with big numbers that take a lot of bits to encode, you're never going to use a huge chunk of those four bytes.
|
|
||||||
|
|
||||||
* If the letter "A" was always encoded to 00000000 00000000 00000000 01000001, "B" always to 00000000 00000000 00000000 01000010 and so on, any document would bloat to four times the necessary size.
|
|
||||||
|
|
||||||
* To optimize this, there are several ways to encode Unicode code points into bits. UTF-8 is one of them.
|
|
||||||
|
|
||||||
character encoding bits
|
|
||||||
A UTF-8 01000001
|
|
||||||
A UTF-16 00000000 01000001
|
|
||||||
A UTF-32 00000000 00000000 00000000 01000001
|
|
||||||
|
|
||||||
U+0000 to U+007F are (correctly) encoded with one byte
|
|
||||||
U+0080 to U+07FF are encoded with 2 bytes
|
|
||||||
U+0800 to U+FFFF are encoded with 3 bytes
|
|
||||||
U+010000 to U+10FFFF are encoded with 4 bytes
|
|
||||||
|
|
||||||
* There is NO string or text, without an accompanying encoding standard.
|
|
||||||
|
|
||||||
## REFS:
|
|
||||||
https://unicode-table.com/en/
|
|
||||||
|
|
||||||
What's the difference between ASCII and Unicode?
|
|
||||||
https://stackoverflow.com/a/41198513/115363
|
|
||||||
|
|
||||||
https://stackoverflow.com/questions/643694/what-is-the-difference-between-utf-8-and-unicode
|
|
||||||
|
|
||||||
https://stackoverflow.com/questions/3951722/whats-the-difference-between-unicode-and-utf-8
|
|
||||||
|
|
||||||
https://stackoverflow.com/questions/1543613/how-does-utf-8-variable-width-encoding-work
|
|
||||||
|
|
||||||
http://kunststube.net/encoding/
|
|
||||||
(detailed and simple)
|
|
||||||
|
|
||||||
http://www.joelonsoftware.com/articles/Unicode.html
|
|
||||||
|
|
||||||
Unicode codepoint to UTF-8 encoding answer: https://stackoverflow.com/a/27939161/115363
|
|
||||||
|
|
||||||
http://www.polylab.dk/utf8-vs-unicode.html
|
|
||||||
|
|
||||||
Characters, Symbols and the Unicode Miracle - Computerphile
|
|
||||||
https://www.youtube.com/watch?v=MijmeoH9LT4
|
|
||||||
|
|
||||||
The history of UTF-8 as told by Rob Pike
|
|
||||||
http://doc.cat-v.org/bell_labs/utf-8_history
|
|
Reference in New Issue
Block a user