optimize: log parser

add: fast text reader to log parser
This commit is contained in:
Inanc Gumus
2019-08-07 13:24:15 +03:00
parent 450018748b
commit 4b2d2a3d6b
6 changed files with 168 additions and 49 deletions

View File

@ -12,8 +12,7 @@ func pageGrouper(r result) string {
return r.domain + r.page
}
// you could have created a noopGrouper as well
// but it's not necessary i think (map allocation)
// groupBy allocates map unnecessarily
func noopGrouper(r result) string {
// with something like:
// return randomStrings()

View File

@ -15,7 +15,7 @@ func main() {
defer recoverErr()
_, err := newReport().
// filterBy(orgDomainsFilter).
// from(fastTextReader(os.Stdin)).
filterBy(notUsing(domainExtFilter("com", "io"))).
groupBy(domainGrouper).
start()

View File

@ -17,6 +17,7 @@ type report struct {
func newReport() *report {
return &report{
filter: noopFilter,
group: noopGrouper,
input: textReader(os.Stdin),
output: textWriter(os.Stdout),
}
@ -43,30 +44,19 @@ func (r *report) groupBy(fn groupFunc) *report {
}
func (r *report) start() ([]result, error) {
if r.input == nil {
panic("report input cannot be nil")
}
// input filterBy groupBy
// scanner (result) bool map[string]result
//
// stdin -> []result -> []results -> []result -> output(stdout)
results, err := r.input()
res, err := r.input()
if err != nil {
return nil, err
}
// noop if filter is nil
results = filterBy(results, r.filter)
res = filterBy(res, r.filter)
res = groupBy(res, r.group)
err = r.output(res)
// group func is more tricky
// you don't want to create an unnecessary map
if r.group != nil {
results = groupBy(results, r.group)
}
// TODO: prefer: noop writer
if r.output != nil {
if err := r.output(results); err != nil {
return nil, err
}
}
return results, nil
return res, err
}

View File

@ -9,6 +9,7 @@ package main
import (
"bufio"
"bytes"
"fmt"
"io"
"strings"
@ -16,34 +17,51 @@ import (
func textReader(r io.Reader) inputFunc {
return func() ([]result, error) {
in := bufio.NewScanner(r)
return parseText(in)
}
}
func parseText(in *bufio.Scanner) ([]result, error) {
var (
results []result
lines int
)
results = make([]result, 0, 5000000)
for in.Scan() {
lines++
res, err := parseFields(strings.Fields(in.Text()))
// first: count the lines, so the parseText can create
// enough buffer.
var buf bytes.Buffer
lines, err := countLines(io.TeeReader(r, &buf))
if err != nil {
// TODO: custom error type for line information
return nil, fmt.Errorf("line %d: %v", lines, err)
return nil, err
}
results = append(results, res)
return parseText(bufio.NewScanner(&buf), lines)
}
}
// TODO: custom error type for line information
func parseText(in *bufio.Scanner, nlines int) ([]result, error) {
res := make([]result, 0, nlines)
for l := 1; in.Scan(); l++ {
fields := strings.Fields(in.Text())
r, err := parseFields(fields)
if err != nil {
return nil, fmt.Errorf("line %d: %v", l, err)
}
res = append(res, r)
}
return res, in.Err()
}
func countLines(r io.Reader) (int, error) {
var (
lines int
buf = make([]byte, 1024<<4) // read via 16 KB blocks
)
for {
n, err := r.Read(buf)
lines += bytes.Count(buf[:n], []byte{'\n'})
if err == io.EOF {
return lines, nil
}
if err != nil {
return lines, err
}
}
if err := in.Err(); err != nil {
return nil, err
}
return results, nil
}

View File

@ -0,0 +1,106 @@
// For more tutorials: https://blog.learngoprogramming.com
//
// Copyright © 2018 Inanc Gumus
// Learn Go Programming Course
// License: https://creativecommons.org/licenses/by-nc-sa/4.0/
//
package main
import (
"bufio"
"bytes"
"errors"
"fmt"
"io"
)
// this could be made faster.
// currently, it's 30-35% faster.
//
// so, what's different than the textreader?
//
// + creates the buffers specific to the input file/stdin size
// + manually parses the fields: instead of strings.Fields
// + gets the lines using scanner's Bytes() method: instead of Text()
// + uses a manual atoi
// +
func fastTextReader(r io.Reader) inputFunc {
return func() ([]result, error) {
// first: count the lines, so the parseText can create
// enough buffer.
var buf bytes.Buffer
l, err := countLines(io.TeeReader(r, &buf))
if err != nil {
return nil, err
}
return fastParseText(bufio.NewScanner(&buf), l)
}
}
func fastParseText(in *bufio.Scanner, nlines int) ([]result, error) {
// needs to know the number of total lines in the file
res := make([]result, 0, nlines)
for l := 0; in.Scan(); l++ {
r, err := fastParseFields(in.Bytes())
if err != nil {
return nil, fmt.Errorf("line %d: %v", l, err)
}
res = append(res, r)
}
return res, in.Err()
}
func fastParseFields(data []byte) (res result, err error) {
var field int
for i, last := 0, 0; i < len(data); i++ {
done := len(data) == i+1
if c := data[i]; c == ' ' || done {
if done {
i = len(data)
}
switch field {
case 0:
res.domain = string(data[last:i])
case 1:
res.page = string(data[last:i])
case 2:
res.visits, err = atoi(data[last:i])
case 3:
res.uniques, err = atoi(data[last:i])
}
if err != nil {
return res, err
}
last = i + 1
field++
}
}
if field != 4 {
return result{}, errors.New("wrong number of fields")
}
return res, nil
}
func atoi(input []byte) (int, error) {
val := 0
for i := 0; i < len(input); i++ {
char := input[i]
if char < '0' || char > '9' {
return 0, errors.New("invalid number")
}
val = val*10 + int(char) - '0'
}
return val, nil
}

View File

@ -35,3 +35,9 @@ func textWriter(w io.Writer) outputFunc {
return nil
}
}
func noWhere() outputFunc {
return func(res []result) error {
return nil
}
}