Counting different kinds of line terminator characters in files

.everyoneloves__top-leaderboard:empty,.everyoneloves__mid-leaderboard:empty margin-bottom:0;

up vote
2
down vote

favorite

Can someone take a look if this code piece is ok. I'm most interested whether the goroutines are used correctly, but also general Go best practices.

package main

import (
 "fmt"
 "io/ioutil"
 "os"
 "path/filepath"
 "regexp"
)

type Endings struct 
 crlf, lf uint


func check(e error) 
 if e != nil 
 panic(e)
 


func getFileList(searchDir string) string 
 fileList := string
 filepath.Walk(searchDir, func(path string, f os.FileInfo, err error) error 
 if !f.IsDir() 
 fileList = append(fileList, path)
 
 return nil
 )

 return fileList


func getFileEndings(filename string, c chan Endings) 
 fileContent, err := ioutil.ReadFile(filename)
 check(err)
 c <- countEndings(string(fileContent))


func countEndings(s string) Endings 
 crlf := regexp.MustCompile("rn")
 lf := regexp.MustCompile("n")

 x := len(crlf.FindAllStringIndex(s, -1))
 y := len(lf.FindAllStringIndex(s, -1))

 return Endingsuint(x), uint(y - x)


func splitIntoChunks(arr string, chunkSize int) string 
 if chunkSize <= 0 
 panic("chunk size too small")
 

 if len(arr) <= chunkSize 
 return stringarr
 

 numChunks := int(len(arr)/chunkSize) + 1
 chunks := make(string, numChunks)

 for i := 0; i < numChunks; i++ 
 l := i * chunkSize
 u := l + chunkSize

 if u > len(arr) 
 u = len(arr)
 

 chunks[i] = arr[l:u]
 

 return chunks


func main() 
 searchDir := os.Args[1]
 c := make(chan Endings)
 chunkSize := 1000

 fileList := getFileList(searchDir)

 count := Endings0, 0
 for _, chunk := range splitIntoChunks(fileList, chunkSize) 
 for _, file := range chunk 
 go getFileEndings(file, c)
 

 for _ = range chunk 
 result := <-c

 count.crlf += result.crlf
 count.lf += result.lf
 
 

 fmt.Printf("crlf: %d, lf: %dn", count.crlf, count.lf)

edited Mar 2 at 22:41

asked Mar 1 at 14:57

Chris

20516

add a commentÂ |Â

up vote
2
down vote

favorite

Can someone take a look if this code piece is ok. I'm most interested whether the goroutines are used correctly, but also general Go best practices.

package main

import (
 "fmt"
 "io/ioutil"
 "os"
 "path/filepath"
 "regexp"
)

type Endings struct 
 crlf, lf uint


func check(e error) 
 if e != nil 
 panic(e)
 


func getFileList(searchDir string) string 
 fileList := string
 filepath.Walk(searchDir, func(path string, f os.FileInfo, err error) error 
 if !f.IsDir() 
 fileList = append(fileList, path)
 
 return nil
 )

 return fileList


func getFileEndings(filename string, c chan Endings) 
 fileContent, err := ioutil.ReadFile(filename)
 check(err)
 c <- countEndings(string(fileContent))


func countEndings(s string) Endings 
 crlf := regexp.MustCompile("rn")
 lf := regexp.MustCompile("n")

 x := len(crlf.FindAllStringIndex(s, -1))
 y := len(lf.FindAllStringIndex(s, -1))

 return Endingsuint(x), uint(y - x)


func splitIntoChunks(arr string, chunkSize int) string 
 if chunkSize <= 0 
 panic("chunk size too small")
 

 if len(arr) <= chunkSize 
 return stringarr
 

 numChunks := int(len(arr)/chunkSize) + 1
 chunks := make(string, numChunks)

 for i := 0; i < numChunks; i++ 
 l := i * chunkSize
 u := l + chunkSize

 if u > len(arr) 
 u = len(arr)
 

 chunks[i] = arr[l:u]
 

 return chunks


func main() 
 searchDir := os.Args[1]
 c := make(chan Endings)
 chunkSize := 1000

 fileList := getFileList(searchDir)

 count := Endings0, 0
 for _, chunk := range splitIntoChunks(fileList, chunkSize) 
 for _, file := range chunk 
 go getFileEndings(file, c)
 

 for _ = range chunk 
 result := <-c

 count.crlf += result.crlf
 count.lf += result.lf
 
 

 fmt.Printf("crlf: %d, lf: %dn", count.crlf, count.lf)

edited Mar 2 at 22:41

asked Mar 1 at 14:57

Chris

20516

add a commentÂ |Â

up vote
2
down vote

favorite

Can someone take a look if this code piece is ok. I'm most interested whether the goroutines are used correctly, but also general Go best practices.

package main

import (
 "fmt"
 "io/ioutil"
 "os"
 "path/filepath"
 "regexp"
)

type Endings struct 
 crlf, lf uint


func check(e error) 
 if e != nil 
 panic(e)
 


func getFileList(searchDir string) string 
 fileList := string
 filepath.Walk(searchDir, func(path string, f os.FileInfo, err error) error 
 if !f.IsDir() 
 fileList = append(fileList, path)
 
 return nil
 )

 return fileList


func getFileEndings(filename string, c chan Endings) 
 fileContent, err := ioutil.ReadFile(filename)
 check(err)
 c <- countEndings(string(fileContent))


func countEndings(s string) Endings 
 crlf := regexp.MustCompile("rn")
 lf := regexp.MustCompile("n")

 x := len(crlf.FindAllStringIndex(s, -1))
 y := len(lf.FindAllStringIndex(s, -1))

 return Endingsuint(x), uint(y - x)


func splitIntoChunks(arr string, chunkSize int) string 
 if chunkSize <= 0 
 panic("chunk size too small")
 

 if len(arr) <= chunkSize 
 return stringarr
 

 numChunks := int(len(arr)/chunkSize) + 1
 chunks := make(string, numChunks)

 for i := 0; i < numChunks; i++ 
 l := i * chunkSize
 u := l + chunkSize

 if u > len(arr) 
 u = len(arr)
 

 chunks[i] = arr[l:u]
 

 return chunks


func main() 
 searchDir := os.Args[1]
 c := make(chan Endings)
 chunkSize := 1000

 fileList := getFileList(searchDir)

 count := Endings0, 0
 for _, chunk := range splitIntoChunks(fileList, chunkSize) 
 for _, file := range chunk 
 go getFileEndings(file, c)
 

 for _ = range chunk 
 result := <-c

 count.crlf += result.crlf
 count.lf += result.lf
 
 

 fmt.Printf("crlf: %d, lf: %dn", count.crlf, count.lf)

edited Mar 2 at 22:41

asked Mar 1 at 14:57

Chris

20516

Can someone take a look if this code piece is ok. I'm most interested whether the goroutines are used correctly, but also general Go best practices.

package main

import (
 "fmt"
 "io/ioutil"
 "os"
 "path/filepath"
 "regexp"
)

type Endings struct 
 crlf, lf uint


func check(e error) 
 if e != nil 
 panic(e)
 


func getFileList(searchDir string) string 
 fileList := string
 filepath.Walk(searchDir, func(path string, f os.FileInfo, err error) error 
 if !f.IsDir() 
 fileList = append(fileList, path)
 
 return nil
 )

 return fileList


func getFileEndings(filename string, c chan Endings) 
 fileContent, err := ioutil.ReadFile(filename)
 check(err)
 c <- countEndings(string(fileContent))


func countEndings(s string) Endings 
 crlf := regexp.MustCompile("rn")
 lf := regexp.MustCompile("n")

 x := len(crlf.FindAllStringIndex(s, -1))
 y := len(lf.FindAllStringIndex(s, -1))

 return Endingsuint(x), uint(y - x)


func splitIntoChunks(arr string, chunkSize int) string 
 if chunkSize <= 0 
 panic("chunk size too small")
 

 if len(arr) <= chunkSize 
 return stringarr
 

 numChunks := int(len(arr)/chunkSize) + 1
 chunks := make(string, numChunks)

 for i := 0; i < numChunks; i++ 
 l := i * chunkSize
 u := l + chunkSize

 if u > len(arr) 
 u = len(arr)
 

 chunks[i] = arr[l:u]
 

 return chunks


func main() 
 searchDir := os.Args[1]
 c := make(chan Endings)
 chunkSize := 1000

 fileList := getFileList(searchDir)

 count := Endings0, 0
 for _, chunk := range splitIntoChunks(fileList, chunkSize) 
 for _, file := range chunk 
 go getFileEndings(file, c)
 

 for _ = range chunk 
 result := <-c

 count.crlf += result.crlf
 count.lf += result.lf
 
 

 fmt.Printf("crlf: %d, lf: %dn", count.crlf, count.lf)

edited Mar 2 at 22:41

asked Mar 1 at 14:57

Chris

20516

edited Mar 2 at 22:41

asked Mar 1 at 14:57

Chris

20516

asked Mar 1 at 14:57

Chris

20516

asked Mar 1 at 14:57

Chris

20516

add a commentÂ |Â

1 Answer
1

active

oldest

votes

up vote
1
down vote

accepted

Your program works fine and you are correctly using goroutines, but the code isn't very idiomatic.

Producer/Consumer architecture

What we have here is a typical producer/consumer scenario. The producer is filepath.Walk() wich returns a list of files, and the consumers are goroutines executing regex on file content.

So instead of the splitIntoChunks() method, we could just have a chan string: the producer sends file names to the channel, and the consumers loop over it and parse the file content.

This have two main advantages:

code is easier to read

producer and consumer can work in parallel

So the code would looks like

results := make(chan Endings, 2) // chan to get the results of each worker
var wg sync.WaitGroup
wg.Add(2)
files := make(chan string, 1000) // channel containing file list
for i := 0; i < 2; i++ 
 go run(files, results, &wg) // start the producers 

filepath.Walk(searchDir, func(path string, f os.FileInfo, err error) error 
 if !f.IsDir() 
 files <- path // feed the channel
 
 return nil
)
close(files)
wg.Wait()
close(results)

Work on byte rather than on string

This is a general advice regarding performances: always prefer working on byte instead of working on string to avoid extra allocations.
The regex package has methods to work on string or on byte slice, so instead of

fileContent, err := ioutil.ReadFile(filename)
c <- countEndings(string(fileContent)) 
...
x := len(crlf.FindAllStringIndex(s, -1))

we can just use

fileContent, err := ioutil.ReadFile(fileName)
...
x := len(crlf.FindAllIndex(fileContent, -1))

Don't panic

The code shouldn't panic on every error. For example, if the user running the program don't have the permission to read a file in searchDir, the program should not crash but rather log the error

so avoid method like this:

func check(e error) 
 if e != nil 
 panic(e)

instead handle the error localy:

fileContent, err := ioutil.ReadFile(fileName)
if err != nil 
 fmt.Printf("fail to read file %s: %vn", fileName, err)

Other consideration

It's better to define regex as global var:

var (
 crlf = regexp.MustCompile("rn")
 lf = regexp.MustCompile("n")
)

It's also better to use the flag package to parse arguments instead of relying on os.Args

Here is the final version of the code:

package main

import (
 "flag"
 "fmt"
 "io/ioutil"
 "os"
 "path/filepath"
 "regexp"
 "sync"
)

var (
 crlf = regexp.MustCompile("rn")
 lf = regexp.MustCompile("n")
)

type Endings struct 
 crlf, lf uint


func (e *Endings) countEndings(fileName string) 
 c, err := ioutil.ReadFile(fileName)
 if err != nil 
 fmt.Printf("fail to read file %s: %vn", fileName, err)
 
 x := len(crlf.FindAllIndex(c, -1))
 y := len(lf.FindAllIndex(c, -1))

 e.crlf += uint(x)
 e.lf += uint(y - x)


func run(files <-chan string, results chan<- Endings, wg *sync.WaitGroup) 
 e := &Endings
 for f := range files 
 e.countEndings(f)
 
 results <- *e
 wg.Done()


func main() 

 searchDir := flag.String("dir", "dir", "directory to search")
 nWorker := flag.Int("n", 2, "number of worker")
 flag.Parse()

 results := make(chan Endings, *nWorker)

 var wg sync.WaitGroup
 wg.Add(*nWorker)
 files := make(chan string, 1000)
 for i := 0; i < *nWorker; i++ 
 go run(files, results, &wg)
 
 filepath.Walk(*searchDir, func(path string, f os.FileInfo, err error) error 
 if !f.IsDir() 
 files <- path
 
 return nil
 )
 close(files)
 wg.Wait()
 close(results)

 count := &Endings
 for e := range results 
 count.crlf += e.crlf
 count.lf += e.lf
 
 fmt.Printf("crlf: %d, lf: %dn", count.crlf, count.lf)

Performance

The new code is slightly faster:

old:

$ time ./test ~/go/src/golang.org 
crlf: 6231, lf: 1589462
./test ~/go/src/golang.org 1,78s user 0,23s system 167% cpu 1,201 total

new:

$ time ./test -dir ~/go/src/golang.org -n 6 
crlf: 6231, lf: 1589462
./test -dir ~/go/src/golang.org -n 6 1,48s user 0,18s system 181% cpu 0,911 total

edited Mar 21 at 6:47

answered Mar 20 at 10:59

felix

67829

add a commentÂ |Â

Your Answer

StackExchange.ifUsing("editor", function ()
return StackExchange.using("mathjaxEditing", function ()
StackExchange.MarkdownEditor.creationCallbacks.add(function (editor, postfix)
StackExchange.mathjaxEditing.prepareWmdForMathJax(editor, postfix, [["\$", "\$"]]);
);
);
, "mathjax-editing");

StackExchange.ifUsing("editor", function ()
StackExchange.using("externalEditor", function ()
StackExchange.using("snippets", function ()
StackExchange.snippets.init();
);
);
, "code-snippets");

StackExchange.ready(function()
var channelOptions =
tags: "".split(" "),
id: "196"
;
initTagRenderer("".split(" "), "".split(" "), channelOptions);

StackExchange.using("externalEditor", function()
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled)
StackExchange.using("snippets", function()
createEditor();
);

else
createEditor();

);

function createEditor()
StackExchange.prepareEditor(
heartbeatType: 'answer',
convertImagesToLinks: false,
noModals: false,
showLowRepImageUploadWarning: true,
reputationToPostImages: null,
bindNavPrevention: true,
postfix: "",
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
);

);

draft saved

draft discarded

StackExchange.ready(
function ()
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f188616%2fcounting-different-kinds-of-line-terminator-characters-in-files%23new-answer', 'question_page');

);

Post as a guest

Name

1 Answer
1

active

oldest

votes

1 Answer
1

active

oldest

votes

up vote
1
down vote

accepted

Your program works fine and you are correctly using goroutines, but the code isn't very idiomatic.

Producer/Consumer architecture

What we have here is a typical producer/consumer scenario. The producer is filepath.Walk() wich returns a list of files, and the consumers are goroutines executing regex on file content.

So instead of the splitIntoChunks() method, we could just have a chan string: the producer sends file names to the channel, and the consumers loop over it and parse the file content.

This have two main advantages:

code is easier to read

producer and consumer can work in parallel

So the code would looks like

results := make(chan Endings, 2) // chan to get the results of each worker
var wg sync.WaitGroup
wg.Add(2)
files := make(chan string, 1000) // channel containing file list
for i := 0; i < 2; i++ 
 go run(files, results, &wg) // start the producers 

filepath.Walk(searchDir, func(path string, f os.FileInfo, err error) error 
 if !f.IsDir() 
 files <- path // feed the channel
 
 return nil
)
close(files)
wg.Wait()
close(results)

Work on byte rather than on string

fileContent, err := ioutil.ReadFile(filename)
c <- countEndings(string(fileContent)) 
...
x := len(crlf.FindAllStringIndex(s, -1))

we can just use

fileContent, err := ioutil.ReadFile(fileName)
...
x := len(crlf.FindAllIndex(fileContent, -1))

Don't panic

The code shouldn't panic on every error. For example, if the user running the program don't have the permission to read a file in searchDir, the program should not crash but rather log the error

so avoid method like this:

func check(e error) 
 if e != nil 
 panic(e)

instead handle the error localy:

fileContent, err := ioutil.ReadFile(fileName)
if err != nil 
 fmt.Printf("fail to read file %s: %vn", fileName, err)

Other consideration

It's better to define regex as global var:

var (
 crlf = regexp.MustCompile("rn")
 lf = regexp.MustCompile("n")
)

It's also better to use the flag package to parse arguments instead of relying on os.Args

Here is the final version of the code:

package main

import (
 "flag"
 "fmt"
 "io/ioutil"
 "os"
 "path/filepath"
 "regexp"
 "sync"
)

var (
 crlf = regexp.MustCompile("rn")
 lf = regexp.MustCompile("n")
)

type Endings struct 
 crlf, lf uint


func (e *Endings) countEndings(fileName string) 
 c, err := ioutil.ReadFile(fileName)
 if err != nil 
 fmt.Printf("fail to read file %s: %vn", fileName, err)
 
 x := len(crlf.FindAllIndex(c, -1))
 y := len(lf.FindAllIndex(c, -1))

 e.crlf += uint(x)
 e.lf += uint(y - x)


func run(files <-chan string, results chan<- Endings, wg *sync.WaitGroup) 
 e := &Endings
 for f := range files 
 e.countEndings(f)
 
 results <- *e
 wg.Done()


func main() 

 searchDir := flag.String("dir", "dir", "directory to search")
 nWorker := flag.Int("n", 2, "number of worker")
 flag.Parse()

 results := make(chan Endings, *nWorker)

 var wg sync.WaitGroup
 wg.Add(*nWorker)
 files := make(chan string, 1000)
 for i := 0; i < *nWorker; i++ 
 go run(files, results, &wg)
 
 filepath.Walk(*searchDir, func(path string, f os.FileInfo, err error) error 
 if !f.IsDir() 
 files <- path
 
 return nil
 )
 close(files)
 wg.Wait()
 close(results)

 count := &Endings
 for e := range results 
 count.crlf += e.crlf
 count.lf += e.lf
 
 fmt.Printf("crlf: %d, lf: %dn", count.crlf, count.lf)

Performance

The new code is slightly faster:

old:

$ time ./test ~/go/src/golang.org 
crlf: 6231, lf: 1589462
./test ~/go/src/golang.org 1,78s user 0,23s system 167% cpu 1,201 total

new:

$ time ./test -dir ~/go/src/golang.org -n 6 
crlf: 6231, lf: 1589462
./test -dir ~/go/src/golang.org -n 6 1,48s user 0,18s system 181% cpu 0,911 total

edited Mar 21 at 6:47

answered Mar 20 at 10:59

felix

67829

add a commentÂ |Â

up vote
1
down vote

accepted

Your program works fine and you are correctly using goroutines, but the code isn't very idiomatic.

Producer/Consumer architecture

What we have here is a typical producer/consumer scenario. The producer is filepath.Walk() wich returns a list of files, and the consumers are goroutines executing regex on file content.

So instead of the splitIntoChunks() method, we could just have a chan string: the producer sends file names to the channel, and the consumers loop over it and parse the file content.

This have two main advantages:

code is easier to read

producer and consumer can work in parallel

So the code would looks like

results := make(chan Endings, 2) // chan to get the results of each worker
var wg sync.WaitGroup
wg.Add(2)
files := make(chan string, 1000) // channel containing file list
for i := 0; i < 2; i++ 
 go run(files, results, &wg) // start the producers 

filepath.Walk(searchDir, func(path string, f os.FileInfo, err error) error 
 if !f.IsDir() 
 files <- path // feed the channel
 
 return nil
)
close(files)
wg.Wait()
close(results)

Work on byte rather than on string

fileContent, err := ioutil.ReadFile(filename)
c <- countEndings(string(fileContent)) 
...
x := len(crlf.FindAllStringIndex(s, -1))

we can just use

fileContent, err := ioutil.ReadFile(fileName)
...
x := len(crlf.FindAllIndex(fileContent, -1))

Don't panic

The code shouldn't panic on every error. For example, if the user running the program don't have the permission to read a file in searchDir, the program should not crash but rather log the error

so avoid method like this:

func check(e error) 
 if e != nil 
 panic(e)

instead handle the error localy:

fileContent, err := ioutil.ReadFile(fileName)
if err != nil 
 fmt.Printf("fail to read file %s: %vn", fileName, err)

Other consideration

It's better to define regex as global var:

var (
 crlf = regexp.MustCompile("rn")
 lf = regexp.MustCompile("n")
)

It's also better to use the flag package to parse arguments instead of relying on os.Args

Here is the final version of the code:

package main

import (
 "flag"
 "fmt"
 "io/ioutil"
 "os"
 "path/filepath"
 "regexp"
 "sync"
)

var (
 crlf = regexp.MustCompile("rn")
 lf = regexp.MustCompile("n")
)

type Endings struct 
 crlf, lf uint


func (e *Endings) countEndings(fileName string) 
 c, err := ioutil.ReadFile(fileName)
 if err != nil 
 fmt.Printf("fail to read file %s: %vn", fileName, err)
 
 x := len(crlf.FindAllIndex(c, -1))
 y := len(lf.FindAllIndex(c, -1))

 e.crlf += uint(x)
 e.lf += uint(y - x)


func run(files <-chan string, results chan<- Endings, wg *sync.WaitGroup) 
 e := &Endings
 for f := range files 
 e.countEndings(f)
 
 results <- *e
 wg.Done()


func main() 

 searchDir := flag.String("dir", "dir", "directory to search")
 nWorker := flag.Int("n", 2, "number of worker")
 flag.Parse()

 results := make(chan Endings, *nWorker)

 var wg sync.WaitGroup
 wg.Add(*nWorker)
 files := make(chan string, 1000)
 for i := 0; i < *nWorker; i++ 
 go run(files, results, &wg)
 
 filepath.Walk(*searchDir, func(path string, f os.FileInfo, err error) error 
 if !f.IsDir() 
 files <- path
 
 return nil
 )
 close(files)
 wg.Wait()
 close(results)

 count := &Endings
 for e := range results 
 count.crlf += e.crlf
 count.lf += e.lf
 
 fmt.Printf("crlf: %d, lf: %dn", count.crlf, count.lf)

Performance

The new code is slightly faster:

old:

$ time ./test ~/go/src/golang.org 
crlf: 6231, lf: 1589462
./test ~/go/src/golang.org 1,78s user 0,23s system 167% cpu 1,201 total

new:

$ time ./test -dir ~/go/src/golang.org -n 6 
crlf: 6231, lf: 1589462
./test -dir ~/go/src/golang.org -n 6 1,48s user 0,18s system 181% cpu 0,911 total

edited Mar 21 at 6:47

answered Mar 20 at 10:59

felix

67829

add a commentÂ |Â

up vote
1
down vote

accepted

Your program works fine and you are correctly using goroutines, but the code isn't very idiomatic.

Producer/Consumer architecture

What we have here is a typical producer/consumer scenario. The producer is filepath.Walk() wich returns a list of files, and the consumers are goroutines executing regex on file content.

So instead of the splitIntoChunks() method, we could just have a chan string: the producer sends file names to the channel, and the consumers loop over it and parse the file content.

This have two main advantages:

code is easier to read

producer and consumer can work in parallel

So the code would looks like

results := make(chan Endings, 2) // chan to get the results of each worker
var wg sync.WaitGroup
wg.Add(2)
files := make(chan string, 1000) // channel containing file list
for i := 0; i < 2; i++ 
 go run(files, results, &wg) // start the producers 

filepath.Walk(searchDir, func(path string, f os.FileInfo, err error) error 
 if !f.IsDir() 
 files <- path // feed the channel
 
 return nil
)
close(files)
wg.Wait()
close(results)

Work on byte rather than on string

fileContent, err := ioutil.ReadFile(filename)
c <- countEndings(string(fileContent)) 
...
x := len(crlf.FindAllStringIndex(s, -1))

we can just use

fileContent, err := ioutil.ReadFile(fileName)
...
x := len(crlf.FindAllIndex(fileContent, -1))

Don't panic

The code shouldn't panic on every error. For example, if the user running the program don't have the permission to read a file in searchDir, the program should not crash but rather log the error

so avoid method like this:

func check(e error) 
 if e != nil 
 panic(e)

instead handle the error localy:

fileContent, err := ioutil.ReadFile(fileName)
if err != nil 
 fmt.Printf("fail to read file %s: %vn", fileName, err)

Other consideration

It's better to define regex as global var:

var (
 crlf = regexp.MustCompile("rn")
 lf = regexp.MustCompile("n")
)

It's also better to use the flag package to parse arguments instead of relying on os.Args

Here is the final version of the code:

package main

import (
 "flag"
 "fmt"
 "io/ioutil"
 "os"
 "path/filepath"
 "regexp"
 "sync"
)

var (
 crlf = regexp.MustCompile("rn")
 lf = regexp.MustCompile("n")
)

type Endings struct 
 crlf, lf uint


func (e *Endings) countEndings(fileName string) 
 c, err := ioutil.ReadFile(fileName)
 if err != nil 
 fmt.Printf("fail to read file %s: %vn", fileName, err)
 
 x := len(crlf.FindAllIndex(c, -1))
 y := len(lf.FindAllIndex(c, -1))

 e.crlf += uint(x)
 e.lf += uint(y - x)


func run(files <-chan string, results chan<- Endings, wg *sync.WaitGroup) 
 e := &Endings
 for f := range files 
 e.countEndings(f)
 
 results <- *e
 wg.Done()


func main() 

 searchDir := flag.String("dir", "dir", "directory to search")
 nWorker := flag.Int("n", 2, "number of worker")
 flag.Parse()

 results := make(chan Endings, *nWorker)

 var wg sync.WaitGroup
 wg.Add(*nWorker)
 files := make(chan string, 1000)
 for i := 0; i < *nWorker; i++ 
 go run(files, results, &wg)
 
 filepath.Walk(*searchDir, func(path string, f os.FileInfo, err error) error 
 if !f.IsDir() 
 files <- path
 
 return nil
 )
 close(files)
 wg.Wait()
 close(results)

 count := &Endings
 for e := range results 
 count.crlf += e.crlf
 count.lf += e.lf
 
 fmt.Printf("crlf: %d, lf: %dn", count.crlf, count.lf)

Performance

The new code is slightly faster:

old:

$ time ./test ~/go/src/golang.org 
crlf: 6231, lf: 1589462
./test ~/go/src/golang.org 1,78s user 0,23s system 167% cpu 1,201 total

new:

$ time ./test -dir ~/go/src/golang.org -n 6 
crlf: 6231, lf: 1589462
./test -dir ~/go/src/golang.org -n 6 1,48s user 0,18s system 181% cpu 0,911 total

edited Mar 21 at 6:47

answered Mar 20 at 10:59

felix

67829

Your program works fine and you are correctly using goroutines, but the code isn't very idiomatic.

Producer/Consumer architecture

What we have here is a typical producer/consumer scenario. The producer is filepath.Walk() wich returns a list of files, and the consumers are goroutines executing regex on file content.

So instead of the splitIntoChunks() method, we could just have a chan string: the producer sends file names to the channel, and the consumers loop over it and parse the file content.

This have two main advantages:

code is easier to read

producer and consumer can work in parallel

So the code would looks like

results := make(chan Endings, 2) // chan to get the results of each worker
var wg sync.WaitGroup
wg.Add(2)
files := make(chan string, 1000) // channel containing file list
for i := 0; i < 2; i++ 
 go run(files, results, &wg) // start the producers 

filepath.Walk(searchDir, func(path string, f os.FileInfo, err error) error 
 if !f.IsDir() 
 files <- path // feed the channel
 
 return nil
)
close(files)
wg.Wait()
close(results)

Work on byte rather than on string

fileContent, err := ioutil.ReadFile(filename)
c <- countEndings(string(fileContent)) 
...
x := len(crlf.FindAllStringIndex(s, -1))

we can just use

fileContent, err := ioutil.ReadFile(fileName)
...
x := len(crlf.FindAllIndex(fileContent, -1))

Don't panic

The code shouldn't panic on every error. For example, if the user running the program don't have the permission to read a file in searchDir, the program should not crash but rather log the error

so avoid method like this:

func check(e error) 
 if e != nil 
 panic(e)

instead handle the error localy:

fileContent, err := ioutil.ReadFile(fileName)
if err != nil 
 fmt.Printf("fail to read file %s: %vn", fileName, err)

Other consideration

It's better to define regex as global var:

var (
 crlf = regexp.MustCompile("rn")
 lf = regexp.MustCompile("n")
)

It's also better to use the flag package to parse arguments instead of relying on os.Args

Here is the final version of the code:

package main

import (
 "flag"
 "fmt"
 "io/ioutil"
 "os"
 "path/filepath"
 "regexp"
 "sync"
)

var (
 crlf = regexp.MustCompile("rn")
 lf = regexp.MustCompile("n")
)

type Endings struct 
 crlf, lf uint


func (e *Endings) countEndings(fileName string) 
 c, err := ioutil.ReadFile(fileName)
 if err != nil 
 fmt.Printf("fail to read file %s: %vn", fileName, err)
 
 x := len(crlf.FindAllIndex(c, -1))
 y := len(lf.FindAllIndex(c, -1))

 e.crlf += uint(x)
 e.lf += uint(y - x)


func run(files <-chan string, results chan<- Endings, wg *sync.WaitGroup) 
 e := &Endings
 for f := range files 
 e.countEndings(f)
 
 results <- *e
 wg.Done()


func main() 

 searchDir := flag.String("dir", "dir", "directory to search")
 nWorker := flag.Int("n", 2, "number of worker")
 flag.Parse()

 results := make(chan Endings, *nWorker)

 var wg sync.WaitGroup
 wg.Add(*nWorker)
 files := make(chan string, 1000)
 for i := 0; i < *nWorker; i++ 
 go run(files, results, &wg)
 
 filepath.Walk(*searchDir, func(path string, f os.FileInfo, err error) error 
 if !f.IsDir() 
 files <- path
 
 return nil
 )
 close(files)
 wg.Wait()
 close(results)

 count := &Endings
 for e := range results 
 count.crlf += e.crlf
 count.lf += e.lf
 
 fmt.Printf("crlf: %d, lf: %dn", count.crlf, count.lf)

Performance

The new code is slightly faster:

old:

$ time ./test ~/go/src/golang.org 
crlf: 6231, lf: 1589462
./test ~/go/src/golang.org 1,78s user 0,23s system 167% cpu 1,201 total

new:

$ time ./test -dir ~/go/src/golang.org -n 6 
crlf: 6231, lf: 1589462
./test -dir ~/go/src/golang.org -n 6 1,48s user 0,18s system 181% cpu 0,911 total

edited Mar 21 at 6:47

answered Mar 20 at 10:59

felix

67829

edited Mar 21 at 6:47

answered Mar 20 at 10:59

felix

67829

answered Mar 20 at 10:59

felix

67829

answered Mar 20 at 10:59

felix

67829

add a commentÂ |Â

draft saved

draft discarded

draft saved

draft discarded

Post as a guest

Name

搜尋此網誌

trjhtr