Counting different kinds of line terminator characters in files

The name of the pictureThe name of the pictureThe name of the pictureClash Royale CLAN TAG#URR8PPP





.everyoneloves__top-leaderboard:empty,.everyoneloves__mid-leaderboard:empty margin-bottom:0;







up vote
2
down vote

favorite
1












Can someone take a look if this code piece is ok. I'm most interested whether the goroutines are used correctly, but also general Go best practices.



package main

import (
"fmt"
"io/ioutil"
"os"
"path/filepath"
"regexp"
)

type Endings struct
crlf, lf uint


func check(e error)
if e != nil
panic(e)



func getFileList(searchDir string) string
fileList := string
filepath.Walk(searchDir, func(path string, f os.FileInfo, err error) error
if !f.IsDir()
fileList = append(fileList, path)

return nil
)

return fileList


func getFileEndings(filename string, c chan Endings)
fileContent, err := ioutil.ReadFile(filename)
check(err)
c <- countEndings(string(fileContent))


func countEndings(s string) Endings
crlf := regexp.MustCompile("rn")
lf := regexp.MustCompile("n")

x := len(crlf.FindAllStringIndex(s, -1))
y := len(lf.FindAllStringIndex(s, -1))

return Endingsuint(x), uint(y - x)


func splitIntoChunks(arr string, chunkSize int) string
if chunkSize <= 0
panic("chunk size too small")


if len(arr) <= chunkSize
return stringarr


numChunks := int(len(arr)/chunkSize) + 1
chunks := make(string, numChunks)

for i := 0; i < numChunks; i++
l := i * chunkSize
u := l + chunkSize

if u > len(arr)
u = len(arr)


chunks[i] = arr[l:u]


return chunks


func main()
searchDir := os.Args[1]
c := make(chan Endings)
chunkSize := 1000

fileList := getFileList(searchDir)

count := Endings0, 0
for _, chunk := range splitIntoChunks(fileList, chunkSize)
for _, file := range chunk
go getFileEndings(file, c)


for _ = range chunk
result := <-c

count.crlf += result.crlf
count.lf += result.lf



fmt.Printf("crlf: %d, lf: %dn", count.crlf, count.lf)







share|improve this question



























    up vote
    2
    down vote

    favorite
    1












    Can someone take a look if this code piece is ok. I'm most interested whether the goroutines are used correctly, but also general Go best practices.



    package main

    import (
    "fmt"
    "io/ioutil"
    "os"
    "path/filepath"
    "regexp"
    )

    type Endings struct
    crlf, lf uint


    func check(e error)
    if e != nil
    panic(e)



    func getFileList(searchDir string) string
    fileList := string
    filepath.Walk(searchDir, func(path string, f os.FileInfo, err error) error
    if !f.IsDir()
    fileList = append(fileList, path)

    return nil
    )

    return fileList


    func getFileEndings(filename string, c chan Endings)
    fileContent, err := ioutil.ReadFile(filename)
    check(err)
    c <- countEndings(string(fileContent))


    func countEndings(s string) Endings
    crlf := regexp.MustCompile("rn")
    lf := regexp.MustCompile("n")

    x := len(crlf.FindAllStringIndex(s, -1))
    y := len(lf.FindAllStringIndex(s, -1))

    return Endingsuint(x), uint(y - x)


    func splitIntoChunks(arr string, chunkSize int) string
    if chunkSize <= 0
    panic("chunk size too small")


    if len(arr) <= chunkSize
    return stringarr


    numChunks := int(len(arr)/chunkSize) + 1
    chunks := make(string, numChunks)

    for i := 0; i < numChunks; i++
    l := i * chunkSize
    u := l + chunkSize

    if u > len(arr)
    u = len(arr)


    chunks[i] = arr[l:u]


    return chunks


    func main()
    searchDir := os.Args[1]
    c := make(chan Endings)
    chunkSize := 1000

    fileList := getFileList(searchDir)

    count := Endings0, 0
    for _, chunk := range splitIntoChunks(fileList, chunkSize)
    for _, file := range chunk
    go getFileEndings(file, c)


    for _ = range chunk
    result := <-c

    count.crlf += result.crlf
    count.lf += result.lf



    fmt.Printf("crlf: %d, lf: %dn", count.crlf, count.lf)







    share|improve this question























      up vote
      2
      down vote

      favorite
      1









      up vote
      2
      down vote

      favorite
      1






      1





      Can someone take a look if this code piece is ok. I'm most interested whether the goroutines are used correctly, but also general Go best practices.



      package main

      import (
      "fmt"
      "io/ioutil"
      "os"
      "path/filepath"
      "regexp"
      )

      type Endings struct
      crlf, lf uint


      func check(e error)
      if e != nil
      panic(e)



      func getFileList(searchDir string) string
      fileList := string
      filepath.Walk(searchDir, func(path string, f os.FileInfo, err error) error
      if !f.IsDir()
      fileList = append(fileList, path)

      return nil
      )

      return fileList


      func getFileEndings(filename string, c chan Endings)
      fileContent, err := ioutil.ReadFile(filename)
      check(err)
      c <- countEndings(string(fileContent))


      func countEndings(s string) Endings
      crlf := regexp.MustCompile("rn")
      lf := regexp.MustCompile("n")

      x := len(crlf.FindAllStringIndex(s, -1))
      y := len(lf.FindAllStringIndex(s, -1))

      return Endingsuint(x), uint(y - x)


      func splitIntoChunks(arr string, chunkSize int) string
      if chunkSize <= 0
      panic("chunk size too small")


      if len(arr) <= chunkSize
      return stringarr


      numChunks := int(len(arr)/chunkSize) + 1
      chunks := make(string, numChunks)

      for i := 0; i < numChunks; i++
      l := i * chunkSize
      u := l + chunkSize

      if u > len(arr)
      u = len(arr)


      chunks[i] = arr[l:u]


      return chunks


      func main()
      searchDir := os.Args[1]
      c := make(chan Endings)
      chunkSize := 1000

      fileList := getFileList(searchDir)

      count := Endings0, 0
      for _, chunk := range splitIntoChunks(fileList, chunkSize)
      for _, file := range chunk
      go getFileEndings(file, c)


      for _ = range chunk
      result := <-c

      count.crlf += result.crlf
      count.lf += result.lf



      fmt.Printf("crlf: %d, lf: %dn", count.crlf, count.lf)







      share|improve this question













      Can someone take a look if this code piece is ok. I'm most interested whether the goroutines are used correctly, but also general Go best practices.



      package main

      import (
      "fmt"
      "io/ioutil"
      "os"
      "path/filepath"
      "regexp"
      )

      type Endings struct
      crlf, lf uint


      func check(e error)
      if e != nil
      panic(e)



      func getFileList(searchDir string) string
      fileList := string
      filepath.Walk(searchDir, func(path string, f os.FileInfo, err error) error
      if !f.IsDir()
      fileList = append(fileList, path)

      return nil
      )

      return fileList


      func getFileEndings(filename string, c chan Endings)
      fileContent, err := ioutil.ReadFile(filename)
      check(err)
      c <- countEndings(string(fileContent))


      func countEndings(s string) Endings
      crlf := regexp.MustCompile("rn")
      lf := regexp.MustCompile("n")

      x := len(crlf.FindAllStringIndex(s, -1))
      y := len(lf.FindAllStringIndex(s, -1))

      return Endingsuint(x), uint(y - x)


      func splitIntoChunks(arr string, chunkSize int) string
      if chunkSize <= 0
      panic("chunk size too small")


      if len(arr) <= chunkSize
      return stringarr


      numChunks := int(len(arr)/chunkSize) + 1
      chunks := make(string, numChunks)

      for i := 0; i < numChunks; i++
      l := i * chunkSize
      u := l + chunkSize

      if u > len(arr)
      u = len(arr)


      chunks[i] = arr[l:u]


      return chunks


      func main()
      searchDir := os.Args[1]
      c := make(chan Endings)
      chunkSize := 1000

      fileList := getFileList(searchDir)

      count := Endings0, 0
      for _, chunk := range splitIntoChunks(fileList, chunkSize)
      for _, file := range chunk
      go getFileEndings(file, c)


      for _ = range chunk
      result := <-c

      count.crlf += result.crlf
      count.lf += result.lf



      fmt.Printf("crlf: %d, lf: %dn", count.crlf, count.lf)









      share|improve this question












      share|improve this question




      share|improve this question








      edited Mar 2 at 22:41
























      asked Mar 1 at 14:57









      Chris

      20516




      20516




















          1 Answer
          1






          active

          oldest

          votes

















          up vote
          1
          down vote



          accepted










          Your program works fine and you are correctly using goroutines, but the code isn't very idiomatic.



          Producer/Consumer architecture



          What we have here is a typical producer/consumer scenario. The producer is filepath.Walk() wich returns a list of files, and the consumers are goroutines executing regex on file content.



          So instead of the splitIntoChunks() method, we could just have a chan string: the producer sends file names to the channel, and the consumers loop over it and parse the file content.



          This have two main advantages:



          • code is easier to read

          • producer and consumer can work in parallel

          So the code would looks like



          results := make(chan Endings, 2) // chan to get the results of each worker
          var wg sync.WaitGroup
          wg.Add(2)
          files := make(chan string, 1000) // channel containing file list
          for i := 0; i < 2; i++
          go run(files, results, &wg) // start the producers

          filepath.Walk(searchDir, func(path string, f os.FileInfo, err error) error
          if !f.IsDir()
          files <- path // feed the channel

          return nil
          )
          close(files)
          wg.Wait()
          close(results)


          Work on byte rather than on string



          This is a general advice regarding performances: always prefer working on byte instead of working on string to avoid extra allocations.
          The regex package has methods to work on string or on byte slice, so instead of



          fileContent, err := ioutil.ReadFile(filename)
          c <- countEndings(string(fileContent))
          ...
          x := len(crlf.FindAllStringIndex(s, -1))


          we can just use



          fileContent, err := ioutil.ReadFile(fileName)
          ...
          x := len(crlf.FindAllIndex(fileContent, -1))


          Don't panic



          The code shouldn't panic on every error. For example, if the user running the program don't have the permission to read a file in searchDir, the program should not crash but rather log the error



          so avoid method like this:



          func check(e error) 
          if e != nil
          panic(e)




          instead handle the error localy:



          fileContent, err := ioutil.ReadFile(fileName)
          if err != nil
          fmt.Printf("fail to read file %s: %vn", fileName, err)



          Other consideration



          It's better to define regex as global var:



          var (
          crlf = regexp.MustCompile("rn")
          lf = regexp.MustCompile("n")
          )


          It's also better to use the flag package to parse arguments instead of relying on os.Args



          Here is the final version of the code:



          package main

          import (
          "flag"
          "fmt"
          "io/ioutil"
          "os"
          "path/filepath"
          "regexp"
          "sync"
          )

          var (
          crlf = regexp.MustCompile("rn")
          lf = regexp.MustCompile("n")
          )

          type Endings struct
          crlf, lf uint


          func (e *Endings) countEndings(fileName string)
          c, err := ioutil.ReadFile(fileName)
          if err != nil
          fmt.Printf("fail to read file %s: %vn", fileName, err)

          x := len(crlf.FindAllIndex(c, -1))
          y := len(lf.FindAllIndex(c, -1))

          e.crlf += uint(x)
          e.lf += uint(y - x)


          func run(files <-chan string, results chan<- Endings, wg *sync.WaitGroup)
          e := &Endings
          for f := range files
          e.countEndings(f)

          results <- *e
          wg.Done()


          func main()

          searchDir := flag.String("dir", "dir", "directory to search")
          nWorker := flag.Int("n", 2, "number of worker")
          flag.Parse()

          results := make(chan Endings, *nWorker)

          var wg sync.WaitGroup
          wg.Add(*nWorker)
          files := make(chan string, 1000)
          for i := 0; i < *nWorker; i++
          go run(files, results, &wg)

          filepath.Walk(*searchDir, func(path string, f os.FileInfo, err error) error
          if !f.IsDir()
          files <- path

          return nil
          )
          close(files)
          wg.Wait()
          close(results)

          count := &Endings
          for e := range results
          count.crlf += e.crlf
          count.lf += e.lf

          fmt.Printf("crlf: %d, lf: %dn", count.crlf, count.lf)



          Performance



          The new code is slightly faster:



          old:



          $ time ./test ~/go/src/golang.org 
          crlf: 6231, lf: 1589462
          ./test ~/go/src/golang.org 1,78s user 0,23s system 167% cpu 1,201 total


          new:



          $ time ./test -dir ~/go/src/golang.org -n 6 
          crlf: 6231, lf: 1589462
          ./test -dir ~/go/src/golang.org -n 6 1,48s user 0,18s system 181% cpu 0,911 total





          share|improve this answer























            Your Answer




            StackExchange.ifUsing("editor", function ()
            return StackExchange.using("mathjaxEditing", function ()
            StackExchange.MarkdownEditor.creationCallbacks.add(function (editor, postfix)
            StackExchange.mathjaxEditing.prepareWmdForMathJax(editor, postfix, [["\$", "\$"]]);
            );
            );
            , "mathjax-editing");

            StackExchange.ifUsing("editor", function ()
            StackExchange.using("externalEditor", function ()
            StackExchange.using("snippets", function ()
            StackExchange.snippets.init();
            );
            );
            , "code-snippets");

            StackExchange.ready(function()
            var channelOptions =
            tags: "".split(" "),
            id: "196"
            ;
            initTagRenderer("".split(" "), "".split(" "), channelOptions);

            StackExchange.using("externalEditor", function()
            // Have to fire editor after snippets, if snippets enabled
            if (StackExchange.settings.snippets.snippetsEnabled)
            StackExchange.using("snippets", function()
            createEditor();
            );

            else
            createEditor();

            );

            function createEditor()
            StackExchange.prepareEditor(
            heartbeatType: 'answer',
            convertImagesToLinks: false,
            noModals: false,
            showLowRepImageUploadWarning: true,
            reputationToPostImages: null,
            bindNavPrevention: true,
            postfix: "",
            onDemand: true,
            discardSelector: ".discard-answer"
            ,immediatelyShowMarkdownHelp:true
            );



            );








             

            draft saved


            draft discarded


















            StackExchange.ready(
            function ()
            StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f188616%2fcounting-different-kinds-of-line-terminator-characters-in-files%23new-answer', 'question_page');

            );

            Post as a guest






























            1 Answer
            1






            active

            oldest

            votes








            1 Answer
            1






            active

            oldest

            votes









            active

            oldest

            votes






            active

            oldest

            votes








            up vote
            1
            down vote



            accepted










            Your program works fine and you are correctly using goroutines, but the code isn't very idiomatic.



            Producer/Consumer architecture



            What we have here is a typical producer/consumer scenario. The producer is filepath.Walk() wich returns a list of files, and the consumers are goroutines executing regex on file content.



            So instead of the splitIntoChunks() method, we could just have a chan string: the producer sends file names to the channel, and the consumers loop over it and parse the file content.



            This have two main advantages:



            • code is easier to read

            • producer and consumer can work in parallel

            So the code would looks like



            results := make(chan Endings, 2) // chan to get the results of each worker
            var wg sync.WaitGroup
            wg.Add(2)
            files := make(chan string, 1000) // channel containing file list
            for i := 0; i < 2; i++
            go run(files, results, &wg) // start the producers

            filepath.Walk(searchDir, func(path string, f os.FileInfo, err error) error
            if !f.IsDir()
            files <- path // feed the channel

            return nil
            )
            close(files)
            wg.Wait()
            close(results)


            Work on byte rather than on string



            This is a general advice regarding performances: always prefer working on byte instead of working on string to avoid extra allocations.
            The regex package has methods to work on string or on byte slice, so instead of



            fileContent, err := ioutil.ReadFile(filename)
            c <- countEndings(string(fileContent))
            ...
            x := len(crlf.FindAllStringIndex(s, -1))


            we can just use



            fileContent, err := ioutil.ReadFile(fileName)
            ...
            x := len(crlf.FindAllIndex(fileContent, -1))


            Don't panic



            The code shouldn't panic on every error. For example, if the user running the program don't have the permission to read a file in searchDir, the program should not crash but rather log the error



            so avoid method like this:



            func check(e error) 
            if e != nil
            panic(e)




            instead handle the error localy:



            fileContent, err := ioutil.ReadFile(fileName)
            if err != nil
            fmt.Printf("fail to read file %s: %vn", fileName, err)



            Other consideration



            It's better to define regex as global var:



            var (
            crlf = regexp.MustCompile("rn")
            lf = regexp.MustCompile("n")
            )


            It's also better to use the flag package to parse arguments instead of relying on os.Args



            Here is the final version of the code:



            package main

            import (
            "flag"
            "fmt"
            "io/ioutil"
            "os"
            "path/filepath"
            "regexp"
            "sync"
            )

            var (
            crlf = regexp.MustCompile("rn")
            lf = regexp.MustCompile("n")
            )

            type Endings struct
            crlf, lf uint


            func (e *Endings) countEndings(fileName string)
            c, err := ioutil.ReadFile(fileName)
            if err != nil
            fmt.Printf("fail to read file %s: %vn", fileName, err)

            x := len(crlf.FindAllIndex(c, -1))
            y := len(lf.FindAllIndex(c, -1))

            e.crlf += uint(x)
            e.lf += uint(y - x)


            func run(files <-chan string, results chan<- Endings, wg *sync.WaitGroup)
            e := &Endings
            for f := range files
            e.countEndings(f)

            results <- *e
            wg.Done()


            func main()

            searchDir := flag.String("dir", "dir", "directory to search")
            nWorker := flag.Int("n", 2, "number of worker")
            flag.Parse()

            results := make(chan Endings, *nWorker)

            var wg sync.WaitGroup
            wg.Add(*nWorker)
            files := make(chan string, 1000)
            for i := 0; i < *nWorker; i++
            go run(files, results, &wg)

            filepath.Walk(*searchDir, func(path string, f os.FileInfo, err error) error
            if !f.IsDir()
            files <- path

            return nil
            )
            close(files)
            wg.Wait()
            close(results)

            count := &Endings
            for e := range results
            count.crlf += e.crlf
            count.lf += e.lf

            fmt.Printf("crlf: %d, lf: %dn", count.crlf, count.lf)



            Performance



            The new code is slightly faster:



            old:



            $ time ./test ~/go/src/golang.org 
            crlf: 6231, lf: 1589462
            ./test ~/go/src/golang.org 1,78s user 0,23s system 167% cpu 1,201 total


            new:



            $ time ./test -dir ~/go/src/golang.org -n 6 
            crlf: 6231, lf: 1589462
            ./test -dir ~/go/src/golang.org -n 6 1,48s user 0,18s system 181% cpu 0,911 total





            share|improve this answer



























              up vote
              1
              down vote



              accepted










              Your program works fine and you are correctly using goroutines, but the code isn't very idiomatic.



              Producer/Consumer architecture



              What we have here is a typical producer/consumer scenario. The producer is filepath.Walk() wich returns a list of files, and the consumers are goroutines executing regex on file content.



              So instead of the splitIntoChunks() method, we could just have a chan string: the producer sends file names to the channel, and the consumers loop over it and parse the file content.



              This have two main advantages:



              • code is easier to read

              • producer and consumer can work in parallel

              So the code would looks like



              results := make(chan Endings, 2) // chan to get the results of each worker
              var wg sync.WaitGroup
              wg.Add(2)
              files := make(chan string, 1000) // channel containing file list
              for i := 0; i < 2; i++
              go run(files, results, &wg) // start the producers

              filepath.Walk(searchDir, func(path string, f os.FileInfo, err error) error
              if !f.IsDir()
              files <- path // feed the channel

              return nil
              )
              close(files)
              wg.Wait()
              close(results)


              Work on byte rather than on string



              This is a general advice regarding performances: always prefer working on byte instead of working on string to avoid extra allocations.
              The regex package has methods to work on string or on byte slice, so instead of



              fileContent, err := ioutil.ReadFile(filename)
              c <- countEndings(string(fileContent))
              ...
              x := len(crlf.FindAllStringIndex(s, -1))


              we can just use



              fileContent, err := ioutil.ReadFile(fileName)
              ...
              x := len(crlf.FindAllIndex(fileContent, -1))


              Don't panic



              The code shouldn't panic on every error. For example, if the user running the program don't have the permission to read a file in searchDir, the program should not crash but rather log the error



              so avoid method like this:



              func check(e error) 
              if e != nil
              panic(e)




              instead handle the error localy:



              fileContent, err := ioutil.ReadFile(fileName)
              if err != nil
              fmt.Printf("fail to read file %s: %vn", fileName, err)



              Other consideration



              It's better to define regex as global var:



              var (
              crlf = regexp.MustCompile("rn")
              lf = regexp.MustCompile("n")
              )


              It's also better to use the flag package to parse arguments instead of relying on os.Args



              Here is the final version of the code:



              package main

              import (
              "flag"
              "fmt"
              "io/ioutil"
              "os"
              "path/filepath"
              "regexp"
              "sync"
              )

              var (
              crlf = regexp.MustCompile("rn")
              lf = regexp.MustCompile("n")
              )

              type Endings struct
              crlf, lf uint


              func (e *Endings) countEndings(fileName string)
              c, err := ioutil.ReadFile(fileName)
              if err != nil
              fmt.Printf("fail to read file %s: %vn", fileName, err)

              x := len(crlf.FindAllIndex(c, -1))
              y := len(lf.FindAllIndex(c, -1))

              e.crlf += uint(x)
              e.lf += uint(y - x)


              func run(files <-chan string, results chan<- Endings, wg *sync.WaitGroup)
              e := &Endings
              for f := range files
              e.countEndings(f)

              results <- *e
              wg.Done()


              func main()

              searchDir := flag.String("dir", "dir", "directory to search")
              nWorker := flag.Int("n", 2, "number of worker")
              flag.Parse()

              results := make(chan Endings, *nWorker)

              var wg sync.WaitGroup
              wg.Add(*nWorker)
              files := make(chan string, 1000)
              for i := 0; i < *nWorker; i++
              go run(files, results, &wg)

              filepath.Walk(*searchDir, func(path string, f os.FileInfo, err error) error
              if !f.IsDir()
              files <- path

              return nil
              )
              close(files)
              wg.Wait()
              close(results)

              count := &Endings
              for e := range results
              count.crlf += e.crlf
              count.lf += e.lf

              fmt.Printf("crlf: %d, lf: %dn", count.crlf, count.lf)



              Performance



              The new code is slightly faster:



              old:



              $ time ./test ~/go/src/golang.org 
              crlf: 6231, lf: 1589462
              ./test ~/go/src/golang.org 1,78s user 0,23s system 167% cpu 1,201 total


              new:



              $ time ./test -dir ~/go/src/golang.org -n 6 
              crlf: 6231, lf: 1589462
              ./test -dir ~/go/src/golang.org -n 6 1,48s user 0,18s system 181% cpu 0,911 total





              share|improve this answer

























                up vote
                1
                down vote



                accepted







                up vote
                1
                down vote



                accepted






                Your program works fine and you are correctly using goroutines, but the code isn't very idiomatic.



                Producer/Consumer architecture



                What we have here is a typical producer/consumer scenario. The producer is filepath.Walk() wich returns a list of files, and the consumers are goroutines executing regex on file content.



                So instead of the splitIntoChunks() method, we could just have a chan string: the producer sends file names to the channel, and the consumers loop over it and parse the file content.



                This have two main advantages:



                • code is easier to read

                • producer and consumer can work in parallel

                So the code would looks like



                results := make(chan Endings, 2) // chan to get the results of each worker
                var wg sync.WaitGroup
                wg.Add(2)
                files := make(chan string, 1000) // channel containing file list
                for i := 0; i < 2; i++
                go run(files, results, &wg) // start the producers

                filepath.Walk(searchDir, func(path string, f os.FileInfo, err error) error
                if !f.IsDir()
                files <- path // feed the channel

                return nil
                )
                close(files)
                wg.Wait()
                close(results)


                Work on byte rather than on string



                This is a general advice regarding performances: always prefer working on byte instead of working on string to avoid extra allocations.
                The regex package has methods to work on string or on byte slice, so instead of



                fileContent, err := ioutil.ReadFile(filename)
                c <- countEndings(string(fileContent))
                ...
                x := len(crlf.FindAllStringIndex(s, -1))


                we can just use



                fileContent, err := ioutil.ReadFile(fileName)
                ...
                x := len(crlf.FindAllIndex(fileContent, -1))


                Don't panic



                The code shouldn't panic on every error. For example, if the user running the program don't have the permission to read a file in searchDir, the program should not crash but rather log the error



                so avoid method like this:



                func check(e error) 
                if e != nil
                panic(e)




                instead handle the error localy:



                fileContent, err := ioutil.ReadFile(fileName)
                if err != nil
                fmt.Printf("fail to read file %s: %vn", fileName, err)



                Other consideration



                It's better to define regex as global var:



                var (
                crlf = regexp.MustCompile("rn")
                lf = regexp.MustCompile("n")
                )


                It's also better to use the flag package to parse arguments instead of relying on os.Args



                Here is the final version of the code:



                package main

                import (
                "flag"
                "fmt"
                "io/ioutil"
                "os"
                "path/filepath"
                "regexp"
                "sync"
                )

                var (
                crlf = regexp.MustCompile("rn")
                lf = regexp.MustCompile("n")
                )

                type Endings struct
                crlf, lf uint


                func (e *Endings) countEndings(fileName string)
                c, err := ioutil.ReadFile(fileName)
                if err != nil
                fmt.Printf("fail to read file %s: %vn", fileName, err)

                x := len(crlf.FindAllIndex(c, -1))
                y := len(lf.FindAllIndex(c, -1))

                e.crlf += uint(x)
                e.lf += uint(y - x)


                func run(files <-chan string, results chan<- Endings, wg *sync.WaitGroup)
                e := &Endings
                for f := range files
                e.countEndings(f)

                results <- *e
                wg.Done()


                func main()

                searchDir := flag.String("dir", "dir", "directory to search")
                nWorker := flag.Int("n", 2, "number of worker")
                flag.Parse()

                results := make(chan Endings, *nWorker)

                var wg sync.WaitGroup
                wg.Add(*nWorker)
                files := make(chan string, 1000)
                for i := 0; i < *nWorker; i++
                go run(files, results, &wg)

                filepath.Walk(*searchDir, func(path string, f os.FileInfo, err error) error
                if !f.IsDir()
                files <- path

                return nil
                )
                close(files)
                wg.Wait()
                close(results)

                count := &Endings
                for e := range results
                count.crlf += e.crlf
                count.lf += e.lf

                fmt.Printf("crlf: %d, lf: %dn", count.crlf, count.lf)



                Performance



                The new code is slightly faster:



                old:



                $ time ./test ~/go/src/golang.org 
                crlf: 6231, lf: 1589462
                ./test ~/go/src/golang.org 1,78s user 0,23s system 167% cpu 1,201 total


                new:



                $ time ./test -dir ~/go/src/golang.org -n 6 
                crlf: 6231, lf: 1589462
                ./test -dir ~/go/src/golang.org -n 6 1,48s user 0,18s system 181% cpu 0,911 total





                share|improve this answer















                Your program works fine and you are correctly using goroutines, but the code isn't very idiomatic.



                Producer/Consumer architecture



                What we have here is a typical producer/consumer scenario. The producer is filepath.Walk() wich returns a list of files, and the consumers are goroutines executing regex on file content.



                So instead of the splitIntoChunks() method, we could just have a chan string: the producer sends file names to the channel, and the consumers loop over it and parse the file content.



                This have two main advantages:



                • code is easier to read

                • producer and consumer can work in parallel

                So the code would looks like



                results := make(chan Endings, 2) // chan to get the results of each worker
                var wg sync.WaitGroup
                wg.Add(2)
                files := make(chan string, 1000) // channel containing file list
                for i := 0; i < 2; i++
                go run(files, results, &wg) // start the producers

                filepath.Walk(searchDir, func(path string, f os.FileInfo, err error) error
                if !f.IsDir()
                files <- path // feed the channel

                return nil
                )
                close(files)
                wg.Wait()
                close(results)


                Work on byte rather than on string



                This is a general advice regarding performances: always prefer working on byte instead of working on string to avoid extra allocations.
                The regex package has methods to work on string or on byte slice, so instead of



                fileContent, err := ioutil.ReadFile(filename)
                c <- countEndings(string(fileContent))
                ...
                x := len(crlf.FindAllStringIndex(s, -1))


                we can just use



                fileContent, err := ioutil.ReadFile(fileName)
                ...
                x := len(crlf.FindAllIndex(fileContent, -1))


                Don't panic



                The code shouldn't panic on every error. For example, if the user running the program don't have the permission to read a file in searchDir, the program should not crash but rather log the error



                so avoid method like this:



                func check(e error) 
                if e != nil
                panic(e)




                instead handle the error localy:



                fileContent, err := ioutil.ReadFile(fileName)
                if err != nil
                fmt.Printf("fail to read file %s: %vn", fileName, err)



                Other consideration



                It's better to define regex as global var:



                var (
                crlf = regexp.MustCompile("rn")
                lf = regexp.MustCompile("n")
                )


                It's also better to use the flag package to parse arguments instead of relying on os.Args



                Here is the final version of the code:



                package main

                import (
                "flag"
                "fmt"
                "io/ioutil"
                "os"
                "path/filepath"
                "regexp"
                "sync"
                )

                var (
                crlf = regexp.MustCompile("rn")
                lf = regexp.MustCompile("n")
                )

                type Endings struct
                crlf, lf uint


                func (e *Endings) countEndings(fileName string)
                c, err := ioutil.ReadFile(fileName)
                if err != nil
                fmt.Printf("fail to read file %s: %vn", fileName, err)

                x := len(crlf.FindAllIndex(c, -1))
                y := len(lf.FindAllIndex(c, -1))

                e.crlf += uint(x)
                e.lf += uint(y - x)


                func run(files <-chan string, results chan<- Endings, wg *sync.WaitGroup)
                e := &Endings
                for f := range files
                e.countEndings(f)

                results <- *e
                wg.Done()


                func main()

                searchDir := flag.String("dir", "dir", "directory to search")
                nWorker := flag.Int("n", 2, "number of worker")
                flag.Parse()

                results := make(chan Endings, *nWorker)

                var wg sync.WaitGroup
                wg.Add(*nWorker)
                files := make(chan string, 1000)
                for i := 0; i < *nWorker; i++
                go run(files, results, &wg)

                filepath.Walk(*searchDir, func(path string, f os.FileInfo, err error) error
                if !f.IsDir()
                files <- path

                return nil
                )
                close(files)
                wg.Wait()
                close(results)

                count := &Endings
                for e := range results
                count.crlf += e.crlf
                count.lf += e.lf

                fmt.Printf("crlf: %d, lf: %dn", count.crlf, count.lf)



                Performance



                The new code is slightly faster:



                old:



                $ time ./test ~/go/src/golang.org 
                crlf: 6231, lf: 1589462
                ./test ~/go/src/golang.org 1,78s user 0,23s system 167% cpu 1,201 total


                new:



                $ time ./test -dir ~/go/src/golang.org -n 6 
                crlf: 6231, lf: 1589462
                ./test -dir ~/go/src/golang.org -n 6 1,48s user 0,18s system 181% cpu 0,911 total






                share|improve this answer















                share|improve this answer



                share|improve this answer








                edited Mar 21 at 6:47


























                answered Mar 20 at 10:59









                felix

                67829




                67829






















                     

                    draft saved


                    draft discarded


























                     


                    draft saved


                    draft discarded














                    StackExchange.ready(
                    function ()
                    StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f188616%2fcounting-different-kinds-of-line-terminator-characters-in-files%23new-answer', 'question_page');

                    );

                    Post as a guest













































































                    Popular posts from this blog

                    Greedy Best First Search implementation in Rust

                    Function to Return a JSON Like Objects Using VBA Collections and Arrays

                    C++11 CLH Lock Implementation