//usr/bin/go run $0 $@ ; exit package main import ( "errors" "flag" "fmt" "log" "os" "path/filepath" "regexp" "runtime" "strconv" "sync" "time" "github.com/TwiN/go-color" "github.com/gabriel-vasile/mimetype" "github.com/sirupsen/logrus" ) type LogMessagePDF struct { pdfWordCount int pdfPageTotal int pdfWordByPage int Author string Creator string CreationDate string pdfFormatVersion string pdfError string } type LogMessageXML struct { isWellFormed bool xmlError string } type LogMessage struct { corpusName string name string startAt string extension string path string mimetype string mimeEncoding string size int64 pdf LogMessagePDF xml LogMessageXML } var queueForConcurrent = make(chan struct{}, 1100) var wg sync.WaitGroup var numberFiles int = 0 var corpusPath = flag.String("p", "", "Corpus path") var outputPath = flag.String("o", "out", "Output path") var corpusName = flag.String("n", "test", "Corpus name") var configurationFolder = flag.String("c", "", "Configuration folder path") var withWordCount = flag.Bool("w", false, "Enable word count") var regexMime = regexp.MustCompile(`(.*); charset=(.*)`) func incrementProcess() { numberFiles++ if numberFiles%5000 == 0 { fmt.Printf("\rFiles processed: %d", numberFiles) } } func getAllFiles(dir string) { defer wg.Done() queueForConcurrent <- struct{}{} defer func() { <-queueForConcurrent }() visit := func(path string, file os.FileInfo, err error) error { if err != nil { fmt.Println(color.InRed("Error for getAllFiles")) fmt.Println(err) os.Exit(1) } if file.IsDir() && path != dir { wg.Add(1) go getAllFiles(path) return filepath.SkipDir } if file.Mode().IsRegular() { mtype, err := mimetype.DetectFile(path) extension := mtype.Extension() groupMime := regexMime.FindStringSubmatch(mtype.String()) mimetype := mtype.String() mimeEncoding := "binary" if len(groupMime) == 3 { mimetype = groupMime[1] mimeEncoding = groupMime[2] } absolutePath, err2 := filepath.Abs(path) if err == nil && err2 == nil { fileData := LogMessage{ corpusName: *corpusName, name: file.Name(), startAt: file.ModTime().String(), extension: extension, path: absolutePath, mimetype: mimetype, mimeEncoding: mimeEncoding, size: file.Size(), } if extension == ".pdf" { wg.Add(1) go processPDF(&fileData) } else if extension == ".xml" { wg.Add(1) go processXML(&fileData) } else { logrus.WithFields(logrus.Fields{ "corpusName": fileData.corpusName, "name": fileData.name, "startAt": fileData.startAt, "extension": fileData.extension, "path": fileData.path, "mimetype": fileData.mimetype, "mimeEncoding": fileData.mimeEncoding, "size": fileData.size, }).Info("") } } } return nil } filepath.Walk(dir, visit) } func initProcess() { // init args flag.Parse() if *corpusPath == "" { fmt.Println(color.InRed("Corpus path is not defined")) os.Exit(1) } if *withWordCount == true { fmt.Println("Count number word is enabled") } // init logger sec := time.Now().Unix() logPath := *outputPath + "/" + strconv.Itoa(int(sec)) + "-" + *corpusName log.Println(color.Green + "Begin of program with " + strconv.Itoa(runtime.NumCPU()) + " CPU" + color.Reset) log.Println("Read corpus in", *corpusPath, "and write out in", logPath) os.MkdirAll(logPath, os.ModePerm|os.ModeDir) outputFile, err := os.OpenFile(logPath+"/analyse-logs.json", os.O_WRONLY|os.O_CREATE, 0755) if err != nil { fmt.Println(color.InRed("Unable to create log file, please fix your command")) fmt.Println(err) os.Exit(1) } logrus.SetFormatter(&logrus.JSONFormatter{}) logrus.SetOutput(outputFile) return } func main() { start := time.Now() // init logger and params initProcess() wg.Add(1) getAllFiles(*corpusPath) wg.Wait() close(queueForConcurrent) elapsed := time.Since(start) fmt.Println("") log.Println(color.Green + "End of program with " + strconv.Itoa(numberFiles) + " files processed" + color.Reset) log.Printf("Total time %s", elapsed) } func Exists(name string) bool { _, err := os.Stat(name) if err == nil { return true } if errors.Is(err, os.ErrNotExist) { return false } return false }