//usr/bin/go run $0 $@ ; exit package main import ( "errors" "flag" "fmt" "log" "os" "path/filepath" "regexp" "runtime" "strconv" "sync" "time" "github.com/TwiN/go-color" "github.com/gabriel-vasile/mimetype" "github.com/sirupsen/logrus" ) type LogMessagePDF struct { pdfWordCount int pdfPageTotal int pdfWordByPage int Author string Creator string CreationDate string pdfFormatVersion string pdfError string } type LogMessageXML struct { isWellFormed bool xmlError string } type LogMessage struct { corpusname string name string startAt string extension string path string mimetype string size int64 pdf LogMessagePDF xml LogMessageXML } var queueForConcurrent = make(chan struct{}, runtime.NumCPU()*75) var wg sync.WaitGroup var numberFiles int = 0 var corpusPath = flag.String("p", "", "Corpus path") var outputPath = flag.String("o", "out", "Output path") var corpusName = flag.String("n", "test", "Corpus name") var configurationFolder = flag.String("c", "", "Configuration folder path") var withWordCount = flag.Bool("w", false, "Enable word count") func writeLog(message *LogMessage) { if message.extension == ".pdf" { logrus.WithFields(logrus.Fields{ "corpusname": message.corpusname, "name": message.name, "startAt": message.startAt, "extension": message.extension, "path": message.path, "mimetype": message.mimetype, "size": message.size, "Author": message.pdf.Author, "Creator": message.pdf.Creator, "CreationDate": message.pdf.CreationDate, "pdfFormatVersion": message.pdf.pdfFormatVersion, "pdfPageTotal": message.pdf.pdfPageTotal, "pdfWordCount": message.pdf.pdfWordCount, "pdfWordByPage": message.pdf.pdfWordByPage, "pdfError": message.pdf.pdfError, }).Info("") } else if message.extension == ".xml" { logrus.WithFields(logrus.Fields{ "corpusname": message.corpusname, "name": message.name, "startAt": message.startAt, "extension": message.extension, "path": message.path, "mimetype": message.mimetype, "size": message.size, "isWellFormed": message.xml.isWellFormed, "xmlError": message.xml.xmlError, }).Info("") } else { logrus.WithFields(logrus.Fields{ "corpusname": message.corpusname, "name": message.name, "startAt": message.startAt, "extension": message.extension, "path": message.path, "mimetype": message.mimetype, "size": message.size, }).Info("") } numberFiles++ if numberFiles%5000 == 0 { fmt.Printf("\rFiles processed: %d", numberFiles) } } func getAllFiles(dir string) { defer wg.Done() queueForConcurrent <- struct{}{} defer func() { <-queueForConcurrent }() visit := func(path string, file os.FileInfo, err error) error { if err != nil { fmt.Println(color.InRed("Error for getAllFiles")) fmt.Println(err) os.Exit(1) } if file.IsDir() && path != dir { wg.Add(1) go getAllFiles(path) return filepath.SkipDir } if file.Mode().IsRegular() { mtype, err := mimetype.DetectFile(path) extension := mtype.Extension() absolutePath, err2 := filepath.Abs(path) if err == nil && err2 == nil { fileData := LogMessage{ corpusname: *corpusName, name: file.Name(), startAt: file.ModTime().String(), extension: extension, path: absolutePath, mimetype: mtype.String(), size: file.Size(), } if extension == ".pdf" { wg.Add(1) go processPDF(&fileData) } else if extension == ".xml" { m1 := regexp.MustCompile(`;.*`) fileData.mimetype = m1.ReplaceAllString(fileData.mimetype, "") wg.Add(1) go processXML(&fileData) } else { writeLog(&fileData) } } } return nil } filepath.Walk(dir, visit) } func initProcess() { // init args flag.Parse() if *corpusPath == "" { fmt.Println(color.InRed("Corpus path is not defined")) os.Exit(1) } if *withWordCount == true { fmt.Println("Count number word is enabled") } // init logger sec := time.Now().Unix() logPath := *outputPath + "/" + strconv.Itoa(int(sec)) + "-" + *corpusName log.Println(color.Green + "Begin of program with " + strconv.Itoa(runtime.NumCPU()) + " CPU" + color.Reset) log.Println("Read corpus in", *corpusPath, "and write out in", logPath) os.MkdirAll(logPath, os.ModePerm|os.ModeDir) outputFile, err := os.OpenFile(logPath+"/analyse-logs.json", os.O_WRONLY|os.O_CREATE, 0755) if err != nil { fmt.Println(color.InRed("Unable to create log file, please fix your command")) fmt.Println(err) os.Exit(1) } logrus.SetFormatter(&logrus.JSONFormatter{}) logrus.SetOutput(outputFile) return } func main() { start := time.Now() // init logger and params initProcess() wg.Add(1) getAllFiles(*corpusPath) wg.Wait() close(queueForConcurrent) elapsed := time.Since(start) fmt.Println("") log.Println(color.Green + "End of program with " + strconv.Itoa(numberFiles) + " files processed" + color.Reset) log.Printf("Total time %s", elapsed) } func Exists(name string) bool { _, err := os.Stat(name) if err == nil { return true } if errors.Is(err, os.ErrNotExist) { return false } return false }