//usr/bin/go run $0 $@ ; exit package main import ( "errors" "flag" "fmt" "log" "os" "path/filepath" "strconv" "sync" "time" "github.com/fatih/color" "github.com/gabriel-vasile/mimetype" "github.com/sirupsen/logrus" ) type LogMessagePDF struct { pdfWordCount int pdfPageTotal int pdfWordByPage float64 /*Title string Author string Creator string ISBN string pdfFormatVersion string*/ } type LogMessageXML struct { isWellFormed bool } type LogMessage struct { corpusname string name string startAt string extension string path string mimetype string size int64 pdf LogMessagePDF xml LogMessageXML } var canal = make(chan struct{}, 12) var wg sync.WaitGroup var numberFiles int = 0 var corpusPath = flag.String("c", "test", "Corpus path") var outputPath = flag.String("o", "out", "Output path") var corpusName = flag.String("n", "test", "Corpus name") var withWordCount = flag.Bool("w", false, "Enable word count") func logger(message *LogMessage) { if message.extension == ".pdf" { logrus.WithFields(logrus.Fields{ "corpusname": message.corpusname, "name": message.name, "startAt": message.startAt, "extension": message.extension, "path": message.path, "mimetype": message.mimetype, "size": message.size, "pdfPageTotal": message.pdf.pdfPageTotal, "pdfWordCount": message.pdf.pdfWordCount, "pdfWordByPage": message.pdf.pdfWordByPage, }).Info("") } else if message.extension == ".xml" { logrus.WithFields(logrus.Fields{ "corpusname": message.corpusname, "name": message.name, "startAt": message.startAt, "extension": message.extension, "path": message.path, "mimetype": message.mimetype, "size": message.size, "isWellFormed": message.xml.isWellFormed, }).Info("") } else { logrus.WithFields(logrus.Fields{ "corpusname": message.corpusname, "name": message.name, "startAt": message.startAt, "extension": message.extension, "path": message.path, "mimetype": message.mimetype, "size": message.size, }).Info("") } numberFiles++ fmt.Printf("\rFiles processed: %d", numberFiles) } func getAllFiles(dir string) { defer wg.Done() visit := func(path string, file os.FileInfo, err error) error { if err != nil { color.Red("Error: The path for corpus resource doesn't exist") os.Exit(1) } if file.IsDir() && path != dir { wg.Add(1) go getAllFiles(path) return filepath.SkipDir } if file.Mode().IsRegular() { mtype, err := mimetype.DetectFile(path) extension := mtype.Extension() absolutePath, err2 := filepath.Abs(path) if err == nil && err2 == nil { fileData := LogMessage{ corpusname: *corpusName, name: file.Name(), startAt: file.ModTime().String(), extension: extension, path: absolutePath, mimetype: mtype.String(), size: file.Size(), } if extension == ".pdf" { wg.Add(1) go processPDF(&fileData) } else if extension == ".xml" { wg.Add(1) go processXML(&fileData) } else { logger(&fileData) } } } return nil } filepath.Walk(dir, visit) } func initProcess() { if *withWordCount == true { log.Println("Count number word is enabled") } // init logger sec := time.Now().Unix() logPath := *outputPath + "/" + strconv.Itoa(int(sec)) + "-" + *corpusName log.Println("Write log in", logPath) os.Mkdir(logPath, os.ModePerm) outputFile, err := os.OpenFile(logPath+"/analyse-logs.json", os.O_WRONLY|os.O_CREATE, 0755) if err != nil { log.Println("CreateFile") } logrus.SetFormatter(&logrus.JSONFormatter{}) logrus.SetOutput(outputFile) } func main() { start := time.Now() flag.Parse() // init logger and params initProcess() // read all files log.Println("Read corpus in", *corpusPath, "and write out in", *outputPath) wg.Add(1) getAllFiles(*corpusPath) wg.Wait() close(canal) elapsed := time.Since(start) fmt.Println("") log.Printf("Total time %s", elapsed) } func Exists(name string) bool { _, err := os.Stat(name) if err == nil { return true } if errors.Is(err, os.ErrNotExist) { return false } return false }