//usr/bin/go run $0 $@ ; exit package main import ( "bytes" "flag" "fmt" "io/ioutil" "log" "os" "os/exec" "path/filepath" "regexp" "runtime" "strconv" "strings" "sync" "time" "github.com/TwiN/go-color" "github.com/gabriel-vasile/mimetype" "github.com/goccy/go-json" "github.com/sirupsen/logrus" ) // concurrence and time var start = time.Now() var queueForConcurrent = make(chan struct{}, 1300) var wg sync.WaitGroup var numberFiles int = 0 // cli args var corpusPath = flag.String("p", "", "Corpus path") var outputPath = flag.String("o", "out", "Output path") var corpusName = flag.String("n", "test", "Corpus name") var configurationFolder = flag.String("c", "", "Configuration folder path") var withWordCount = flag.Bool("w", false, "Enable word count") // regex var regexMime = regexp.MustCompile(`(.*); charset=(.*)`) // other var logPath = "" var configDetailledAnalyze ConfigDetailledAnalyze var importType = "generique" func GetAllFiles(dir string) { defer wg.Done() queueForConcurrent <- struct{}{} defer func() { <-queueForConcurrent }() visit := func(path string, file os.FileInfo, err error) error { if err != nil { fmt.Println(color.InRed("Error for getAllFiles")) fmt.Println(err) os.Exit(1) } if file.IsDir() && path != dir { wg.Add(1) go GetAllFiles(path) return filepath.SkipDir } if file.Mode().IsRegular() { // count number files processed if numberFiles%10 == 0 { fmt.Printf("\rFiles processed: %d", numberFiles) } mtype, err := mimetype.DetectFile(path) extension := filepath.Ext(path) groupMime := regexMime.FindStringSubmatch(mtype.String()) mimetype := mtype.String() mimeEncoding := "binary" if len(groupMime) == 3 { mimetype = groupMime[1] mimeEncoding = groupMime[2] } absolutePath, err2 := filepath.Abs(path) if err == nil && err2 == nil { fileData := GeneralInfo{ corpusName: *corpusName, name: file.Name(), startAt: file.ModTime().Unix(), extension: extension, path: absolutePath, mimetype: strings.ToLower(mimetype), mimeEncoding: mimeEncoding, size: file.Size(), } logger := logrus.WithFields(logrus.Fields{ "corpusname": fileData.corpusName, "name": fileData.name, "startAt": fileData.startAt, "extension": fileData.extension, "path": fileData.path, "mimetype": fileData.mimetype, "mimeEncoding": fileData.mimeEncoding, "size": fileData.size, }) if *configurationFolder == "" { if fileData.mimetype == "application/pdf" { wg.Add(1) go ProcessPDF(&fileData, logger) } else if fileData.mimetype == "application/xml" || fileData.mimetype == "text/xml" { wg.Add(1) go ProcessXML(&fileData, logger) } else { logger.Info("") numberFiles++ } } else { if fileData.mimetype == "application/xml" || fileData.mimetype == "text/xml" { wg.Add(1) go ProcessXML(&fileData, logger) } } } } return nil } filepath.Walk(dir, visit) } func InitProcess() { // init args flag.Parse() // params for process if *corpusPath == "" { fmt.Println(color.InRed("Corpus path is not defined")) os.Exit(1) } if *withWordCount { fmt.Println(color.InBlue("Count number word is enabled")) } // init logger sec := time.Now().Unix() logPath = *outputPath + "/" + strconv.Itoa(int(sec)) + "-" + *corpusName log.Println(color.Green + "Begin of program with " + strconv.Itoa(runtime.NumCPU()) + " CPU" + color.Reset) log.Println("Read corpus in", *corpusPath, "and write out in", logPath) os.MkdirAll(logPath, os.ModePerm|os.ModeDir) outputFile, err := os.OpenFile(logPath+"/analyse-logs.json", os.O_WRONLY|os.O_CREATE, 0755) if err != nil { fmt.Println(color.InRed("Unable to create log file, please fix your command")) fmt.Println(err) os.Exit(1) } logrus.SetFormatter(&logrus.JSONFormatter{ DisableHTMLEscape: true, DisableTimestamp: true, }) logrus.SetOutput(outputFile) } func InitDetailledAnalyze() { files, _ := ioutil.ReadDir(*configurationFolder) // get file on corpus resources directory for _, file := range files { if file.IsDir() && GetStringWithPrefix(*corpusName, file.Name()) != "" { *corpusName = file.Name() } } fmt.Println(color.InGreen("The detailed analysis is enabled for"), color.InGreen(*configurationFolder+"/"+*corpusName+"/sisyphe-conf.json")) jsonFile, err := ioutil.ReadFile(*configurationFolder + "/" + *corpusName + "/sisyphe-conf.json") if err != nil { fmt.Println(color.InRed("Bad configuration file")) os.Exit(1) } if err := json.Unmarshal(bytes.TrimPrefix(jsonFile, []byte("\xef\xbb\xbf")), &configDetailledAnalyze); err != nil { log.Fatal("Error in Config file", err) os.Exit(1) } fmt.Println(color.InBlue(strconv.Itoa(len(configDetailledAnalyze.XML.ListDTD)) + " DTD files for analysis")) fmt.Println(color.InBlue(strconv.Itoa(len(configDetailledAnalyze.XML.ListXSD)) + " XSD files for analysis")) importType = "detaille" } func main() { // init logger and params InitProcess() if *configurationFolder != "" { InitDetailledAnalyze() } wg.Add(1) GetAllFiles(*corpusPath) wg.Wait() close(queueForConcurrent) elapsed := time.Since(start) fmt.Println("") log.Println(color.Green + "End of program with " + strconv.Itoa(numberFiles) + " files processed" + color.Reset) log.Printf("Total time %s", elapsed) _, err := exec.Command("/bin/bash", "indexCorpus.sh", logPath+"/analyse-logs.json", importType).CombinedOutput() if err != nil { fmt.Println(color.InRed("Error indexCorpus.sh")) panic(err) } }