Newer
Older
sisyphe-go / main.go
@Nacim Nacim on 9 Mar 2022 5 KB refactoring logging
//usr/bin/go run $0 $@ ; exit

package main

import (
	"bytes"
	"errors"
	"flag"
	"fmt"
	"io/ioutil"
	"log"
	"os"
	"path/filepath"
	"regexp"
	"runtime"
	"strconv"
	"strings"
	"sync"
	"time"

	"github.com/TwiN/go-color"
	"github.com/gabriel-vasile/mimetype"
	"github.com/goccy/go-json"
	"github.com/sirupsen/logrus"
)

// concurrence and time
var start = time.Now()
var queueForConcurrent = make(chan struct{}, 1300)
var wg sync.WaitGroup
var numberFiles int = 0

// cli args
var corpusPath = flag.String("p", "", "Corpus path")
var outputPath = flag.String("o", "out", "Output path")
var corpusName = flag.String("n", "test", "Corpus name")
var configurationFolder = flag.String("c", "", "Configuration folder path")
var withWordCount = flag.Bool("w", false, "Enable word count")

// regex
var regexMime = regexp.MustCompile(`(.*); charset=(.*)`)

var configDetailledAnalyze ConfigDetailledAnalyze

func GetAllFiles(dir string) {
	defer wg.Done()
	queueForConcurrent <- struct{}{}
	defer func() { <-queueForConcurrent }()

	visit := func(path string, file os.FileInfo, err error) error {
		if err != nil {
			fmt.Println(color.InRed("Error for getAllFiles"))
			fmt.Println(err)
			os.Exit(1)
		}
		if file.IsDir() && path != dir {
			wg.Add(1)
			go GetAllFiles(path)
			return filepath.SkipDir
		}
		if file.Mode().IsRegular() {
			// count number files processed
			if numberFiles%10 == 0 {
				fmt.Printf("\rFiles processed: %d", numberFiles)
			}
			mtype, err := mimetype.DetectFile(path)
			extension := filepath.Ext(path)
			groupMime := regexMime.FindStringSubmatch(mtype.String())
			mimetype := mtype.String()
			mimeEncoding := "binary"
			if len(groupMime) == 3 {
				mimetype = groupMime[1]
				mimeEncoding = groupMime[2]
			}
			absolutePath, err2 := filepath.Abs(path)
			if err == nil && err2 == nil {
				fileData := GeneralInfo{
					corpusName:   *corpusName,
					name:         file.Name(),
					startAt:      file.ModTime().Unix(),
					extension:    extension,
					path:         absolutePath,
					mimetype:     strings.ToLower(mimetype),
					mimeEncoding: mimeEncoding,
					size:         file.Size(),
				}
				logger := logrus.WithFields(logrus.Fields{
					"corpusName":   fileData.corpusName,
					"name":         fileData.name,
					"startAt":      fileData.startAt,
					"extension":    fileData.extension,
					"path":         fileData.path,
					"mimetype":     fileData.mimetype,
					"mimeEncoding": fileData.mimeEncoding,
					"size":         fileData.size,
				})
				if *configurationFolder == "" {
					if fileData.mimetype == "application/pdf" {
						wg.Add(1)
						go ProcessPDF(&fileData, logger)
						numberFiles++
					} else if fileData.mimetype == "application/xml" || fileData.mimetype == "text/xml" {
						wg.Add(1)
						go ProcessXML(&fileData, logger)
						numberFiles++
					} else {
						logger.Info("")
						numberFiles++
					}
				} else {
					if fileData.mimetype == "application/xml" || fileData.mimetype == "text/xml" {
						wg.Add(1)
						go ProcessXML(&fileData, logger)
						numberFiles++
					}
				}
			}
		}
		return nil
	}

	filepath.Walk(dir, visit)
}

func InitProcess() {
	// init args
	flag.Parse()

	// params for process
	if *corpusPath == "" {
		fmt.Println(color.InRed("Corpus path is not defined"))
		os.Exit(1)
	}
	if *withWordCount {
		fmt.Println(color.InBlue("Count number word is enabled"))
	}

	// init logger
	sec := time.Now().Unix()
	logPath := *outputPath + "/" + strconv.Itoa(int(sec)) + "-" + *corpusName
	log.Println(color.Green + "Begin of program with " + strconv.Itoa(runtime.NumCPU()) + " CPU" + color.Reset)
	log.Println("Read corpus in", *corpusPath, "and write out in", logPath)
	os.MkdirAll(logPath, os.ModePerm|os.ModeDir)
	outputFile, err := os.OpenFile(logPath+"/analyse-logs.json", os.O_WRONLY|os.O_CREATE, 0755)
	if err != nil {
		fmt.Println(color.InRed("Unable to create log file, please fix your command"))
		fmt.Println(err)
		os.Exit(1)
	}
	logrus.SetFormatter(&logrus.JSONFormatter{
		DisableHTMLEscape: true,
		DisableTimestamp:  true,
	})
	logrus.SetOutput(outputFile)
}

func InitDetailledAnalyze() {
	fmt.Println(color.InGreen("The detailed analysis is enabled for"), color.InGreen(*configurationFolder+"/"+*corpusName+"/sisyphe-conf.json"))
	jsonFile, err := ioutil.ReadFile(*configurationFolder + "/" + *corpusName + "/sisyphe-conf.json")
	if err != nil {
		fmt.Println(color.InRed("Bad configuration file"))
		os.Exit(1)
	}
	if err := json.Unmarshal(bytes.TrimPrefix(jsonFile, []byte("\xef\xbb\xbf")), &configDetailledAnalyze); err != nil {
		log.Fatal("Error in Config file", err)
		os.Exit(1)
	}
	fmt.Println(color.InBlue(strconv.Itoa(len(configDetailledAnalyze.XML.ListDTD)) + " DTD files for analysis"))
	fmt.Println(color.InBlue(strconv.Itoa(len(configDetailledAnalyze.XML.ListXSD)) + " XSD files for analysis"))
}

func main() {
	// init logger and params
	InitProcess()

	if *configurationFolder != "" {
		InitDetailledAnalyze()
	}

	wg.Add(1)
	GetAllFiles(*corpusPath)
	wg.Wait()
	close(queueForConcurrent)
	time.Sleep(2 * time.Second)
	elapsed := time.Since(start)
	fmt.Println("")
	log.Println(color.Green + "End of program with " + strconv.Itoa(numberFiles) + " files processed" + color.Reset)
	log.Printf("Total time %s", elapsed)
}

func Exists(name string) bool {
	_, err := os.Stat(name)
	if err == nil {
		return true
	}
	if errors.Is(err, os.ErrNotExist) {
		return false
	}
	return false
}