Newer
Older
sisyphe-go / main.go
//usr/bin/go run $0 $@ ; exit

package main

import (
	"bytes"
	"flag"
	"fmt"
	"io/ioutil"
	"log"
	"os"
	"os/exec"
	"path/filepath"
	"regexp"
	"runtime"
	"strconv"
	"strings"
	"sync"
	"time"

	"github.com/TwiN/go-color"
	"github.com/gabriel-vasile/mimetype"
	"github.com/goccy/go-json"
	"github.com/sirupsen/logrus"
)

// concurrence and time
var start = time.Now()
var queueForConcurrent = make(chan struct{}, 1300)
var wg sync.WaitGroup
var numberFiles int = 0

// cli args
var corpusPath = flag.String("p", "", "Corpus path")
var outputPath = flag.String("o", "out", "Output path")
var corpusName = flag.String("n", "test", "Corpus name")
var configurationFolder = flag.String("c", "", "Configuration folder path")
var withWordCount = flag.Bool("w", false, "Enable word count")
var noIndexation = flag.Bool("noindex", false, "Disable indexation after process")

// regex
var regexMime = regexp.MustCompile(`(.*); charset=(.*)`)

// other
var logPath = ""
var corpusNameForLog = ""
var configDetailledAnalyze ConfigDetailledAnalyze
var importType = "generique"

func GetAllFiles(dir string) {
	defer wg.Done()
	queueForConcurrent <- struct{}{}
	defer func() { <-queueForConcurrent }()

	visit := func(path string, file os.FileInfo, err error) error {
		if err != nil {
			fmt.Println(color.InRed("Error for getAllFiles"))
			fmt.Println(err)
			os.Exit(1)
		}
		if file.IsDir() && path != dir {
			wg.Add(1)
			go GetAllFiles(path)
			return filepath.SkipDir
		}
		if file.Mode().IsRegular() {
			mtype, err := mimetype.DetectFile(path)
			extension := filepath.Ext(path)
			groupMime := regexMime.FindStringSubmatch(mtype.String())
			mimetype := mtype.String()
			mimeEncoding := "binary"
			if len(groupMime) == 3 {
				mimetype = groupMime[1]
				mimeEncoding = groupMime[2]
			}
			absolutePath, err2 := filepath.Abs(path)
			if err == nil && err2 == nil {
				fileData := GeneralInfo{
					corpusName:   corpusNameForLog,
					name:         file.Name(),
					startAt:      file.ModTime().Unix(),
					extension:    extension,
					path:         absolutePath,
					mimetype:     strings.ToLower(mimetype),
					mimeEncoding: mimeEncoding,
					size:         file.Size(),
				}
				logger := logrus.WithFields(logrus.Fields{
					"corpusname":   fileData.corpusName,
					"name":         fileData.name,
					"startAt":      fileData.startAt,
					"extension":    fileData.extension,
					"path":         fileData.path,
					"mimetype":     fileData.mimetype,
					"mimeEncoding": fileData.mimeEncoding,
					"size":         fileData.size,
				})
				if *configurationFolder == "" {
					if fileData.mimetype == "application/pdf" {
						wg.Add(1)
						go ProcessPDF(&fileData, logger)
					} else if fileData.mimetype == "application/xml" || fileData.mimetype == "text/xml" {
						wg.Add(1)
						go ProcessXML(&fileData, logger)
					} else {
						logger.Info("")
						UpdateCounter()
					}
				} else {
					if fileData.mimetype == "application/xml" || fileData.mimetype == "text/xml" {
						wg.Add(1)
						go ProcessXML(&fileData, logger)
					}
				}
			}
		}
		return nil
	}

	filepath.Walk(dir, visit)
}

func InitProcess() {
	// init args
	flag.Parse()

	// params for process
	if *corpusPath == "" {
		fmt.Println(color.InRed("Corpus path is not defined"))
		os.Exit(1)
	}
	if *withWordCount {
		fmt.Println(color.InBlue("Count number word is enabled"))
	}

	if *corpusName == "" {
		fmt.Println(color.InRed("Corpus name is not defined"))
		os.Exit(1)
	}
	corpusNameForLog = *corpusName

	// init logger
	sec := time.Now().Unix()
	logPath = *outputPath + "/" + strconv.Itoa(int(sec)) + "-" + *corpusName
	log.Println(color.Green + "Begin of program with " + strconv.Itoa(runtime.NumCPU()) + " CPU" + color.Reset)
	log.Println("Read corpus in", *corpusPath, "and write out in", logPath)
	os.MkdirAll(logPath, os.ModePerm|os.ModeDir)
	outputFile, err := os.OpenFile(logPath+"/analyse-logs.json", os.O_WRONLY|os.O_CREATE, 0755)
	if err != nil {
		fmt.Println(color.InRed("Unable to create log file, please fix your command"))
		fmt.Println(err)
		os.Exit(1)
	}
	logrus.SetFormatter(&logrus.JSONFormatter{
		DisableHTMLEscape: true,
		DisableTimestamp:  true,
	})
	logrus.SetOutput(outputFile)
}

func InitDetailledAnalyze() {
	files, _ := ioutil.ReadDir(*configurationFolder)
	// get file on corpus resources directory
	for _, file := range files {
		if file.IsDir() && GetStringWithPrefix(*corpusName, file.Name()) != "" {
			*corpusName = file.Name()
		}
	}
	fmt.Println(color.InGreen("The detailed analysis is enabled for"), color.InGreen(*configurationFolder+"/"+*corpusName+"/sisyphe-conf.json"))
	jsonFile, err := ioutil.ReadFile(*configurationFolder + "/" + *corpusName + "/sisyphe-conf.json")
	if err != nil {
		fmt.Println(color.InRed("Bad configuration file"))
		os.Exit(1)
	}
	if err := json.Unmarshal(bytes.TrimPrefix(jsonFile, []byte("\xef\xbb\xbf")), &configDetailledAnalyze); err != nil {
		log.Fatal("Error in Config file", err)
		os.Exit(1)
	}
	fmt.Println(color.InBlue(strconv.Itoa(len(configDetailledAnalyze.XML.ListDTD)) + " DTD files for analysis"))
	fmt.Println(color.InBlue(strconv.Itoa(len(configDetailledAnalyze.XML.ListXSD)) + " XSD files for analysis"))
	importType = "detaille"
}

func main() {
	// init logger and params
	InitProcess()

	if *configurationFolder != "" {
		InitDetailledAnalyze()
	}

	wg.Add(1)
	GetAllFiles(*corpusPath)
	wg.Wait()
	close(queueForConcurrent)
	elapsed := time.Since(start)
	fmt.Println("")
	log.Println(color.Green + "End of program with " + strconv.Itoa(numberFiles) + " files processed" + color.Reset)
	log.Printf("Total time %s", elapsed)

	// after process index analyze file
	if !*noIndexation {
		log.Println(color.InBlue("Run indexation process"))
		result, err := exec.Command("/bin/bash", "indexCorpus.sh", logPath+"/analyse-logs.json", importType).CombinedOutput()
		if err != nil {
			fmt.Println(color.InRed("Error indexCorpus.sh"))
			panic(err)
		}
		fmt.Println(string(result))
	}
}