Newer
Older
sisyphe-go / main.go
@Nacim Nacim on 7 Feb 2022 4 KB add name
//usr/bin/go run $0 $@ ; exit

package main

import (
	"errors"
	"flag"
	"fmt"
	"log"
	"os"
	"path/filepath"
	"strconv"
	"sync"
	"time"

	"github.com/fatih/color"
	"github.com/gabriel-vasile/mimetype"
	"github.com/sirupsen/logrus"
)

type LogMessagePDF struct {
	pdfWordCount  int
	pdfPageTotal  int
	pdfWordByPage float64
	/*Title            string
	Author           string
	Creator          string
	ISBN             string
	pdfFormatVersion string*/
}

type LogMessageXML struct {
	isWellFormed bool
}
type LogMessage struct {
	corpusname string
	name       string
	startAt    string
	extension  string
	path       string
	mimetype   string
	size       int64
	pdf        LogMessagePDF
	xml        LogMessageXML
}

var canal = make(chan struct{}, 12)
var wg sync.WaitGroup
var numberFiles int = 0
var corpusPath = flag.String("c", "test", "Corpus path")
var outputPath = flag.String("o", "out", "Output path")
var corpusName = flag.String("n", "test", "Corpus name")
var withWordCount = flag.Bool("w", false, "Enable word count")

func logger(message *LogMessage) {
	if message.extension == ".pdf" {
		logrus.WithFields(logrus.Fields{
			"corpusname":    message.corpusname,
			"name":          message.name,
			"startAt":       message.startAt,
			"extension":     message.extension,
			"path":          message.path,
			"mimetype":      message.mimetype,
			"size":          message.size,
			"pdfPageTotal":  message.pdf.pdfPageTotal,
			"pdfWordCount":  message.pdf.pdfWordCount,
			"pdfWordByPage": message.pdf.pdfWordByPage,
		}).Info("")
	} else if message.extension == ".xml" {
		logrus.WithFields(logrus.Fields{
			"corpusname":   message.corpusname,
			"name":         message.name,
			"startAt":      message.startAt,
			"extension":    message.extension,
			"path":         message.path,
			"mimetype":     message.mimetype,
			"size":         message.size,
			"isWellFormed": message.xml.isWellFormed,
		}).Info("")
	} else {
		logrus.WithFields(logrus.Fields{
			"corpusname": message.corpusname,
			"name":       message.name,
			"startAt":    message.startAt,
			"extension":  message.extension,
			"path":       message.path,
			"mimetype":   message.mimetype,
			"size":       message.size,
		}).Info("")
	}
	numberFiles++
	fmt.Printf("\rFiles processed: %d", numberFiles)
}

func getAllFiles(dir string) {
	defer wg.Done()

	visit := func(path string, file os.FileInfo, err error) error {
		if err != nil {
			color.Red("Error: The path for corpus resource doesn't exist")
			os.Exit(1)
		}
		if file.IsDir() && path != dir {
			wg.Add(1)
			go getAllFiles(path)
			return filepath.SkipDir
		}
		if file.Mode().IsRegular() {
			mtype, err := mimetype.DetectFile(path)
			extension := mtype.Extension()
			absolutePath, err2 := filepath.Abs(path)
			if err == nil && err2 == nil {
				fileData := LogMessage{
					corpusname: *corpusName,
					name:       file.Name(),
					startAt:    file.ModTime().String(),
					extension:  extension,
					path:       absolutePath,
					mimetype:   mtype.String(),
					size:       file.Size(),
				}
				if extension == ".pdf" {
					wg.Add(1)
					go processPDF(&fileData)
				} else if extension == ".xml" {
					wg.Add(1)
					go processXML(&fileData)
				} else {
					logger(&fileData)
				}
			}

		}

		return nil
	}

	filepath.Walk(dir, visit)
}

func initProcess() {
	if *withWordCount == true {
		log.Println("Count number word is enabled")
	}
	// init logger
	sec := time.Now().Unix()
	logPath := *outputPath + "/" + strconv.Itoa(int(sec)) + "-" + *corpusName
	log.Println("Write log in", logPath)
	os.Mkdir(logPath, os.ModePerm)
	outputFile, err := os.OpenFile(logPath+"/analyse-logs.json", os.O_WRONLY|os.O_CREATE, 0755)
	if err != nil {
		log.Println("CreateFile")
	}
	logrus.SetFormatter(&logrus.JSONFormatter{})
	logrus.SetOutput(outputFile)
}

func main() {

	start := time.Now()
	flag.Parse()

	// init logger and params
	initProcess()

	// read all files
	log.Println("Read corpus in", *corpusPath, "and write out in", *outputPath)

	wg.Add(1)
	getAllFiles(*corpusPath)
	wg.Wait()
	close(canal)
	elapsed := time.Since(start)
	fmt.Println("")
	log.Printf("Total time %s", elapsed)
}

func Exists(name string) bool {
	_, err := os.Stat(name)
	if err == nil {
		return true
	}
	if errors.Is(err, os.ErrNotExist) {
		return false
	}
	return false
}