Newer
Older
sisyphe-go / main.go
@Nacim Nacim on 9 Feb 2022 4 KB add mime encoding
//usr/bin/go run $0 $@ ; exit

package main

import (
	"errors"
	"flag"
	"fmt"
	"log"
	"os"
	"path/filepath"
	"regexp"
	"runtime"
	"strconv"
	"sync"
	"time"

	"github.com/TwiN/go-color"
	"github.com/gabriel-vasile/mimetype"
	"github.com/sirupsen/logrus"
)

type LogMessagePDF struct {
	pdfWordCount     int
	pdfPageTotal     int
	pdfWordByPage    int
	Author           string
	Creator          string
	CreationDate     string
	pdfFormatVersion string
	pdfError         string
}

type LogMessageXML struct {
	isWellFormed bool
	xmlError     string
}
type LogMessage struct {
	corpusName   string
	name         string
	startAt      string
	extension    string
	path         string
	mimetype     string
	mimeEncoding string
	size         int64
	pdf          LogMessagePDF
	xml          LogMessageXML
}

var queueForConcurrent = make(chan struct{}, 1100)
var wg sync.WaitGroup
var numberFiles int = 0
var corpusPath = flag.String("p", "", "Corpus path")
var outputPath = flag.String("o", "out", "Output path")
var corpusName = flag.String("n", "test", "Corpus name")
var configurationFolder = flag.String("c", "", "Configuration folder path")
var withWordCount = flag.Bool("w", false, "Enable word count")

var regexMime = regexp.MustCompile(`(.*); charset=(.*)`)

func incrementProcess() {
	numberFiles++
	if numberFiles%5000 == 0 {
		fmt.Printf("\rFiles processed: %d", numberFiles)
	}
}

func getAllFiles(dir string) {
	defer wg.Done()
	queueForConcurrent <- struct{}{}
	defer func() { <-queueForConcurrent }()

	visit := func(path string, file os.FileInfo, err error) error {
		if err != nil {
			fmt.Println(color.InRed("Error for getAllFiles"))
			fmt.Println(err)
			os.Exit(1)
		}
		if file.IsDir() && path != dir {
			wg.Add(1)
			go getAllFiles(path)
			return filepath.SkipDir
		}
		if file.Mode().IsRegular() {
			mtype, err := mimetype.DetectFile(path)
			extension := mtype.Extension()
			groupMime := regexMime.FindStringSubmatch(mtype.String())
			mimetype := mtype.String()
			mimeEncoding := "binary"
			if len(groupMime) == 3 {
				mimetype = groupMime[1]
				mimeEncoding = groupMime[2]
			}
			absolutePath, err2 := filepath.Abs(path)
			if err == nil && err2 == nil {
				fileData := LogMessage{
					corpusName:   *corpusName,
					name:         file.Name(),
					startAt:      file.ModTime().String(),
					extension:    extension,
					path:         absolutePath,
					mimetype:     mimetype,
					mimeEncoding: mimeEncoding,
					size:         file.Size(),
				}
				if extension == ".pdf" {
					wg.Add(1)
					go processPDF(&fileData)
				} else if extension == ".xml" {
					wg.Add(1)
					go processXML(&fileData)
				} else {
					logrus.WithFields(logrus.Fields{
						"corpusName":   fileData.corpusName,
						"name":         fileData.name,
						"startAt":      fileData.startAt,
						"extension":    fileData.extension,
						"path":         fileData.path,
						"mimetype":     fileData.mimetype,
						"mimeEncoding": fileData.mimeEncoding,
						"size":         fileData.size,
					}).Info("")
				}
			}
		}
		return nil
	}

	filepath.Walk(dir, visit)
}

func initProcess() {
	// init args
	flag.Parse()

	if *corpusPath == "" {
		fmt.Println(color.InRed("Corpus path is not defined"))
		os.Exit(1)
	}

	if *withWordCount == true {
		fmt.Println("Count number word is enabled")
	}
	// init logger
	sec := time.Now().Unix()
	logPath := *outputPath + "/" + strconv.Itoa(int(sec)) + "-" + *corpusName
	log.Println(color.Green + "Begin of program with " + strconv.Itoa(runtime.NumCPU()) + " CPU" + color.Reset)
	log.Println("Read corpus in", *corpusPath, "and write out in", logPath)
	os.MkdirAll(logPath, os.ModePerm|os.ModeDir)
	outputFile, err := os.OpenFile(logPath+"/analyse-logs.json", os.O_WRONLY|os.O_CREATE, 0755)
	if err != nil {
		fmt.Println(color.InRed("Unable to create log file, please fix your command"))
		fmt.Println(err)
		os.Exit(1)
	}
	logrus.SetFormatter(&logrus.JSONFormatter{})
	logrus.SetOutput(outputFile)
	return
}

func main() {
	start := time.Now()

	// init logger and params
	initProcess()

	wg.Add(1)
	getAllFiles(*corpusPath)
	wg.Wait()
	close(queueForConcurrent)
	elapsed := time.Since(start)
	fmt.Println("")
	log.Println(color.Green + "End of program with " + strconv.Itoa(numberFiles) + " files processed" + color.Reset)
	log.Printf("Total time %s", elapsed)
}

func Exists(name string) bool {
	_, err := os.Stat(name)
	if err == nil {
		return true
	}
	if errors.Is(err, os.ErrNotExist) {
		return false
	}
	return false
}