Newer
Older
sisyphe-go / main.go
@Nacim Nacim on 6 Feb 2022 3 KB add xml and flags
//usr/bin/go run $0 $@ ; exit

package main

import (
	"errors"
	"flag"
	"fmt"
	"log"
	"os"
	"path/filepath"
	"sync"
	"time"

	"github.com/gabriel-vasile/mimetype"
	"github.com/sirupsen/logrus"
)

type LogMessagePDF struct {
	pdfWordCount  int
	pdfPageTotal  int
	pdfWordByPage float64
	/*Title            string
	Author           string
	Creator          string
	ISBN             string
	pdfFormatVersion string*/
}

type LogMessageXML struct {
	isWellFormed bool
}
type LogMessage struct {
	corpusname string
	startAt    string
	extension  string
	path       string
	mimetype   string
	size       int64
	pdf        LogMessagePDF
	xml        LogMessageXML
}

var canal = make(chan struct{}, 12)
var wg sync.WaitGroup
var numberFiles int = 0
var corpusPath = flag.String("c", "test", "Corpus path")
var outPath = flag.String("o", "out", "Output path")
var corpusName = flag.String("n", "test", "Corpus name")
var withWordCount = flag.Bool("w", false, "Enable word count")

func logger(message *LogMessage) {
	logrus.WithFields(logrus.Fields{
		"corpusname":    message.corpusname,
		"startAt":       message.startAt,
		"extension":     message.extension,
		"path":          message.path,
		"mimetype":      message.mimetype,
		"size":          message.size,
		"pdfPageTotal":  message.pdf.pdfPageTotal,
		"pdfWordCount":  message.pdf.pdfWordCount,
		"pdfWordByPage": message.pdf.pdfWordByPage,
		"isWellFormed":  message.xml.isWellFormed,
	}).Info("")
	numberFiles++
	fmt.Printf("\rFiles processed: %d", numberFiles)
}

func getAllFiles(dir string) {
	defer wg.Done()

	visit := func(path string, file os.FileInfo, err error) error {
		if file.IsDir() && path != dir {
			wg.Add(1)
			go getAllFiles(path)
			return filepath.SkipDir
		}
		if file.Mode().IsRegular() {
			mtype, err := mimetype.DetectFile(path)
			extension := mtype.Extension()
			absolutePath, err2 := filepath.Abs(path)
			if err == nil && err2 == nil {
				fileData := LogMessage{
					corpusname: *corpusName,
					startAt:    file.ModTime().String(),
					extension:  extension,
					path:       absolutePath,
					mimetype:   mtype.String(),
					size:       file.Size(),
				}
				if extension == ".pdf" {
					wg.Add(1)
					go processPDF(&fileData)
				} else if extension == ".xml" {
					wg.Add(1)
					go processXML(&fileData)
				} else {
					logger(&fileData)
				}

			}

		}

		return nil
	}

	filepath.Walk(dir, visit)
}

func initProcess() {
	if *withWordCount == true {
		log.Println("Count number word is enabled")
	}
	// init logger
	if Exists(*outPath + "/file.json") {
		os.Remove(*outPath + "/file.json")
	}
	outputFile, err := os.OpenFile(*outPath+"/file.json", os.O_WRONLY|os.O_CREATE, 0755)
	if err != nil {
		panic(err)
	}
	logrus.SetFormatter(&logrus.JSONFormatter{})
	logrus.SetOutput(outputFile)
}

func main() {

	start := time.Now()
	flag.Parse()

	// init logger and params
	initProcess()

	// read all files
	log.Println("Read corpus in", *corpusPath, "and write out in", *outPath)

	wg.Add(1)
	getAllFiles(*corpusPath)
	wg.Wait()
	close(canal)
	elapsed := time.Since(start)
	fmt.Println("")
	log.Printf("Total time %s", elapsed)
}

func Exists(name string) (bool) {
    _, err := os.Stat(name)
    if err == nil {
        return true
    }
    if errors.Is(err, os.ErrNotExist) {
        return false
    }
    return false
}