Newer
Older
sisyphe-go / pdf.go
@Nacim Nacim on 9 Mar 2022 2 KB refactoring logging
package main

import (
	"math"
	"os/exec"
	"strconv"
	"strings"

	"github.com/sirupsen/logrus"
)

func ProcessPDF(message *GeneralInfo, logger *logrus.Entry) {
	// queue for read pdf (limit number of parallel read files)
	defer wg.Done()
	queueForConcurrent <- struct{}{}
	defer func() { <-queueForConcurrent }()

	pdfData := GetMetadata(message)
	logger = logger.WithFields(logrus.Fields{
		"pdfMetadata":  pdfData.pdfMetadata,
		"pdfPageTotal": pdfData.pdfPageTotal,
	})

	if *withWordCount && pdfData.pdfPageTotal != 0 {
		pdfWordCount := GetNumberWords(message.path)

		pdfData.pdfWordCount = pdfWordCount
		pdfData.pdfWordByPage = int(math.Floor(float64(pdfWordCount) / float64(pdfData.pdfPageTotal)))

		logger = logger.WithFields(logrus.Fields{
			"pdfWordCount":  pdfData.pdfWordCount,
			"pdfWordByPage": pdfData.pdfWordByPage,
			"pdfError":      pdfData.pdfError,
		})
	} else if pdfData.pdfError != "" {
		logger = logger.WithFields(logrus.Fields{
			"pdfError": pdfData.pdfError,
		})
	}
	logger.Info("")
}

// return Message with metadata info
func GetMetadata(message *GeneralInfo) PDFInfo {
	pdfData := PDFInfo{}
	metaResult := make(map[string]string)
	metaStr, err := exec.Command("pdfinfo", message.path).Output()

	// Parse meta output
	for _, line := range strings.Split(string(metaStr), "\n") {
		if parts := strings.SplitN(line, ":", 2); len(parts) > 1 {
			metaResult[strings.TrimSpace(parts[0])] = strings.TrimSpace(parts[1])
		}
	}

	if err == nil {
		pdfData.pdfMetadata.Author = metaResult["Author"]
		pdfData.pdfMetadata.Creator = metaResult["Creator"]
		pdfData.pdfMetadata.CreationDate = metaResult["CreationDate"]
		pdfData.pdfMetadata.PDFFormatVersion = metaResult["PDF version"]
		numberPages, _ := strconv.Atoi(metaResult["Pages"])
		pdfData.pdfPageTotal = numberPages
	} else {
		pdfData.pdfError = err.Error()
	}
	return pdfData
}

// return number word in pdf
func GetNumberWords(path string) int {
	text, _ := exec.Command("pdftotext", "-q", "-nopgbrk", "-enc", "UTF-8", "-eol", "unix", path, "-").Output()
	pdfWordCount := len(strings.Fields(string(text)))
	return pdfWordCount
}