Newer
Older
sisyphe-go / pdf.go
@Nacim Nacim on 10 Feb 2022 2 KB add more validation for xml
package main

import (
	"math"
	"os/exec"
	"strconv"
	"strings"

	"github.com/sirupsen/logrus"
)

type MetaResult struct {
	meta map[string]string
	err  error
}

func processPDF(message *Message) {
	// queue for read pdf (limit number of parallel read files)
	defer wg.Done()
	queueForConcurrent <- struct{}{}
	defer func() { <-queueForConcurrent }()

	pdfData := getMetadata(message)

	if *withWordCount == true && pdfData.pdfPageTotal != 0 {
		pdfWordCount := getNumberWords(message.path)

		pdfData.pdfWordCount = pdfWordCount
		pdfData.pdfWordByPage = int(math.Floor(float64(pdfWordCount) / float64(pdfData.pdfPageTotal)))
	}

	logrus.WithFields(logrus.Fields{
		"corpusName":    message.corpusName,
		"name":          message.name,
		"startAt":       message.startAt,
		"extension":     message.extension,
		"path":          message.path,
		"mimetype":      message.mimetype,
		"size":          message.size,
		"pdfMetadata":   pdfData.pdfMetadata,
		"pdfPageTotal":  pdfData.pdfPageTotal,
		"pdfWordCount":  pdfData.pdfWordCount,
		"pdfWordByPage": pdfData.pdfWordByPage,
		"pdfError":      pdfData.pdfError,
	}).Info("")
	incrementProcess()
	return
}

// return Message with metadata info
func getMetadata(message *Message) MessagePDF {
	pdfData := MessagePDF{}
	metaResult := make(map[string]string)
	metaStr, err := exec.Command("pdfinfo", message.path).Output()

	// Parse meta output
	for _, line := range strings.Split(string(metaStr), "\n") {
		if parts := strings.SplitN(line, ":", 2); len(parts) > 1 {
			metaResult[strings.TrimSpace(parts[0])] = strings.TrimSpace(parts[1])
		}
	}

	if err == nil {
		pdfData.pdfMetadata.Author = metaResult["Author"]
		pdfData.pdfMetadata.Creator = metaResult["Creator"]
		pdfData.pdfMetadata.CreationDate = metaResult["CreationDate"]
		pdfData.pdfMetadata.pdfFormatVersion = metaResult["PDF version"]
		numberPages, _ := strconv.Atoi(metaResult["Pages"])
		pdfData.pdfPageTotal = numberPages
	} else {
		pdfData.pdfError = err.Error()
	}
	return pdfData
}

// return number word in pdf
func getNumberWords(path string) int {
	text, _ := exec.Command("pdftotext", "-q", "-nopgbrk", "-enc", "UTF-8", "-eol", "unix", path, "-").Output()
	pdfWordCount := len(strings.Fields(string(text)))
	return pdfWordCount
}