Newer
Older
sisyphe-go / pdf.go
@Nacim Nacim on 9 Feb 2022 2 KB add mime encoding
package main

import (
	"math"
	"os/exec"
	"strconv"
	"strings"

	"github.com/sirupsen/logrus"
)

type MetaResult struct {
	meta map[string]string
	err  error
}

func processPDF(message *LogMessage) {
	// queue for read pdf (limit number of parallel read files)
	defer wg.Done()
	queueForConcurrent <- struct{}{}
	defer func() { <-queueForConcurrent }()

	metadata := getMetadata(message)

	if *withWordCount == true && metadata.pdf.pdfPageTotal != 0 {
		pdfWordCount := getNumberWords(message.path)

		message.pdf.pdfWordCount = pdfWordCount
		message.pdf.pdfWordByPage = int(math.Floor(float64(pdfWordCount) / float64(metadata.pdf.pdfPageTotal)))
	}

	logrus.WithFields(logrus.Fields{
		"corpusName":       message.corpusName,
		"name":             message.name,
		"startAt":          message.startAt,
		"extension":        message.extension,
		"path":             message.path,
		"mimetype":         message.mimetype,
		"size":             message.size,
		"Author":           message.pdf.Author,
		"Creator":          message.pdf.Creator,
		"CreationDate":     message.pdf.CreationDate,
		"pdfFormatVersion": message.pdf.pdfFormatVersion,
		"pdfPageTotal":     message.pdf.pdfPageTotal,
		"pdfWordCount":     message.pdf.pdfWordCount,
		"pdfWordByPage":    message.pdf.pdfWordByPage,
		"pdfError":         message.pdf.pdfError,
	}).Info("")
	incrementProcess()
	return
}

func getMetadata(message *LogMessage) *LogMessage {
	metaResult := make(map[string]string)
	metaStr, err := exec.Command("pdfinfo", message.path).Output()

	// Parse meta output
	for _, line := range strings.Split(string(metaStr), "\n") {
		if parts := strings.SplitN(line, ":", 2); len(parts) > 1 {
			metaResult[strings.TrimSpace(parts[0])] = strings.TrimSpace(parts[1])
		}
	}

	if err == nil {
		message.pdf.Author = metaResult["Author"]
		message.pdf.Creator = metaResult["Creator"]
		message.pdf.CreationDate = metaResult["CreationDate"]
		message.pdf.pdfFormatVersion = metaResult["PDF version"]
		numberPages, _ := strconv.Atoi(metaResult["Pages"])
		message.pdf.pdfPageTotal = numberPages
	} else {
		message.pdf.pdfError = err.Error()
	}
	return message
}

func getNumberWords(path string) int {
	text, _ := exec.Command("pdftotext", "-q", "-nopgbrk", "-enc", "UTF-8", "-eol", "unix", path, "-").Output()
	pdfWordCount := len(strings.Fields(string(text)))
	return pdfWordCount
}