Newer
Older
sisyphe-go / pdf.go
package main

import (
	"math"
	"os/exec"
	"strconv"
	"strings"
)

type MetaResult struct {
	meta map[string]string
	err  error
}

func processPDF(message *LogMessage) {
	// queue for read pdf
	canal <- struct{}{}
	defer func() { <-canal }()
	defer wg.Done()

	metadata, err := getMetadata(message.path)

	if err == nil {
		message.pdf.Author = metadata["Author"]
		message.pdf.Creator = metadata["Creator"]
		message.pdf.CreationDate = metadata["CreationDate"]
		message.pdf.pdfFormatVersion = metadata["PDF version"]
		numberPages, _ := strconv.Atoi(metadata["Pages"])
		message.pdf.pdfPageTotal = numberPages
	} else {
		message.pdf.pdfError = err.Error()
	}

	if *withWordCount == true {
		pdfWordCount := getNumberWords(message.path)

		message.pdf.pdfWordCount = pdfWordCount
		message.pdf.pdfWordByPage = int(math.Floor(float64(pdfWordCount) / float64(message.pdf.pdfPageTotal)))
	}

	writeLog(message)
}

func getMetadata(path string) (map[string]string, error) {
	metaResult := MetaResult{meta: make(map[string]string)}
	metaStr, err := exec.Command("pdfinfo", path).Output()

	// Parse meta output
	for _, line := range strings.Split(string(metaStr), "\n") {
		if parts := strings.SplitN(line, ":", 2); len(parts) > 1 {
			metaResult.meta[strings.TrimSpace(parts[0])] = strings.TrimSpace(parts[1])
		}
	}
	return metaResult.meta, err
}

func getNumberWords(path string) int {
	text, _ := exec.Command("pdftotext", "-q", "-nopgbrk", "-enc", "UTF-8", "-eol", "unix", path, "-").Output()
	pdfWordCount := len(strings.Fields(string(text)))
	return pdfWordCount
}