package main import ( "math" "os/exec" "strconv" "strings" "github.com/sirupsen/logrus" ) type MetaResult struct { meta map[string]string err error } func processPDF(message *LogMessage) { // queue for read pdf (limit number of parallel read files) defer wg.Done() queueForConcurrent <- struct{}{} defer func() { <-queueForConcurrent }() metadata := getMetadata(message) if *withWordCount == true && metadata.pdf.pdfPageTotal != 0 { pdfWordCount := getNumberWords(message.path) message.pdf.pdfWordCount = pdfWordCount message.pdf.pdfWordByPage = int(math.Floor(float64(pdfWordCount) / float64(metadata.pdf.pdfPageTotal))) } logrus.WithFields(logrus.Fields{ "corpusName": message.corpusName, "name": message.name, "startAt": message.startAt, "extension": message.extension, "path": message.path, "mimetype": message.mimetype, "size": message.size, "Author": message.pdf.Author, "Creator": message.pdf.Creator, "CreationDate": message.pdf.CreationDate, "pdfFormatVersion": message.pdf.pdfFormatVersion, "pdfPageTotal": message.pdf.pdfPageTotal, "pdfWordCount": message.pdf.pdfWordCount, "pdfWordByPage": message.pdf.pdfWordByPage, "pdfError": message.pdf.pdfError, }).Info("") incrementProcess() return } func getMetadata(message *LogMessage) *LogMessage { metaResult := make(map[string]string) metaStr, err := exec.Command("pdfinfo", message.path).Output() // Parse meta output for _, line := range strings.Split(string(metaStr), "\n") { if parts := strings.SplitN(line, ":", 2); len(parts) > 1 { metaResult[strings.TrimSpace(parts[0])] = strings.TrimSpace(parts[1]) } } if err == nil { message.pdf.Author = metaResult["Author"] message.pdf.Creator = metaResult["Creator"] message.pdf.CreationDate = metaResult["CreationDate"] message.pdf.pdfFormatVersion = metaResult["PDF version"] numberPages, _ := strconv.Atoi(metaResult["Pages"]) message.pdf.pdfPageTotal = numberPages } else { message.pdf.pdfError = err.Error() } return message } func getNumberWords(path string) int { text, _ := exec.Command("pdftotext", "-q", "-nopgbrk", "-enc", "UTF-8", "-eol", "unix", path, "-").Output() pdfWordCount := len(strings.Fields(string(text))) return pdfWordCount }