package main import ( "math" "os/exec" "strconv" "strings" "github.com/sirupsen/logrus" ) type MetaResult struct { meta map[string]string err error } func processPDF(message *Message) { // queue for read pdf (limit number of parallel read files) defer wg.Done() queueForConcurrent <- struct{}{} defer func() { <-queueForConcurrent }() pdfData := getMetadata(message) if *withWordCount == true && pdfData.pdfPageTotal != 0 { pdfWordCount := getNumberWords(message.path) pdfData.pdfWordCount = pdfWordCount pdfData.pdfWordByPage = int(math.Floor(float64(pdfWordCount) / float64(pdfData.pdfPageTotal))) } logrus.WithFields(logrus.Fields{ "corpusName": message.corpusName, "name": message.name, "startAt": message.startAt, "extension": message.extension, "path": message.path, "mimetype": message.mimetype, "size": message.size, "pdfMetadata": pdfData.pdfMetadata, "pdfPageTotal": pdfData.pdfPageTotal, "pdfWordCount": pdfData.pdfWordCount, "pdfWordByPage": pdfData.pdfWordByPage, "pdfError": pdfData.pdfError, }).Info("") incrementProcess() return } // return Message with metadata info func getMetadata(message *Message) MessagePDF { pdfData := MessagePDF{} metaResult := make(map[string]string) metaStr, err := exec.Command("pdfinfo", message.path).Output() // Parse meta output for _, line := range strings.Split(string(metaStr), "\n") { if parts := strings.SplitN(line, ":", 2); len(parts) > 1 { metaResult[strings.TrimSpace(parts[0])] = strings.TrimSpace(parts[1]) } } if err == nil { pdfData.pdfMetadata.Author = metaResult["Author"] pdfData.pdfMetadata.Creator = metaResult["Creator"] pdfData.pdfMetadata.CreationDate = metaResult["CreationDate"] pdfData.pdfMetadata.PDFFormatVersion = metaResult["PDF version"] numberPages, _ := strconv.Atoi(metaResult["Pages"]) pdfData.pdfPageTotal = numberPages } else { pdfData.pdfError = err.Error() } return pdfData } // return number word in pdf func getNumberWords(path string) int { text, _ := exec.Command("pdftotext", "-q", "-nopgbrk", "-enc", "UTF-8", "-eol", "unix", path, "-").Output() pdfWordCount := len(strings.Fields(string(text))) return pdfWordCount }