package main import ( "math" "os/exec" "strconv" "strings" "github.com/sirupsen/logrus" ) func ProcessPDF(message *GeneralInfo, logger *logrus.Entry) { // queue for read pdf (limit number of parallel read files) defer wg.Done() queueForConcurrent <- struct{}{} defer func() { <-queueForConcurrent }() pdfData := GetMetadata(message) logger = logger.WithFields(logrus.Fields{ "pdfMetadata": pdfData.pdfMetadata, "pdfPageTotal": pdfData.pdfPageTotal, }) if *withWordCount && pdfData.pdfPageTotal != 0 { pdfWordCount := GetNumberWords(message.path) pdfData.pdfWordCount = pdfWordCount pdfData.pdfWordByPage = int(math.Floor(float64(pdfWordCount) / float64(pdfData.pdfPageTotal))) logger = logger.WithFields(logrus.Fields{ "pdfWordCount": pdfData.pdfWordCount, "pdfWordByPage": pdfData.pdfWordByPage, "pdfError": pdfData.pdfError, }) } else if pdfData.pdfError != "" { logger = logger.WithFields(logrus.Fields{ "pdfError": pdfData.pdfError, }) } logger.Info("") } // return Message with metadata info func GetMetadata(message *GeneralInfo) PDFInfo { pdfData := PDFInfo{} metaResult := make(map[string]string) metaStr, err := exec.Command("pdfinfo", message.path).Output() // Parse meta output for _, line := range strings.Split(string(metaStr), "\n") { if parts := strings.SplitN(line, ":", 2); len(parts) > 1 { metaResult[strings.TrimSpace(parts[0])] = strings.TrimSpace(parts[1]) } } if err == nil { pdfData.pdfMetadata.Author = metaResult["Author"] pdfData.pdfMetadata.Creator = metaResult["Creator"] pdfData.pdfMetadata.CreationDate = metaResult["CreationDate"] pdfData.pdfMetadata.PDFFormatVersion = metaResult["PDF version"] numberPages, _ := strconv.Atoi(metaResult["Pages"]) pdfData.pdfPageTotal = numberPages } else { pdfData.pdfError = err.Error() } return pdfData } // return number word in pdf func GetNumberWords(path string) int { text, _ := exec.Command("pdftotext", "-q", "-nopgbrk", "-enc", "UTF-8", "-eol", "unix", path, "-").Output() pdfWordCount := len(strings.Fields(string(text))) return pdfWordCount }