Newer
Older
sisyphe-go / pdf.go
@Nacim Nacim on 6 Feb 2022 1 KB add xml and flags
package main

import (
	"math"

	"github.com/ledongthuc/pdf"
)

func processPDF(message *LogMessage) {
	// queue for read pdf
	canal <- struct{}{}
	defer func() { <-canal }()
	defer wg.Done()

	if *withWordCount == true {
		pdfWordCount, pdfPageTotal, pdfWordByPage := readPdf(message.path)

		message.pdf.pdfWordCount = pdfWordCount
		message.pdf.pdfPageTotal = pdfPageTotal
		message.pdf.pdfWordByPage = pdfWordByPage
	}
	
	logger(message)
}

func readPdf(path string) (int, int, float64) {
	pdfFile, r, err := pdf.Open(path)
	// remember close file
	defer pdfFile.Close()
	if err != nil {
		return 0, 0, 0
	}
	pdfPageTotal := r.NumPage()

	pdfWordCount := 0

	for pageIndex := 1; pageIndex <= pdfPageTotal; pageIndex++ {
		p := r.Page(pageIndex)
		if p.V.IsNull() {
			continue
		}

		rows, _ := p.GetTextByRow()
		for _, row := range rows {
			pdfWordCount = pdfWordCount + len(row.Content)
		}
	}

	var pdfWordByPage = math.Floor(float64(pdfWordCount) / float64(pdfPageTotal))
	if math.IsNaN(pdfWordByPage) {
		pdfWordByPage = 0
	}
	return pdfWordCount, pdfPageTotal, pdfWordByPage
}