diff --git a/docker-compose.yml b/docker-compose.yml index d9c3c43..cc07ce8 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,6 +1,9 @@ version: '3.7' services: go: + environment: + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} build: context: . args: diff --git a/go.mod b/go.mod index a680e7e..6b71083 100644 --- a/go.mod +++ b/go.mod @@ -5,17 +5,14 @@ require ( github.com/TwiN/go-color v1.1.0 github.com/gabriel-vasile/mimetype v1.4.0 - github.com/lestrrat-go/libxml2 v0.0.0-20201123224832-e6d9de61b80d github.com/sirupsen/logrus v1.8.1 github.com/stretchr/testify v1.6.1 ) require ( github.com/davecgh/go-spew v1.1.1 // indirect - github.com/pkg/errors v0.9.1 // indirect github.com/pmezard/go-difflib v1.0.0 // indirect golang.org/x/net v0.0.0-20220127200216-cd36cc0744dd // indirect golang.org/x/sys v0.0.0-20220204135822-1c1b9b1eba6a // indirect - gopkg.in/xmlpath.v1 v1.0.0-20140413065638-a146725ea6e7 // indirect gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c // indirect ) diff --git a/go.sum b/go.sum index 552a259..7a26493 100644 --- a/go.sum +++ b/go.sum @@ -5,10 +5,6 @@ github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/gabriel-vasile/mimetype v1.4.0 h1:Cn9dkdYsMIu56tGho+fqzh7XmvY2YyGU0FnbhiOsEro= github.com/gabriel-vasile/mimetype v1.4.0/go.mod h1:fA8fi6KUiG7MgQQ+mEWotXoEOvmxRtOJlERCzSmRvr8= -github.com/lestrrat-go/libxml2 v0.0.0-20201123224832-e6d9de61b80d h1:7uUkdtm6TC3olmG0I9lIAwBJQianl8YT5H8zcw6Mkpk= -github.com/lestrrat-go/libxml2 v0.0.0-20201123224832-e6d9de61b80d/go.mod h1:fy/ZVbgyB83mtricxwSW3zqIRXWOVpKG2PvdUDFeC58= -github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= -github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/sirupsen/logrus v1.8.1 h1:dJKuHgqk1NNQlqoA6BTlM1Wf9DOH3NBjQyu0h9+AZZE= @@ -34,7 +30,5 @@ golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= -gopkg.in/xmlpath.v1 v1.0.0-20140413065638-a146725ea6e7 h1:zibSPXbkfB1Dwl76rJgLa68xcdHu42qmFTe6vAnU4wA= -gopkg.in/xmlpath.v1 v1.0.0-20140413065638-a146725ea6e7/go.mod h1:wo0SW5T6XqIKCCAge330Cd5sm+7VI6v85OrQHIk50KM= gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c h1:dUUwHk2QECo/6vqA44rthZ8ie2QXMNeKRTHCNY2nXvo= gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/main.go b/main.go index ab8b80a..6c3ce26 100644 --- a/main.go +++ b/main.go @@ -20,35 +20,6 @@ "github.com/sirupsen/logrus" ) -type LogMessagePDF struct { - pdfWordCount int - pdfPageTotal int - pdfWordByPage int - Author string - Creator string - CreationDate string - pdfFormatVersion string - pdfError string -} - -type LogMessageXML struct { - isWellFormed bool - doctype string - xmlError string -} -type LogMessage struct { - corpusName string - name string - startAt string - extension string - path string - mimetype string - mimeEncoding string - size int64 - pdf LogMessagePDF - xml LogMessageXML -} - var queueForConcurrent = make(chan struct{}, 1111) var wg sync.WaitGroup var numberFiles int = 0 diff --git a/pdf.go b/pdf.go index 89b9057..ddafc3a 100644 --- a/pdf.go +++ b/pdf.go @@ -20,37 +20,36 @@ queueForConcurrent <- struct{}{} defer func() { <-queueForConcurrent }() - metadata := getMetadata(message) + pdfData := getMetadata(message) - if *withWordCount == true && metadata.pdf.pdfPageTotal != 0 { + if *withWordCount == true && pdfData.pdfPageTotal != 0 { pdfWordCount := getNumberWords(message.path) message.pdf.pdfWordCount = pdfWordCount - message.pdf.pdfWordByPage = int(math.Floor(float64(pdfWordCount) / float64(metadata.pdf.pdfPageTotal))) + message.pdf.pdfWordByPage = int(math.Floor(float64(pdfWordCount) / float64(pdfData.pdfPageTotal))) } logrus.WithFields(logrus.Fields{ - "corpusName": message.corpusName, - "name": message.name, - "startAt": message.startAt, - "extension": message.extension, - "path": message.path, - "mimetype": message.mimetype, - "size": message.size, - "Author": message.pdf.Author, - "Creator": message.pdf.Creator, - "CreationDate": message.pdf.CreationDate, - "pdfFormatVersion": message.pdf.pdfFormatVersion, - "pdfPageTotal": message.pdf.pdfPageTotal, - "pdfWordCount": message.pdf.pdfWordCount, - "pdfWordByPage": message.pdf.pdfWordByPage, - "pdfError": message.pdf.pdfError, + "corpusName": message.corpusName, + "name": message.name, + "startAt": message.startAt, + "extension": message.extension, + "path": message.path, + "mimetype": message.mimetype, + "size": message.size, + "pdfMetadata": pdfData.pdfMetadata, + "pdfPageTotal": pdfData.pdfPageTotal, + "pdfWordCount": pdfData.pdfWordCount, + "pdfWordByPage": pdfData.pdfWordByPage, + "pdfError": pdfData.pdfError, }).Info("") incrementProcess() return } -func getMetadata(message *LogMessage) *LogMessage { +// return LogMessage with metadata info +func getMetadata(message *LogMessage) LogMessagePDF { + pdfData := LogMessagePDF{} metaResult := make(map[string]string) metaStr, err := exec.Command("pdfinfo", message.path).Output() @@ -62,18 +61,19 @@ } if err == nil { - message.pdf.Author = metaResult["Author"] - message.pdf.Creator = metaResult["Creator"] - message.pdf.CreationDate = metaResult["CreationDate"] - message.pdf.pdfFormatVersion = metaResult["PDF version"] + pdfData.pdfMetadata.Author = metaResult["Author"] + pdfData.pdfMetadata.Creator = metaResult["Creator"] + pdfData.pdfMetadata.CreationDate = metaResult["CreationDate"] + pdfData.pdfMetadata.pdfFormatVersion = metaResult["PDF version"] numberPages, _ := strconv.Atoi(metaResult["Pages"]) - message.pdf.pdfPageTotal = numberPages + pdfData.pdfPageTotal = numberPages } else { - message.pdf.pdfError = err.Error() + pdfData.pdfError = err.Error() } - return message + return pdfData } +// return number word in pdf func getNumberWords(path string) int { text, _ := exec.Command("pdftotext", "-q", "-nopgbrk", "-enc", "UTF-8", "-eol", "unix", path, "-").Output() pdfWordCount := len(strings.Fields(string(text))) diff --git a/pdf_test.go b/pdf_test.go index 3987bf3..8a03cb4 100644 --- a/pdf_test.go +++ b/pdf_test.go @@ -19,10 +19,10 @@ // Test metadata func TestMetadata(t *testing.T) { result := getMetadata(&pdfData) - assert.Equal(t, result.pdf.Author, "manshanden", "author is equal") - assert.Equal(t, result.pdf.Creator, "PScript5.dll Version 5.2", "Creator is equal") - assert.Equal(t, result.pdf.pdfFormatVersion, "1.4", "pdfFormatVersion is equal") - assert.Equal(t, result.pdf.pdfError, "", "No error") + assert.Equal(t, result.pdfMetadata.Author, "manshanden", "author is equal") + assert.Equal(t, result.pdfMetadata.Creator, "PScript5.dll Version 5.2", "Creator is equal") + assert.Equal(t, result.pdfMetadata.pdfFormatVersion, "1.4", "pdfFormatVersion is equal") + assert.Equal(t, result.pdfError, "", "No error") } // test number word diff --git a/struct.go b/struct.go new file mode 100644 index 0000000..e9705b5 --- /dev/null +++ b/struct.go @@ -0,0 +1,40 @@ +package main + +type LogMetadataPDF struct { + Author string + Creator string + CreationDate string + pdfFormatVersion string +} +type LogMessagePDF struct { + pdfWordCount int + pdfPageTotal int + pdfWordByPage int + pdfMetadata LogMetadataPDF + pdfError string +} + +type DoctypeXML struct { + name string + pubid string + sysid string +} + +type LogMessageXML struct { + isWellFormed bool + wellFormedErrors string + doctype DoctypeXML +} + +type LogMessage struct { + corpusName string + name string + startAt string + extension string + path string + mimetype string + mimeEncoding string + size int64 + pdf LogMessagePDF + xml LogMessageXML +} diff --git a/xml.go b/xml.go index 356c866..5f622ab 100644 --- a/xml.go +++ b/xml.go @@ -10,6 +10,7 @@ ) var regexDoctype = regexp.MustCompile("") +var regexDtd = regexp.MustCompile("[a-zA-Z0-9-]*.dtd") func processXML(message *LogMessage) { // queue for read xml (limit number of parallel read files) @@ -17,27 +18,30 @@ queueForConcurrent <- struct{}{} defer func() { <-queueForConcurrent }() - isWellFormed, doctype, errorXML := getXMlData(message.path) + xmlData := getXMlData(message) logrus.WithFields(logrus.Fields{ - "corpusName": message.corpusName, - "name": message.name, - "startAt": message.startAt, - "extension": message.extension, - "path": message.path, - "mimetype": message.mimetype, - "size": message.size, - "isWellFormed": isWellFormed, - "doctype": doctype, - "xmlError": errorXML, + "corpusName": message.corpusName, + "name": message.name, + "startAt": message.startAt, + "extension": message.extension, + "path": message.path, + "mimetype": message.mimetype, + "size": message.size, + "isWellFormed": xmlData.isWellFormed, + "doctype": xmlData.doctype, + "wellFormedErrors": xmlData.wellFormedErrors, }).Info("") incrementProcess() return } -func getXMlData(path string) (bool, string, string) { - xmlFile, errOpen := os.Open(path) +func getXMlData(message *LogMessage) LogMessageXML { + xmlData := LogMessageXML{isWellFormed: true} + xmlFile, errOpen := os.Open(message.path) if errOpen != nil { - return false, "", errOpen.Error() + xmlData.isWellFormed = false + xmlData.wellFormedErrors = errOpen.Error() + return xmlData } // defer the closing of our xmlFile so that we can parse it later on defer xmlFile.Close() @@ -45,19 +49,27 @@ byteValue, errRead := ioutil.ReadAll(xmlFile) if errRead != nil { - return false, "", errRead.Error() + xmlData.isWellFormed = false + xmlData.wellFormedErrors = errRead.Error() + return xmlData } - errorUnmarshal := xml.Unmarshal(byteValue, new(interface{})) - if errorUnmarshal != nil { - return false, "", errorUnmarshal.Error() + errUnmarshal := xml.Unmarshal(byteValue, new(interface{})) + if errUnmarshal != nil { + xmlData.isWellFormed = false + xmlData.wellFormedErrors = errUnmarshal.Error() + return xmlData } - // xmlData + // get doctype from xml doctype := regexDoctype.FindStringSubmatch(string(byteValue)) - if len(doctype[0]) > 0 { - return true, doctype[0], "" + if doctype != nil && len(doctype[0]) > 0 { + dtd := regexDtd.FindStringSubmatch(string(doctype[0])) + if dtd != nil && len(dtd[0]) > 0 { + xmlData.doctype.sysid = dtd[0] + } + return xmlData } - return true, "", "" + return xmlData } diff --git a/xml_test.go b/xml_test.go index b1bd5e2..427a337 100644 --- a/xml_test.go +++ b/xml_test.go @@ -17,13 +17,16 @@ } func TestValidXML(t *testing.T) { - result, _, err := getXMlData("./example/xml/test-default.xml") - assert.Equal(t, result, true, "XML is well formed") - assert.Equal(t, err, "", "Return empty if xml is not well formed") + result := getXMlData(&xmlData) + assert.Equal(t, result.isWellFormed, true, "XML is well formed") + assert.Equal(t, result.wellFormedErrors, "", "Return empty if xml is not well formed") + assert.Equal(t, result.doctype.sysid, "mydoctype.dtd", "Doctype is valid") } -func TestInValidXML(t *testing.T) { - result, _, err := getXMlData("./example/xml/test-not-wellformed.xml") - assert.Equal(t, result, false, "XML is not well formed") - assert.Equal(t, err, "XML syntax error on line 6: element closed by ", "Return error if xml is not well formed") +func TestInvalidXML(t *testing.T) { + xmlData.path = "./example/xml/test-not-wellformed.xml" + result := getXMlData(&xmlData) + assert.Equal(t, result.isWellFormed, false, "XML is not well formed") + assert.Equal(t, result.wellFormedErrors, "XML syntax error on line 6: element closed by ", "Return error if xml is not well formed") + assert.Equal(t, result.doctype.sysid, "", "Not get doctype if xml is invalid") } \ No newline at end of file