diff --git a/example/xml/test-default.xml b/example/xml/test-default.xml index dd5c90d..e6e5b63 100644 --- a/example/xml/test-default.xml +++ b/example/xml/test-default.xml @@ -1,8 +1,8 @@ -Tove -Jani -Reminder -Don't forget me this weekend! + Tove + Jani + Reminder + Don't forget me this weekend! \ No newline at end of file diff --git a/example/xml/test-not-wellformed.xml b/example/xml/test-not-wellformed.xml index 6997eb1..b36b5a6 100644 --- a/example/xml/test-not-wellformed.xml +++ b/example/xml/test-not-wellformed.xml @@ -1,7 +1,8 @@ - - - - - trezaq - - + + + + Tove + Jani + Reminder + Don't forget me this weekend! + diff --git a/main.go b/main.go index f63df52..50d5044 100644 --- a/main.go +++ b/main.go @@ -22,7 +22,7 @@ ) var start = time.Now() -var queueForConcurrent = make(chan struct{}, 1250) +var queueForConcurrent = make(chan struct{}, 1300) var wg sync.WaitGroup var numberFiles int = 0 var corpusPath = flag.String("p", "", "Corpus path") diff --git a/pdf.go b/pdf.go index 452b8b0..5749287 100644 --- a/pdf.go +++ b/pdf.go @@ -27,22 +27,35 @@ pdfData.pdfWordCount = pdfWordCount pdfData.pdfWordByPage = int(math.Floor(float64(pdfWordCount) / float64(pdfData.pdfPageTotal))) - } - logrus.WithFields(logrus.Fields{ - "corpusName": message.corpusName, - "name": message.name, - "startAt": message.startAt, - "extension": message.extension, - "path": message.path, - "mimetype": message.mimetype, - "size": message.size, - "pdfMetadata": pdfData.pdfMetadata, - "pdfPageTotal": pdfData.pdfPageTotal, - "pdfWordCount": pdfData.pdfWordCount, - "pdfWordByPage": pdfData.pdfWordByPage, - "pdfError": pdfData.pdfError, - }).Info("") + logrus.WithFields(logrus.Fields{ + "corpusName": message.corpusName, + "name": message.name, + "startAt": message.startAt, + "extension": message.extension, + "path": message.path, + "mimetype": message.mimetype, + "size": message.size, + "pdfMetadata": pdfData.pdfMetadata, + "pdfPageTotal": pdfData.pdfPageTotal, + "pdfWordCount": pdfData.pdfWordCount, + "pdfWordByPage": pdfData.pdfWordByPage, + "pdfError": pdfData.pdfError, + }).Info("") + } else { + logrus.WithFields(logrus.Fields{ + "corpusName": message.corpusName, + "name": message.name, + "startAt": message.startAt, + "extension": message.extension, + "path": message.path, + "mimetype": message.mimetype, + "size": message.size, + "pdfMetadata": pdfData.pdfMetadata, + "pdfPageTotal": pdfData.pdfPageTotal, + "pdfError": pdfData.pdfError, + }).Info("") + } incrementProcess() return } diff --git a/pdf_test.go b/pdf_test.go index e7a5f9a..0f38c99 100644 --- a/pdf_test.go +++ b/pdf_test.go @@ -21,7 +21,7 @@ result := getMetadata(&pdfData) assert.Equal(t, result.pdfMetadata.Author, "manshanden", "author is equal") assert.Equal(t, result.pdfMetadata.Creator, "PScript5.dll Version 5.2", "Creator is equal") - assert.Equal(t, result.pdfMetadata.pdfFormatVersion, "1.4", "pdfFormatVersion is equal") + assert.Equal(t, result.pdfMetadata.PDFFormatVersion, "1.4", "PdfFormatVersion is equal") assert.Equal(t, result.pdfError, "", "No error") } diff --git a/struct.go b/struct.go index 9784d9a..e5165a7 100644 --- a/struct.go +++ b/struct.go @@ -1,10 +1,10 @@ package main type MetadataPDF struct { - Author string - Creator string - CreationDate string - PDFFormatVersion string + Author string `json:"author"` + Creator string `json:"creator"` + CreationDate string `json:"creationDate"` + PDFFormatVersion string `json:"PDFFormatVersion"` } type MessagePDF struct { pdfWordCount int @@ -15,14 +15,19 @@ } type DoctypeXML struct { - name string - pubid string - sysid string + Name string `json:"name"` + Pubid string `json:"pubid"` + Sysid string `json:"sysid"` +} + +type WellFormedErrorXML struct { + Message string `json:"message"` + File string `json:"file"` } type MessageXML struct { isWellFormed bool - wellFormedErrors string + wellFormedErrors WellFormedErrorXML doctype DoctypeXML metadataXML struct{} } diff --git a/xml.go b/xml.go index 6c883ce..01f7bbd 100644 --- a/xml.go +++ b/xml.go @@ -1,9 +1,9 @@ package main import ( - "encoding/xml" "io/ioutil" "os" + "os/exec" "regexp" "strings" @@ -20,70 +20,71 @@ defer func() { <-queueForConcurrent }() xmlData := getXMlData(message) - logrus.WithFields(logrus.Fields{ - "corpusName": message.corpusName, - "name": message.name, - "startAt": message.startAt, - "extension": message.extension, - "path": message.path, - "mimetype": message.mimetype, - "size": message.size, - "isWellFormed": xmlData.isWellFormed, - "doctype": xmlData.doctype.sysid, - "wellFormedErrors": xmlData.wellFormedErrors, - }).Info("") + if xmlData.wellFormedErrors.Message != "" { + logrus.WithFields(logrus.Fields{ + "corpusName": message.corpusName, + "name": message.name, + "startAt": message.startAt, + "extension": message.extension, + "path": message.path, + "mimetype": message.mimetype, + "size": message.size, + "isWellFormed": xmlData.isWellFormed, + "wellFormedErrors": xmlData.wellFormedErrors, + }).Info("") + } else { + logrus.WithFields(logrus.Fields{ + "corpusName": message.corpusName, + "name": message.name, + "startAt": message.startAt, + "extension": message.extension, + "path": message.path, + "mimetype": message.mimetype, + "size": message.size, + "isWellFormed": xmlData.isWellFormed, + "doctype": xmlData.doctype, + }).Info("") + } + incrementProcess() return } func getXMlData(message *Message) MessageXML { xmlMessage := MessageXML{isWellFormed: true} + + // check with xmlstarlet (slow) + result, _ := exec.Command("xmlstarlet", "val", "-w", "-e", message.path).CombinedOutput() + if result != nil && strings.Contains(string(result), "invalid") { + xmlMessage.isWellFormed = false + xmlMessage.wellFormedErrors = WellFormedErrorXML{string(result), message.path} + return xmlMessage + } + // check if able to open xmlFile, errOpen := os.Open(message.path) if errOpen != nil { xmlMessage.isWellFormed = false - xmlMessage.wellFormedErrors = errOpen.Error() + xmlMessage.wellFormedErrors = WellFormedErrorXML{errOpen.Error(), message.path} return xmlMessage } // defer the closing of our xmlFile so that we can parse it later on defer xmlFile.Close() // read our opened xmlFile1 as a byte array. here I am checking if the file is valid or not byteValue, errRead := ioutil.ReadAll(xmlFile) - if errRead != nil { xmlMessage.isWellFormed = false - xmlMessage.wellFormedErrors = errRead.Error() + xmlMessage.wellFormedErrors = WellFormedErrorXML{errRead.Error(), message.path} return xmlMessage } - // check if unmarshal - errUnmarshal := xml.Unmarshal(byteValue, new(interface{})) - if errUnmarshal != nil { - xmlMessage.isWellFormed = false - xmlMessage.wellFormedErrors = errUnmarshal.Error() - return xmlMessage - } - - // check with xmlstarlet - /*exec.Command("xmlstarlet", "val", "-w", "-e", message.path).CombinedOutput() - if result != nil && strings.Contains(string(result), "invalid") { - xmlMessage.isWellFormed = false - xmlMessage.wellFormedErrors = string(result) - }*/ - xmlData := string(byteValue) - - if !strings.HasPrefix(xmlData, " 0 { dtd := regexDtd.FindStringSubmatch(string(doctype[0])) if dtd != nil && len(dtd[0]) > 0 { - xmlMessage.doctype.sysid = dtd[0] + xmlMessage.doctype.Sysid = dtd[0] } } diff --git a/xml_test.go b/xml_test.go index e37161f..40d075d 100644 --- a/xml_test.go +++ b/xml_test.go @@ -19,14 +19,15 @@ func TestValidXML(t *testing.T) { result := getXMlData(&xmlData) assert.Equal(t, result.isWellFormed, true, "XML is well formed") - assert.Equal(t, result.wellFormedErrors, "", "Return empty if xml is not well formed") - assert.Equal(t, result.doctype.sysid, "Note.dtd", "Doctype is valid") + assert.Equal(t, result.wellFormedErrors.Message, "", "Return empty if xml is not well formed") + assert.Equal(t, result.wellFormedErrors.File, "", "Return empty if xml is not well formed") + assert.Equal(t, result.doctype.Sysid, "Note.dtd", "Doctype is valid") } func TestInvalidXML(t *testing.T) { xmlData.path = "./example/xml/test-not-wellformed.xml" result := getXMlData(&xmlData) assert.Equal(t, result.isWellFormed, false, "XML is not well formed") - assert.Equal(t, result.wellFormedErrors, "XML syntax error on line 6: element closed by ", "Return error if xml is not well formed") - assert.Equal(t, result.doctype.sysid, "", "Not get doctype if xml is invalid") + assert.Equal(t, result.wellFormedErrors.File, xmlData.path) + assert.Equal(t, result.doctype.Sysid, "", "Not get doctype if xml is invalid") }