diff --git a/example/xml/test-default.xml b/example/xml/test-default.xml
index dd5c90d..e6e5b63 100644
--- a/example/xml/test-default.xml
+++ b/example/xml/test-default.xml
@@ -1,8 +1,8 @@
-Tove
-Jani
-Reminder
-Don't forget me this weekend!
+ Tove
+ Jani
+ Reminder
+ Don't forget me this weekend!
\ No newline at end of file
diff --git a/example/xml/test-not-wellformed.xml b/example/xml/test-not-wellformed.xml
index 6997eb1..b36b5a6 100644
--- a/example/xml/test-not-wellformed.xml
+++ b/example/xml/test-not-wellformed.xml
@@ -1,7 +1,8 @@
-
-
-
-
- trezaq
-
-
+
+
+
+ Tove
+ Jani
+ Reminder
+ Don't forget me this weekend!
+
diff --git a/main.go b/main.go
index f63df52..50d5044 100644
--- a/main.go
+++ b/main.go
@@ -22,7 +22,7 @@
)
var start = time.Now()
-var queueForConcurrent = make(chan struct{}, 1250)
+var queueForConcurrent = make(chan struct{}, 1300)
var wg sync.WaitGroup
var numberFiles int = 0
var corpusPath = flag.String("p", "", "Corpus path")
diff --git a/pdf.go b/pdf.go
index 452b8b0..5749287 100644
--- a/pdf.go
+++ b/pdf.go
@@ -27,22 +27,35 @@
pdfData.pdfWordCount = pdfWordCount
pdfData.pdfWordByPage = int(math.Floor(float64(pdfWordCount) / float64(pdfData.pdfPageTotal)))
- }
- logrus.WithFields(logrus.Fields{
- "corpusName": message.corpusName,
- "name": message.name,
- "startAt": message.startAt,
- "extension": message.extension,
- "path": message.path,
- "mimetype": message.mimetype,
- "size": message.size,
- "pdfMetadata": pdfData.pdfMetadata,
- "pdfPageTotal": pdfData.pdfPageTotal,
- "pdfWordCount": pdfData.pdfWordCount,
- "pdfWordByPage": pdfData.pdfWordByPage,
- "pdfError": pdfData.pdfError,
- }).Info("")
+ logrus.WithFields(logrus.Fields{
+ "corpusName": message.corpusName,
+ "name": message.name,
+ "startAt": message.startAt,
+ "extension": message.extension,
+ "path": message.path,
+ "mimetype": message.mimetype,
+ "size": message.size,
+ "pdfMetadata": pdfData.pdfMetadata,
+ "pdfPageTotal": pdfData.pdfPageTotal,
+ "pdfWordCount": pdfData.pdfWordCount,
+ "pdfWordByPage": pdfData.pdfWordByPage,
+ "pdfError": pdfData.pdfError,
+ }).Info("")
+ } else {
+ logrus.WithFields(logrus.Fields{
+ "corpusName": message.corpusName,
+ "name": message.name,
+ "startAt": message.startAt,
+ "extension": message.extension,
+ "path": message.path,
+ "mimetype": message.mimetype,
+ "size": message.size,
+ "pdfMetadata": pdfData.pdfMetadata,
+ "pdfPageTotal": pdfData.pdfPageTotal,
+ "pdfError": pdfData.pdfError,
+ }).Info("")
+ }
incrementProcess()
return
}
diff --git a/pdf_test.go b/pdf_test.go
index e7a5f9a..0f38c99 100644
--- a/pdf_test.go
+++ b/pdf_test.go
@@ -21,7 +21,7 @@
result := getMetadata(&pdfData)
assert.Equal(t, result.pdfMetadata.Author, "manshanden", "author is equal")
assert.Equal(t, result.pdfMetadata.Creator, "PScript5.dll Version 5.2", "Creator is equal")
- assert.Equal(t, result.pdfMetadata.pdfFormatVersion, "1.4", "pdfFormatVersion is equal")
+ assert.Equal(t, result.pdfMetadata.PDFFormatVersion, "1.4", "PdfFormatVersion is equal")
assert.Equal(t, result.pdfError, "", "No error")
}
diff --git a/struct.go b/struct.go
index 9784d9a..e5165a7 100644
--- a/struct.go
+++ b/struct.go
@@ -1,10 +1,10 @@
package main
type MetadataPDF struct {
- Author string
- Creator string
- CreationDate string
- PDFFormatVersion string
+ Author string `json:"author"`
+ Creator string `json:"creator"`
+ CreationDate string `json:"creationDate"`
+ PDFFormatVersion string `json:"PDFFormatVersion"`
}
type MessagePDF struct {
pdfWordCount int
@@ -15,14 +15,19 @@
}
type DoctypeXML struct {
- name string
- pubid string
- sysid string
+ Name string `json:"name"`
+ Pubid string `json:"pubid"`
+ Sysid string `json:"sysid"`
+}
+
+type WellFormedErrorXML struct {
+ Message string `json:"message"`
+ File string `json:"file"`
}
type MessageXML struct {
isWellFormed bool
- wellFormedErrors string
+ wellFormedErrors WellFormedErrorXML
doctype DoctypeXML
metadataXML struct{}
}
diff --git a/xml.go b/xml.go
index 6c883ce..01f7bbd 100644
--- a/xml.go
+++ b/xml.go
@@ -1,9 +1,9 @@
package main
import (
- "encoding/xml"
"io/ioutil"
"os"
+ "os/exec"
"regexp"
"strings"
@@ -20,70 +20,71 @@
defer func() { <-queueForConcurrent }()
xmlData := getXMlData(message)
- logrus.WithFields(logrus.Fields{
- "corpusName": message.corpusName,
- "name": message.name,
- "startAt": message.startAt,
- "extension": message.extension,
- "path": message.path,
- "mimetype": message.mimetype,
- "size": message.size,
- "isWellFormed": xmlData.isWellFormed,
- "doctype": xmlData.doctype.sysid,
- "wellFormedErrors": xmlData.wellFormedErrors,
- }).Info("")
+ if xmlData.wellFormedErrors.Message != "" {
+ logrus.WithFields(logrus.Fields{
+ "corpusName": message.corpusName,
+ "name": message.name,
+ "startAt": message.startAt,
+ "extension": message.extension,
+ "path": message.path,
+ "mimetype": message.mimetype,
+ "size": message.size,
+ "isWellFormed": xmlData.isWellFormed,
+ "wellFormedErrors": xmlData.wellFormedErrors,
+ }).Info("")
+ } else {
+ logrus.WithFields(logrus.Fields{
+ "corpusName": message.corpusName,
+ "name": message.name,
+ "startAt": message.startAt,
+ "extension": message.extension,
+ "path": message.path,
+ "mimetype": message.mimetype,
+ "size": message.size,
+ "isWellFormed": xmlData.isWellFormed,
+ "doctype": xmlData.doctype,
+ }).Info("")
+ }
+
incrementProcess()
return
}
func getXMlData(message *Message) MessageXML {
xmlMessage := MessageXML{isWellFormed: true}
+
+ // check with xmlstarlet (slow)
+ result, _ := exec.Command("xmlstarlet", "val", "-w", "-e", message.path).CombinedOutput()
+ if result != nil && strings.Contains(string(result), "invalid") {
+ xmlMessage.isWellFormed = false
+ xmlMessage.wellFormedErrors = WellFormedErrorXML{string(result), message.path}
+ return xmlMessage
+ }
+
// check if able to open
xmlFile, errOpen := os.Open(message.path)
if errOpen != nil {
xmlMessage.isWellFormed = false
- xmlMessage.wellFormedErrors = errOpen.Error()
+ xmlMessage.wellFormedErrors = WellFormedErrorXML{errOpen.Error(), message.path}
return xmlMessage
}
// defer the closing of our xmlFile so that we can parse it later on
defer xmlFile.Close()
// read our opened xmlFile1 as a byte array. here I am checking if the file is valid or not
byteValue, errRead := ioutil.ReadAll(xmlFile)
-
if errRead != nil {
xmlMessage.isWellFormed = false
- xmlMessage.wellFormedErrors = errRead.Error()
+ xmlMessage.wellFormedErrors = WellFormedErrorXML{errRead.Error(), message.path}
return xmlMessage
}
- // check if unmarshal
- errUnmarshal := xml.Unmarshal(byteValue, new(interface{}))
- if errUnmarshal != nil {
- xmlMessage.isWellFormed = false
- xmlMessage.wellFormedErrors = errUnmarshal.Error()
- return xmlMessage
- }
-
- // check with xmlstarlet
- /*exec.Command("xmlstarlet", "val", "-w", "-e", message.path).CombinedOutput()
- if result != nil && strings.Contains(string(result), "invalid") {
- xmlMessage.isWellFormed = false
- xmlMessage.wellFormedErrors = string(result)
- }*/
-
xmlData := string(byteValue)
-
- if !strings.HasPrefix(xmlData, " 0 {
dtd := regexDtd.FindStringSubmatch(string(doctype[0]))
if dtd != nil && len(dtd[0]) > 0 {
- xmlMessage.doctype.sysid = dtd[0]
+ xmlMessage.doctype.Sysid = dtd[0]
}
}
diff --git a/xml_test.go b/xml_test.go
index e37161f..40d075d 100644
--- a/xml_test.go
+++ b/xml_test.go
@@ -19,14 +19,15 @@
func TestValidXML(t *testing.T) {
result := getXMlData(&xmlData)
assert.Equal(t, result.isWellFormed, true, "XML is well formed")
- assert.Equal(t, result.wellFormedErrors, "", "Return empty if xml is not well formed")
- assert.Equal(t, result.doctype.sysid, "Note.dtd", "Doctype is valid")
+ assert.Equal(t, result.wellFormedErrors.Message, "", "Return empty if xml is not well formed")
+ assert.Equal(t, result.wellFormedErrors.File, "", "Return empty if xml is not well formed")
+ assert.Equal(t, result.doctype.Sysid, "Note.dtd", "Doctype is valid")
}
func TestInvalidXML(t *testing.T) {
xmlData.path = "./example/xml/test-not-wellformed.xml"
result := getXMlData(&xmlData)
assert.Equal(t, result.isWellFormed, false, "XML is not well formed")
- assert.Equal(t, result.wellFormedErrors, "XML syntax error on line 6: element closed by ", "Return error if xml is not well formed")
- assert.Equal(t, result.doctype.sysid, "", "Not get doctype if xml is invalid")
+ assert.Equal(t, result.wellFormedErrors.File, xmlData.path)
+ assert.Equal(t, result.doctype.Sysid, "", "Not get doctype if xml is invalid")
}