diff --git a/main.go b/main.go index 03ccec6..bd5dbf1 100644 --- a/main.go +++ b/main.go @@ -42,7 +42,7 @@ var configDetailledAnalyze ConfigDetailledAnalyze -func getAllFiles(dir string) { +func GetAllFiles(dir string) { defer wg.Done() queueForConcurrent <- struct{}{} defer func() { <-queueForConcurrent }() @@ -55,7 +55,7 @@ } if file.IsDir() && path != dir { wg.Add(1) - go getAllFiles(path) + go GetAllFiles(path) return filepath.SkipDir } if file.Mode().IsRegular() { @@ -74,7 +74,7 @@ } absolutePath, err2 := filepath.Abs(path) if err == nil && err2 == nil { - fileData := Message{ + fileData := GeneralInfo{ corpusName: *corpusName, name: file.Name(), startAt: file.ModTime().Unix(), @@ -84,32 +84,33 @@ mimeEncoding: mimeEncoding, size: file.Size(), } + logger := logrus.WithFields(logrus.Fields{ + "corpusName": fileData.corpusName, + "name": fileData.name, + "startAt": fileData.startAt, + "extension": fileData.extension, + "path": fileData.path, + "mimetype": fileData.mimetype, + "mimeEncoding": fileData.mimeEncoding, + "size": fileData.size, + }) if *configurationFolder == "" { if fileData.mimetype == "application/pdf" { wg.Add(1) - go processPDF(&fileData) + go ProcessPDF(&fileData, logger) numberFiles++ } else if fileData.mimetype == "application/xml" || fileData.mimetype == "text/xml" { wg.Add(1) - go processXML(&fileData) + go ProcessXML(&fileData, logger) numberFiles++ } else { - logrus.WithFields(logrus.Fields{ - "corpusName": fileData.corpusName, - "name": fileData.name, - "startAt": fileData.startAt, - "extension": fileData.extension, - "path": fileData.path, - "mimetype": fileData.mimetype, - "mimeEncoding": fileData.mimeEncoding, - "size": fileData.size, - }).Info("") + logger.Info("") numberFiles++ } } else { if fileData.mimetype == "application/xml" || fileData.mimetype == "text/xml" { wg.Add(1) - go processXML(&fileData) + go ProcessXML(&fileData, logger) numberFiles++ } } @@ -121,7 +122,7 @@ filepath.Walk(dir, visit) } -func initProcess() { +func InitProcess() { // init args flag.Parse() @@ -130,7 +131,7 @@ fmt.Println(color.InRed("Corpus path is not defined")) os.Exit(1) } - if *withWordCount == true { + if *withWordCount { fmt.Println(color.InBlue("Count number word is enabled")) } @@ -153,7 +154,7 @@ logrus.SetOutput(outputFile) } -func initDetailledAnalyze() { +func InitDetailledAnalyze() { fmt.Println(color.InGreen("The detailed analysis is enabled for"), color.InGreen(*configurationFolder+"/"+*corpusName+"/sisyphe-conf.json")) jsonFile, err := ioutil.ReadFile(*configurationFolder + "/" + *corpusName + "/sisyphe-conf.json") if err != nil { @@ -170,14 +171,14 @@ func main() { // init logger and params - initProcess() + InitProcess() if *configurationFolder != "" { - initDetailledAnalyze() + InitDetailledAnalyze() } wg.Add(1) - getAllFiles(*corpusPath) + GetAllFiles(*corpusPath) wg.Wait() close(queueForConcurrent) time.Sleep(2 * time.Second) diff --git a/pdf.go b/pdf.go index ab226e9..2e211b1 100644 --- a/pdf.go +++ b/pdf.go @@ -9,53 +9,40 @@ "github.com/sirupsen/logrus" ) -func processPDF(message *Message) { +func ProcessPDF(message *GeneralInfo, logger *logrus.Entry) { // queue for read pdf (limit number of parallel read files) defer wg.Done() queueForConcurrent <- struct{}{} defer func() { <-queueForConcurrent }() - pdfData := getMetadata(message) + pdfData := GetMetadata(message) + logger = logger.WithFields(logrus.Fields{ + "pdfMetadata": pdfData.pdfMetadata, + "pdfPageTotal": pdfData.pdfPageTotal, + }) if *withWordCount && pdfData.pdfPageTotal != 0 { - pdfWordCount := getNumberWords(message.path) + pdfWordCount := GetNumberWords(message.path) pdfData.pdfWordCount = pdfWordCount pdfData.pdfWordByPage = int(math.Floor(float64(pdfWordCount) / float64(pdfData.pdfPageTotal))) - logrus.WithFields(logrus.Fields{ - "corpusName": message.corpusName, - "name": message.name, - "startAt": message.startAt, - "extension": message.extension, - "path": message.path, - "mimetype": message.mimetype, - "size": message.size, - "pdfMetadata": pdfData.pdfMetadata, - "pdfPageTotal": pdfData.pdfPageTotal, + logger = logger.WithFields(logrus.Fields{ "pdfWordCount": pdfData.pdfWordCount, "pdfWordByPage": pdfData.pdfWordByPage, "pdfError": pdfData.pdfError, - }).Info("") - } else { - logrus.WithFields(logrus.Fields{ - "corpusName": message.corpusName, - "name": message.name, - "startAt": message.startAt, - "extension": message.extension, - "path": message.path, - "mimetype": message.mimetype, - "size": message.size, - "pdfMetadata": pdfData.pdfMetadata, - "pdfPageTotal": pdfData.pdfPageTotal, - "pdfError": pdfData.pdfError, - }).Info("") + }) + } else if pdfData.pdfError != "" { + logger = logger.WithFields(logrus.Fields{ + "pdfError": pdfData.pdfError, + }) } + logger.Info("") } // return Message with metadata info -func getMetadata(message *Message) MessagePDF { - pdfData := MessagePDF{} +func GetMetadata(message *GeneralInfo) PDFInfo { + pdfData := PDFInfo{} metaResult := make(map[string]string) metaStr, err := exec.Command("pdfinfo", message.path).Output() @@ -80,7 +67,7 @@ } // return number word in pdf -func getNumberWords(path string) int { +func GetNumberWords(path string) int { text, _ := exec.Command("pdftotext", "-q", "-nopgbrk", "-enc", "UTF-8", "-eol", "unix", path, "-").Output() pdfWordCount := len(strings.Fields(string(text))) return pdfWordCount diff --git a/pdf_test.go b/pdf_test.go index fa9053b..dde62ea 100644 --- a/pdf_test.go +++ b/pdf_test.go @@ -6,7 +6,7 @@ "github.com/stretchr/testify/assert" ) -var pdfData = Message{ +var pdfData = GeneralInfo{ corpusName: "test", name: "test.pdf", startAt: 1456693426, @@ -18,7 +18,7 @@ // Test metadata func TestMetadata(t *testing.T) { - result := getMetadata(&pdfData) + result := GetMetadata(&pdfData) assert.Equal(t, result.pdfMetadata.Author, "manshanden", "author is equal") assert.Equal(t, result.pdfMetadata.Creator, "PScript5.dll Version 5.2", "Creator is equal") assert.Equal(t, result.pdfMetadata.PDFFormatVersion, "1.4", "PdfFormatVersion is equal") @@ -27,6 +27,6 @@ // test number word func TestNumberWord(t *testing.T) { - numberWord := getNumberWords(pdfData.path) + numberWord := GetNumberWords(pdfData.path) assert.Equal(t, numberWord, 573, "Number word is equal") } diff --git a/struct.go b/struct.go index 5a56c85..b79526e 100644 --- a/struct.go +++ b/struct.go @@ -8,7 +8,7 @@ CreationDate string PDFFormatVersion string } -type MessagePDF struct { +type PDFInfo struct { pdfWordCount int pdfPageTotal int pdfWordByPage int @@ -22,14 +22,15 @@ Sysid string `json:"sysid"` } -type WellFormedErrorXML struct { +type ErrorXML struct { Message string `json:"message"` Line string `json:"line"` } -type MessageXML struct { +type XMLInfo struct { + data string isWellFormed bool - wellFormedErrors []WellFormedErrorXML + wellFormedErrors ErrorXML doctype DoctypeXML } @@ -64,15 +65,16 @@ } type DetailledAnalysis struct { - isValidAgainstDTD bool - validationDTDInfos string - validationsErrors []string + isValidAgainstDTD bool + validationDTDInfos string + // validationsErrors []ErrorXML + validationsErrors string isValidAgainstSchema bool validationSchemaErrors []string xpath json.RawMessage } -type Message struct { +type GeneralInfo struct { corpusName string name string startAt int64 diff --git a/xml.go b/xml.go index 2c1d98e..ba0307a 100644 --- a/xml.go +++ b/xml.go @@ -17,124 +17,104 @@ var regexDoctype = regexp.MustCompile("") var regexDtd = regexp.MustCompile("[a-zA-Z0-9-_./:]*.dtd") -var regexLine = regexp.MustCompile("line ([0-9]{1,}) ") -var regexErrorMessage = regexp.MustCompile(` ([A-Z].*)\^`) +var regexLineWellFormed = regexp.MustCompile("line ([0-9]{1,}) ") +var regexErrorWellFormedMessage = regexp.MustCompile(` ([A-Z].*)\^`) -func processXML(message *Message) { +// var regexLineValidation = regexp.MustCompile(`(?m):([0-9]{1,}):`) +// var regexErrorValidationMessage = regexp.MustCompile(` ([A-Z].*)`) + +func ProcessXML(message *GeneralInfo, logger *logrus.Entry) { // queue for read xml (limit number of parallel read files) defer wg.Done() queueForConcurrent <- struct{}{} defer func() { <-queueForConcurrent }() - xmlDataLog, xmlData := getXMlData(message.path) - if len(xmlDataLog.wellFormedErrors) > 0 { - logrus.WithFields(logrus.Fields{ - "corpusName": message.corpusName, - "name": message.name, - "startAt": message.startAt, - "extension": message.extension, - "path": message.path, - "mimetype": message.mimetype, - "size": message.size, - "isWellFormed": xmlDataLog.isWellFormed, - "wellFormedErrors": xmlDataLog.wellFormedErrors, - }).Info("") + logger, xmlInfo := CheckIfXmlIsWellFormed(message.path, logger) + if *configurationFolder != "" && len(xmlInfo.data) > 0 { + detailledInfo := DetailledAnalysis{} + detailledInfo, logger = CheckXMLValidation(message.path, xmlInfo, logger) + ProcessXpath(xmlInfo, detailledInfo, logger) } else { - if *configurationFolder == "" { - logrus.WithFields(logrus.Fields{ - "corpusName": message.corpusName, - "name": message.name, - "startAt": message.startAt, - "extension": message.extension, - "path": message.path, - "mimetype": message.mimetype, - "size": message.size, - "isWellFormed": xmlDataLog.isWellFormed, - "doctype": xmlDataLog.doctype, - }).Info("") - } else { - detailledAnalysis := processDetailledAnalysis(message.path, xmlData, xmlDataLog.doctype.Sysid) - logrus.WithFields(logrus.Fields{ - "corpusName": message.corpusName, - "name": message.name, - "startAt": message.startAt, - "extension": message.extension, - "path": message.path, - "mimetype": message.mimetype, - "size": message.size, - "isWellFormed": xmlDataLog.isWellFormed, - "doctype": xmlDataLog.doctype, - "isValidAgainstDTD": detailledAnalysis.isValidAgainstDTD, - "validationDTDInfos": detailledAnalysis.validationDTDInfos, - "validationsErrors": detailledAnalysis.validationsErrors, - "isValidAgainstSchema": detailledAnalysis.isValidAgainstSchema, - "validationSchemaErrors": detailledAnalysis.validationSchemaErrors, - "xpath": &detailledAnalysis.xpath, - }).Info("") - } + logger.Info("") } } -func getXMlData(xmlPath string) (MessageXML, string) { - xmlMessage := MessageXML{isWellFormed: true} - +func CheckIfXmlIsWellFormed(xmlPath string, logger *logrus.Entry) (*logrus.Entry, XMLInfo) { + xmlInfo := XMLInfo{isWellFormed: false} // check with xmlstarlet (slow) result, _ := exec.Command("xmlstarlet", "val", "-w", "-e", xmlPath).CombinedOutput() if result != nil && strings.Contains(string(result), "invalid") { - xmlMessage.isWellFormed = false - xmlMessage.wellFormedErrors = append(xmlMessage.wellFormedErrors, formatError(string(result))) - return xmlMessage, "" + logger = logger.WithFields(logrus.Fields{ + "isWellFormed": false, + "wellFormedErrors": FormatWellFormedError(string(result)), + }) + return logger, xmlInfo } // check if able to open xmlFile, errOpen := os.Open(xmlPath) if errOpen != nil { - xmlMessage.isWellFormed = false - xmlMessage.wellFormedErrors = append(xmlMessage.wellFormedErrors, WellFormedErrorXML{Message: errOpen.Error()}) - return xmlMessage, "" + logger = logger.WithFields(logrus.Fields{ + "isWellFormed": false, + "wellFormedErrors": ErrorXML{Message: errOpen.Error()}, + }) + return logger, xmlInfo } // defer the closing of our xmlFile so that we can parse it later on defer xmlFile.Close() // read our opened xmlFile1 as a byte array. here I am checking if the file is valid or not byteValue, errRead := ioutil.ReadAll(xmlFile) if errRead != nil { - xmlMessage.isWellFormed = false - xmlMessage.wellFormedErrors = append(xmlMessage.wellFormedErrors, WellFormedErrorXML{Message: errRead.Error()}) - return xmlMessage, "" + xmlInfo.wellFormedErrors = ErrorXML{Message: errOpen.Error()} + logger = logger.WithFields(logrus.Fields{ + "isWellFormed": false, + "wellFormedErrors": xmlInfo.wellFormedErrors, + }) + return logger, xmlInfo } - xmlData := string(byteValue) + xmlInfo.data = string(byteValue) // get doctype from xml - doctype := regexDoctype.FindString(xmlData) + doctype := regexDoctype.FindString(xmlInfo.data) if doctype != "" { dtd := regexDtd.FindString(doctype) if dtd != "" { - xmlMessage.doctype.Sysid = dtd + xmlDoctype := DoctypeXML{Sysid: dtd} + logger = logger.WithFields(logrus.Fields{ + "isWellFormed": true, + "doctype": xmlDoctype, + }) + xmlInfo.doctype = xmlDoctype } } - return xmlMessage, xmlData + return logger, xmlInfo } -func formatError(resultError string) WellFormedErrorXML { - messageFormatted := strings.NewReplacer(`\s+`, " ").Replace(resultError) - messageLine := regexLine.FindStringSubmatch(messageFormatted) +func FormatWellFormedError(resultError string) ErrorXML { + messageFormatted := strings.NewReplacer("\n", "", ` `, "").Replace(resultError) + messageLine := regexLineWellFormed.FindStringSubmatch(messageFormatted) line := "0" - message := resultError - errorMessage := regexErrorMessage.FindStringSubmatch(messageFormatted) + errorMessage := regexErrorWellFormedMessage.FindStringSubmatch(messageFormatted) if len(messageLine) >= 2 && len(messageLine[1]) > 0 { line = messageLine[1] } if len(errorMessage) >= 2 && len(errorMessage[1]) > 0 { - message = strings.TrimSpace(errorMessage[1]) + messageFormatted = strings.TrimSpace(errorMessage[1]) } - return WellFormedErrorXML{ - Message: message, + return ErrorXML{ + Message: messageFormatted, Line: line, } } -func processDetailledAnalysis(pathXml string, xmlData string, dtdInDoctype string) DetailledAnalysis { +func FormatValidationError(resultError string) []ErrorXML { + // errorMessage := regexErrorValidationMessage.FindStringSubmatch(resultError) + // messageLine := regexLineValidation.FindStringSubmatch(resultError) + return []ErrorXML{} +} + +func CheckXMLValidation(pathXml string, xmlInfo XMLInfo, logger *logrus.Entry) (DetailledAnalysis, *logrus.Entry) { xmlDetailled := DetailledAnalysis{isValidAgainstDTD: false, isValidAgainstSchema: false} pathExec := *configurationFolder + "/" + *corpusName @@ -145,13 +125,15 @@ // if dtd exist in xml file process only this if len(configDetailledAnalyze.XML.ListDTD) > 0 { - if dtdInDoctype != "" { + if xmlInfo.doctype.Sysid != "" { for _, dtdPath := range configDetailledAnalyze.XML.ListDTD { - if strings.HasSuffix(dtdPath, dtdInDoctype) { + if strings.HasSuffix(dtdPath, xmlInfo.doctype.Sysid) { result, _ := exec.Command("xmllint", "--dtdvalid", pathExec+"/dtd/"+dtdPath, pathXml, "--noout", "--nowarning").CombinedOutput() if string(result) == "" { xmlDetailled.isValidAgainstDTD = true xmlDetailled.validationDTDInfos = dtdPath + } else { + xmlDetailled.validationsErrors = string(result) } } } @@ -164,11 +146,21 @@ xmlDetailled.validationDTDInfos = dtdPath break } else { - xmlDetailled.validationsErrors = append(xmlDetailled.validationsErrors, string(result)) + xmlDetailled.validationsErrors = string(result) } } } + logger = logger.WithFields(logrus.Fields{ + "isValidAgainstDTD": xmlDetailled.isValidAgainstDTD, + "validationDTDInfos": xmlDetailled.validationDTDInfos, + }) + if !xmlDetailled.isValidAgainstDTD { + logger = logger.WithFields(logrus.Fields{ + "validationsErrors": xmlDetailled.validationsErrors, + }) + } + } else if len(configDetailledAnalyze.XML.ListXSD) > 0 { // if xsd is present check schema validation for _, xsdPath := range configDetailledAnalyze.XML.ListXSD { @@ -177,19 +169,26 @@ xmlDetailled.isValidAgainstSchema = true break } else { - xmlDetailled.validationSchemaErrors = append(xmlDetailled.validationsErrors, string(result)) + xmlDetailled.validationSchemaErrors = append(xmlDetailled.validationSchemaErrors, string(result)) } } + logger = logger.WithFields(logrus.Fields{ + "isValidAgainstSchema": xmlDetailled.isValidAgainstSchema, + "validationDTDInfos": xmlDetailled.validationDTDInfos, + }) + + if !xmlDetailled.isValidAgainstSchema { + logger = logger.WithFields(logrus.Fields{ + "validationsErrors": xmlDetailled.validationsErrors, + }) + } } - // get xpath - xmlDetailled.xpath = GetXpath(xmlData) - - return xmlDetailled + return xmlDetailled, logger } -func GetXpath(xmlData string) []byte { - doc, err := xmlquery.Parse(strings.NewReader(xmlData)) +func ProcessXpath(xmlInfo XMLInfo, detailledInfo DetailledAnalysis, logger *logrus.Entry) { + doc, err := xmlquery.Parse(strings.NewReader(xmlInfo.data)) tmpXpath := "{" if err == nil { @@ -209,11 +208,16 @@ tmpXpath = strings.TrimSuffix(tmpXpath, ",") tmpXpath += "}" - data := []byte(tmpXpath) + finalXpath := []byte(tmpXpath) jsonData := XpathStructure{} - json.Unmarshal(data, &jsonData) + json.Unmarshal(finalXpath, &jsonData) - return data + detailledInfo.xpath = finalXpath + + logger = logger.WithFields(logrus.Fields{ + "xpath": detailledInfo.xpath, + }) + logger.Info("") } func FormatTextForXpath(field MetadataConfigDetailledAnalyze, text string, hasQuote bool) string { diff --git a/xml_test.go b/xml_test.go index 54e3e3c..b9e931b 100644 --- a/xml_test.go +++ b/xml_test.go @@ -3,10 +3,11 @@ import ( "testing" + "github.com/sirupsen/logrus" "github.com/stretchr/testify/assert" ) -var xmlData = Message{ +var xmlData = GeneralInfo{ corpusName: "test", name: "test-default.xml", startAt: 1456693426, @@ -15,9 +16,10 @@ mimetype: "text/xml", size: 7123, } +var logger *logrus.Entry func TestValidXML(t *testing.T) { - result, _ := getXMlData(xmlData.path) + result, _ := CheckIfXmlIsWellFormed(xmlData.path, logger) assert.Equal(t, result.doctype.Sysid, "note.dtd", "Doctype is valid") assert.Equal(t, result.isWellFormed, true, "XML is well formed") assert.Equal(t, len(result.wellFormedErrors), 0, "Return empty if xml is not well formed") @@ -26,30 +28,30 @@ func TestInvalidXML(t *testing.T) { xmlData.path = "./example/xml/test-not-wellformed.xml" - result, _ := getXMlData(xmlData.path) - assert.Equal(t, result.doctype.Sysid, "", "Not get doctype if xml is invalid") - assert.Equal(t, result.isWellFormed, false, "XML is not well formed") - assert.Equal(t, result.wellFormedErrors[0].Message, "Opening and ending tag mismatch: from line 4 and Ffrom Jani", "Return empty if xml is not well formed") - assert.Equal(t, result.wellFormedErrors[0].Line, "4", "Return empty if xml is not well formed") + result, _ := CheckIfXmlIsWellFormed(xmlData.path, logger) + assert.Equal(t, result.Data["doctype"], "", "Not get doctype if xml is invalid") + assert.Equal(t, result.Data["isWellFormed"], false, "XML is not well formed") + assert.Equal(t, result.Data["wellFormedErrors"].Message, "Opening and ending tag mismatch: from line 4 and Ffrom Jani", "Return empty if xml is not well formed") + assert.Equal(t, result.Data["wellFormedErrors"].Line, "4", "Return empty if xml is not well formed") } func TestValidDTD(t *testing.T) { xmlData.path = "./example/xml/test-default.xml" configDetailledAnalyze.XML.ListDTD = append(configDetailledAnalyze.XML.ListDTD, "note.dtd") - result := processDetailledAnalysis(xmlData.path, "", "note.dtd") + result, _ := CheckXMLValidation(xmlData.path, "note.dtd", logger) assert.Equal(t, result.isValidAgainstDTD, true, "XML must be valid according to the DTD") } func TestInvalidDTD(t *testing.T) { xmlData.path = "./example/xml/test-default-bad-doctype.xml" configDetailledAnalyze.XML.ListDTD = append(configDetailledAnalyze.XML.ListDTD, "bad-doctype.dtd") - result := processDetailledAnalysis(xmlData.path, "", "bad-doctype.dtd") + result, _ := CheckXMLValidation(xmlData.path, "bad-doctype.dtd", logger) assert.Equal(t, result.isValidAgainstDTD, false, "XML must not be valid according to the DTD") } func TestValidSchema(t *testing.T) { configDetailledAnalyze.XML.ListXSD = append(configDetailledAnalyze.XML.ListXSD, "note.xsd") xmlData.path = "./example/xml/test-default.xml" - result := processDetailledAnalysis(xmlData.path, "", "") + result, _ := CheckXMLValidation(xmlData.path, "", logger) assert.Equal(t, result.isValidAgainstSchema, true, "XML must be valid according to the Schema") }