package main import ( "flag" "io/ioutil" "os" "os/exec" "regexp" "strings" "github.com/antchfx/xmlquery" "github.com/sirupsen/logrus" ) var regexDoctype = regexp.MustCompile("<!DOCTYPE.*([[sS]*?])?>") var regexDtd = regexp.MustCompile("[a-zA-Z0-9-_./:]*.dtd") var regexSpace = regexp.MustCompile(`\s+`) var regexLine = regexp.MustCompile("line ([0-9]{1,}) ") var regexErrorMessage = regexp.MustCompile(` ([A-Z].*)\^`) func processXML(message *Message) { // queue for read xml (limit number of parallel read files) defer wg.Done() queueForConcurrent <- struct{}{} defer func() { <-queueForConcurrent }() xmlDataLog, xmlData := getXMlData(message.path) if len(xmlDataLog.wellFormedErrors) > 0 { logrus.WithFields(logrus.Fields{ "corpusName": message.corpusName, "name": message.name, "startAt": message.startAt, "extension": message.extension, "path": message.path, "mimetype": message.mimetype, "size": message.size, "isWellFormed": xmlDataLog.isWellFormed, "wellFormedErrors": xmlDataLog.wellFormedErrors, }).Info("") } else { if *configurationFolder == "" { logrus.WithFields(logrus.Fields{ "corpusName": message.corpusName, "name": message.name, "startAt": message.startAt, "extension": message.extension, "path": message.path, "mimetype": message.mimetype, "size": message.size, "isWellFormed": xmlDataLog.isWellFormed, "doctype": xmlDataLog.doctype, }).Info("") } else { detailledAnalysis := processDetailledAnalysis(message.path, xmlData, xmlDataLog.doctype.Sysid) logrus.WithFields(logrus.Fields{ "corpusName": message.corpusName, "name": message.name, "startAt": message.startAt, "extension": message.extension, "path": message.path, "mimetype": message.mimetype, "size": message.size, "isWellFormed": xmlDataLog.isWellFormed, "doctype": xmlDataLog.doctype, "isValidAgainstDTD": detailledAnalysis.isValidAgainstDTD, "validationDTDInfos": detailledAnalysis.validationDTDInfos, "validationsErrors": detailledAnalysis.validationsErrors, "isValidAgainstSchema": detailledAnalysis.isValidAgainstSchema, "validationSchemaErrors": detailledAnalysis.validationSchemaErrors, "xpath": detailledAnalysis.xpath, }).Info("") } } return } func getXMlData(xmlPath string) (MessageXML, string) { xmlMessage := MessageXML{isWellFormed: true} // check with xmlstarlet (slow) result, _ := exec.Command("xmlstarlet", "val", "-w", "-e", xmlPath).CombinedOutput() if result != nil && strings.Contains(string(result), "invalid") { xmlMessage.isWellFormed = false xmlMessage.wellFormedErrors = append(xmlMessage.wellFormedErrors, formatError(string(result))) return xmlMessage, "" } // check if able to open xmlFile, errOpen := os.Open(xmlPath) if errOpen != nil { xmlMessage.isWellFormed = false xmlMessage.wellFormedErrors = append(xmlMessage.wellFormedErrors, WellFormedErrorXML{Message: errOpen.Error()}) return xmlMessage, "" } // defer the closing of our xmlFile so that we can parse it later on defer xmlFile.Close() // read our opened xmlFile1 as a byte array. here I am checking if the file is valid or not byteValue, errRead := ioutil.ReadAll(xmlFile) if errRead != nil { xmlMessage.isWellFormed = false xmlMessage.wellFormedErrors = append(xmlMessage.wellFormedErrors, WellFormedErrorXML{Message: errRead.Error()}) return xmlMessage, "" } xmlData := string(byteValue) // get doctype from xml doctype := regexDoctype.FindStringSubmatch(xmlData) if doctype != nil && len(doctype[0]) > 0 { dtd := regexDtd.FindStringSubmatch(string(doctype[0])) if dtd != nil && len(dtd[0]) > 0 { xmlMessage.doctype.Sysid = dtd[0] } } return xmlMessage, xmlData } func formatError(resultError string) WellFormedErrorXML { messageFormatted := regexSpace.ReplaceAllString(resultError, " ") messageLine := regexLine.FindStringSubmatch(messageFormatted) line := "0" message := resultError errorMessage := regexErrorMessage.FindStringSubmatch(messageFormatted) if messageLine != nil && len(messageLine) >= 2 && len(messageLine[1]) > 0 { line = messageLine[1] } if errorMessage != nil && len(errorMessage) >= 2 && len(errorMessage[1]) > 0 { message = strings.TrimSpace(errorMessage[1]) } return WellFormedErrorXML{ Message: message, Line: line, } } func processDetailledAnalysis(pathXml string, xmlData string, dtdInDoctype string) DetailledAnalysis { xmlDetailled := DetailledAnalysis{isValidAgainstDTD: false, isValidAgainstSchema: false, xpath: ""} pathExec := *configurationFolder + "/" + *corpusName // if test if flag.Lookup("test.v") != nil { pathExec = "./example" } // if dtd exist in xml file process only this if len(configDetailledAnalyze.XML.ListDTD) > 0 { if dtdInDoctype != "" { for _, dtdPath := range configDetailledAnalyze.XML.ListDTD { if strings.HasSuffix(dtdPath, dtdInDoctype) { result, _ := exec.Command("xmllint", "--dtdvalid", pathExec+"/dtd/"+dtdPath, pathXml, "--noout", "--nowarning").CombinedOutput() if string(result) == "" { xmlDetailled.isValidAgainstDTD = true xmlDetailled.validationDTDInfos = dtdPath } } } } else { // check with all dtd and stop if one is true for _, dtdPath := range configDetailledAnalyze.XML.ListDTD { result, _ := exec.Command("xmllint", "--dtdvalid", pathExec+"/dtd/"+dtdPath, pathXml, "--noout", "--nowarning").CombinedOutput() if string(result) == "" { xmlDetailled.isValidAgainstDTD = true xmlDetailled.validationDTDInfos = dtdPath break } else { xmlDetailled.validationsErrors = append(xmlDetailled.validationsErrors, string(result)) } } } // don't process if dtd is not valid if xmlDetailled.isValidAgainstDTD == false { return xmlDetailled } } else if len(configDetailledAnalyze.XML.ListXSD) > 0 { // if xsd is present check schema validation for _, xsdPath := range configDetailledAnalyze.XML.ListXSD { result, _ := exec.Command("xmllint", "--schema", pathExec+"/xsd/"+xsdPath, pathXml, "--noout", "--nowarning").CombinedOutput() if strings.Contains(string(result), "validates") { xmlDetailled.isValidAgainstSchema = true break } else { xmlDetailled.validationSchemaErrors = append(xmlDetailled.validationsErrors, string(result)) } } if xmlDetailled.isValidAgainstSchema == false { return xmlDetailled } } // get xpath doc, err := xmlquery.Parse(strings.NewReader(xmlData)) if err == nil { xmlDetailled.xpath = "{" for _, field := range configDetailledAnalyze.XML.XPATH { channel := xmlquery.FindOne(doc, field.Xpath) if channel != nil { xmlDetailled.xpath += field.Name + ":" + channel.InnerText() + "," } } } return xmlDetailled }