package main import ( "flag" "fmt" "io/ioutil" "os" "os/exec" "regexp" "strings" "github.com/antchfx/xmlquery" "github.com/antchfx/xpath" "github.com/goccy/go-json" "github.com/sirupsen/logrus" ) var regexDoctype = regexp.MustCompile("<!DOCTYPE.*([[sS]*?])?>") var regexDtd = regexp.MustCompile("[a-zA-Z0-9-_./:]*.dtd") var regexLineWellFormed = regexp.MustCompile("line ([0-9]{1,}) ") var regexErrorWellFormedMessage = regexp.MustCompile(` ([A-Z].*)\^`) var regexLineValidation = regexp.MustCompile(`:([-]?[0-9]{1,}):`) var regexErrorValidationMessage = regexp.MustCompile(` ([A-Z].*)\n`) func ProcessXML(message *GeneralInfo, logger *logrus.Entry) { // queue for read xml (limit number of parallel read files) defer wg.Done() queueForConcurrent <- struct{}{} defer func() { <-queueForConcurrent }() xmlInfo, logger := CheckIfXmlIsWellFormed(message.path, logger) if *configurationFolder != "" && len(xmlInfo.data) > 0 { _, logger = CheckXMLValidation(message.path, xmlInfo, logger) _, logger = ProcessXpath(message.path, xmlInfo, logger) } logger.Info("") UpdateCounter() } func CheckIfXmlIsWellFormed(xmlPath string, logger *logrus.Entry) (XMLInfo, *logrus.Entry) { xmlInfo := XMLInfo{isWellFormed: false} // check with xmlstarlet (slow) result, _ := exec.Command("xmlstarlet", "val", "-w", "-e", xmlPath).CombinedOutput() if result != nil && strings.Contains(string(result), "invalid") { xmlInfo.wellFormedErrors = FormatWellFormedError(string(result)) logger = logger.WithFields(logrus.Fields{ "isWellFormed": false, "wellFormedErrors": FormatWellFormedError(string(result)), }) return xmlInfo, logger } // check if able to open xmlFile, errOpen := os.Open(xmlPath) if errOpen != nil { xmlInfo.wellFormedErrors = ErrorXML{Message: errOpen.Error()} logger = logger.WithFields(logrus.Fields{ "isWellFormed": false, "wellFormedErrors": xmlInfo.wellFormedErrors, }) return xmlInfo, logger } // defer the closing of our xmlFile so that we can parse it later on defer xmlFile.Close() // read our opened xmlFile1 as a byte array. here I am checking if the file is valid or not byteValue, errRead := ioutil.ReadAll(xmlFile) if errRead != nil { xmlInfo.wellFormedErrors = ErrorXML{Message: errOpen.Error()} logger = logger.WithFields(logrus.Fields{ "isWellFormed": false, "wellFormedErrors": xmlInfo.wellFormedErrors, }) return xmlInfo, logger } xmlInfo.data = string(byteValue) xmlInfo.isWellFormed = true logger = logger.WithFields(logrus.Fields{ "isWellFormed": true, }) // get doctype from xml doctype := regexDoctype.FindString(xmlInfo.data) if doctype != "" { dtd := regexDtd.FindString(doctype) if dtd != "" { xmlInfo.doctype = DoctypeXML{Sysid: dtd} logger = logger.WithFields(logrus.Fields{ "doctype": xmlInfo.doctype, }) } } return xmlInfo, logger } func FormatWellFormedError(resultError string) ErrorXML { messageFormatted := strings.NewReplacer("\n", "", ` `, "").Replace(resultError) messageLine := regexLineWellFormed.FindStringSubmatch(messageFormatted) line := "0" errorMessage := regexErrorWellFormedMessage.FindStringSubmatch(messageFormatted) if len(messageLine) >= 2 && len(messageLine[1]) > 0 { line = messageLine[1] } if len(errorMessage) >= 2 && len(errorMessage[1]) > 0 { messageFormatted = strings.TrimSpace(errorMessage[1]) } return ErrorXML{ Message: messageFormatted, Line: line, } } func FormatValidationError(resultError string, file string) []ErrorXML { listError := []ErrorXML{} errorsMessages := regexErrorValidationMessage.FindAllStringSubmatch(resultError, -1) linesMessages := regexLineValidation.FindAllStringSubmatch(resultError, -1) if errorsMessages != nil && linesMessages != nil { //fmt.Print("errorsMessages: ", errorsMessages) for i := 0; i < len(errorsMessages); i++ { if errorsMessages[i] != nil || linesMessages[i] != nil { if i < len(linesMessages) { listError = append(listError, ErrorXML{Message: strings.NewReplacer(`"`, "").Replace(errorsMessages[i][1]), Line: linesMessages[i][1], File: file}) } else { listError = append(listError, ErrorXML{Message: strings.NewReplacer(`"`, "").Replace(errorsMessages[i][1]), File: file}) fmt.Print("XML validation error...") } } } } return listError } func CheckXMLValidation(pathXml string, xmlInfo XMLInfo, logger *logrus.Entry) (DetailledAnalysis, *logrus.Entry) { xmlDetailled := DetailledAnalysis{isValidAgainstDTD: false, isValidAgainstSchema: false} pathExec := *configurationFolder + "/" + *corpusName // if test if flag.Lookup("test.v") != nil { pathExec = "./example" } /** * VALIDATION DTD */ // if dtd exist in xml file process only this if len(configDetailledAnalyze.XML.ListDTD) > 0 { if xmlInfo.doctype.Sysid != "" { sysId := xmlInfo.doctype.Sysid if strings.Contains(sysId, "/") { t := strings.Split(xmlInfo.doctype.Sysid, "/") sysId = t[len(t)-1] } dtdPath := GetStringWithSuffixInList(configDetailledAnalyze.XML.ListDTD, sysId) if dtdPath != "" { result, _ := exec.Command("xmllint", "--dtdvalid", pathExec+"/dtd/"+dtdPath, pathXml, "--noout", "--nowarning").CombinedOutput() if string(result) == "" { xmlDetailled.isValidAgainstDTD = true xmlDetailled.validationDTDInfos = dtdPath } else { //fmt.Println("result de xmlstarlet: ", string(result)) errors := FormatValidationError(string(result), dtdPath) if len(errors) > 0 { xmlDetailled.validationsErrors = append(xmlDetailled.validationsErrors, errors...) } } } else { xmlDetailled.validationsErrors = append(xmlDetailled.validationsErrors, ErrorXML{Message: "Dtd not present in corpus-resources"}) } } else if !*parseOnlyWithSysid { // check with all dtd and stop if one is true for _, dtdPath := range configDetailledAnalyze.XML.ListDTD { result, _ := exec.Command("xmllint", "--dtdvalid", pathExec+"/dtd/"+dtdPath, pathXml, "--noout", "--nowarning").CombinedOutput() if string(result) == "" { xmlDetailled.isValidAgainstDTD = true xmlDetailled.validationDTDInfos = dtdPath break } else { errors := FormatValidationError(string(result), dtdPath) if len(errors) > 0 { xmlDetailled.validationsErrors = append(xmlDetailled.validationsErrors, errors...) } } } } logger = logger.WithFields(logrus.Fields{ "isValidAgainstDTD": xmlDetailled.isValidAgainstDTD, }) if xmlDetailled.isValidAgainstDTD { logger = logger.WithFields(logrus.Fields{ "validationDTDInfos": xmlDetailled.validationDTDInfos, }) } else { logger = logger.WithFields(logrus.Fields{ "validationsErrors": xmlDetailled.validationsErrors, }) } /** * VALIDATION XSD */ } else if len(configDetailledAnalyze.XML.ListXSD) > 0 { // if xsd is present check schema validation for _, xsdPath := range configDetailledAnalyze.XML.ListXSD { result, _ := exec.Command("xmllint", "--schema", pathExec+"/xsd/"+xsdPath, pathXml, "--noout", "--nowarning").CombinedOutput() if strings.Contains(string(result), "validates") { xmlDetailled.isValidAgainstSchema = true xmlDetailled.validationXSDInfos = xsdPath break } else { errors := FormatValidationError(string(result), xsdPath) xmlDetailled.validationSchemaErrors = append(xmlDetailled.validationSchemaErrors, errors...) } } logger = logger.WithFields(logrus.Fields{ "isValidAgainstSchema": xmlDetailled.isValidAgainstSchema, }) if xmlDetailled.isValidAgainstSchema { logger = logger.WithFields(logrus.Fields{ "validationXSDInfos": xmlDetailled.validationXSDInfos, }) } else { logger = logger.WithFields(logrus.Fields{ "validationSchemaErrors": xmlDetailled.validationSchemaErrors, }) } } return xmlDetailled, logger } func ProcessXpath(xmlPath string, xmlInfo XMLInfo, logger *logrus.Entry) (DetailledAnalysis, *logrus.Entry) { doc, err := xmlquery.Parse(strings.NewReader(xmlInfo.data)) detailledInfo := DetailledAnalysis{} if len(configDetailledAnalyze.XML.XPATH) > 0 { tmpXpath := "{" if err == nil { for _, field := range configDetailledAnalyze.XML.XPATH { previousXpath := make(map[string]string) for _, xpathContent := range field.Xpath { goodXPath, hasQuote := FormatXpathByType(field.Type, xpathContent) expr, _ := xpath.Compile(goodXPath) result := expr.Evaluate(xmlquery.CreateXPathNavigator(doc)) text := fmt.Sprintf("%v", result) // not process xpath if previous is good if previousXpath[xpathContent] == "" && len(text) > 0 { previousXpath[xpathContent] = xpathContent // if regex add element if isValid if field.Regex != "" { filterText := regexp.MustCompile(field.Regex).FindString(text) if len(filterText) > 0 && filterText != "0" { tmpXpath += `"` + field.Name + `IsValid":` + "true" + `,` } else if filterText != "0" { tmpXpath += `"` + field.Name + `IsValid":` + "false" + `,` } } // if fieldType = Count authorize 0 value if field.Type == "Count" { tmpXpath += `"` + field.Name + `":` + text + `,` } else if text != "0" { text = strings.NewReplacer("\"", "").Replace(text) // for string if hasQuote { tmpXpath += `"` + field.Name + `":"` + text + `",` } else { // for boolean and number tmpXpath += `"` + field.Name + `":` + text + `,` } } } } previousXpath = nil } } tmpXpath = strings.TrimSuffix(tmpXpath, ",") tmpXpath += "}" tmpXpath = standardizeSpaces(tmpXpath) finalXpath := []byte(tmpXpath) jsonData := XpathStructure{} error := json.Unmarshal(finalXpath, &jsonData) // if no error add xpath to analyze if error == nil { detailledInfo.xpath = finalXpath logger = logger.WithFields(logrus.Fields{ "xpath": detailledInfo.xpath, }) } else { logger = logger.WithFields(logrus.Fields{ "xpathError": "File " + xmlPath + " : " + error.Error(), }) } } return detailledInfo, logger } func FormatXpathByType(fieldType string, xpath string) (string, bool) { hasQuote := false switch fieldType { case "Number": return "number(" + xpath + ")", hasQuote case "Boolean": return "boolean(" + xpath + ")", hasQuote case "Count": return "count(" + xpath + ")", hasQuote } hasQuote = true // String and Attribute return "string(" + xpath + ")", hasQuote }