package main import ( "flag" "fmt" "io/ioutil" "os" "os/exec" "regexp" "strings" "github.com/antchfx/xmlquery" "github.com/antchfx/xpath" "github.com/goccy/go-json" "github.com/sirupsen/logrus" ) var regexDoctype = regexp.MustCompile("<!DOCTYPE.*([[sS]*?])?>") var regexDtd = regexp.MustCompile("[a-zA-Z0-9-_./:]*.dtd") var regexLineWellFormed = regexp.MustCompile("line ([0-9]{1,}) ") var regexErrorWellFormedMessage = regexp.MustCompile(` ([A-Z].*)\^`) // var regexLineValidation = regexp.MustCompile(`(?m):([0-9]{1,}):`) // var regexErrorValidationMessage = regexp.MustCompile(` ([A-Z].*)`) func ProcessXML(message *GeneralInfo, logger *logrus.Entry) { // queue for read xml (limit number of parallel read files) defer wg.Done() queueForConcurrent <- struct{}{} defer func() { <-queueForConcurrent }() logger, xmlInfo := CheckIfXmlIsWellFormed(message.path, logger) if *configurationFolder != "" && len(xmlInfo.data) > 0 { detailledInfo := DetailledAnalysis{} detailledInfo, logger = CheckXMLValidation(message.path, xmlInfo, logger) ProcessXpath(xmlInfo, detailledInfo, logger) } else { logger.Info("") } } func CheckIfXmlIsWellFormed(xmlPath string, logger *logrus.Entry) (*logrus.Entry, XMLInfo) { xmlInfo := XMLInfo{isWellFormed: false} // check with xmlstarlet (slow) result, _ := exec.Command("xmlstarlet", "val", "-w", "-e", xmlPath).CombinedOutput() if result != nil && strings.Contains(string(result), "invalid") { logger = logger.WithFields(logrus.Fields{ "isWellFormed": false, "wellFormedErrors": FormatWellFormedError(string(result)), }) return logger, xmlInfo } // check if able to open xmlFile, errOpen := os.Open(xmlPath) if errOpen != nil { logger = logger.WithFields(logrus.Fields{ "isWellFormed": false, "wellFormedErrors": ErrorXML{Message: errOpen.Error()}, }) return logger, xmlInfo } // defer the closing of our xmlFile so that we can parse it later on defer xmlFile.Close() // read our opened xmlFile1 as a byte array. here I am checking if the file is valid or not byteValue, errRead := ioutil.ReadAll(xmlFile) if errRead != nil { xmlInfo.wellFormedErrors = ErrorXML{Message: errOpen.Error()} logger = logger.WithFields(logrus.Fields{ "isWellFormed": false, "wellFormedErrors": xmlInfo.wellFormedErrors, }) return logger, xmlInfo } xmlInfo.data = string(byteValue) // get doctype from xml doctype := regexDoctype.FindString(xmlInfo.data) if doctype != "" { dtd := regexDtd.FindString(doctype) if dtd != "" { xmlDoctype := DoctypeXML{Sysid: dtd} logger = logger.WithFields(logrus.Fields{ "isWellFormed": true, "doctype": xmlDoctype, }) xmlInfo.doctype = xmlDoctype } } return logger, xmlInfo } func FormatWellFormedError(resultError string) ErrorXML { messageFormatted := strings.NewReplacer("\n", "", ` `, "").Replace(resultError) messageLine := regexLineWellFormed.FindStringSubmatch(messageFormatted) line := "0" errorMessage := regexErrorWellFormedMessage.FindStringSubmatch(messageFormatted) if len(messageLine) >= 2 && len(messageLine[1]) > 0 { line = messageLine[1] } if len(errorMessage) >= 2 && len(errorMessage[1]) > 0 { messageFormatted = strings.TrimSpace(errorMessage[1]) } return ErrorXML{ Message: messageFormatted, Line: line, } } func FormatValidationError(resultError string) []ErrorXML { // errorMessage := regexErrorValidationMessage.FindStringSubmatch(resultError) // messageLine := regexLineValidation.FindStringSubmatch(resultError) return []ErrorXML{} } func CheckXMLValidation(pathXml string, xmlInfo XMLInfo, logger *logrus.Entry) (DetailledAnalysis, *logrus.Entry) { xmlDetailled := DetailledAnalysis{isValidAgainstDTD: false, isValidAgainstSchema: false} pathExec := *configurationFolder + "/" + *corpusName // if test if flag.Lookup("test.v") != nil { pathExec = "./example" } // if dtd exist in xml file process only this if len(configDetailledAnalyze.XML.ListDTD) > 0 { if xmlInfo.doctype.Sysid != "" { for _, dtdPath := range configDetailledAnalyze.XML.ListDTD { if strings.HasSuffix(dtdPath, xmlInfo.doctype.Sysid) { result, _ := exec.Command("xmllint", "--dtdvalid", pathExec+"/dtd/"+dtdPath, pathXml, "--noout", "--nowarning").CombinedOutput() if string(result) == "" { xmlDetailled.isValidAgainstDTD = true xmlDetailled.validationDTDInfos = dtdPath } else { xmlDetailled.validationsErrors = string(result) } } } } else { // check with all dtd and stop if one is true for _, dtdPath := range configDetailledAnalyze.XML.ListDTD { result, _ := exec.Command("xmllint", "--dtdvalid", pathExec+"/dtd/"+dtdPath, pathXml, "--noout", "--nowarning").CombinedOutput() if string(result) == "" { xmlDetailled.isValidAgainstDTD = true xmlDetailled.validationDTDInfos = dtdPath break } else { xmlDetailled.validationsErrors = string(result) } } } logger = logger.WithFields(logrus.Fields{ "isValidAgainstDTD": xmlDetailled.isValidAgainstDTD, "validationDTDInfos": xmlDetailled.validationDTDInfos, }) if !xmlDetailled.isValidAgainstDTD { logger = logger.WithFields(logrus.Fields{ "validationsErrors": xmlDetailled.validationsErrors, }) } } else if len(configDetailledAnalyze.XML.ListXSD) > 0 { // if xsd is present check schema validation for _, xsdPath := range configDetailledAnalyze.XML.ListXSD { result, _ := exec.Command("xmllint", "--schema", pathExec+"/xsd/"+xsdPath, pathXml, "--noout", "--nowarning").CombinedOutput() if strings.Contains(string(result), "validates") { xmlDetailled.isValidAgainstSchema = true break } else { xmlDetailled.validationSchemaErrors = append(xmlDetailled.validationSchemaErrors, string(result)) } } logger = logger.WithFields(logrus.Fields{ "isValidAgainstSchema": xmlDetailled.isValidAgainstSchema, "validationDTDInfos": xmlDetailled.validationDTDInfos, }) if !xmlDetailled.isValidAgainstSchema { logger = logger.WithFields(logrus.Fields{ "validationsErrors": xmlDetailled.validationsErrors, }) } } return xmlDetailled, logger } func ProcessXpath(xmlInfo XMLInfo, detailledInfo DetailledAnalysis, logger *logrus.Entry) { doc, err := xmlquery.Parse(strings.NewReader(xmlInfo.data)) tmpXpath := "{" if err == nil { for _, field := range configDetailledAnalyze.XML.XPATH { for _, xpathContent := range field.Xpath { goodXPath, hasQuote := FormatXpathByType(field.Type, xpathContent) expr, _ := xpath.Compile(goodXPath) result := expr.Evaluate(xmlquery.CreateXPathNavigator(doc)) str := fmt.Sprintf("%v", result) if len(str) > 0 && str != "0" { tmpXpath += FormatTextForXpath(field, str, hasQuote) } } } } tmpXpath = strings.TrimSuffix(tmpXpath, ",") tmpXpath += "}" finalXpath := []byte(tmpXpath) jsonData := XpathStructure{} json.Unmarshal(finalXpath, &jsonData) detailledInfo.xpath = finalXpath logger = logger.WithFields(logrus.Fields{ "xpath": detailledInfo.xpath, }) logger.Info("") } func FormatTextForXpath(field MetadataConfigDetailledAnalyze, text string, hasQuote bool) string { if field.Regex != "" { filterText := regexp.MustCompile(field.Regex).FindString(text) if len(filterText) > 0 { text = filterText } } if hasQuote { return `"` + field.Name + `":"` + strings.NewReplacer("\n", "").Replace(text) + `",` } return `"` + field.Name + `":` + text + `,` } func FormatXpathByType(fieldType string, xpath string) (string, bool) { switch fieldType { case "Number": return "number(" + xpath + ")", false case "Boolean": return "boolean(" + xpath + ")", false case "Count": return "count(" + xpath + ")", false } // String and Attrivute return "string(" + xpath + ")", true }