Newer
Older
sisyphe-go / xml.go
package main

import (
	"flag"
	"fmt"
	"io/ioutil"
	"os"
	"os/exec"
	"regexp"
	"strings"

	"github.com/antchfx/xmlquery"
	"github.com/antchfx/xpath"
	"github.com/goccy/go-json"
	"github.com/sirupsen/logrus"
)

var regexDoctype = regexp.MustCompile("<!DOCTYPE.*([[sS]*?])?>")
var regexDtd = regexp.MustCompile("[a-zA-Z0-9-_./:]*.dtd")
var regexLineWellFormed = regexp.MustCompile("line ([0-9]{1,}) ")
var regexErrorWellFormedMessage = regexp.MustCompile(` ([A-Z].*)\^`)

var regexLineValidation = regexp.MustCompile(`:([0-9]{1,}):`)
var regexErrorValidationMessage = regexp.MustCompile(` ([A-Z].*)\n`)

func ProcessXML(message *GeneralInfo, logger *logrus.Entry) {
	// queue for read xml (limit number of parallel read files)
	defer wg.Done()
	queueForConcurrent <- struct{}{}
	defer func() { <-queueForConcurrent }()

	xmlInfo, logger := CheckIfXmlIsWellFormed(message.path, logger)
	if *configurationFolder != "" && len(xmlInfo.data) > 0 {
		_, logger = CheckXMLValidation(message.path, xmlInfo, logger)
		_, logger = ProcessXpath(xmlInfo, logger)
	}
	logger.Info("")
	UpdateCounter()
}

func CheckIfXmlIsWellFormed(xmlPath string, logger *logrus.Entry) (XMLInfo, *logrus.Entry) {
	xmlInfo := XMLInfo{isWellFormed: false}
	// check with xmlstarlet (slow)
	result, _ := exec.Command("xmlstarlet", "val", "-w", "-e", xmlPath).CombinedOutput()
	if result != nil && strings.Contains(string(result), "invalid") {
		xmlInfo.wellFormedErrors = FormatWellFormedError(string(result))
		logger = logger.WithFields(logrus.Fields{
			"isWellFormed":     false,
			"wellFormedErrors": FormatWellFormedError(string(result)),
		})
		return xmlInfo, logger
	}

	// check if able to open
	xmlFile, errOpen := os.Open(xmlPath)
	if errOpen != nil {
		xmlInfo.wellFormedErrors = ErrorXML{Message: errOpen.Error()}
		logger = logger.WithFields(logrus.Fields{
			"isWellFormed":     false,
			"wellFormedErrors": xmlInfo.wellFormedErrors,
		})
		return xmlInfo, logger
	}
	// defer the closing of our xmlFile so that we can parse it later on
	defer xmlFile.Close()
	// read our opened xmlFile1 as a byte array. here I am checking if the file is valid or not
	byteValue, errRead := ioutil.ReadAll(xmlFile)
	if errRead != nil {
		xmlInfo.wellFormedErrors = ErrorXML{Message: errOpen.Error()}
		logger = logger.WithFields(logrus.Fields{
			"isWellFormed":     false,
			"wellFormedErrors": xmlInfo.wellFormedErrors,
		})
		return xmlInfo, logger
	}

	xmlInfo.data = string(byteValue)
	xmlInfo.isWellFormed = true
	logger = logger.WithFields(logrus.Fields{
		"isWellFormed": true,
	})
	// get doctype from xml
	doctype := regexDoctype.FindString(xmlInfo.data)
	if doctype != "" {
		dtd := regexDtd.FindString(doctype)
		if dtd != "" {
			xmlInfo.doctype = DoctypeXML{Sysid: dtd}
			logger = logger.WithFields(logrus.Fields{
				"doctype": xmlInfo.doctype,
			})
		}
	}
	return xmlInfo, logger
}

func FormatWellFormedError(resultError string) ErrorXML {
	messageFormatted := strings.NewReplacer("\n", "", `  `, "").Replace(resultError)
	messageLine := regexLineWellFormed.FindStringSubmatch(messageFormatted)
	line := "0"
	errorMessage := regexErrorWellFormedMessage.FindStringSubmatch(messageFormatted)
	if len(messageLine) >= 2 && len(messageLine[1]) > 0 {
		line = messageLine[1]
	}
	if len(errorMessage) >= 2 && len(errorMessage[1]) > 0 {
		messageFormatted = strings.TrimSpace(errorMessage[1])
	}
	return ErrorXML{
		Message: messageFormatted,
		Line:    line,
	}
}

func FormatValidationError(resultError string, file string) []ErrorXML {
	listError := []ErrorXML{}
	errorsMessages := regexErrorValidationMessage.FindAllStringSubmatch(resultError, -1)
	linesMessages := regexLineValidation.FindAllStringSubmatch(resultError, -1)
	if errorsMessages != nil && linesMessages != nil {
		for i := 0; i < len(errorsMessages); i++ {
			if errorsMessages[i] != nil || linesMessages[i] != nil {
				listError = append(listError, ErrorXML{Message: strings.NewReplacer(`"`, "").Replace(errorsMessages[i][1]), Line: linesMessages[i][1], File: file})
			}
		}
	}
	return listError
}

func CheckXMLValidation(pathXml string, xmlInfo XMLInfo, logger *logrus.Entry) (DetailledAnalysis, *logrus.Entry) {
	xmlDetailled := DetailledAnalysis{isValidAgainstDTD: false, isValidAgainstSchema: false}
	pathExec := *configurationFolder + "/" + *corpusName

	// if test
	if flag.Lookup("test.v") != nil {
		pathExec = "./example"
	}

	/**
	* VALIDATION DTD
	 */
	// if dtd exist in xml file process only this
	if len(configDetailledAnalyze.XML.ListDTD) > 0 {
		if xmlInfo.doctype.Sysid != "" {
			dtdPath := GetStringWithSuffixInList(configDetailledAnalyze.XML.ListDTD, xmlInfo.doctype.Sysid)
			if dtdPath != "" {
				result, _ := exec.Command("xmllint", "--dtdvalid", pathExec+"/dtd/"+dtdPath, pathXml, "--noout", "--nowarning").CombinedOutput()
				if string(result) == "" {
					xmlDetailled.isValidAgainstDTD = true
					xmlDetailled.validationDTDInfos = dtdPath
				} else {
					errors := FormatValidationError(string(result), dtdPath)
					if len(errors) > 0 {
						xmlDetailled.validationsErrors = append(xmlDetailled.validationsErrors, errors...)
					}
				}
			} else {
				xmlDetailled.validationsErrors = append(xmlDetailled.validationsErrors, ErrorXML{Message: "Dtd not present in corpus-resources"})
			}

		} else {
			// check with all dtd and stop if one is true
			for _, dtdPath := range configDetailledAnalyze.XML.ListDTD {
				result, _ := exec.Command("xmllint", "--dtdvalid", pathExec+"/dtd/"+dtdPath, pathXml, "--noout", "--nowarning").CombinedOutput()
				if string(result) == "" {
					xmlDetailled.isValidAgainstDTD = true
					xmlDetailled.validationDTDInfos = dtdPath
					break
				} else {
					errors := FormatValidationError(string(result), dtdPath)
					if len(errors) > 0 {
						xmlDetailled.validationsErrors = append(xmlDetailled.validationsErrors, errors...)
					}
				}
			}
		}

		logger = logger.WithFields(logrus.Fields{
			"isValidAgainstDTD": xmlDetailled.isValidAgainstDTD,
		})
		if xmlDetailled.isValidAgainstDTD {
			logger = logger.WithFields(logrus.Fields{
				"validationDTDInfos": xmlDetailled.validationDTDInfos,
			})
		} else {
			logger = logger.WithFields(logrus.Fields{
				"validationsErrors": xmlDetailled.validationsErrors,
			})
		}

		/**
		* VALIDATION XSD
		 */
	} else if len(configDetailledAnalyze.XML.ListXSD) > 0 {
		// if xsd is present check schema validation
		for _, xsdPath := range configDetailledAnalyze.XML.ListXSD {
			result, _ := exec.Command("xmllint", "--schema", pathExec+"/xsd/"+xsdPath, pathXml, "--noout", "--nowarning").CombinedOutput()
			if strings.Contains(string(result), "validates") {
				xmlDetailled.isValidAgainstSchema = true
				xmlDetailled.validationXSDInfos = xsdPath
				break
			} else {
				errors := FormatValidationError(string(result), xsdPath)
				xmlDetailled.validationSchemaErrors = append(xmlDetailled.validationSchemaErrors, errors...)
			}
		}

		logger = logger.WithFields(logrus.Fields{
			"isValidAgainstSchema": xmlDetailled.isValidAgainstSchema,
		})

		if xmlDetailled.isValidAgainstSchema {
			logger = logger.WithFields(logrus.Fields{
				"validationXSDInfos": xmlDetailled.validationXSDInfos,
			})
		} else {
			logger = logger.WithFields(logrus.Fields{
				"validationSchemaErrors": xmlDetailled.validationSchemaErrors,
			})
		}
	}

	return xmlDetailled, logger
}

func ProcessXpath(xmlInfo XMLInfo, logger *logrus.Entry) (DetailledAnalysis, *logrus.Entry) {
	doc, err := xmlquery.Parse(strings.NewReader(xmlInfo.data))
	detailledInfo := DetailledAnalysis{}

	if len(configDetailledAnalyze.XML.XPATH) > 0 {
		tmpXpath := "{"
		if err == nil {
			for _, field := range configDetailledAnalyze.XML.XPATH {
				previousXpath := make(map[string]string)
				for _, xpathContent := range field.Xpath {
					goodXPath, hasQuote := FormatXpathByType(field.Type, xpathContent)
					expr, _ := xpath.Compile(goodXPath)
					result := expr.Evaluate(xmlquery.CreateXPathNavigator(doc))
					text := fmt.Sprintf("%v", result)
					// not process xpath if previous is good
					if previousXpath[xpathContent] == "" && len(text) > 0 {
						previousXpath[xpathContent] = xpathContent
						// if regex add element if isValid
						if field.Regex != "" {
							filterText := regexp.MustCompile(field.Regex).FindString(text)
							if len(filterText) > 0 && filterText != "0" {
								tmpXpath += `"` + field.Name + `IsValid":` + "true" + `,`
							} else if filterText != "0" {
								tmpXpath += `"` + field.Name + `IsValid":` + "false" + `,`
							}
						}

						// if fieldType = Count authorize 0 value
						if field.Type == "Count" {
							tmpXpath += `"` + field.Name + `":` + text + `,`
						} else if text != "0" {
							text = strings.NewReplacer("\"", "").Replace(text)
							// for string
							if hasQuote {
								tmpXpath += `"` + field.Name + `":"` + text + `",`
							} else { // for boolean and number
								tmpXpath += `"` + field.Name + `":` + text + `,`
							}
						}
					}
				}
				previousXpath = nil
			}
		}

		tmpXpath = strings.TrimSuffix(tmpXpath, ",")
		tmpXpath += "}"
		tmpXpath = standardizeSpaces(tmpXpath)

		finalXpath := []byte(tmpXpath)
		jsonData := XpathStructure{}
		json.Unmarshal(finalXpath, &jsonData)

		detailledInfo.xpath = finalXpath

		logger = logger.WithFields(logrus.Fields{
			"xpath": detailledInfo.xpath,
		})
	}

	return detailledInfo, logger
}

func FormatXpathByType(fieldType string, xpath string) (string, bool) {
	hasQuote := false
	switch fieldType {
	case "Number":
		return "number(" + xpath + ")", hasQuote
	case "Boolean":
		return "boolean(" + xpath + ")", hasQuote
	case "Count":
		return "count(" + xpath + ")", hasQuote
	}
	hasQuote = true
	// String and Attribute
	return "string(" + xpath + ")", hasQuote
}