Newer
Older
sisyphe-go / xml.go
@Nacim Nacim on 12 Mar 2022 8 KB run script for index corpus
package main

import (
	"flag"
	"fmt"
	"io/ioutil"
	"os"
	"os/exec"
	"regexp"
	"strings"

	"github.com/antchfx/xmlquery"
	"github.com/antchfx/xpath"
	"github.com/goccy/go-json"
	"github.com/sirupsen/logrus"
)

var regexDoctype = regexp.MustCompile("<!DOCTYPE.*([[sS]*?])?>")
var regexDtd = regexp.MustCompile("[a-zA-Z0-9-_./:]*.dtd")
var regexLineWellFormed = regexp.MustCompile("line ([0-9]{1,}) ")
var regexErrorWellFormedMessage = regexp.MustCompile(` ([A-Z].*)\^`)

var regexLineValidation = regexp.MustCompile(`:([0-9]{1,}):`)
var regexErrorValidationMessage = regexp.MustCompile(` ([A-Z].*)\n`)

func ProcessXML(message *GeneralInfo, logger *logrus.Entry) {
	// queue for read xml (limit number of parallel read files)
	defer wg.Done()
	queueForConcurrent <- struct{}{}
	defer func() { <-queueForConcurrent }()

	xmlInfo, logger := CheckIfXmlIsWellFormed(message.path, logger)
	if *configurationFolder != "" && len(xmlInfo.data) > 0 {
		detailledInfo := DetailledAnalysis{}
		detailledInfo, logger = CheckXMLValidation(message.path, xmlInfo, logger)
		ProcessXpath(xmlInfo, detailledInfo, logger)
	} else {
		logger.Info("")
	}
	numberFiles++
}

func CheckIfXmlIsWellFormed(xmlPath string, logger *logrus.Entry) (XMLInfo, *logrus.Entry) {
	xmlInfo := XMLInfo{isWellFormed: false}
	// check with xmlstarlet (slow)
	result, _ := exec.Command("xmlstarlet", "val", "-w", "-e", xmlPath).CombinedOutput()
	if result != nil && strings.Contains(string(result), "invalid") {
		xmlInfo.wellFormedErrors = FormatWellFormedError(string(result))
		logger = logger.WithFields(logrus.Fields{
			"isWellFormed":     false,
			"wellFormedErrors": FormatWellFormedError(string(result)),
		})
		return xmlInfo, logger
	}

	// check if able to open
	xmlFile, errOpen := os.Open(xmlPath)
	if errOpen != nil {
		xmlInfo.wellFormedErrors = ErrorXML{Message: errOpen.Error()}
		logger = logger.WithFields(logrus.Fields{
			"isWellFormed":     false,
			"wellFormedErrors": xmlInfo.wellFormedErrors,
		})
		return xmlInfo, logger
	}
	// defer the closing of our xmlFile so that we can parse it later on
	defer xmlFile.Close()
	// read our opened xmlFile1 as a byte array. here I am checking if the file is valid or not
	byteValue, errRead := ioutil.ReadAll(xmlFile)
	if errRead != nil {
		xmlInfo.wellFormedErrors = ErrorXML{Message: errOpen.Error()}
		logger = logger.WithFields(logrus.Fields{
			"isWellFormed":     false,
			"wellFormedErrors": xmlInfo.wellFormedErrors,
		})
		return xmlInfo, logger
	}

	xmlInfo.data = string(byteValue)
	// get doctype from xml
	doctype := regexDoctype.FindString(xmlInfo.data)
	if doctype != "" {
		dtd := regexDtd.FindString(doctype)
		if dtd != "" {
			xmlDoctype := DoctypeXML{Sysid: dtd}
			logger = logger.WithFields(logrus.Fields{
				"isWellFormed": true,
				"doctype":      xmlDoctype,
			})
			xmlInfo.isWellFormed = true
			xmlInfo.doctype = xmlDoctype
		}
	}
	return xmlInfo, logger
}

func FormatWellFormedError(resultError string) ErrorXML {
	messageFormatted := strings.NewReplacer("\n", "", `  `, "").Replace(resultError)
	messageLine := regexLineWellFormed.FindStringSubmatch(messageFormatted)
	line := "0"
	errorMessage := regexErrorWellFormedMessage.FindStringSubmatch(messageFormatted)
	if len(messageLine) >= 2 && len(messageLine[1]) > 0 {
		line = messageLine[1]
	}
	if len(errorMessage) >= 2 && len(errorMessage[1]) > 0 {
		messageFormatted = strings.TrimSpace(errorMessage[1])
	}
	return ErrorXML{
		Message: messageFormatted,
		Line:    line,
	}
}

func FormatValidationError(resultError string, file string) []ErrorXML {
	listError := []ErrorXML{}
	errorsMessages := regexErrorValidationMessage.FindAllStringSubmatch(resultError, -1)
	linesMessages := regexLineValidation.FindAllStringSubmatch(resultError, -1)
	if errorsMessages != nil && linesMessages != nil {
		for i := 0; i < len(errorsMessages); i++ {
			listError = append(listError, ErrorXML{Message: strings.NewReplacer(`"`, "").Replace(errorsMessages[i][1]), Line: linesMessages[i][1], File: file})
		}
	}
	return listError
}

func CheckXMLValidation(pathXml string, xmlInfo XMLInfo, logger *logrus.Entry) (DetailledAnalysis, *logrus.Entry) {
	xmlDetailled := DetailledAnalysis{isValidAgainstDTD: false, isValidAgainstSchema: false}
	pathExec := *configurationFolder + "/" + *corpusName

	// if test
	if flag.Lookup("test.v") != nil {
		pathExec = "./example"
	}

	/**
	* VALIDATION DTD
	 */
	// if dtd exist in xml file process only this
	if len(configDetailledAnalyze.XML.ListDTD) > 0 {
		if xmlInfo.doctype.Sysid != "" {
			dtdPath := GetStringWithSuffixInList(configDetailledAnalyze.XML.ListDTD, xmlInfo.doctype.Sysid)
			if dtdPath != "" {
				result, _ := exec.Command("xmllint", "--dtdvalid", pathExec+"/dtd/"+dtdPath, pathXml, "--noout", "--nowarning").CombinedOutput()
				if string(result) == "" {
					xmlDetailled.isValidAgainstDTD = true
					xmlDetailled.validationDTDInfos = dtdPath
				} else {
					errors := FormatValidationError(string(result), dtdPath)
					if len(errors) > 0 {
						xmlDetailled.validationsErrors = append(xmlDetailled.validationsErrors, errors...)
					}
				}
			} else {
				xmlDetailled.validationsErrors = append(xmlDetailled.validationsErrors, ErrorXML{Message: "Dtd not present in corpus-resources"})
			}

		} else {
			// check with all dtd and stop if one is true
			for _, dtdPath := range configDetailledAnalyze.XML.ListDTD {
				result, _ := exec.Command("xmllint", "--dtdvalid", pathExec+"/dtd/"+dtdPath, pathXml, "--noout", "--nowarning").CombinedOutput()
				if string(result) == "" {
					xmlDetailled.isValidAgainstDTD = true
					xmlDetailled.validationDTDInfos = dtdPath
					break
				} else {
					xmlDetailled.validationsErrors = FormatValidationError(string(result), dtdPath)
				}
			}
		}

		logger = logger.WithFields(logrus.Fields{
			"isValidAgainstDTD": xmlDetailled.isValidAgainstDTD,
		})
		if xmlDetailled.isValidAgainstDTD {
			logger = logger.WithFields(logrus.Fields{
				"validationDTDInfos": xmlDetailled.validationDTDInfos,
			})
		} else {
			logger = logger.WithFields(logrus.Fields{
				"validationsErrors": xmlDetailled.validationsErrors,
			})
		}

		/**
		* VALIDATION XSD
		 */
	} else if len(configDetailledAnalyze.XML.ListXSD) > 0 {
		// if xsd is present check schema validation
		for _, xsdPath := range configDetailledAnalyze.XML.ListXSD {
			result, _ := exec.Command("xmllint", "--schema", pathExec+"/xsd/"+xsdPath, pathXml, "--noout", "--nowarning").CombinedOutput()
			if strings.Contains(string(result), "validates") {
				xmlDetailled.isValidAgainstSchema = true
				break
			} else {
				xmlDetailled.validationSchemaErrors = FormatValidationError(string(result), xsdPath)
			}
		}
		logger = logger.WithFields(logrus.Fields{
			"isValidAgainstSchema": xmlDetailled.isValidAgainstSchema,
		})

		if !xmlDetailled.isValidAgainstSchema {
			logger = logger.WithFields(logrus.Fields{
				"validationSchemaErrors": xmlDetailled.validationSchemaErrors,
			})
		}
	}

	return xmlDetailled, logger
}

func ProcessXpath(xmlInfo XMLInfo, detailledInfo DetailledAnalysis, logger *logrus.Entry) {
	doc, err := xmlquery.Parse(strings.NewReader(xmlInfo.data))

	tmpXpath := "{"
	if err == nil {
		for _, field := range configDetailledAnalyze.XML.XPATH {
			previousXpath := make(map[string]string)
			for _, xpathContent := range field.Xpath {
				goodXPath, hasQuote := FormatXpathByType(field.Type, xpathContent)
				expr, _ := xpath.Compile(goodXPath)
				result := expr.Evaluate(xmlquery.CreateXPathNavigator(doc))
				text := fmt.Sprintf("%v", result)
				// not process xpath if previous is good
				if previousXpath[xpathContent] == "" && len(text) > 0 {
					previousXpath[xpathContent] = xpathContent
					// if regex add substring isValid
					if field.Regex != "" {
						filterText := regexp.MustCompile(field.Regex).FindString(text)
						if len(filterText) > 0 && filterText != "0" {
							tmpXpath += `"` + field.Name + `IsValid":` + "true" + `,`
						} else {
							tmpXpath += `"` + field.Name + `IsValid":` + "false" + `,`
						}
					}
					// if fieldType = Count authorize 0 value
					if field.Type == "Count" {
						tmpXpath += `"` + field.Name + `":` + text + `,`
					} else if text != "0" {
						// for string
						if hasQuote {
							tmpXpath += `"` + field.Name + `":"` + strings.NewReplacer("\n", "").Replace(text) + `",`
						} else { // for boolean and number
							tmpXpath += `"` + field.Name + `":` + text + `,`
						}
					}
				}
			}
			previousXpath = nil
		}
	}

	tmpXpath = strings.TrimSuffix(tmpXpath, ",")
	tmpXpath += "}"

	finalXpath := []byte(tmpXpath)
	jsonData := XpathStructure{}
	json.Unmarshal(finalXpath, &jsonData)

	detailledInfo.xpath = finalXpath

	logger = logger.WithFields(logrus.Fields{
		"xpath": detailledInfo.xpath,
	})
	logger.Info("")
}

func FormatXpathByType(fieldType string, xpath string) (string, bool) {
	hasQuote := false
	switch fieldType {
	case "Number":
		return "number(" + xpath + ")", hasQuote
	case "Boolean":
		return "boolean(" + xpath + ")", hasQuote
	case "Count":
		return "count(" + xpath + ")", hasQuote
	}
	hasQuote = true
	// String and Attrivute
	return "string(" + xpath + ")", hasQuote
}