Newer
Older
sisyphe-go / xml.go
@Nacim Nacim on 9 Mar 2022 7 KB refactoring logging
package main

import (
	"flag"
	"fmt"
	"io/ioutil"
	"os"
	"os/exec"
	"regexp"
	"strings"

	"github.com/antchfx/xmlquery"
	"github.com/antchfx/xpath"
	"github.com/goccy/go-json"
	"github.com/sirupsen/logrus"
)

var regexDoctype = regexp.MustCompile("<!DOCTYPE.*([[sS]*?])?>")
var regexDtd = regexp.MustCompile("[a-zA-Z0-9-_./:]*.dtd")
var regexLineWellFormed = regexp.MustCompile("line ([0-9]{1,}) ")
var regexErrorWellFormedMessage = regexp.MustCompile(` ([A-Z].*)\^`)

// var regexLineValidation = regexp.MustCompile(`(?m):([0-9]{1,}):`)
// var regexErrorValidationMessage = regexp.MustCompile(` ([A-Z].*)`)

func ProcessXML(message *GeneralInfo, logger *logrus.Entry) {
	// queue for read xml (limit number of parallel read files)
	defer wg.Done()
	queueForConcurrent <- struct{}{}
	defer func() { <-queueForConcurrent }()

	logger, xmlInfo := CheckIfXmlIsWellFormed(message.path, logger)
	if *configurationFolder != "" && len(xmlInfo.data) > 0 {
		detailledInfo := DetailledAnalysis{}
		detailledInfo, logger = CheckXMLValidation(message.path, xmlInfo, logger)
		ProcessXpath(xmlInfo, detailledInfo, logger)
	} else {
		logger.Info("")
	}
}

func CheckIfXmlIsWellFormed(xmlPath string, logger *logrus.Entry) (*logrus.Entry, XMLInfo) {
	xmlInfo := XMLInfo{isWellFormed: false}
	// check with xmlstarlet (slow)
	result, _ := exec.Command("xmlstarlet", "val", "-w", "-e", xmlPath).CombinedOutput()
	if result != nil && strings.Contains(string(result), "invalid") {
		logger = logger.WithFields(logrus.Fields{
			"isWellFormed":     false,
			"wellFormedErrors": FormatWellFormedError(string(result)),
		})
		return logger, xmlInfo
	}

	// check if able to open
	xmlFile, errOpen := os.Open(xmlPath)
	if errOpen != nil {
		logger = logger.WithFields(logrus.Fields{
			"isWellFormed":     false,
			"wellFormedErrors": ErrorXML{Message: errOpen.Error()},
		})
		return logger, xmlInfo
	}
	// defer the closing of our xmlFile so that we can parse it later on
	defer xmlFile.Close()
	// read our opened xmlFile1 as a byte array. here I am checking if the file is valid or not
	byteValue, errRead := ioutil.ReadAll(xmlFile)
	if errRead != nil {
		xmlInfo.wellFormedErrors = ErrorXML{Message: errOpen.Error()}
		logger = logger.WithFields(logrus.Fields{
			"isWellFormed":     false,
			"wellFormedErrors": xmlInfo.wellFormedErrors,
		})
		return logger, xmlInfo
	}

	xmlInfo.data = string(byteValue)
	// get doctype from xml
	doctype := regexDoctype.FindString(xmlInfo.data)
	if doctype != "" {
		dtd := regexDtd.FindString(doctype)
		if dtd != "" {
			xmlDoctype := DoctypeXML{Sysid: dtd}
			logger = logger.WithFields(logrus.Fields{
				"isWellFormed": true,
				"doctype":      xmlDoctype,
			})
			xmlInfo.doctype = xmlDoctype
		}
	}

	return logger, xmlInfo
}

func FormatWellFormedError(resultError string) ErrorXML {
	messageFormatted := strings.NewReplacer("\n", "", `  `, "").Replace(resultError)
	messageLine := regexLineWellFormed.FindStringSubmatch(messageFormatted)
	line := "0"
	errorMessage := regexErrorWellFormedMessage.FindStringSubmatch(messageFormatted)
	if len(messageLine) >= 2 && len(messageLine[1]) > 0 {
		line = messageLine[1]
	}
	if len(errorMessage) >= 2 && len(errorMessage[1]) > 0 {
		messageFormatted = strings.TrimSpace(errorMessage[1])
	}
	return ErrorXML{
		Message: messageFormatted,
		Line:    line,
	}
}

func FormatValidationError(resultError string) []ErrorXML {
	// errorMessage := regexErrorValidationMessage.FindStringSubmatch(resultError)
	// messageLine := regexLineValidation.FindStringSubmatch(resultError)
	return []ErrorXML{}
}

func CheckXMLValidation(pathXml string, xmlInfo XMLInfo, logger *logrus.Entry) (DetailledAnalysis, *logrus.Entry) {
	xmlDetailled := DetailledAnalysis{isValidAgainstDTD: false, isValidAgainstSchema: false}
	pathExec := *configurationFolder + "/" + *corpusName

	// if test
	if flag.Lookup("test.v") != nil {
		pathExec = "./example"
	}

	// if dtd exist in xml file process only this
	if len(configDetailledAnalyze.XML.ListDTD) > 0 {
		if xmlInfo.doctype.Sysid != "" {
			for _, dtdPath := range configDetailledAnalyze.XML.ListDTD {
				if strings.HasSuffix(dtdPath, xmlInfo.doctype.Sysid) {
					result, _ := exec.Command("xmllint", "--dtdvalid", pathExec+"/dtd/"+dtdPath, pathXml, "--noout", "--nowarning").CombinedOutput()
					if string(result) == "" {
						xmlDetailled.isValidAgainstDTD = true
						xmlDetailled.validationDTDInfos = dtdPath
					} else {
						xmlDetailled.validationsErrors = string(result)
					}
				}
			}
		} else {
			// check with all dtd and stop if one is true
			for _, dtdPath := range configDetailledAnalyze.XML.ListDTD {
				result, _ := exec.Command("xmllint", "--dtdvalid", pathExec+"/dtd/"+dtdPath, pathXml, "--noout", "--nowarning").CombinedOutput()
				if string(result) == "" {
					xmlDetailled.isValidAgainstDTD = true
					xmlDetailled.validationDTDInfos = dtdPath
					break
				} else {
					xmlDetailled.validationsErrors = string(result)
				}
			}
		}

		logger = logger.WithFields(logrus.Fields{
			"isValidAgainstDTD":  xmlDetailled.isValidAgainstDTD,
			"validationDTDInfos": xmlDetailled.validationDTDInfos,
		})
		if !xmlDetailled.isValidAgainstDTD {
			logger = logger.WithFields(logrus.Fields{
				"validationsErrors": xmlDetailled.validationsErrors,
			})
		}

	} else if len(configDetailledAnalyze.XML.ListXSD) > 0 {
		// if xsd is present check schema validation
		for _, xsdPath := range configDetailledAnalyze.XML.ListXSD {
			result, _ := exec.Command("xmllint", "--schema", pathExec+"/xsd/"+xsdPath, pathXml, "--noout", "--nowarning").CombinedOutput()
			if strings.Contains(string(result), "validates") {
				xmlDetailled.isValidAgainstSchema = true
				break
			} else {
				xmlDetailled.validationSchemaErrors = append(xmlDetailled.validationSchemaErrors, string(result))
			}
		}
		logger = logger.WithFields(logrus.Fields{
			"isValidAgainstSchema": xmlDetailled.isValidAgainstSchema,
			"validationDTDInfos":   xmlDetailled.validationDTDInfos,
		})

		if !xmlDetailled.isValidAgainstSchema {
			logger = logger.WithFields(logrus.Fields{
				"validationsErrors": xmlDetailled.validationsErrors,
			})
		}
	}

	return xmlDetailled, logger
}

func ProcessXpath(xmlInfo XMLInfo, detailledInfo DetailledAnalysis, logger *logrus.Entry) {
	doc, err := xmlquery.Parse(strings.NewReader(xmlInfo.data))

	tmpXpath := "{"
	if err == nil {
		for _, field := range configDetailledAnalyze.XML.XPATH {
			for _, xpathContent := range field.Xpath {
				goodXPath, hasQuote := FormatXpathByType(field.Type, xpathContent)
				expr, _ := xpath.Compile(goodXPath)
				result := expr.Evaluate(xmlquery.CreateXPathNavigator(doc))
				str := fmt.Sprintf("%v", result)
				if len(str) > 0 && str != "0" {
					tmpXpath += FormatTextForXpath(field, str, hasQuote)
				}
			}
		}
	}

	tmpXpath = strings.TrimSuffix(tmpXpath, ",")
	tmpXpath += "}"

	finalXpath := []byte(tmpXpath)
	jsonData := XpathStructure{}
	json.Unmarshal(finalXpath, &jsonData)

	detailledInfo.xpath = finalXpath

	logger = logger.WithFields(logrus.Fields{
		"xpath": detailledInfo.xpath,
	})
	logger.Info("")
}

func FormatTextForXpath(field MetadataConfigDetailledAnalyze, text string, hasQuote bool) string {
	if field.Regex != "" {
		filterText := regexp.MustCompile(field.Regex).FindString(text)
		if len(filterText) > 0 {
			text = filterText
		}
	}

	if hasQuote {
		return `"` + field.Name + `":"` + strings.NewReplacer("\n", "").Replace(text) + `",`
	}
	return `"` + field.Name + `":` + text + `,`
}

func FormatXpathByType(fieldType string, xpath string) (string, bool) {
	switch fieldType {
	case "Number":
		return "number(" + xpath + ")", false
	case "Boolean":
		return "boolean(" + xpath + ")", false
	case "Count":
		return "count(" + xpath + ")", false
	}
	// String and Attrivute
	return "string(" + xpath + ")", true
}