Newer
Older
sisyphe-go / xml.go
@Nacim Nacim on 4 Mar 2022 6 KB add xpath
package main

import (
	"flag"
	"io/ioutil"
	"os"
	"os/exec"
	"regexp"
	"strings"

	"github.com/antchfx/xmlquery"
	"github.com/sirupsen/logrus"
)

var regexDoctype = regexp.MustCompile("<!DOCTYPE.*([[sS]*?])?>")
var regexDtd = regexp.MustCompile("[a-zA-Z0-9-_./:]*.dtd")
var regexSpace = regexp.MustCompile(`\s+`)
var regexLine = regexp.MustCompile("line ([0-9]{1,}) ")
var regexErrorMessage = regexp.MustCompile(` ([A-Z].*)\^`)

func processXML(message *Message) {
	// queue for read xml (limit number of parallel read files)
	defer wg.Done()
	queueForConcurrent <- struct{}{}
	defer func() { <-queueForConcurrent }()

	xmlDataLog, xmlData := getXMlData(message.path)
	if len(xmlDataLog.wellFormedErrors) > 0 {
		logrus.WithFields(logrus.Fields{
			"corpusName":       message.corpusName,
			"name":             message.name,
			"startAt":          message.startAt,
			"extension":        message.extension,
			"path":             message.path,
			"mimetype":         message.mimetype,
			"size":             message.size,
			"isWellFormed":     xmlDataLog.isWellFormed,
			"wellFormedErrors": xmlDataLog.wellFormedErrors,
		}).Info("")
	} else {
		if *configurationFolder == "" {
			logrus.WithFields(logrus.Fields{
				"corpusName":   message.corpusName,
				"name":         message.name,
				"startAt":      message.startAt,
				"extension":    message.extension,
				"path":         message.path,
				"mimetype":     message.mimetype,
				"size":         message.size,
				"isWellFormed": xmlDataLog.isWellFormed,
				"doctype":      xmlDataLog.doctype,
			}).Info("")
		} else {
			detailledAnalysis := processDetailledAnalysis(message.path, xmlData, xmlDataLog.doctype.Sysid)
			logrus.WithFields(logrus.Fields{
				"corpusName":             message.corpusName,
				"name":                   message.name,
				"startAt":                message.startAt,
				"extension":              message.extension,
				"path":                   message.path,
				"mimetype":               message.mimetype,
				"size":                   message.size,
				"isWellFormed":           xmlDataLog.isWellFormed,
				"doctype":                xmlDataLog.doctype,
				"isValidAgainstDTD":      detailledAnalysis.isValidAgainstDTD,
				"validationDTDInfos":     detailledAnalysis.validationDTDInfos,
				"validationsErrors":      detailledAnalysis.validationsErrors,
				"isValidAgainstSchema":   detailledAnalysis.isValidAgainstSchema,
				"validationSchemaErrors": detailledAnalysis.validationSchemaErrors,
				"xpath":                  detailledAnalysis.xpath,
			}).Info("")
		}
	}
	return
}

func getXMlData(xmlPath string) (MessageXML, string) {
	xmlMessage := MessageXML{isWellFormed: true}

	// check with xmlstarlet (slow)
	result, _ := exec.Command("xmlstarlet", "val", "-w", "-e", xmlPath).CombinedOutput()
	if result != nil && strings.Contains(string(result), "invalid") {
		xmlMessage.isWellFormed = false
		xmlMessage.wellFormedErrors = append(xmlMessage.wellFormedErrors, formatError(string(result)))
		return xmlMessage, ""
	}

	// check if able to open
	xmlFile, errOpen := os.Open(xmlPath)
	if errOpen != nil {
		xmlMessage.isWellFormed = false
		xmlMessage.wellFormedErrors = append(xmlMessage.wellFormedErrors, WellFormedErrorXML{Message: errOpen.Error()})
		return xmlMessage, ""
	}
	// defer the closing of our xmlFile so that we can parse it later on
	defer xmlFile.Close()
	// read our opened xmlFile1 as a byte array. here I am checking if the file is valid or not
	byteValue, errRead := ioutil.ReadAll(xmlFile)
	if errRead != nil {
		xmlMessage.isWellFormed = false
		xmlMessage.wellFormedErrors = append(xmlMessage.wellFormedErrors, WellFormedErrorXML{Message: errRead.Error()})
		return xmlMessage, ""
	}

	xmlData := string(byteValue)
	// get doctype from xml
	doctype := regexDoctype.FindStringSubmatch(xmlData)
	if doctype != nil && len(doctype[0]) > 0 {
		dtd := regexDtd.FindStringSubmatch(string(doctype[0]))
		if dtd != nil && len(dtd[0]) > 0 {
			xmlMessage.doctype.Sysid = dtd[0]
		}
	}

	return xmlMessage, xmlData
}

func formatError(resultError string) WellFormedErrorXML {
	messageFormatted := regexSpace.ReplaceAllString(resultError, " ")
	messageLine := regexLine.FindStringSubmatch(messageFormatted)
	line := "0"
	message := resultError
	errorMessage := regexErrorMessage.FindStringSubmatch(messageFormatted)
	if messageLine != nil && len(messageLine) >= 2 && len(messageLine[1]) > 0 {
		line = messageLine[1]
	}
	if errorMessage != nil && len(errorMessage) >= 2 && len(errorMessage[1]) > 0 {
		message = strings.TrimSpace(errorMessage[1])
	}
	return WellFormedErrorXML{
		Message: message,
		Line:    line,
	}
}

func processDetailledAnalysis(pathXml string, xmlData string, dtdInDoctype string) DetailledAnalysis {
	xmlDetailled := DetailledAnalysis{isValidAgainstDTD: false, isValidAgainstSchema: false, xpath: ""}
	pathExec := *configurationFolder + "/" + *corpusName

	// if test
	if flag.Lookup("test.v") != nil {
		pathExec = "./example"
	}

	// if dtd exist in xml file process only this
	if len(configDetailledAnalyze.XML.ListDTD) > 0 {
		if dtdInDoctype != "" {
			for _, dtdPath := range configDetailledAnalyze.XML.ListDTD {
				if strings.HasSuffix(dtdPath, dtdInDoctype) {
					result, _ := exec.Command("xmllint", "--dtdvalid", pathExec+"/dtd/"+dtdPath, pathXml, "--noout", "--nowarning").CombinedOutput()
					if string(result) == "" {
						xmlDetailled.isValidAgainstDTD = true
						xmlDetailled.validationDTDInfos = dtdPath
					}
				}
			}
		} else {
			// check with all dtd and stop if one is true
			for _, dtdPath := range configDetailledAnalyze.XML.ListDTD {
				result, _ := exec.Command("xmllint", "--dtdvalid", pathExec+"/dtd/"+dtdPath, pathXml, "--noout", "--nowarning").CombinedOutput()
				if string(result) == "" {
					xmlDetailled.isValidAgainstDTD = true
					xmlDetailled.validationDTDInfos = dtdPath
					break
				} else {
					xmlDetailled.validationsErrors = append(xmlDetailled.validationsErrors, string(result))
				}
			}
		}

		// don't process if dtd is not valid
		if xmlDetailled.isValidAgainstDTD == false {
			return xmlDetailled
		}

	} else if len(configDetailledAnalyze.XML.ListXSD) > 0 {
		// if xsd is present check schema validation
		for _, xsdPath := range configDetailledAnalyze.XML.ListXSD {
			result, _ := exec.Command("xmllint", "--schema", pathExec+"/xsd/"+xsdPath, pathXml, "--noout", "--nowarning").CombinedOutput()
			if strings.Contains(string(result), "validates") {
				xmlDetailled.isValidAgainstSchema = true
				break
			} else {
				xmlDetailled.validationSchemaErrors = append(xmlDetailled.validationsErrors, string(result))
			}
		}

		if xmlDetailled.isValidAgainstSchema == false {
			return xmlDetailled
		}
	}

	// get xpath
	doc, err := xmlquery.Parse(strings.NewReader(xmlData))

	if err == nil {
		xmlDetailled.xpath = "{"
		for _, field := range configDetailledAnalyze.XML.XPATH {
			channel := xmlquery.FindOne(doc, field.Xpath)
			if channel != nil {
				xmlDetailled.xpath += field.Name + ":" + channel.InnerText() + ","
			}
		}
	}

	return xmlDetailled
}