Newer
Older
sisyphe-go / xml.go
@Nacim Nacim on 10 Feb 2022 2 KB add more validation for xml
package main

import (
	"encoding/xml"
	"io/ioutil"
	"os"
	"os/exec"
	"regexp"
	"strings"

	"github.com/sirupsen/logrus"
)

var regexDoctype = regexp.MustCompile("<!DOCTYPE.*([[sS]*?])?>")
var regexDtd = regexp.MustCompile("[a-zA-Z0-9-_./:]*.dtd")

func processXML(message *Message) {
	// queue for read xml (limit number of parallel read files)
	defer wg.Done()
	queueForConcurrent <- struct{}{}
	defer func() { <-queueForConcurrent }()

	xmlData := getXMlData(message)
	logrus.WithFields(logrus.Fields{
		"corpusName":       message.corpusName,
		"name":             message.name,
		"startAt":          message.startAt,
		"extension":        message.extension,
		"path":             message.path,
		"mimetype":         message.mimetype,
		"size":             message.size,
		"isWellFormed":     xmlData.isWellFormed,
		"doctype":          xmlData.doctype.sysid,
		"wellFormedErrors": xmlData.wellFormedErrors,
	}).Info("")
	incrementProcess()
	return
}

func getXMlData(message *Message) MessageXML {
	xmlData := MessageXML{isWellFormed: true}
	// check if able to open
	xmlFile, errOpen := os.Open(message.path)
	if errOpen != nil {
		xmlData.isWellFormed = false
		xmlData.wellFormedErrors = errOpen.Error()
		return xmlData
	}
	// defer the closing of our xmlFile so that we can parse it later on
	defer xmlFile.Close()
	// read our opened xmlFile1 as a byte array. here I am checking if the file is valid or not
	byteValue, errRead := ioutil.ReadAll(xmlFile)

	if errRead != nil {
		xmlData.isWellFormed = false
		xmlData.wellFormedErrors = errRead.Error()
		return xmlData
	}

	// check if unmarshal
	errUnmarshal := xml.Unmarshal(byteValue, new(interface{}))
	if errUnmarshal != nil {
		xmlData.isWellFormed = false
		xmlData.wellFormedErrors = errUnmarshal.Error()
		return xmlData
	}

	// check with xmlstarlet
	result, _ := exec.Command("xmlstarlet", "val", "-w", "-e", message.path).CombinedOutput()
	if result != nil && strings.Contains(string(result), "invalid") {
		xmlData.isWellFormed = false
		xmlData.wellFormedErrors = string(result)
	}

	// get doctype from xml
	doctype := regexDoctype.FindStringSubmatch(string(byteValue))
	if doctype != nil && len(doctype[0]) > 0 {
		dtd := regexDtd.FindStringSubmatch(string(doctype[0]))
		if dtd != nil && len(dtd[0]) > 0 {
			xmlData.doctype.sysid = dtd[0]
		}
		return xmlData
	}

	return xmlData
}