Newer
Older
sisyphe-go / xml.go
@Nacim Nacim on 15 Feb 2022 2 KB change struct json
package main

import (
	"io/ioutil"
	"os"
	"os/exec"
	"regexp"
	"strings"

	"github.com/sirupsen/logrus"
)

var regexDoctype = regexp.MustCompile("<!DOCTYPE.*([[sS]*?])?>")
var regexDtd = regexp.MustCompile("[a-zA-Z0-9-_./:]*.dtd")

func processXML(message *Message) {
	// queue for read xml (limit number of parallel read files)
	defer wg.Done()
	queueForConcurrent <- struct{}{}
	defer func() { <-queueForConcurrent }()

	xmlData := getXMlData(message)
	if xmlData.wellFormedErrors.Message != "" {
		logrus.WithFields(logrus.Fields{
			"corpusName":       message.corpusName,
			"name":             message.name,
			"startAt":          message.startAt,
			"extension":        message.extension,
			"path":             message.path,
			"mimetype":         message.mimetype,
			"size":             message.size,
			"isWellFormed":     xmlData.isWellFormed,
			"wellFormedErrors": xmlData.wellFormedErrors,
		}).Info("")
	} else {
		logrus.WithFields(logrus.Fields{
			"corpusName":       message.corpusName,
			"name":             message.name,
			"startAt":          message.startAt,
			"extension":        message.extension,
			"path":             message.path,
			"mimetype":         message.mimetype,
			"size":             message.size,
			"isWellFormed":     xmlData.isWellFormed,
			"doctype":          xmlData.doctype,
		}).Info("")
	}
	
	incrementProcess()
	return
}

func getXMlData(message *Message) MessageXML {
	xmlMessage := MessageXML{isWellFormed: true}

	// check with xmlstarlet (slow)
	result, _ := exec.Command("xmlstarlet", "val", "-w", "-e", message.path).CombinedOutput()
	if result != nil && strings.Contains(string(result), "invalid") {
		xmlMessage.isWellFormed = false
		xmlMessage.wellFormedErrors = WellFormedErrorXML{string(result), message.path}
		return xmlMessage
	}
	
	// check if able to open
	xmlFile, errOpen := os.Open(message.path)
	if errOpen != nil {
		xmlMessage.isWellFormed = false
		xmlMessage.wellFormedErrors = WellFormedErrorXML{errOpen.Error(), message.path}
		return xmlMessage
	}
	// defer the closing of our xmlFile so that we can parse it later on
	defer xmlFile.Close()
	// read our opened xmlFile1 as a byte array. here I am checking if the file is valid or not
	byteValue, errRead := ioutil.ReadAll(xmlFile)
	if errRead != nil {
		xmlMessage.isWellFormed = false
		xmlMessage.wellFormedErrors = WellFormedErrorXML{errRead.Error(), message.path}
		return xmlMessage
	}

	xmlData := string(byteValue)
	// get doctype from xml
	doctype := regexDoctype.FindStringSubmatch(xmlData)
	if doctype != nil && len(doctype[0]) > 0 {
		dtd := regexDtd.FindStringSubmatch(string(doctype[0]))
		if dtd != nil && len(dtd[0]) > 0 {
			xmlMessage.doctype.Sysid = dtd[0]
		}
	}

	return xmlMessage
}