Newer
Older
sisyphe-go / xml.go
@Nacim Nacim on 10 Feb 2022 1 KB refactor code
package main

import (
	"encoding/xml"
	"io/ioutil"
	"os"
	"regexp"

	"github.com/sirupsen/logrus"
)

var regexDoctype = regexp.MustCompile("<!DOCTYPE.*([[sS]*?])?>")
var regexDtd = regexp.MustCompile("[a-zA-Z0-9-]*.dtd")

func processXML(message *LogMessage) {
	// queue for read xml (limit number of parallel read files)
	defer wg.Done()
	queueForConcurrent <- struct{}{}
	defer func() { <-queueForConcurrent }()

	xmlData := getXMlData(message)
	logrus.WithFields(logrus.Fields{
		"corpusName":       message.corpusName,
		"name":             message.name,
		"startAt":          message.startAt,
		"extension":        message.extension,
		"path":             message.path,
		"mimetype":         message.mimetype,
		"size":             message.size,
		"isWellFormed":     xmlData.isWellFormed,
		"doctype":          xmlData.doctype,
		"wellFormedErrors": xmlData.wellFormedErrors,
	}).Info("")
	incrementProcess()
	return
}

func getXMlData(message *LogMessage) LogMessageXML {
	xmlData := LogMessageXML{isWellFormed: true}
	xmlFile, errOpen := os.Open(message.path)
	if errOpen != nil {
		xmlData.isWellFormed = false
		xmlData.wellFormedErrors = errOpen.Error()
		return xmlData
	}
	// defer the closing of our xmlFile so that we can parse it later on
	defer xmlFile.Close()
	// read our opened xmlFile1 as a byte array. here I am checking if the file is valid or not
	byteValue, errRead := ioutil.ReadAll(xmlFile)

	if errRead != nil {
		xmlData.isWellFormed = false
		xmlData.wellFormedErrors = errRead.Error()
		return xmlData
	}

	errUnmarshal := xml.Unmarshal(byteValue, new(interface{}))
	if errUnmarshal != nil {
		xmlData.isWellFormed = false
		xmlData.wellFormedErrors = errUnmarshal.Error()
		return xmlData
	}

	// get doctype from xml
	doctype := regexDoctype.FindStringSubmatch(string(byteValue))
	if doctype != nil && len(doctype[0]) > 0 {
		dtd := regexDtd.FindStringSubmatch(string(doctype[0]))
		if dtd != nil && len(dtd[0]) > 0 {
			xmlData.doctype.sysid = dtd[0]
		}
		return xmlData
	}

	return xmlData
}