package main import ( "encoding/xml" "io/ioutil" "os" "os/exec" "regexp" "strings" "github.com/sirupsen/logrus" ) var regexDoctype = regexp.MustCompile("<!DOCTYPE.*([[sS]*?])?>") var regexDtd = regexp.MustCompile("[a-zA-Z0-9-_./:]*.dtd") func processXML(message *Message) { // queue for read xml (limit number of parallel read files) defer wg.Done() queueForConcurrent <- struct{}{} defer func() { <-queueForConcurrent }() xmlData := getXMlData(message) logrus.WithFields(logrus.Fields{ "corpusName": message.corpusName, "name": message.name, "startAt": message.startAt, "extension": message.extension, "path": message.path, "mimetype": message.mimetype, "size": message.size, "isWellFormed": xmlData.isWellFormed, "doctype": xmlData.doctype.sysid, "wellFormedErrors": xmlData.wellFormedErrors, }).Info("") incrementProcess() return } func getXMlData(message *Message) MessageXML { xmlData := MessageXML{isWellFormed: true} // check if able to open xmlFile, errOpen := os.Open(message.path) if errOpen != nil { xmlData.isWellFormed = false xmlData.wellFormedErrors = errOpen.Error() return xmlData } // defer the closing of our xmlFile so that we can parse it later on defer xmlFile.Close() // read our opened xmlFile1 as a byte array. here I am checking if the file is valid or not byteValue, errRead := ioutil.ReadAll(xmlFile) if errRead != nil { xmlData.isWellFormed = false xmlData.wellFormedErrors = errRead.Error() return xmlData } // check if unmarshal errUnmarshal := xml.Unmarshal(byteValue, new(interface{})) if errUnmarshal != nil { xmlData.isWellFormed = false xmlData.wellFormedErrors = errUnmarshal.Error() return xmlData } // check with xmlstarlet result, _ := exec.Command("xmlstarlet", "val", "-w", "-e", message.path).CombinedOutput() if result != nil && strings.Contains(string(result), "invalid") { xmlData.isWellFormed = false xmlData.wellFormedErrors = string(result) } // get doctype from xml doctype := regexDoctype.FindStringSubmatch(string(byteValue)) if doctype != nil && len(doctype[0]) > 0 { dtd := regexDtd.FindStringSubmatch(string(doctype[0])) if dtd != nil && len(dtd[0]) > 0 { xmlData.doctype.sysid = dtd[0] } return xmlData } return xmlData }