package main import ( "encoding/xml" "io/ioutil" "os" "regexp" "github.com/sirupsen/logrus" ) var regexDoctype = regexp.MustCompile("<!DOCTYPE.*([[sS]*?])?>") var regexDtd = regexp.MustCompile("[a-zA-Z0-9-]*.dtd") func processXML(message *LogMessage) { // queue for read xml (limit number of parallel read files) defer wg.Done() queueForConcurrent <- struct{}{} defer func() { <-queueForConcurrent }() xmlData := getXMlData(message) logrus.WithFields(logrus.Fields{ "corpusName": message.corpusName, "name": message.name, "startAt": message.startAt, "extension": message.extension, "path": message.path, "mimetype": message.mimetype, "size": message.size, "isWellFormed": xmlData.isWellFormed, "doctype": xmlData.doctype, "wellFormedErrors": xmlData.wellFormedErrors, }).Info("") incrementProcess() return } func getXMlData(message *LogMessage) LogMessageXML { xmlData := LogMessageXML{isWellFormed: true} xmlFile, errOpen := os.Open(message.path) if errOpen != nil { xmlData.isWellFormed = false xmlData.wellFormedErrors = errOpen.Error() return xmlData } // defer the closing of our xmlFile so that we can parse it later on defer xmlFile.Close() // read our opened xmlFile1 as a byte array. here I am checking if the file is valid or not byteValue, errRead := ioutil.ReadAll(xmlFile) if errRead != nil { xmlData.isWellFormed = false xmlData.wellFormedErrors = errRead.Error() return xmlData } errUnmarshal := xml.Unmarshal(byteValue, new(interface{})) if errUnmarshal != nil { xmlData.isWellFormed = false xmlData.wellFormedErrors = errUnmarshal.Error() return xmlData } // get doctype from xml doctype := regexDoctype.FindStringSubmatch(string(byteValue)) if doctype != nil && len(doctype[0]) > 0 { dtd := regexDtd.FindStringSubmatch(string(doctype[0])) if dtd != nil && len(dtd[0]) > 0 { xmlData.doctype.sysid = dtd[0] } return xmlData } return xmlData }