package main import ( "encoding/xml" "io/ioutil" "os" "regexp" "github.com/sirupsen/logrus" ) var regexDoctype = regexp.MustCompile("<!DOCTYPE.*([[sS]*?])?>") func processXML(message *LogMessage) { // queue for read xml (limit number of parallel read files) defer wg.Done() queueForConcurrent <- struct{}{} defer func() { <-queueForConcurrent }() isWellFormed, doctype, errorXML := getXMlData(message.path) logrus.WithFields(logrus.Fields{ "corpusName": message.corpusName, "name": message.name, "startAt": message.startAt, "extension": message.extension, "path": message.path, "mimetype": message.mimetype, "size": message.size, "isWellFormed": isWellFormed, "doctype": doctype, "xmlError": errorXML, }).Info("") incrementProcess() return } func getXMlData(path string) (bool, string, string) { xmlFile, errOpen := os.Open(path) if errOpen != nil { return false, "", errOpen.Error() } // defer the closing of our xmlFile so that we can parse it later on defer xmlFile.Close() // read our opened xmlFile1 as a byte array. here I am checking if the file is valid or not byteValue, errRead := ioutil.ReadAll(xmlFile) if errRead != nil { return false, "", errRead.Error() } errorUnmarshal := xml.Unmarshal(byteValue, new(interface{})) if errorUnmarshal != nil { return false, "", errorUnmarshal.Error() } // xmlData doctype := regexDoctype.FindStringSubmatch(string(byteValue)) if len(doctype[0]) > 0 { return true, doctype[0], "" } return true, "", "" }