package main import ( "io/ioutil" "os" "os/exec" "regexp" "strings" "github.com/sirupsen/logrus" ) var regexDoctype = regexp.MustCompile("<!DOCTYPE.*([[sS]*?])?>") var regexDtd = regexp.MustCompile("[a-zA-Z0-9-_./:]*.dtd") func processXML(message *Message) { // queue for read xml (limit number of parallel read files) defer wg.Done() queueForConcurrent <- struct{}{} defer func() { <-queueForConcurrent }() xmlData := getXMlData(message) if xmlData.wellFormedErrors.Message != "" { logrus.WithFields(logrus.Fields{ "corpusName": message.corpusName, "name": message.name, "startAt": message.startAt, "extension": message.extension, "path": message.path, "mimetype": message.mimetype, "size": message.size, "isWellFormed": xmlData.isWellFormed, "wellFormedErrors": xmlData.wellFormedErrors, }).Info("") } else { logrus.WithFields(logrus.Fields{ "corpusName": message.corpusName, "name": message.name, "startAt": message.startAt, "extension": message.extension, "path": message.path, "mimetype": message.mimetype, "size": message.size, "isWellFormed": xmlData.isWellFormed, "doctype": xmlData.doctype, }).Info("") } incrementProcess() return } func getXMlData(message *Message) MessageXML { xmlMessage := MessageXML{isWellFormed: true} // check with xmlstarlet (slow) result, _ := exec.Command("xmlstarlet", "val", "-w", "-e", message.path).CombinedOutput() if result != nil && strings.Contains(string(result), "invalid") { xmlMessage.isWellFormed = false xmlMessage.wellFormedErrors = WellFormedErrorXML{string(result), message.path} return xmlMessage } // check if able to open xmlFile, errOpen := os.Open(message.path) if errOpen != nil { xmlMessage.isWellFormed = false xmlMessage.wellFormedErrors = WellFormedErrorXML{errOpen.Error(), message.path} return xmlMessage } // defer the closing of our xmlFile so that we can parse it later on defer xmlFile.Close() // read our opened xmlFile1 as a byte array. here I am checking if the file is valid or not byteValue, errRead := ioutil.ReadAll(xmlFile) if errRead != nil { xmlMessage.isWellFormed = false xmlMessage.wellFormedErrors = WellFormedErrorXML{errRead.Error(), message.path} return xmlMessage } xmlData := string(byteValue) // get doctype from xml doctype := regexDoctype.FindStringSubmatch(xmlData) if doctype != nil && len(doctype[0]) > 0 { dtd := regexDtd.FindStringSubmatch(string(doctype[0])) if dtd != nil && len(dtd[0]) > 0 { xmlMessage.doctype.Sysid = dtd[0] } } return xmlMessage }