package main import ( "io/ioutil" "os" "os/exec" "regexp" "strings" "github.com/sirupsen/logrus" ) var regexDoctype = regexp.MustCompile("<!DOCTYPE.*([[sS]*?])?>") var regexDtd = regexp.MustCompile("[a-zA-Z0-9-_./:]*.dtd") var regexSpace = regexp.MustCompile(`\s+`) var regexLine = regexp.MustCompile("line ([0-9]{1,}) ") var regexErrorMessage = regexp.MustCompile(` ([A-Z].*)\^`) func processXML(message *Message) { // queue for read xml (limit number of parallel read files) defer wg.Done() queueForConcurrent <- struct{}{} defer func() { <-queueForConcurrent }() xmlData := getXMlData(message) if len(xmlData.wellFormedErrors) > 0 { logrus.WithFields(logrus.Fields{ "corpusName": message.corpusName, "name": message.name, "startAt": message.startAt, "extension": message.extension, "path": message.path, "mimetype": message.mimetype, "size": message.size, "isWellFormed": xmlData.isWellFormed, "wellFormedErrors": xmlData.wellFormedErrors, }).Info("") } else { logrus.WithFields(logrus.Fields{ "corpusName": message.corpusName, "name": message.name, "startAt": message.startAt, "extension": message.extension, "path": message.path, "mimetype": message.mimetype, "size": message.size, "isWellFormed": xmlData.isWellFormed, "doctype": xmlData.doctype, }).Info("") } return } func getXMlData(message *Message) MessageXML { xmlMessage := MessageXML{isWellFormed: true} // check with xmlstarlet (slow) result, _ := exec.Command("xmlstarlet", "val", "-w", "-e", message.path).CombinedOutput() if result != nil && strings.Contains(string(result), "invalid") { xmlMessage.isWellFormed = false xmlMessage.wellFormedErrors = append(xmlMessage.wellFormedErrors, formatError(string(result))) return xmlMessage } // check if able to open xmlFile, errOpen := os.Open(message.path) if errOpen != nil { xmlMessage.isWellFormed = false xmlMessage.wellFormedErrors = append(xmlMessage.wellFormedErrors, WellFormedErrorXML{Message: errOpen.Error()}) return xmlMessage } // defer the closing of our xmlFile so that we can parse it later on defer xmlFile.Close() // read our opened xmlFile1 as a byte array. here I am checking if the file is valid or not byteValue, errRead := ioutil.ReadAll(xmlFile) if errRead != nil { xmlMessage.isWellFormed = false xmlMessage.wellFormedErrors = append(xmlMessage.wellFormedErrors, WellFormedErrorXML{Message: errRead.Error()}) return xmlMessage } xmlData := string(byteValue) // get doctype from xml doctype := regexDoctype.FindStringSubmatch(xmlData) if doctype != nil && len(doctype[0]) > 0 { dtd := regexDtd.FindStringSubmatch(string(doctype[0])) if dtd != nil && len(dtd[0]) > 0 { xmlMessage.doctype.Sysid = dtd[0] } } return xmlMessage } func formatError(resultError string) WellFormedErrorXML { messageFormatted := regexSpace.ReplaceAllString(resultError, " ") messageLine := regexLine.FindStringSubmatch(messageFormatted) line := "0" message := resultError errorMessage := regexErrorMessage.FindStringSubmatch(messageFormatted) if messageLine != nil && len(messageLine) >= 2 && len(messageLine[1]) > 0 { line = messageLine[1] } if errorMessage != nil && len(errorMessage) >= 2 && len(errorMessage[1]) > 0 { message = strings.TrimSpace(errorMessage[1]) } return WellFormedErrorXML{ Message: message, Line: line, } }