diff --git a/main.go b/main.go index 6c3ce26..25b1169 100644 --- a/main.go +++ b/main.go @@ -12,6 +12,7 @@ "regexp" "runtime" "strconv" + "strings" "sync" "time" @@ -76,23 +77,28 @@ mimeEncoding: mimeEncoding, size: file.Size(), } - if extension == ".pdf" { - wg.Add(1) - go processPDF(&fileData) - } else if extension == ".xml" { + if *configurationFolder == "" { + if extension == ".pdf" { + wg.Add(1) + go processPDF(&fileData) + } else if extension == ".xml" && strings.Contains(fileData.mimetype, "text") { + wg.Add(1) + go processXML(&fileData) + } else { + logrus.WithFields(logrus.Fields{ + "corpusName": fileData.corpusName, + "name": fileData.name, + "startAt": fileData.startAt, + "extension": fileData.extension, + "path": fileData.path, + "mimetype": fileData.mimetype, + "mimeEncoding": fileData.mimeEncoding, + "size": fileData.size, + }).Info("") + } + } else { wg.Add(1) go processXML(&fileData) - } else { - logrus.WithFields(logrus.Fields{ - "corpusName": fileData.corpusName, - "name": fileData.name, - "startAt": fileData.startAt, - "extension": fileData.extension, - "path": fileData.path, - "mimetype": fileData.mimetype, - "mimeEncoding": fileData.mimeEncoding, - "size": fileData.size, - }).Info("") } } } @@ -112,7 +118,7 @@ } if *withWordCount == true { - fmt.Println("Count number word is enabled") + fmt.Println(color.InBlue("Count number word is enabled")) } // init logger sec := time.Now().Unix() @@ -126,7 +132,10 @@ fmt.Println(err) os.Exit(1) } - logrus.SetFormatter(&logrus.JSONFormatter{}) + logrus.SetFormatter(&logrus.JSONFormatter{ + DisableHTMLEscape: true, + DisableTimestamp: true, + }) logrus.SetOutput(outputFile) } diff --git a/xml.go b/xml.go index 5f622ab..d5cdc8a 100644 --- a/xml.go +++ b/xml.go @@ -10,7 +10,7 @@ ) var regexDoctype = regexp.MustCompile("") -var regexDtd = regexp.MustCompile("[a-zA-Z0-9-]*.dtd") +var regexDtd = regexp.MustCompile("[a-zA-Z0-9-_./:]*.dtd") func processXML(message *LogMessage) { // queue for read xml (limit number of parallel read files)