diff --git a/README.md b/README.md index 7ea5a5d..408093b 100644 --- a/README.md +++ b/README.md @@ -16,7 +16,7 @@ ```bash docker-compose up -d -docker exec sisyphe_go_go_1 -it go run . -n corpusname -c corpuspath -o outputpath +docker exec sisyphe_go_go_1 -it go run . -n corpusName -c corpuspath -o outputpath ``` ### Install it on local @@ -49,7 +49,7 @@ Sisyphe is now working in background with all your computer thread. Just take a coffee and wait , it will prevent you when it's done :) -The results of sisyphe are present @ `sisyphe/out/{timestamp}-corpusname/` (errors,info,duration..) +The results of sisyphe are present @ `sisyphe/out/{timestamp}-corpusName/` (errors,info,duration..) ### Test diff --git a/main.go b/main.go index 53500ca..d38da3a 100644 --- a/main.go +++ b/main.go @@ -36,15 +36,16 @@ xmlError string } type LogMessage struct { - corpusname string - name string - startAt string - extension string - path string - mimetype string - size int64 - pdf LogMessagePDF - xml LogMessageXML + corpusName string + name string + startAt string + extension string + path string + mimetype string + mimeEncoding string + size int64 + pdf LogMessagePDF + xml LogMessageXML } var queueForConcurrent = make(chan struct{}, 1100) @@ -56,6 +57,8 @@ var configurationFolder = flag.String("c", "", "Configuration folder path") var withWordCount = flag.Bool("w", false, "Enable word count") +var regexMime = regexp.MustCompile(`(.*); charset=(.*)`) + func incrementProcess() { numberFiles++ if numberFiles%5000 == 0 { @@ -82,34 +85,41 @@ if file.Mode().IsRegular() { mtype, err := mimetype.DetectFile(path) extension := mtype.Extension() + groupMime := regexMime.FindStringSubmatch(mtype.String()) + mimetype := mtype.String() + mimeEncoding := "binary" + if len(groupMime) == 3 { + mimetype = groupMime[1] + mimeEncoding = groupMime[2] + } absolutePath, err2 := filepath.Abs(path) if err == nil && err2 == nil { fileData := LogMessage{ - corpusname: *corpusName, - name: file.Name(), - startAt: file.ModTime().String(), - extension: extension, - path: absolutePath, - mimetype: mtype.String(), - size: file.Size(), + corpusName: *corpusName, + name: file.Name(), + startAt: file.ModTime().String(), + extension: extension, + path: absolutePath, + mimetype: mimetype, + mimeEncoding: mimeEncoding, + size: file.Size(), } if extension == ".pdf" { wg.Add(1) go processPDF(&fileData) } else if extension == ".xml" { - m1 := regexp.MustCompile(`;.*`) - fileData.mimetype = m1.ReplaceAllString(fileData.mimetype, "") wg.Add(1) go processXML(&fileData) } else { logrus.WithFields(logrus.Fields{ - "corpusname": fileData.corpusname, - "name": fileData.name, - "startAt": fileData.startAt, - "extension": fileData.extension, - "path": fileData.path, - "mimetype": fileData.mimetype, - "size": fileData.size, + "corpusName": fileData.corpusName, + "name": fileData.name, + "startAt": fileData.startAt, + "extension": fileData.extension, + "path": fileData.path, + "mimetype": fileData.mimetype, + "mimeEncoding": fileData.mimeEncoding, + "size": fileData.size, }).Info("") } } diff --git a/pdf.go b/pdf.go index 767beaa..89b9057 100644 --- a/pdf.go +++ b/pdf.go @@ -30,7 +30,7 @@ } logrus.WithFields(logrus.Fields{ - "corpusname": message.corpusname, + "corpusName": message.corpusName, "name": message.name, "startAt": message.startAt, "extension": message.extension, diff --git a/pdf_test.go b/pdf_test.go index 77a51d7..55f2ef3 100644 --- a/pdf_test.go +++ b/pdf_test.go @@ -8,7 +8,7 @@ ) var pdfData = LogMessage{ - corpusname: "test", + corpusName: "test", name: "test.pdf", startAt: "Thu Mar 4 13:08:00 2010 CET", extension: ".pdf", diff --git a/xml.go b/xml.go index ee42e6e..5b72857 100644 --- a/xml.go +++ b/xml.go @@ -16,7 +16,7 @@ isWellFormed, errorXML := isValidXML(message.path) logrus.WithFields(logrus.Fields{ - "corpusname": message.corpusname, + "corpusName": message.corpusName, "name": message.name, "startAt": message.startAt, "extension": message.extension, @@ -30,21 +30,23 @@ return } -func isValidXML(path string) (bool, error) { - xmlFile, err := os.Open(path) - if err != nil { - return false, err +func isValidXML(path string) (bool, string) { + xmlFile, errOpen := os.Open(path) + if errOpen != nil { + return false, errOpen.Error() } // defer the closing of our xmlFile so that we can parse it later on defer xmlFile.Close() // read our opened xmlFile1 as a byte array. here I am checking if the file is valid or not - byteValue, err := ioutil.ReadAll(xmlFile) - if err != nil { - return false, err + byteValue, errRead := ioutil.ReadAll(xmlFile) + + if errRead != nil { + return false, errRead.Error() } - if xml.Unmarshal(byteValue, new(interface{})) != nil { - return false, err + errorUnmarshal := xml.Unmarshal(byteValue, new(interface{})) + if errorUnmarshal != nil { + return false, errorUnmarshal.Error() } - return true, nil + return true, "" } diff --git a/xml_test.go b/xml_test.go index 0b913f5..10c3df6 100644 --- a/xml_test.go +++ b/xml_test.go @@ -7,7 +7,7 @@ ) var xmlData = LogMessage{ - corpusname: "test", + corpusName: "test", name: "test-default.xml", startAt: "Thu Mar 4 13:08:00 2010 CET", extension: ".xml", @@ -17,11 +17,13 @@ } func TestValidXML(t *testing.T) { - result := isValidXML("./example/xml/test-default.xml") + result, err := isValidXML("./example/xml/test-default.xml") assert.Equal(t, result, true, "XML is well formed") + assert.Equal(t, err, "", "Return empty if xml is not well formed") } func TestInValidXML(t *testing.T) { - result := isValidXML("./example/xml/test-not-wellformed.xml") + result, err := isValidXML("./example/xml/test-not-wellformed.xml") assert.Equal(t, result, false, "XML is not well formed") + assert.Equal(t, err, "XML syntax error on line 6: element closed by ", "Return error if xml is not well formed") } \ No newline at end of file