diff --git a/.gitignore b/.gitignore index f401cb1..69efa02 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ *.log *.json out/* -example/* \ No newline at end of file +example/* +corpus-resources/ diff --git a/example/xml/test-bad-default.xml b/example/xml/test-bad-default.xml deleted file mode 100644 index 41ba3ac..0000000 --- a/example/xml/test-bad-default.xml +++ /dev/null @@ -1,7 +0,0 @@ - - - - - trezaq - - diff --git a/example/xml/test-bad-doctype.xml b/example/xml/test-bad-doctype.xml deleted file mode 100644 index abee98e..0000000 --- a/example/xml/test-bad-doctype.xml +++ /dev/null @@ -1,8 +0,0 @@ - - - - - trezaq - - - diff --git a/example/xml/test-default.xml b/example/xml/test-default.xml index d42409e..dd5c90d 100644 --- a/example/xml/test-default.xml +++ b/example/xml/test-default.xml @@ -1,8 +1,8 @@ - - - - - trezaq - - - + + + +Tove +Jani +Reminder +Don't forget me this weekend! + \ No newline at end of file diff --git a/example/xml/test-getxpaths.xml b/example/xml/test-getxpaths.xml deleted file mode 100644 index 807b8b9..0000000 --- a/example/xml/test-getxpaths.xml +++ /dev/null @@ -1,15 +0,0 @@ - - - - - trezaqwx -
trezaqwx end
- 1234 - - <p>lorem ipsum dolor sit amet</p> - <p>lorem ipsum dolor sit amet</p> - <p someattribute="test">blabla</p> - <p someattribute="test3">blabla</p> - </my> -</to> -</xpath> \ No newline at end of file diff --git a/example/xml/test-not-valid-dtd.xml b/example/xml/test-not-valid-dtd.xml deleted file mode 100644 index 79c0816..0000000 --- a/example/xml/test-not-valid-dtd.xml +++ /dev/null @@ -1,7 +0,0 @@ -<!DOCTYPE article PUBLIC "my doctype of doom" "mydoctype.dtd"> -<data> - Parthénogénese du poulpe en milieu sub-aquatique - <p>My other data</p> - <div>My other data 2</div> - <li>My other data 3</li> -</data> diff --git a/example/xml/test-tei.xml b/example/xml/test-tei.xml deleted file mode 100644 index 82399a1..0000000 --- a/example/xml/test-tei.xml +++ /dev/null @@ -1,136 +0,0 @@ -<?xml version="1.0" encoding="utf-8"?> -<TEI xmlns="http://www.tei-c.org/ns/1.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" version="3.1.0" xsi:schemaLocation="http://www.tei-c.org/ns/1.0 http://lodel.org/ns/tei/tei.openedition.1.6.1/document.xsd"> - <teiHeader> - <fileDesc> - <titleStmt> - <title level="a" type="main"> - <hi xml:lang="fr">Analyser un corpus avec Sisyphe</hi> - - Un logiciel libre d'exploration et d'analyse de fichiers orienté XML et PDF - - - Jean-Michel - ISTEX - - - INIST-CNRSUPS 076Équipe ISTEXCNRScontact@istex.fr - - - - - Gérard - MANSOIF - - - Université des jeux de mots laids - - - - - CNRS - UPS 076 (Institut de l'Information Scientifique et Technique) - ISTEX - 2019 - -

ISTEX www.istex.fr

-
- https://api.istex.fr/document/01234567890123456789 - 10.9999/abc.123 -
- - - - - - <respStmt> - <resp>auteur</resp> - <name>Jean-Michel ISTEX</name> - </respStmt> - <respStmt> - <resp>auteur</resp> - <name>Gérard MANSOIF</name> - </respStmt> - </titleStmt> - <publicationStmt> - <publisher/> - <date>2019</date> - <idno type="pp">25-47</idno> - </publicationStmt> - </biblFull> - <biblStruct type="article"> - <analytic> - <title level="a" type="main">Analyser un corpus avec Sisyphe - Un logiciel libre d'exploration et d'analyse de fichiers orienté XML et PDF - - - Jean-Michel - ISTEX - - - INIST-CNRSUPS 076Équipe ISTEXCNRScontact@istex.fr - - - - - Gérard - MANSOIF - - - Université des jeux de mots laids - - - - - ISTEX : des articles très cools - 9876-5432 - 8765-4321 - Vrac - 10.9999/abc.12 - https://www/istex.fr/abc/12 - - CNRS - UPS 076 - 25-47 - 6 - 2019 - - - - - - - - fr - - - - - France - - - - - Archive - Documentation scientifique - - - - - - - -
-

Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

-
-
- -

Sed ut perspiciatis unde omnis iste natus error sit voluptatem accusantium doloremque laudantium, totam rem aperiam, eaque ipsa quae ab illo inventore veritatis et quasi architecto beatae vitae dicta sunt explicabo.

Nemo enim ipsam voluptatem quia voluptas sit aspernatur aut odit aut fugit, sed quia consequuntur magni dolores eos qui ratione voluptatem sequi nesciunt. Neque porro quisquam est, qui dolorem ipsum quia dolor sit amet, consectetur, adipisci velit, sed quia non numquam eius modi tempora incidunt ut labore et dolore magnam aliquam quaerat voluptatem. Ut enim ad minima veniam, quis nostrum exercitationem ullam corporis suscipit laboriosam, nisi ut aliquid ex ea commodi consequatur? Quis autem vel eum iure reprehenderit qui in ea voluptate velit esse quam nihil molestiae consequatur, vel illum qui dolorem eum fugiat quo voluptas nulla pariatur?

- - -
- - ma première réf - ma deuxième réf - -
-
-
- diff --git a/example/xml/test-unknown-doctype.xml b/example/xml/test-unknown-doctype.xml deleted file mode 100644 index 5d226b5..0000000 --- a/example/xml/test-unknown-doctype.xml +++ /dev/null @@ -1,8 +0,0 @@ - - - - - trezaq - - - diff --git a/main.go b/main.go index f7f50b1..f63df52 100644 --- a/main.go +++ b/main.go @@ -21,6 +21,7 @@ "github.com/sirupsen/logrus" ) +var start = time.Now() var queueForConcurrent = make(chan struct{}, 1250) var wg sync.WaitGroup var numberFiles int = 0 @@ -73,7 +74,7 @@ startAt: file.ModTime().String(), extension: extension, path: absolutePath, - mimetype: mimetype, + mimetype: strings.ToLower(mimetype), mimeEncoding: mimeEncoding, size: file.Size(), } @@ -81,7 +82,7 @@ if fileData.mimetype == "application/pdf" { wg.Add(1) go processPDF(&fileData) - } else if extension == ".xml" && strings.Contains(fileData.mimetype, "text") { + } else if fileData.mimetype == "application/xml" || fileData.mimetype == "text/xml" { wg.Add(1) go processXML(&fileData) } else { @@ -97,8 +98,10 @@ }).Info("") } } else { - wg.Add(1) - go processXML(&fileData) + if fileData.mimetype == "application/xml" || fileData.mimetype == "text/xml" { + wg.Add(1) + go processXML(&fileData) + } } } } @@ -140,8 +143,6 @@ } func main() { - start := time.Now() - // init logger and params initProcess() diff --git a/pdf.go b/pdf.go index daa82f0..452b8b0 100644 --- a/pdf.go +++ b/pdf.go @@ -64,7 +64,7 @@ pdfData.pdfMetadata.Author = metaResult["Author"] pdfData.pdfMetadata.Creator = metaResult["Creator"] pdfData.pdfMetadata.CreationDate = metaResult["CreationDate"] - pdfData.pdfMetadata.pdfFormatVersion = metaResult["PDF version"] + pdfData.pdfMetadata.PDFFormatVersion = metaResult["PDF version"] numberPages, _ := strconv.Atoi(metaResult["Pages"]) pdfData.pdfPageTotal = numberPages } else { diff --git a/struct.go b/struct.go index 2d38136..9784d9a 100644 --- a/struct.go +++ b/struct.go @@ -4,7 +4,7 @@ Author string Creator string CreationDate string - pdfFormatVersion string + PDFFormatVersion string } type MessagePDF struct { pdfWordCount int @@ -24,6 +24,7 @@ isWellFormed bool wellFormedErrors string doctype DoctypeXML + metadataXML struct{} } type Message struct { diff --git a/xml.go b/xml.go index a187998..6c883ce 100644 --- a/xml.go +++ b/xml.go @@ -4,7 +4,6 @@ "encoding/xml" "io/ioutil" "os" - "os/exec" "regexp" "strings" @@ -38,13 +37,13 @@ } func getXMlData(message *Message) MessageXML { - xmlData := MessageXML{isWellFormed: true} + xmlMessage := MessageXML{isWellFormed: true} // check if able to open xmlFile, errOpen := os.Open(message.path) if errOpen != nil { - xmlData.isWellFormed = false - xmlData.wellFormedErrors = errOpen.Error() - return xmlData + xmlMessage.isWellFormed = false + xmlMessage.wellFormedErrors = errOpen.Error() + return xmlMessage } // defer the closing of our xmlFile so that we can parse it later on defer xmlFile.Close() @@ -52,35 +51,41 @@ byteValue, errRead := ioutil.ReadAll(xmlFile) if errRead != nil { - xmlData.isWellFormed = false - xmlData.wellFormedErrors = errRead.Error() - return xmlData + xmlMessage.isWellFormed = false + xmlMessage.wellFormedErrors = errRead.Error() + return xmlMessage } // check if unmarshal errUnmarshal := xml.Unmarshal(byteValue, new(interface{})) if errUnmarshal != nil { - xmlData.isWellFormed = false - xmlData.wellFormedErrors = errUnmarshal.Error() - return xmlData + xmlMessage.isWellFormed = false + xmlMessage.wellFormedErrors = errUnmarshal.Error() + return xmlMessage } // check with xmlstarlet - result, _ := exec.Command("xmlstarlet", "val", "-w", "-e", message.path).CombinedOutput() + /*exec.Command("xmlstarlet", "val", "-w", "-e", message.path).CombinedOutput() if result != nil && strings.Contains(string(result), "invalid") { - xmlData.isWellFormed = false - xmlData.wellFormedErrors = string(result) + xmlMessage.isWellFormed = false + xmlMessage.wellFormedErrors = string(result) + }*/ + + xmlData := string(byteValue) + + if !strings.HasPrefix(xmlData, " 0 { dtd := regexDtd.FindStringSubmatch(string(doctype[0])) if dtd != nil && len(dtd[0]) > 0 { - xmlData.doctype.sysid = dtd[0] + xmlMessage.doctype.sysid = dtd[0] } - return xmlData } - return xmlData + return xmlMessage } diff --git a/xml_test.go b/xml_test.go index 3b31f02..e37161f 100644 --- a/xml_test.go +++ b/xml_test.go @@ -20,7 +20,7 @@ result := getXMlData(&xmlData) assert.Equal(t, result.isWellFormed, true, "XML is well formed") assert.Equal(t, result.wellFormedErrors, "", "Return empty if xml is not well formed") - assert.Equal(t, result.doctype.sysid, "mydoctype.dtd", "Doctype is valid") + assert.Equal(t, result.doctype.sysid, "Note.dtd", "Doctype is valid") } func TestInvalidXML(t *testing.T) {