diff --git a/README.md b/README.md index 7ecd6be..d4d0305 100644 --- a/README.md +++ b/README.md @@ -60,6 +60,3 @@ - XML Usage of poppler function (`pdftotext` and `pdfinfo`) - - -//"xpath": ["/article/@article-type", "/article/front/article-meta/article-categories/subj-group[@subj-group-type=\"document-type-name\"]/subject"] \ No newline at end of file diff --git a/go.mod b/go.mod index 15758b5..66604cb 100644 --- a/go.mod +++ b/go.mod @@ -5,6 +5,7 @@ require ( github.com/TwiN/go-color v1.1.0 github.com/antchfx/xmlquery v1.3.9 + github.com/antchfx/xpath v1.2.0 github.com/gabriel-vasile/mimetype v1.4.0 github.com/goccy/go-json v0.9.4 github.com/sirupsen/logrus v1.8.1 @@ -12,7 +13,6 @@ ) require ( - github.com/antchfx/xpath v1.2.0 // indirect github.com/davecgh/go-spew v1.1.1 // indirect github.com/golang/groupcache v0.0.0-20200121045136-8c9f03a8e57e // indirect github.com/pmezard/go-difflib v1.0.0 // indirect diff --git a/main.go b/main.go index 142d525..03ccec6 100644 --- a/main.go +++ b/main.go @@ -60,7 +60,6 @@ } if file.Mode().IsRegular() { // count number files processed - numberFiles++ if numberFiles%10 == 0 { fmt.Printf("\rFiles processed: %d", numberFiles) } @@ -89,9 +88,11 @@ if fileData.mimetype == "application/pdf" { wg.Add(1) go processPDF(&fileData) + numberFiles++ } else if fileData.mimetype == "application/xml" || fileData.mimetype == "text/xml" { wg.Add(1) go processXML(&fileData) + numberFiles++ } else { logrus.WithFields(logrus.Fields{ "corpusName": fileData.corpusName, @@ -103,11 +104,13 @@ "mimeEncoding": fileData.mimeEncoding, "size": fileData.size, }).Info("") + numberFiles++ } } else { if fileData.mimetype == "application/xml" || fileData.mimetype == "text/xml" { wg.Add(1) go processXML(&fileData) + numberFiles++ } } } diff --git a/struct.go b/struct.go index 253a361..7107e5b 100644 --- a/struct.go +++ b/struct.go @@ -63,8 +63,8 @@ } type MetadataConfigDetailledAnalyze struct { - Name string `json:"name"` - Type string `json:"type"` - Regex string `json:"regex"` - Xpath string `json:"xpath"` + Name string `json:"name"` + Type string `json:"type"` + Regex string `json:"regex"` + Xpath []string `json:"xpath"` } diff --git a/xml.go b/xml.go index 84f4bfb..d6ae97b 100644 --- a/xml.go +++ b/xml.go @@ -2,6 +2,7 @@ import ( "flag" + "fmt" "io/ioutil" "os" "os/exec" @@ -9,6 +10,7 @@ "strings" "github.com/antchfx/xmlquery" + "github.com/antchfx/xpath" "github.com/sirupsen/logrus" ) @@ -67,7 +69,7 @@ "validationsErrors": detailledAnalysis.validationsErrors, "isValidAgainstSchema": detailledAnalysis.isValidAgainstSchema, "validationSchemaErrors": detailledAnalysis.validationSchemaErrors, - "xpath": detailledAnalysis.xpath, + "xpath": &detailledAnalysis.xpath, }).Info("") } } @@ -134,7 +136,7 @@ } func processDetailledAnalysis(pathXml string, xmlData string, dtdInDoctype string) DetailledAnalysis { - xmlDetailled := DetailledAnalysis{isValidAgainstDTD: false, isValidAgainstSchema: false, xpath: ""} + xmlDetailled := DetailledAnalysis{isValidAgainstDTD: false, isValidAgainstSchema: false} pathExec := *configurationFolder + "/" + *corpusName // if test @@ -193,20 +195,38 @@ // get xpath doc, err := xmlquery.Parse(strings.NewReader(xmlData)) + tmpXpath := "" if err == nil { - xmlDetailled.xpath = "{" + tmpXpath = "{" for _, field := range configDetailledAnalyze.XML.XPATH { - channel := xmlquery.FindOne(doc, field.Xpath) - if channel != nil { - xmlDetailled.xpath += field.Name + ":" + channel.InnerText() + "," + for _, xpathContent := range field.Xpath { + expr, _ := xpath.Compile(FormatXpathByType(field.Type, xpathContent)) + result := expr.Evaluate(xmlquery.CreateXPathNavigator(doc)) + str := fmt.Sprintf("%v", result) + if len(str) > 0 { + tmpXpath += `"`+field.Name+`"` + `:"` + str + `",` + } } + } } - xmlDetailled.xpath += "}" + tmpXpath = strings.TrimSuffix(tmpXpath, ",") + tmpXpath += "}" + + xmlDetailled.xpath = tmpXpath return xmlDetailled } -/*func FormatByType() { - -}*/ \ No newline at end of file +func FormatXpathByType(fieldType string, xpath string) string { + switch fieldType { + case "Number": + return "number(" + xpath + ")" + case "Boolean": + return "boolean(" + xpath + ")" + case "Count": + return "count(" + xpath + ")" + } + // String and Attrivute + return "string(" + xpath + ")" +}