diff --git a/Dockerfile b/Dockerfile index d1f589b..b6cbd75 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM golang:1.18-alpine +FROM golang:1.18.1-alpine RUN apk add git bash curl poppler-utils gcc musl-dev xmlstarlet libxml2-utils # build diff --git a/example/dtd/bad-doctype.dtd b/example/dtd/bad-doctype.dtd new file mode 100644 index 0000000..5fdbc6e --- /dev/null +++ b/example/dtd/bad-doctype.dtd @@ -0,0 +1,4 @@ + + + + \ No newline at end of file diff --git a/example/xml/test-default-bad-doctype.xml b/example/xml/test-default-bad-doctype.xml new file mode 100644 index 0000000..222f92e --- /dev/null +++ b/example/xml/test-default-bad-doctype.xml @@ -0,0 +1,8 @@ + + + + Tove + Jani + Reminder + Don't forget me this weekend! + \ No newline at end of file diff --git a/go.mod b/go.mod index 83b6c76..b8ca042 100644 --- a/go.mod +++ b/go.mod @@ -7,17 +7,17 @@ github.com/antchfx/xmlquery v1.3.10 github.com/antchfx/xpath v1.2.0 github.com/gabriel-vasile/mimetype v1.4.0 - github.com/goccy/go-json v0.9.5 + github.com/goccy/go-json v0.9.7 github.com/sirupsen/logrus v1.8.1 - github.com/stretchr/testify v1.7.0 + github.com/stretchr/testify v1.7.1 ) require ( github.com/davecgh/go-spew v1.1.1 // indirect github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect github.com/pmezard/go-difflib v1.0.0 // indirect - golang.org/x/net v0.0.0-20220225172249-27dd8689420f // indirect - golang.org/x/sys v0.0.0-20220310020820-b874c991c1a5 // indirect + golang.org/x/net v0.0.0-20220425223048-2871e0cb64e4 // indirect + golang.org/x/sys v0.0.0-20220429233432-b5fbb4746d32 // indirect golang.org/x/text v0.3.7 // indirect gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b // indirect ) diff --git a/go.sum b/go.sum index d1334ff..74ada56 100644 --- a/go.sum +++ b/go.sum @@ -11,6 +11,8 @@ github.com/gabriel-vasile/mimetype v1.4.0/go.mod h1:fA8fi6KUiG7MgQQ+mEWotXoEOvmxRtOJlERCzSmRvr8= github.com/goccy/go-json v0.9.5 h1:ooSMW526ZjK+EaL5elrSyN2EzIfi/3V0m4+HJEDYLik= github.com/goccy/go-json v0.9.5/go.mod h1:6MelG93GURQebXPDq3khkgXZkazVtN9CRI+MGFi0w8I= +github.com/goccy/go-json v0.9.7 h1:IcB+Aqpx/iMHu5Yooh7jEzJk1JZ7Pjtmys2ukPr7EeM= +github.com/goccy/go-json v0.9.7/go.mod h1:6MelG93GURQebXPDq3khkgXZkazVtN9CRI+MGFi0w8I= github.com/golang/groupcache v0.0.0-20200121045136-8c9f03a8e57e/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da h1:oI5xCqsCo564l8iNU+DwB5epxmsaqB+rhGL0m5jtYqE= github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= @@ -22,6 +24,8 @@ github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs= github.com/stretchr/testify v1.7.0 h1:nwc3DEeHmmLAfoZucVR881uASk0Mfjw8xYJ99tb5CcY= github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/stretchr/testify v1.7.1 h1:5TQK59W5E3v0r2duFAb7P95B6hEeOyEnHRa8MjYSMTY= +github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= @@ -29,6 +33,8 @@ golang.org/x/net v0.0.0-20210505024714-0287a6fb4125/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= golang.org/x/net v0.0.0-20220225172249-27dd8689420f h1:oA4XRj0qtSt8Yo1Zms0CUlsT3KG69V2UGQWPBxujDmc= golang.org/x/net v0.0.0-20220225172249-27dd8689420f/go.mod h1:CfG3xpIq0wQ8r1q4Su4UZFWDARRcnwPjda9FqA0JpMk= +golang.org/x/net v0.0.0-20220425223048-2871e0cb64e4 h1:HVyaeDAYux4pnY+D/SiwmLOR36ewZ4iGQIIrtnuCjFA= +golang.org/x/net v0.0.0-20220425223048-2871e0cb64e4/go.mod h1:CfG3xpIq0wQ8r1q4Su4UZFWDARRcnwPjda9FqA0JpMk= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20191026070338-33540a1f6037/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= @@ -39,6 +45,8 @@ golang.org/x/sys v0.0.0-20211216021012-1d35b9e2eb4e/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220310020820-b874c991c1a5 h1:y/woIyUBFbpQGKS0u1aHF/40WUDnek3fPOyD08H5Vng= golang.org/x/sys v0.0.0-20220310020820-b874c991c1a5/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220429233432-b5fbb4746d32 h1:Js08h5hqB5xyWR789+QqueR6sDE8mk+YvpETZ+F6X9Y= +golang.org/x/sys v0.0.0-20220429233432-b5fbb4746d32/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= diff --git a/indexCorpus.sh b/indexCorpus.sh index 9e3593c..e55f77f 100755 --- a/indexCorpus.sh +++ b/indexCorpus.sh @@ -13,6 +13,7 @@ dashboardTemplateFolder="/app/kibanatemplates" genericDashboardTemplate="dashboard-generique.ndjson" detailDashboardTemplate="dashboard-detail.ndjson" +chunksMaxLines=4000 #Help and args check if [ -z $2 ] @@ -53,13 +54,29 @@ echo "Converting file for bulk" #create buffer folder for json convert and kibana dashboard template mkdir -p $outCurlFolder"/"$corpusName - #convert json file to json ready for bulk - sed 's/{"corpusname":/{ "index" : {} }\n{"corpusname":/gi' $fullPath > $outCurlFolder"/"$corpusName"/"$filePatternReadyForCurl # Suppression index elasticsearch du meme nom si existant curl -k --noproxy '*' -XDELETE $ELASTIC_URL":"$ELASTIC_PORT"/analyse-"$corpusName > /dev/null 2>&1 - #indexing - echo "Sending to Index" - curl -k --noproxy '*' -XPOST $ELASTIC_URL":"$ELASTIC_PORT"/analyse-"$corpusName"/_bulk?pretty" -H "Content-Type: application/json" --data-binary "@"$outCurlFolder"/"$corpusName"/"$filePatternReadyForCurl > /dev/null 2>&1 + #Check if import file is too big - https://www.elastic.co/guide/en/app-search/current/limits.html - Split in chunks with $chunksMaxLines max lines + nbLinesImport=`wc -l $fullPath | awk '{print $1}'` + if [ "$nbLinesImport" -gt "$chunksMaxLines" ] + then + echo $fullPath" has "$nbLinesImport" lines, splitting it before indexing" + mkdir -p $outCurlFolder"/"$corpusName"/split" + split -l $chunksMaxLines $outCurlFolder"/"$corpusName"/"$filePatternReadyForCurl $outCurlFolder"/"$corpusName"/split/split" + for chunk in `ls $outCurlFolder"/"$corpusName"/split/"` + do echo "indexing "$chunk + #convert json file to json ready for bulk + sed 's/{"corpusname":/{ "index" : {} }\n{"corpusname":/gi' $outCurlFolder"/"$corpusName"/"$chunk > $outCurlFolder"/"$corpusName"/"$chunk"-curlReady" + curl -k --noproxy '*' -XPOST $ELASTIC_URL":"$ELASTIC_PORT"/analyse-"$corpusName"/_bulk?pretty" -H "Content-Type: application/json" --data-binary "@"$outCurlFolder"/"$corpusName"/"$chunk"-curlReady" > /dev/null 2>&1 + rm $outCurlFolder"/"$corpusName"/"$chunk"-curlReady" + done + rm -rf $outCurlFolder"/"$corpusName"/split" + else + echo "Sending to Index" + #convert json file to json ready for bulk + sed 's/{"corpusname":/{ "index" : {} }\n{"corpusname":/gi' $fullPath > $outCurlFolder"/"$corpusName"/"$filePatternReadyForCurl + curl -k --noproxy '*' -XPOST $ELASTIC_URL":"$ELASTIC_PORT"/analyse-"$corpusName"/_bulk?pretty" -H "Content-Type: application/json" --data-binary "@"$outCurlFolder"/"$corpusName"/"$filePatternReadyForCurl > /dev/null 2>&1 + fi #Import Dashboard - check type first if [ "$analysisType" = "generique" ] @@ -67,13 +84,13 @@ echo "Generic Analysis" cat $dashboardTemplateFolder"/"$genericDashboardTemplate | sed "s/CORPUSNAMEREPLACE/"$corpusName"/" | sed "s/DASHBOARDTITLEGEN/"$corpusName"/" > $outCurlFolder"/"$corpusName"/dashboardGEN.ndjson" echo "Creating dashboard" - curl -k --noproxy '*' -X POST $ELASTIC_URL":"$KIBANA_PORT"/api/saved_objects/_import?createNewCopies=true" -H "kbn-xsrf: true" --form "file=@"$outCurlFolder"/"$corpusName"/dashboardGEN.ndjson" + curl -k --noproxy '*' -X POST $ELASTIC_URL":"$KIBANA_PORT"/api/saved_objects/_import?createNewCopies=true" -H "kbn-xsrf: true" --form "file=@"$outCurlFolder"/"$corpusName"/dashboardGEN.ndjson" elif [ "$analysisType" = "detaille" ] then echo "Detailled dashboard" cat $dashboardTemplateFolder"/"$detailDashboardTemplate | sed "s/CORPUSNAMEREPLACE/"$corpusName"/" | sed "s/DASHBOARDTITLEGEN/"$corpusName"/" > $outCurlFolder"/"$corpusName"/dashboardDET.ndjson" echo "Creating dashboard" - curl -k --noproxy '*' -X POST $ELASTIC_URL":"$KIBANA_PORT"/api/saved_objects/_import?createNewCopies=true" -H "kbn-xsrf: true" --form "file=@"$outCurlFolder"/"$corpusName"/dashboardDET.ndjson" + curl -k --noproxy '*' -X POST $ELASTIC_URL":"$KIBANA_PORT"/api/saved_objects/_import?createNewCopies=true" -H "kbn-xsrf: true" --form "file=@"$outCurlFolder"/"$corpusName"/dashboardDET.ndjson" else echo "Unknown Analysis type : "$analysisType fi @@ -87,4 +104,4 @@ fi #Removing buffer folder -rm -rf $outCurlFolder"/"$corpusName \ No newline at end of file +rm -rf $outCurlFolder"/"$corpusName diff --git a/main.go b/main.go index 6df6f28..99fe6a0 100644 --- a/main.go +++ b/main.go @@ -173,7 +173,8 @@ os.Exit(1) } if err := json.Unmarshal(bytes.TrimPrefix(jsonFile, []byte("\xef\xbb\xbf")), &configDetailledAnalyze); err != nil { - log.Fatal("Error in Config file", err) + fmt.Println(color.InRed("Error in config file")) + fmt.Println(err) os.Exit(1) } fmt.Println(color.InBlue(strconv.Itoa(len(configDetailledAnalyze.XML.ListDTD)) + " DTD files for analysis")) @@ -204,8 +205,8 @@ result, err := exec.Command("/bin/bash", "indexCorpus.sh", logPath+"/analyse-logs.json", importType).CombinedOutput() if err != nil { fmt.Println(color.InRed("Error indexCorpus.sh")) - panic(err) } fmt.Println(string(result)) } + fmt.Println(color.InGreen("Done!")) } diff --git a/struct.go b/struct.go index d22d28b..ac00171 100644 --- a/struct.go +++ b/struct.go @@ -35,8 +35,8 @@ type XpathStructure struct { ContentType string `json:"content-type"` - PublicationYear string `json:"publicationYear"` - Doi int `json:"doi"` + PublicationYear int `json:"publicationYear"` + Doi string `json:"doi"` Pmid string `json:"pmid"` Issn string `json:"issn"` Eissn string `json:"eissn"` diff --git a/util.go b/util.go index a19a046..58b2000 100644 --- a/util.go +++ b/util.go @@ -21,6 +21,10 @@ return "" } +func standardizeSpaces(s string) string { + return strings.Join(strings.Fields(s), " ") +} + func UpdateCounter() { numberFiles++ // count number files processed diff --git a/xml.go b/xml.go index 774f053..5ce93b8 100644 --- a/xml.go +++ b/xml.go @@ -251,9 +251,10 @@ if field.Type == "Count" { tmpXpath += `"` + field.Name + `":` + text + `,` } else if text != "0" { + text = strings.NewReplacer("\"", "").Replace(text) // for string if hasQuote { - tmpXpath += `"` + field.Name + `":"` + strings.NewReplacer("\n", "").Replace(text) + `",` + tmpXpath += `"` + field.Name + `":"` + text + `",` } else { // for boolean and number tmpXpath += `"` + field.Name + `":` + text + `,` } @@ -266,6 +267,7 @@ tmpXpath = strings.TrimSuffix(tmpXpath, ",") tmpXpath += "}" + tmpXpath = standardizeSpaces(tmpXpath) finalXpath := []byte(tmpXpath) jsonData := XpathStructure{}