Newer
Older
sisyphe-go / xpath.sh
#!/bin/sh

# Debuggage
#set -x

corpusPath=$1
outPath=$2
attValue=$3

if [ "$#" -ne 3 ]
	then
	echo ""
	echo "Generation xpaths.csv"
	echo ""
	echo "Script Usage : $0 \$corpusPath \$outPath"
	echo " \$corpusPath = full path of the corpus"
	echo " \$outPath = xpaths.csv file directory path"
	echo " \$attValue = with or without attribute value [noattval/attval]"
	echo ""
	echo "Exemple : $0 /work/elsevier/elsevier-2019-livraison-2020-09-15 /applis/panist/home/sisyphe_out/1660119726-elsevier2019-2022-08-10-generique noattval"
	echo ""
	exit
fi

cpus=`cat /proc/cpuinfo | grep -i "^processor" | wc -l`

if [ "$attValue" = "attval" ]
then
	# Generation xpaths.csv avec les valeurs des attributs
	echo "Generation xpaths.csv avec les valeurs des attributs"
	find $corpusPath -name *.xml | parallel --silent xmlstarlet el -v {} 2>/dev/null \; | LC_ALL=C sort --temporary-directory=${outPath} --compress-program=gzip --buffer-size=50% --parallel=$cpus | LC_ALL=C uniq -c > $outPath/xpaths.csv
else
	# Generation xpaths.csv sans les valeurs des attributs
	echo "Generation xpaths.csv sans les valeurs des attributs"
	find $corpusPath -name *.xml | parallel --silent xmlstarlet el -a {} 2>/dev/null \; | LC_ALL=C sort --temporary-directory=${outPath} --compress-program=gzip --buffer-size=50% --parallel=$cpus | LC_ALL=C uniq -c > $outPath/xpaths.csv
fi