diff --git a/bash/get_page.sh b/bash/get_page.sh index f681512..ea114c0 100755 --- a/bash/get_page.sh +++ b/bash/get_page.sh @@ -6,13 +6,21 @@ # ######################################################################## -# Arguments ldc -DOC_IDX="$1" -PAGE="$2" +# Arguments ligne de commande + # Outils JQ="jq -c -M -r" +# On récupère les différents documents (hits) de la page +for DOC_IDX in $(seq 1 $NB_HITS) + do + echo "--> Téléchargement de la page $PAGE_N/$TOTAL_PAGE" + FROM=$(wcalc -q "$PAGE_N.0 * $PAGE_SIZE") + URL="$ISTEX_URI&from=$FROM&output=fulltext,metadata" + PAGE=$(curl -s $URL) + NB_HITS=$(echo $PAGE | $JQ ".hits | length") + DOC_IDX=$(wcalc -q "$DOC_IDX - 1") DOC=$(echo $PAGE | $JQ ".hits[$DOC_IDX]") DOC_ISTEXID=$(echo $DOC | $JQ ".id") @@ -44,3 +52,4 @@ curl -s $URI > $FILENAME echo "--> Plein texte téléchargé : $FILENAME" done + done diff --git a/bash/harvest-perf.sh b/bash/harvest-perf.sh index fc32456..7715896 100755 --- a/bash/harvest-perf.sh +++ b/bash/harvest-perf.sh @@ -35,18 +35,19 @@ echo "--> Téléchargement de $TOTAL_DOC documents" # On récupère les pages de resultats une par une -for PAGE_N in $(seq 0 $TOTAL_PAGE) -do - echo "--> Téléchargement de la page $PAGE_N/$TOTAL_PAGE" - FROM=$(wcalc -q "$PAGE_N.0 * $PAGE_SIZE") - URL="$ISTEX_URI&from=$FROM&output=fulltext,metadata" - PAGE=$(curl -s $URL) - NB_HITS=$(echo $PAGE | $JQ ".hits | length") +seq 0 $TOTAL_PAGE | parallel --gnu "./get_page.sh {}" + +#for PAGE_N in $(seq 0 $TOTAL_PAGE) +#do + + #echo "--> Téléchargement de la page $PAGE_N/$TOTAL_PAGE" + #FROM=$(wcalc -q "$PAGE_N.0 * $PAGE_SIZE") + #URL="$ISTEX_URI&from=$FROM&output=fulltext,metadata" + #PAGE=$(curl -s $URL) + #NB_HITS=$(echo $PAGE | $JQ ".hits | length") # On récupère les différents documents (hits) de la page - seq 1 $NB_HITS | parallel --gnu "./get_docs.sh {} $PAGE" - #for DOC_IDX in $(seq 1 $NB_HITS) #do #DOC_IDX=$(wcalc -q "$DOC_IDX - 1")