Commit -- 11/08/2015 @ 09:15:26
1 parent 69f9096 commit 05622e104488c2420a412f5693b4ac8863855bae
@perrin perrin authored on 11 Aug 2015
Showing 2 changed files
View
16
bash/get_page.sh
# (pour être parallélisé)
#
########################################################################
# Arguments ldc
DOC_IDX=
"
$1
"
PAGE=
"
$2
"
# Arguments ligne de commande
# Outils
JQ=
"jq -c -M -r"
# On récupère les différents documents (hits) de la page
for
DOC_IDX
in
$(
seq
1
$NB_HITS
)
do
echo
"--> Téléchargement de la page
$PAGE_N
/
$TOTAL_PAGE
"
FROM=$(wcalc -q
"
$PAGE_N
.0 *
$PAGE_SIZE
"
)
URL=
"
$ISTEX_URI
&from=
$FROM
&output=fulltext,metadata"
PAGE=$(curl -s
$URL
)
NB_HITS=$(
echo
$PAGE
|
$JQ
".hits | length"
)
DOC_IDX=$(wcalc -q
"
$DOC_IDX
- 1"
)
DOC=$(
echo
$PAGE
|
$JQ
".hits[
$DOC_IDX
]"
)
DOC_ISTEXID=$(
echo
$DOC
|
$JQ
".id"
)
DOC_FULLTEXT=$(
echo
$DOC
|
$JQ
".fulltext"
)
FILENAME=
"
$DOC_ISTEXID
.
$FILETYPE
"
curl -s
$URI
>
$FILENAME
echo
"--> Plein texte téléchargé :
$FILENAME
"
done
done
View
24
bash/harvest-perf.sh
echo
"--> Téléchargement de
$TOTAL_DOC
documents"
# On récupère les pages de resultats une par une
for
PAGE_N
in
$(
seq
0
$TOTAL_PAGE
)
do
echo
"--> Téléchargement de la page
$PAGE_N
/
$TOTAL_PAGE
"
FROM=$(wcalc -q
"
$PAGE_N
.0 *
$PAGE_SIZE
"
)
URL=
"
$ISTEX_URI
&from=
$FROM
&output=fulltext,metadata"
PAGE=$(curl -s
$URL
)
NB_HITS=$(
echo
$PAGE
|
$JQ
".hits | length"
)
seq
0
$TOTAL_PAGE
| parallel --gnu
"./get_page.sh {}"
#for PAGE_N in $(seq 0 $TOTAL_PAGE)
#do
#echo "--> Téléchargement de la page $PAGE_N/$TOTAL_PAGE"
#FROM=$(wcalc -q "$PAGE_N.0 * $PAGE_SIZE")
#URL="$ISTEX_URI&from=$FROM&output=fulltext,metadata"
#PAGE=$(curl -s $URL)
#NB_HITS=$(echo $PAGE | $JQ ".hits | length")
# On récupère les différents documents (hits) de la page
seq
1
$NB_HITS
| parallel --gnu
"./get_docs.sh {}
$PAGE
"
#for DOC_IDX in $(seq 1 $NB_HITS)
#do
#DOC_IDX=$(wcalc -q "$DOC_IDX - 1")
#DOC=$(echo $PAGE | $JQ ".hits[$DOC_IDX]")