Newer
Older
rnsr-geo-ml-dvc / dvc.yaml
stages:
  prepare:
    foreach: ${prepare}
    do:
      desc: Assign a geographical area to each address
      cmd: node bin/address-to-area.mjs  "${item.input}" >  ${item.output}
      deps:
      - ${item.input} # - data/addresses-40-cnrs-rnsr-big-classes-${item}.txt
      - data/netscity-ville-aire-uniq.tsv
      - bin/address-to-area.mjs
      - libs/geo.mjs
      outs:
      - ${item.output} # data/area-address-${item}.tsv
  extract-areas:
    foreach:
    - train
    - test
    do:
      desc: Extract geographic areas
      cmd: cat data/area-address-${item}.tsv| cut -f1 | sort -u > data/areas-${item}.txt
      deps:
      - data/area-address-${item}.tsv
      outs:
      - data/areas-${item}.txt
  split:
    desc: Split the adresses into the tree of areas
    cmd:
    - rm -rf areas
    - cat data/areas-train.txt | sed -e 's|^|areas/|' | xargs mkdir -p
    - bash bin/split-addresses.sh
    - bash bin/gather-little-areas.sh
    - bin/assign-to-areas.mjs data/area-address-test.tsv
    deps:
    - data/areas-train.txt
    - data/area-address-train.tsv
    - bin/split-addresses.sh
    - bin/gather-little-areas.sh
    - bin/assign-to-areas.mjs
    - data/area-address-test.tsv
    outs:
    - areas
  train:
    desc: Train a model in each area
    cmd:
    - mkdir -p models
    - bin/train-areas.sh ${train.fasttext.duration} "${train.areas}" ${train.fasttext.modelsize}
    deps:
    - bin/fasttext
    - bin/train-areas.sh
    - areas
    - data/areas-train.txt
    - data/areas-test.txt
    outs:
    - models
  evaluate:
    cmd:
    - bin/evaluate-areas.sh
    deps:
    - models
    - bin/evaluate-areas.sh
    metrics:
    - metrics.json:
        desc: Statistiques sur les précisions obtenues par aire géographique
        cache: false
    plots:
    - precision.json:
        cache: false
        y: precision
        template: scatter
        title: Précision par aire géographique
        x_label: Aire
        y_label: Précision