Newer
Older
rnsr-geo-ml-dvc / dvc.yaml
stages:
  prepare:
    foreach: ${prepare}
    do:
      desc: Assign a geographical area to each address
      cmd: node bin/address-to-area.mjs  "${item.input}" >  ${item.output}
      deps:
      - ${item.input} # - data/addresses-40-cnrs-rnsr-big-classes-${item}.txt
      - data/netscity-ville-aire-uniq.tsv
      - bin/address-to-area.mjs
      - libs/geo.mjs
      outs:
      - ${item.output} # data/area-address-${item}.tsv
  extract-areas:
    foreach:
    - train
    - test
    do:
      desc: Extract geographic areas
      cmd: cat data/area-address-${item}.tsv| cut -f1 | sort -u > data/areas-${item}.txt
      deps:
      - data/area-address-${item}.tsv
      outs:
      - data/areas-${item}.txt
  split:
    desc: Split the adresses into the tree of areas
    cmd:
    - rm -rf areas
    - cat data/areas-train.txt | sed -e 's|^|areas/|' | xargs mkdir -p
    - bash bin/split-addresses.sh
    - bash bin/gather-little-areas.sh
    deps:
    - data/areas-train.txt
    - data/area-address-train.tsv
    - bin/split-addresses.sh
    - bin/gather-little-areas.sh
    outs:
    - areas
  test-to-areas:
    desc: Assign test addresses to existing (training) areas
    cmd: bin/assign-to-areas.mjs data/area-address-test.tsv
    deps:
    - data/area-address-test.tsv
    - bin/assign-to-areas.mjs
    - areas