Newer
Older
rnsr-geo-ml-dvc / dvc.lock
schema: '2.0'
stages:
  prepare:
    cmd: node bin/address-to-area.mjs > data/area-address.tsv
    deps:
    - path: bin/address-to-area.mjs
      md5: 586c0f15529591e46a6b22e2a05770d5
      size: 942
    - path: data/addresses-40-cnrs-rnsr-big-classes-train.txt
      md5: 612c9731294d230f5fd4ed8c10d67468
      size: 3694626
    - path: data/netscity-ville-aire-uniq.tsv
      md5: 85e405b8cc452b953e8dfd12dae5d8d6
      size: 341275
    - path: libs/geo.mjs
      md5: af499b4463e041aeb5f4ca58af486565
      size: 2443
    params:
      params.yaml:
        prepare.input: data/addresses-40-cnrs-rnsr-big-classes-train.txt
    outs:
    - path: data/area-address.tsv
      md5: 179a8eb09f2ddf72dd9aa2add136fb87
      size: 4010846
  extract-areas:
    cmd: cat data/area-address.tsv| cut -f1 | sort -u > data/areas.txt
    deps:
    - path: data/area-address.tsv
      md5: 179a8eb09f2ddf72dd9aa2add136fb87
      size: 4010846
    outs:
    - path: data/areas.txt
      md5: 6102ce8099fd61bccb6eb606b66ef260
      size: 964
  create-tree:
    cmd:
    - rm -rf areas
    - cat data/areas.txt | sed -e 's|^|areas/|' | xargs mkdir -p
    deps:
    - path: data/areas.txt
      md5: 6102ce8099fd61bccb6eb606b66ef260
      size: 964
    outs:
    - path: areas
      md5: d751713988987e9331980363e24189ce.dir
      size: 0
      nfiles: 0
  clean-tree:
    cmd: rm -rf areas
  split:
    cmd:
    - rm -rf areas
    - cat data/areas-train.txt | sed -e 's|^|areas/|' | xargs mkdir -p
    - bash bin/split-addresses.sh
    - bash bin/gather-little-areas.sh
    - bin/assign-to-areas.mjs data/area-address-test.tsv
    deps:
    - path: bin/assign-to-areas.mjs
      md5: d9343a744f33ccdd9ea79a9e97d33be8
      size: 673
    - path: bin/gather-little-areas.sh
      md5: f8363cdb0c36496eac9fcb89f367bffd
      size: 305
    - path: bin/split-addresses.sh
      md5: e64b86f9e4c88ef778e19a7423e59b0b
      size: 176
    - path: data/area-address-test.tsv
      md5: 72ff16c01ed5e62d7ec3e2daafc4f257
      size: 3700326
    - path: data/area-address-train.tsv
      md5: 179a8eb09f2ddf72dd9aa2add136fb87
      size: 4010846
    - path: data/areas-train.txt
      md5: 6102ce8099fd61bccb6eb606b66ef260
      size: 964
    outs:
    - path: areas
      md5: 025c79e8e821b743ea5f9a2820c8d9f5.dir
      size: 7104921
      nfiles: 96
  prepare@train:
    cmd: node bin/address-to-area.mjs  "data/addresses-40-cnrs-rnsr-big-classes-train.txt"
      >  data/area-address-train.tsv
    deps:
    - path: bin/address-to-area.mjs
      md5: 34802f4af057d735b12ea9aa8ddd70cf
      size: 889
    - path: data/addresses-40-cnrs-rnsr-big-classes-train.txt
      md5: 612c9731294d230f5fd4ed8c10d67468
      size: 3694626
    - path: data/netscity-ville-aire-uniq.tsv
      md5: 85e405b8cc452b953e8dfd12dae5d8d6
      size: 341275
    - path: libs/geo.mjs
      md5: af499b4463e041aeb5f4ca58af486565
      size: 2443
    outs:
    - path: data/area-address-train.tsv
      md5: 179a8eb09f2ddf72dd9aa2add136fb87
      size: 4010846
  extract-areas@train:
    cmd: cat data/area-address-train.tsv| cut -f1 | sort -u > data/areas-train.txt
    deps:
    - path: data/area-address-train.tsv
      md5: 179a8eb09f2ddf72dd9aa2add136fb87
      size: 4010846
    outs:
    - path: data/areas-train.txt
      md5: 6102ce8099fd61bccb6eb606b66ef260
      size: 964
  prepare@test:
    cmd: node bin/address-to-area.mjs  "data/addresses-40-cnrs-rnsr-big-classes-test.txt"
      >  data/area-address-test.tsv
    deps:
    - path: bin/address-to-area.mjs
      md5: 34802f4af057d735b12ea9aa8ddd70cf
      size: 889
    - path: data/addresses-40-cnrs-rnsr-big-classes-test.txt
      md5: 88402e9e874960f0b6f7eeb8e3c306d4
      size: 3410283
    - path: data/netscity-ville-aire-uniq.tsv
      md5: 85e405b8cc452b953e8dfd12dae5d8d6
      size: 341275
    - path: libs/geo.mjs
      md5: af499b4463e041aeb5f4ca58af486565
      size: 2443
    outs:
    - path: data/area-address-test.tsv
      md5: 72ff16c01ed5e62d7ec3e2daafc4f257
      size: 3700326
  extract-areas@test:
    cmd: cat data/area-address-test.tsv| cut -f1 | sort -u > data/areas-test.txt
    deps:
    - path: data/area-address-test.tsv
      md5: 72ff16c01ed5e62d7ec3e2daafc4f257
      size: 3700326
    outs:
    - path: data/areas-test.txt
      md5: b4464b712659fc2b0ebfaece2ba6b695
      size: 962
  test-to-areas:
    cmd: bin/assign-to-areas.mjs data/area-address-test.tsv
    deps:
    - path: areas
      md5: 7979ecda9047f54d236d01ad6ae3cc7f.dir
      size: 7104921
      nfiles: 97
    - path: bin/assign-to-areas.mjs
      md5: 6697688dfe19ee59dae600e49c0af4af
      size: 657
    - path: data/area-address-test.tsv
      md5: 72ff16c01ed5e62d7ec3e2daafc4f257
      size: 3700326
  train:
    cmd:
    - mkdir -p models
    - bin/train-areas.sh 60 "LILLE METZ PARIS" 4M
    deps:
    - path: areas
      md5: 025c79e8e821b743ea5f9a2820c8d9f5.dir
      size: 7104921
      nfiles: 96
    - path: bin/fasttext
      md5: 2fbc2c71ba6e474327503d21206ec9b8
      size: 462216
    - path: bin/train-areas.sh
      md5: b51c7308c8fca67995e77f6fd6237639
      size: 464
    - path: data/areas-test.txt
      md5: b4464b712659fc2b0ebfaece2ba6b695
      size: 962
    - path: data/areas-train.txt
      md5: 6102ce8099fd61bccb6eb606b66ef260
      size: 964
    outs:
    - path: models
      md5: 0316b065e00a0ff4721b532e8e124a27.dir
      size: 15617069
      nfiles: 6
  evaluate:
    cmd:
    - bin/evaluate-areas.sh "LILLE METZ PARIS"
    deps:
    - path: bin/evaluate-areas.sh
      md5: 6c7cbe18d0f52b876a7bc8cdc9f9ad41
      size: 1125
    - path: models
      md5: 0316b065e00a0ff4721b532e8e124a27.dir
      size: 15617069
      nfiles: 6
    outs:
    - path: metrics.json
      md5: 392281e19c45feb05a16f09ab519a3ef
      size: 74
    - path: precision.json
      md5: 7671b9c2c3db4383bd2e39bb9cc88ac2
      size: 167