schema: '2.0' stages: prepare: cmd: node bin/address-to-area.mjs > data/area-address.tsv deps: - path: bin/address-to-area.mjs md5: 586c0f15529591e46a6b22e2a05770d5 size: 942 - path: data/addresses-40-cnrs-rnsr-big-classes-train.txt md5: 612c9731294d230f5fd4ed8c10d67468 size: 3694626 - path: data/netscity-ville-aire-uniq.tsv md5: 85e405b8cc452b953e8dfd12dae5d8d6 size: 341275 - path: libs/geo.mjs md5: af499b4463e041aeb5f4ca58af486565 size: 2443 params: params.yaml: prepare.input: data/addresses-40-cnrs-rnsr-big-classes-train.txt outs: - path: data/area-address.tsv md5: 179a8eb09f2ddf72dd9aa2add136fb87 size: 4010846 extract-areas: cmd: cat data/area-address.tsv| cut -f1 | sort -u > data/areas.txt deps: - path: data/area-address.tsv md5: 179a8eb09f2ddf72dd9aa2add136fb87 size: 4010846 outs: - path: data/areas.txt md5: 6102ce8099fd61bccb6eb606b66ef260 size: 964 create-tree: cmd: - rm -rf areas - cat data/areas.txt | sed -e 's|^|areas/|' | xargs mkdir -p deps: - path: data/areas.txt md5: 6102ce8099fd61bccb6eb606b66ef260 size: 964 outs: - path: areas md5: d751713988987e9331980363e24189ce.dir size: 0 nfiles: 0 clean-tree: cmd: rm -rf areas split: cmd: - rm -rf areas - cat data/areas-train.txt | sed -e 's|^|areas/|' | xargs mkdir -p - bash bin/split-addresses.sh - bash bin/gather-little-areas.sh - bin/assign-to-areas.mjs data/area-address-test.tsv deps: - path: bin/assign-to-areas.mjs md5: d9343a744f33ccdd9ea79a9e97d33be8 size: 673 - path: bin/gather-little-areas.sh md5: f8363cdb0c36496eac9fcb89f367bffd size: 305 - path: bin/split-addresses.sh md5: e64b86f9e4c88ef778e19a7423e59b0b size: 176 - path: data/area-address-test.tsv md5: 72ff16c01ed5e62d7ec3e2daafc4f257 size: 3700326 - path: data/area-address-train.tsv md5: 179a8eb09f2ddf72dd9aa2add136fb87 size: 4010846 - path: data/areas-train.txt md5: 6102ce8099fd61bccb6eb606b66ef260 size: 964 outs: - path: areas md5: 025c79e8e821b743ea5f9a2820c8d9f5.dir size: 7104921 nfiles: 96 prepare@train: cmd: node bin/address-to-area.mjs "data/addresses-40-cnrs-rnsr-big-classes-train.txt" > data/area-address-train.tsv deps: - path: bin/address-to-area.mjs md5: 34802f4af057d735b12ea9aa8ddd70cf size: 889 - path: data/addresses-40-cnrs-rnsr-big-classes-train.txt md5: 612c9731294d230f5fd4ed8c10d67468 size: 3694626 - path: data/netscity-ville-aire-uniq.tsv md5: 85e405b8cc452b953e8dfd12dae5d8d6 size: 341275 - path: libs/geo.mjs md5: af499b4463e041aeb5f4ca58af486565 size: 2443 outs: - path: data/area-address-train.tsv md5: 179a8eb09f2ddf72dd9aa2add136fb87 size: 4010846 extract-areas@train: cmd: cat data/area-address-train.tsv| cut -f1 | sort -u > data/areas-train.txt deps: - path: data/area-address-train.tsv md5: 179a8eb09f2ddf72dd9aa2add136fb87 size: 4010846 outs: - path: data/areas-train.txt md5: 6102ce8099fd61bccb6eb606b66ef260 size: 964 prepare@test: cmd: node bin/address-to-area.mjs "data/addresses-40-cnrs-rnsr-big-classes-test.txt" > data/area-address-test.tsv deps: - path: bin/address-to-area.mjs md5: 34802f4af057d735b12ea9aa8ddd70cf size: 889 - path: data/addresses-40-cnrs-rnsr-big-classes-test.txt md5: 88402e9e874960f0b6f7eeb8e3c306d4 size: 3410283 - path: data/netscity-ville-aire-uniq.tsv md5: 85e405b8cc452b953e8dfd12dae5d8d6 size: 341275 - path: libs/geo.mjs md5: af499b4463e041aeb5f4ca58af486565 size: 2443 outs: - path: data/area-address-test.tsv md5: 72ff16c01ed5e62d7ec3e2daafc4f257 size: 3700326 extract-areas@test: cmd: cat data/area-address-test.tsv| cut -f1 | sort -u > data/areas-test.txt deps: - path: data/area-address-test.tsv md5: 72ff16c01ed5e62d7ec3e2daafc4f257 size: 3700326 outs: - path: data/areas-test.txt md5: b4464b712659fc2b0ebfaece2ba6b695 size: 962 test-to-areas: cmd: bin/assign-to-areas.mjs data/area-address-test.tsv deps: - path: areas md5: 7979ecda9047f54d236d01ad6ae3cc7f.dir size: 7104921 nfiles: 97 - path: bin/assign-to-areas.mjs md5: 6697688dfe19ee59dae600e49c0af4af size: 657 - path: data/area-address-test.tsv md5: 72ff16c01ed5e62d7ec3e2daafc4f257 size: 3700326 train: cmd: - mkdir -p models - bin/train-areas.sh 30 "*" 2M deps: - path: areas md5: 025c79e8e821b743ea5f9a2820c8d9f5.dir size: 7104921 nfiles: 96 - path: bin/fasttext md5: 2fbc2c71ba6e474327503d21206ec9b8 size: 462216 - path: bin/train-areas.sh md5: b51c7308c8fca67995e77f6fd6237639 size: 464 - path: data/areas-test.txt md5: b4464b712659fc2b0ebfaece2ba6b695 size: 962 - path: data/areas-train.txt md5: 6102ce8099fd61bccb6eb606b66ef260 size: 964 outs: - path: models md5: 4bc30dee7cb60406116610ceb54f066a.dir size: 102068604 nfiles: 96 evaluate: cmd: - bin/evaluate-areas.sh deps: - path: bin/evaluate-areas.sh md5: 6c7cbe18d0f52b876a7bc8cdc9f9ad41 size: 1125 - path: models md5: 4bc30dee7cb60406116610ceb54f066a.dir size: 102068604 nfiles: 96 outs: - path: metrics.json md5: 785ff44558467419eaca3f8a17cbf48d size: 74 - path: precision.json md5: 474d58e16b4f8c0d3f70f23a51a5cb28 size: 2873