schema: '2.0'
stages:
prepare:
cmd: node bin/address-to-area.mjs > data/area-address.tsv
deps:
- path: bin/address-to-area.mjs
md5: 586c0f15529591e46a6b22e2a05770d5
size: 942
- path: data/addresses-40-cnrs-rnsr-big-classes-train.txt
md5: 612c9731294d230f5fd4ed8c10d67468
size: 3694626
- path: data/netscity-ville-aire-uniq.tsv
md5: 85e405b8cc452b953e8dfd12dae5d8d6
size: 341275
- path: libs/geo.mjs
md5: af499b4463e041aeb5f4ca58af486565
size: 2443
params:
params.yaml:
prepare.input: data/addresses-40-cnrs-rnsr-big-classes-train.txt
outs:
- path: data/area-address.tsv
md5: 179a8eb09f2ddf72dd9aa2add136fb87
size: 4010846
extract-areas:
cmd: cat data/area-address.tsv| cut -f1 | sort -u > data/areas.txt
deps:
- path: data/area-address.tsv
md5: 179a8eb09f2ddf72dd9aa2add136fb87
size: 4010846
outs:
- path: data/areas.txt
md5: 6102ce8099fd61bccb6eb606b66ef260
size: 964
create-tree:
cmd:
- rm -rf areas
- cat data/areas.txt | sed -e 's|^|areas/|' | xargs mkdir -p
deps:
- path: data/areas.txt
md5: 6102ce8099fd61bccb6eb606b66ef260
size: 964
outs:
- path: areas
md5: d751713988987e9331980363e24189ce.dir
size: 0
nfiles: 0
clean-tree:
cmd: rm -rf areas
split:
cmd:
- rm -rf areas
- cat data/areas-train.txt | sed -e 's|^|areas/|' | xargs mkdir -p
- bash bin/split-addresses.sh
- bash bin/gather-little-areas.sh
- bin/assign-to-areas.mjs data/area-address-test.tsv
deps:
- path: bin/assign-to-areas.mjs
md5: d9343a744f33ccdd9ea79a9e97d33be8
size: 673
- path: bin/gather-little-areas.sh
md5: f8363cdb0c36496eac9fcb89f367bffd
size: 305
- path: bin/split-addresses.sh
md5: e64b86f9e4c88ef778e19a7423e59b0b
size: 176
- path: data/area-address-test.tsv
md5: 72ff16c01ed5e62d7ec3e2daafc4f257
size: 3700326
- path: data/area-address-train.tsv
md5: 179a8eb09f2ddf72dd9aa2add136fb87
size: 4010846
- path: data/areas-train.txt
md5: 6102ce8099fd61bccb6eb606b66ef260
size: 964
outs:
- path: areas
md5: 025c79e8e821b743ea5f9a2820c8d9f5.dir
size: 7104921
nfiles: 96
prepare@train:
cmd: node bin/address-to-area.mjs "data/addresses-40-cnrs-rnsr-big-classes-train.txt"
> data/area-address-train.tsv
deps:
- path: bin/address-to-area.mjs
md5: 34802f4af057d735b12ea9aa8ddd70cf
size: 889
- path: data/addresses-40-cnrs-rnsr-big-classes-train.txt
md5: 612c9731294d230f5fd4ed8c10d67468
size: 3694626
- path: data/netscity-ville-aire-uniq.tsv
md5: 85e405b8cc452b953e8dfd12dae5d8d6
size: 341275
- path: libs/geo.mjs
md5: af499b4463e041aeb5f4ca58af486565
size: 2443
outs:
- path: data/area-address-train.tsv
md5: 179a8eb09f2ddf72dd9aa2add136fb87
size: 4010846
extract-areas@train:
cmd: cat data/area-address-train.tsv| cut -f1 | sort -u > data/areas-train.txt
deps:
- path: data/area-address-train.tsv
md5: 179a8eb09f2ddf72dd9aa2add136fb87
size: 4010846
outs:
- path: data/areas-train.txt
md5: 6102ce8099fd61bccb6eb606b66ef260
size: 964
prepare@test:
cmd: node bin/address-to-area.mjs "data/addresses-40-cnrs-rnsr-big-classes-test.txt"
> data/area-address-test.tsv
deps:
- path: bin/address-to-area.mjs
md5: 34802f4af057d735b12ea9aa8ddd70cf
size: 889
- path: data/addresses-40-cnrs-rnsr-big-classes-test.txt
md5: 88402e9e874960f0b6f7eeb8e3c306d4
size: 3410283
- path: data/netscity-ville-aire-uniq.tsv
md5: 85e405b8cc452b953e8dfd12dae5d8d6
size: 341275
- path: libs/geo.mjs
md5: af499b4463e041aeb5f4ca58af486565
size: 2443
outs:
- path: data/area-address-test.tsv
md5: 72ff16c01ed5e62d7ec3e2daafc4f257
size: 3700326
extract-areas@test:
cmd: cat data/area-address-test.tsv| cut -f1 | sort -u > data/areas-test.txt
deps:
- path: data/area-address-test.tsv
md5: 72ff16c01ed5e62d7ec3e2daafc4f257
size: 3700326
outs:
- path: data/areas-test.txt
md5: b4464b712659fc2b0ebfaece2ba6b695
size: 962
test-to-areas:
cmd: bin/assign-to-areas.mjs data/area-address-test.tsv
deps:
- path: areas
md5: 7979ecda9047f54d236d01ad6ae3cc7f.dir
size: 7104921
nfiles: 97
- path: bin/assign-to-areas.mjs
md5: 6697688dfe19ee59dae600e49c0af4af
size: 657
- path: data/area-address-test.tsv
md5: 72ff16c01ed5e62d7ec3e2daafc4f257
size: 3700326
train:
cmd:
- mkdir -p models
- bin/train-areas.sh 30 "*" 4M
deps:
- path: areas
md5: 025c79e8e821b743ea5f9a2820c8d9f5.dir
size: 7104921
nfiles: 96
- path: bin/fasttext
md5: 2fbc2c71ba6e474327503d21206ec9b8
size: 462216
- path: bin/train-areas.sh
md5: b51c7308c8fca67995e77f6fd6237639
size: 464
- path: data/areas-test.txt
md5: b4464b712659fc2b0ebfaece2ba6b695
size: 962
- path: data/areas-train.txt
md5: 6102ce8099fd61bccb6eb606b66ef260
size: 964
outs:
- path: models
md5: 6195e3d34a21149180cd46363d049d6b.dir
size: 195517888
nfiles: 96
evaluate:
cmd:
- bin/evaluate-areas.sh
deps:
- path: bin/evaluate-areas.sh
md5: 6c7cbe18d0f52b876a7bc8cdc9f9ad41
size: 1125
- path: models
md5: 6195e3d34a21149180cd46363d049d6b.dir
size: 195517888
nfiles: 96
outs:
- path: metrics.json
md5: d872a11b4d335b589fa9ed25cf4b43dc
size: 74
- path: precision.json
md5: 364219f1383bd275d2db5d72fb24ce83
size: 2871