diff --git a/.gitignore b/.gitignore index ab507f1..fbd9b73 100644 --- a/.gitignore +++ b/.gitignore @@ -85,3 +85,4 @@ # LibreOffice temporary files .~lock.* /areas +/models diff --git a/bin/train-areas.sh b/bin/train-areas.sh new file mode 100755 index 0000000..af8f4c2 --- /dev/null +++ b/bin/train-areas.sh @@ -0,0 +1,15 @@ +#!/usr/bin/env bash + +duration=${1:-60} # seconds +areas=${2:-*} # * or ANGERS + +for areaPath in areas/${areas} +do + area="${areaPath#areas/}" + bin/fasttext supervised \ + -input "areas/${area}/addresses-train.txt" \ + -output "models/${area}" \ + -autotune-validation \ + "areas/${area}/addresses-test.txt" \ + -autotune-duration ${duration} +done diff --git a/dvc.lock b/dvc.lock index 060b2d6..3e49e29 100644 --- a/dvc.lock +++ b/dvc.lock @@ -56,8 +56,8 @@ - bin/assign-to-areas.mjs data/area-address-test.tsv deps: - path: bin/assign-to-areas.mjs - md5: 6697688dfe19ee59dae600e49c0af4af - size: 657 + md5: d9343a744f33ccdd9ea79a9e97d33be8 + size: 673 - path: bin/gather-little-areas.sh md5: f8363cdb0c36496eac9fcb89f367bffd size: 305 @@ -75,9 +75,9 @@ size: 964 outs: - path: areas - md5: 7979ecda9047f54d236d01ad6ae3cc7f.dir + md5: 025c79e8e821b743ea5f9a2820c8d9f5.dir size: 7104921 - nfiles: 97 + nfiles: 96 prepare@train: cmd: node bin/address-to-area.mjs "data/addresses-40-cnrs-rnsr-big-classes-train.txt" > data/area-address-train.tsv @@ -151,3 +151,23 @@ - path: data/area-address-test.tsv md5: 72ff16c01ed5e62d7ec3e2daafc4f257 size: 3700326 + train: + cmd: + - mkdir -p models + - bin/train-areas.sh 30 "*" + deps: + - path: areas + md5: 025c79e8e821b743ea5f9a2820c8d9f5.dir + size: 7104921 + nfiles: 96 + - path: bin/fasttext + md5: 2fbc2c71ba6e474327503d21206ec9b8 + size: 462216 + - path: data/areas-train.txt + md5: 6102ce8099fd61bccb6eb606b66ef260 + size: 964 + outs: + - path: models + md5: 6a3d10abfdd5f2c02620e816c71e0c2c.dir + size: 18803735951 + nfiles: 94 diff --git a/dvc.yaml b/dvc.yaml index 6e8a33f..8e871a4 100644 --- a/dvc.yaml +++ b/dvc.yaml @@ -39,3 +39,15 @@ - data/area-address-test.tsv outs: - areas + train: + desc: Train a model in each area + cmd: + - mkdir -p models + - bin/train-areas.sh ${train.fasttext.duration} "${train.areas}" + deps: + - bin/fasttext + # - bin/train-areas.sh + - areas + - data/areas-train.txt + outs: + - models diff --git a/params.yaml b/params.yaml index baa7042..cfefe04 100644 --- a/params.yaml +++ b/params.yaml @@ -5,3 +5,7 @@ test: input: data/addresses-40-cnrs-rnsr-big-classes-test.txt output: data/area-address-test.tsv +train: + fasttext: + duration: 30 # seconds + areas: "*" # "*" or any of ANGERS, ALBI, ...