diff --git a/data/.gitignore b/data/.gitignore index 1e4d7f1..7509b97 100644 --- a/data/.gitignore +++ b/data/.gitignore @@ -2,3 +2,4 @@ /addresses-40-cnrs-rnsr-big-classes-test.txt /addresses-40-cnrs-rnsr-big-classes-train.txt /area-address.tsv +/areas.txt diff --git a/dvc.lock b/dvc.lock index a247e23..43aab92 100644 --- a/dvc.lock +++ b/dvc.lock @@ -1,12 +1,11 @@ schema: '2.0' stages: prepare: - cmd: node bin/address-to-area.mjs data/addresses-40-cnrs-rnsr-big-classes-train.txt - > data/area-address.tsv + cmd: node bin/address-to-area.mjs > data/area-address.tsv deps: - path: bin/address-to-area.mjs - md5: 92569f4ef93c42d6659b43c7a93abf1d - size: 1093 + md5: 586c0f15529591e46a6b22e2a05770d5 + size: 942 - path: data/addresses-40-cnrs-rnsr-big-classes-train.txt md5: 612c9731294d230f5fd4ed8c10d67468 size: 3694626 @@ -16,7 +15,20 @@ - path: libs/geo.mjs md5: af499b4463e041aeb5f4ca58af486565 size: 2443 + params: + params.yaml: + prepare.input: data/addresses-40-cnrs-rnsr-big-classes-train.txt outs: - path: data/area-address.tsv md5: 179a8eb09f2ddf72dd9aa2add136fb87 size: 4010846 + extract-areas: + cmd: cat data/area-address.tsv| cut -f1 | sort -u > data/areas.txt + deps: + - path: data/area-address.tsv + md5: 179a8eb09f2ddf72dd9aa2add136fb87 + size: 4010846 + outs: + - path: data/areas.txt + md5: 6102ce8099fd61bccb6eb606b66ef260 + size: 964 diff --git a/dvc.yaml b/dvc.yaml index 3264443..8ede10d 100644 --- a/dvc.yaml +++ b/dvc.yaml @@ -11,3 +11,10 @@ - prepare.input outs: - data/area-address.tsv + extract-areas: + desc: Extract geographic areas + cmd: cat data/area-address.tsv| cut -f1 | sort -u > data/areas.txt + deps: + - data/area-address.tsv + outs: + - data/areas.txt