diff --git a/README.md b/README.md index 146a040..d7ec021 100644 --- a/README.md +++ b/README.md @@ -71,7 +71,7 @@ ### Experiments -Pour initialiser le fichier `dvc.yaml`, contenant les +Pour initialiser le fichier `dvc.yaml`, générant les [expériences](https://dvc.org/doc/start/experiments), on peut utiliser `dvc exp init --interactive`. diff --git a/bin/address-to-area.mjs b/bin/address-to-area.mjs index 4b60cf7..5f28d13 100755 --- a/bin/address-to-area.mjs +++ b/bin/address-to-area.mjs @@ -9,17 +9,16 @@ import { getParam } from "../libs/params.mjs"; import { getArea, getMapping, getSpacedMapping } from "../libs/geo.mjs"; -/** @type {string} */ -const inputFilePath = await getParam(["prepare", "input"]); +const [, , inputFilePath] = process.argv; console.error({ inputFilePath }) const mapping = await getMapping(); const spacedMapping = getSpacedMapping(mapping); -const trainingAddresses = (await fs.readFile(inputFilePath, "utf-8")) +const addresses = (await fs.readFile(inputFilePath, "utf-8")) .split("\n"); -for (const address of trainingAddresses) { +for (const address of addresses) { const area = getArea(address, mapping, spacedMapping); if (Array.isArray(area)) { console.error(`${address} => ${area}`); diff --git a/bin/gather-little-areas.sh b/bin/gather-little-areas.sh index 337027e..9964bc2 100755 --- a/bin/gather-little-areas.sh +++ b/bin/gather-little-areas.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -littleAreas=$(wc -l areas/*/addresses.txt | \ +littleAreas=$(wc -l areas/*/addresses-train.txt | \ grep -E "\s[123]?[0-9] " | \ cut -d/ -f2) @@ -8,6 +8,6 @@ for area in $littleAreas do - cat "areas/${area}/addresses.txt" >> areas/GatheredLittleAreas/addresses.txt + cat "areas/${area}/addresses-train.txt" >> areas/GatheredLittleAreas/addresses-train.txt rm -rf "areas/${area}" done diff --git a/bin/split-addresses.sh b/bin/split-addresses.sh index 1059711..933dbbe 100755 --- a/bin/split-addresses.sh +++ b/bin/split-addresses.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -for area in $(cat data/areas.txt) +for area in $(cat data/areas-train.txt) do - grep -E "^${area}\s" data/area-address.tsv > "areas/${area}/addresses.txt" + grep -E "^${area}\s" data/area-address-train.tsv > "areas/${area}/addresses-train.txt" done diff --git a/data/.gitignore b/data/.gitignore index 7509b97..07f0325 100644 --- a/data/.gitignore +++ b/data/.gitignore @@ -3,3 +3,7 @@ /addresses-40-cnrs-rnsr-big-classes-train.txt /area-address.tsv /areas.txt +/area-address-train.tsv +/areas-train.txt +/area-address-test.tsv +/areas-test.txt diff --git a/dvc.lock b/dvc.lock index 4c8338b..9240cdc 100644 --- a/dvc.lock +++ b/dvc.lock @@ -50,21 +50,84 @@ split: cmd: - rm -rf areas - - cat data/areas.txt | sed -e 's|^|areas/|' | xargs mkdir -p + - cat data/areas-train.txt | sed -e 's|^|areas/|' | xargs mkdir -p - bash bin/split-addresses.sh - bash bin/gather-little-areas.sh deps: - path: bin/gather-little-areas.sh - md5: 951f07ec08ce75ba7187f41c570d7cf7 - size: 287 + md5: f8363cdb0c36496eac9fcb89f367bffd + size: 305 - path: bin/split-addresses.sh - md5: c378294952b517901437b33b2a5cefea - size: 142 - - path: data/areas.txt + md5: a673fe80ebde2c236ede30c4ed57ce8c + size: 160 + - path: data/area-address-train.tsv + md5: 179a8eb09f2ddf72dd9aa2add136fb87 + size: 4010846 + - path: data/areas-train.txt md5: 6102ce8099fd61bccb6eb606b66ef260 size: 964 outs: - path: areas - md5: 940c9151775b378b99f7add87531dd82.dir + md5: 880febe8c0d19ec9ae40a29f5bf026ba.dir size: 4010846 nfiles: 48 + prepare@train: + cmd: node bin/address-to-area.mjs "data/addresses-40-cnrs-rnsr-big-classes-train.txt" + > data/area-address-train.tsv + deps: + - path: bin/address-to-area.mjs + md5: 1b7878a5dda36bc9b67b49778aac702a + size: 886 + - path: data/addresses-40-cnrs-rnsr-big-classes-train.txt + md5: 612c9731294d230f5fd4ed8c10d67468 + size: 3694626 + - path: data/netscity-ville-aire-uniq.tsv + md5: 85e405b8cc452b953e8dfd12dae5d8d6 + size: 341275 + - path: libs/geo.mjs + md5: af499b4463e041aeb5f4ca58af486565 + size: 2443 + outs: + - path: data/area-address-train.tsv + md5: 179a8eb09f2ddf72dd9aa2add136fb87 + size: 4010846 + extract-areas@train: + cmd: cat data/area-address-train.tsv| cut -f1 | sort -u > data/areas-train.txt + deps: + - path: data/area-address-train.tsv + md5: 179a8eb09f2ddf72dd9aa2add136fb87 + size: 4010846 + outs: + - path: data/areas-train.txt + md5: 6102ce8099fd61bccb6eb606b66ef260 + size: 964 + prepare@test: + cmd: node bin/address-to-area.mjs "data/addresses-40-cnrs-rnsr-big-classes-test.txt" + > data/area-address-test.tsv + deps: + - path: bin/address-to-area.mjs + md5: 1b7878a5dda36bc9b67b49778aac702a + size: 886 + - path: data/addresses-40-cnrs-rnsr-big-classes-test.txt + md5: 88402e9e874960f0b6f7eeb8e3c306d4 + size: 3410283 + - path: data/netscity-ville-aire-uniq.tsv + md5: 85e405b8cc452b953e8dfd12dae5d8d6 + size: 341275 + - path: libs/geo.mjs + md5: af499b4463e041aeb5f4ca58af486565 + size: 2443 + outs: + - path: data/area-address-test.tsv + md5: 72ff16c01ed5e62d7ec3e2daafc4f257 + size: 3700326 + extract-areas@test: + cmd: cat data/area-address-test.tsv| cut -f1 | sort -u > data/areas-test.txt + deps: + - path: data/area-address-test.tsv + md5: 72ff16c01ed5e62d7ec3e2daafc4f257 + size: 3700326 + outs: + - path: data/areas-test.txt + md5: b4464b712659fc2b0ebfaece2ba6b695 + size: 962 diff --git a/dvc.yaml b/dvc.yaml index a780897..f0ba632 100644 --- a/dvc.yaml +++ b/dvc.yaml @@ -1,32 +1,37 @@ stages: prepare: - desc: Assign a geographical area to each address - cmd: node bin/address-to-area.mjs > data/area-address.tsv - deps: - - data/addresses-40-cnrs-rnsr-big-classes-train.txt - - data/netscity-ville-aire-uniq.tsv - - bin/address-to-area.mjs - - libs/geo.mjs - params: - - prepare.input - outs: - - data/area-address.tsv + foreach: ${prepare} + do: + desc: Assign a geographical area to each address + cmd: node bin/address-to-area.mjs "${item.input}" > ${item.output} + deps: + - ${item.input} # - data/addresses-40-cnrs-rnsr-big-classes-${item}.txt + - data/netscity-ville-aire-uniq.tsv + - bin/address-to-area.mjs + - libs/geo.mjs + outs: + - ${item.output} # data/area-address-${item}.tsv extract-areas: - desc: Extract geographic areas - cmd: cat data/area-address.tsv| cut -f1 | sort -u > data/areas.txt - deps: - - data/area-address.tsv - outs: - - data/areas.txt + foreach: + - train + - test + do: + desc: Extract geographic areas + cmd: cat data/area-address-${item}.tsv| cut -f1 | sort -u > data/areas-${item}.txt + deps: + - data/area-address-${item}.tsv + outs: + - data/areas-${item}.txt split: desc: Split the adresses into the tree of areas cmd: - rm -rf areas - - cat data/areas.txt | sed -e 's|^|areas/|' | xargs mkdir -p + - cat data/areas-train.txt | sed -e 's|^|areas/|' | xargs mkdir -p - bash bin/split-addresses.sh - bash bin/gather-little-areas.sh deps: - - data/areas.txt + - data/areas-train.txt + - data/area-address-train.tsv - bin/split-addresses.sh - bin/gather-little-areas.sh outs: diff --git a/params.yaml b/params.yaml index 2445ce4..baa7042 100644 --- a/params.yaml +++ b/params.yaml @@ -1,2 +1,7 @@ prepare: - input: data/addresses-40-cnrs-rnsr-big-classes-train.txt \ No newline at end of file + train: + input: data/addresses-40-cnrs-rnsr-big-classes-train.txt + output: data/area-address-train.tsv + test: + input: data/addresses-40-cnrs-rnsr-big-classes-test.txt + output: data/area-address-test.tsv