diff --git a/bin/gather-little-areas.sh b/bin/gather-little-areas.sh new file mode 100755 index 0000000..337027e --- /dev/null +++ b/bin/gather-little-areas.sh @@ -0,0 +1,13 @@ +#!/usr/bin/env bash + +littleAreas=$(wc -l areas/*/addresses.txt | \ +grep -E "\s[123]?[0-9] " | \ +cut -d/ -f2) + +mkdir -p areas/GatheredLittleAreas + +for area in $littleAreas +do + cat "areas/${area}/addresses.txt" >> areas/GatheredLittleAreas/addresses.txt + rm -rf "areas/${area}" +done diff --git a/bin/split-addresses.sh b/bin/split-addresses.sh new file mode 100755 index 0000000..1059711 --- /dev/null +++ b/bin/split-addresses.sh @@ -0,0 +1,6 @@ +#!/usr/bin/env bash + +for area in $(cat data/areas.txt) +do + grep -E "^${area}\s" data/area-address.tsv > "areas/${area}/addresses.txt" +done diff --git a/dvc.lock b/dvc.lock index 8bfd102..4c8338b 100644 --- a/dvc.lock +++ b/dvc.lock @@ -47,3 +47,24 @@ nfiles: 0 clean-tree: cmd: rm -rf areas + split: + cmd: + - rm -rf areas + - cat data/areas.txt | sed -e 's|^|areas/|' | xargs mkdir -p + - bash bin/split-addresses.sh + - bash bin/gather-little-areas.sh + deps: + - path: bin/gather-little-areas.sh + md5: 951f07ec08ce75ba7187f41c570d7cf7 + size: 287 + - path: bin/split-addresses.sh + md5: c378294952b517901437b33b2a5cefea + size: 142 + - path: data/areas.txt + md5: 6102ce8099fd61bccb6eb606b66ef260 + size: 964 + outs: + - path: areas + md5: 940c9151775b378b99f7add87531dd82.dir + size: 4010846 + nfiles: 48 diff --git a/dvc.yaml b/dvc.yaml index 33e82b8..a780897 100644 --- a/dvc.yaml +++ b/dvc.yaml @@ -18,12 +18,16 @@ - data/area-address.tsv outs: - data/areas.txt - create-tree: - desc: Create the tree of areas + split: + desc: Split the adresses into the tree of areas cmd: - rm -rf areas - cat data/areas.txt | sed -e 's|^|areas/|' | xargs mkdir -p + - bash bin/split-addresses.sh + - bash bin/gather-little-areas.sh deps: - data/areas.txt + - bin/split-addresses.sh + - bin/gather-little-areas.sh outs: - - areas # Unfortunately, does not take into account the subdirectories \ No newline at end of file + - areas \ No newline at end of file