-
Notifications
You must be signed in to change notification settings - Fork 0
/
dataset_resource_inconsistencies.sh
executable file
·30 lines (24 loc) · 1.18 KB
/
dataset_resource_inconsistencies.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
#!/bin/bash
#
# Download dumps files from data.gouv.fr (if not already present)
# Looks for inconsistencies between datasets and dataset ids referenced by resources
#
# Depends on comm, csvkit (csvcut, csvjoin), sed and uniq command-line tools
# Maximum CSV line length (bytes)
MAX_LINE_LEN=1000000
./download_dumps.sh || exit 1
echo "Work on it..."
mkdir -p work
csvcut -d ';' -c 'id' -z $MAX_LINE_LEN data/datasets.csv | sed -e "1d" | sort -u > work/datasets_unique_id.txt
csvcut -d ';' -c 'dataset.id' data/resources.csv | sed -e '1d' | sort -u > work/resources_ref_dataset_unique_id.txt
STATS_FILE=dist/dataset_ids_not_found.csv
echo "Generating stats file $STATS_FILE"
mkdir -p dist
(echo "unknown dataset.id" && (comm -13 work/datasets_unique_id.txt work/resources_ref_dataset_unique_id.txt)) > $STATS_FILE
dataset_nb=`cat work/datasets_unique_id.txt | wc -l`
resource_dataset_nb=`cat work/resources_ref_dataset_unique_id.txt | wc -l`
unknown_dataset_nb=`sed -e '1d' $STATS_FILE | wc -l`
echo
echo "Dataset unique ids count in datasets table: $dataset_nb"
echo "Dataset unique ids number in resources.csv: $resource_dataset_nb"
echo "Unknown dataset ids number in resources.csv: $unknown_dataset_nb"