#!/bin/bash

set -euf -o pipefail

echo "This script will require 80GB of disk space and might run for serveral hours"
echo "It will download 37GB"
echo ""

check_md5sum=false

if [ ! -f rvl-cdip.tar.gz ]; then
    # Google Drive output a virus warning for 
    wget --save-cookies cookies.txt -O /dev/null "https://drive.google.com/uc?export=download&id=0Bz1dfcnrpXM-MUt4cHNzUEFXcmc"
    google_virus_confirmcode=`awk '/download/ {print $NF}' cookies.txt` 
    wget --load-cookies cookies.txt -O rvl-cdip.tar.gz "https://drive.google.com/uc?export=download&confirm=${google_virus_confirmcode}&id=0Bz1dfcnrpXM-MUt4cHNzUEFXcmc"
    check_md5sum=true
fi
if [ ! -f labels_only.tar.gz ]; then
    wget -c -O labels_only.tar.gz "https://drive.google.com/uc?export=download&id=0B0NKIRwUL9KYcXo3bV9LU0t3SGs"
    check_md5sum=true
fi 

if $check_md5sum; then
    md5sum -c download.md5
fi

# Get the list of invoices
echo "Build invoices.csv"
tar xzf labels_only.tar.gz
cat labels/test.txt labels/train.txt labels/val.txt | grep 11\$ | cut -d " " -f 1 > labels/invoices.csv

if [ ! -d images ]; then
    echo "Extract images"
    tar xzf rvl-cdip.tar.gz
fi

if [ ! -d invoices ]; then
    mkdir invoices
fi

for i in `cat labels/invoices.csv`; do
    new_filename="invoices/${i//\//_}"
    new_filename=${new_filename/\.tif/.png}
    new_filename=${new_filename/_?_?_?/}
    new_filename=${new_filename/\+/}
    if [ -f ${new_filename} ]; then
        echo "Skipping ${new_filename}"
    else
        convert images/${i} ${new_filename}
        echo "Created ${new_filename}"
    fi
done


