diff --git a/scripts/download-pdf-file-list-files.sh b/scripts/download-pdf-file-list-files.sh new file mode 100755 index 0000000000000000000000000000000000000000..31d6982c9e637e74581ed6a29133b10b71ebe3a3 --- /dev/null +++ b/scripts/download-pdf-file-list-files.sh @@ -0,0 +1,24 @@ +#!/bin/bash + +set -e + +CLOUD_FILE_LIST_PATH=${1:-$CLOUD_FILE_LIST_PATH} +PDF_DIR=${2:-$PDF_DIR} + +if [ -z "${CLOUD_FILE_LIST_PATH}" ]; then + echo "Error: CLOUD_FILE_LIST_PATH required" + exit 1 +fi + +if [ -z "${PDF_DIR}" ]; then + echo "Error: PDF_DIR required" + exit 1 +fi + +echo "downloading dataset pdf from ${CLOUD_FILE_LIST_PATH} to ${PDF_DIR}" + +mkdir -p "${PDF_DIR}" +gsutil cat "${CLOUD_FILE_LIST_PATH}" | gsutil -m cp -I "${PDF_DIR}/" +gunzip -f "${PDF_DIR}/"*.gz || true + +ls -l "${PDF_DIR}/"