From 855cf7ee16986127fb3b3c632d98ccd5fbfa91e4 Mon Sep 17 00:00:00 2001 From: Daniel Ecer <de-code@users.noreply.github.com> Date: Sat, 25 Jul 2020 14:52:37 +0200 Subject: [PATCH] added pdf download using file list (#56) --- scripts/download-pdf-file-list-files.sh | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100755 scripts/download-pdf-file-list-files.sh diff --git a/scripts/download-pdf-file-list-files.sh b/scripts/download-pdf-file-list-files.sh new file mode 100755 index 0000000..31d6982 --- /dev/null +++ b/scripts/download-pdf-file-list-files.sh @@ -0,0 +1,24 @@ +#!/bin/bash + +set -e + +CLOUD_FILE_LIST_PATH=${1:-$CLOUD_FILE_LIST_PATH} +PDF_DIR=${2:-$PDF_DIR} + +if [ -z "${CLOUD_FILE_LIST_PATH}" ]; then + echo "Error: CLOUD_FILE_LIST_PATH required" + exit 1 +fi + +if [ -z "${PDF_DIR}" ]; then + echo "Error: PDF_DIR required" + exit 1 +fi + +echo "downloading dataset pdf from ${CLOUD_FILE_LIST_PATH} to ${PDF_DIR}" + +mkdir -p "${PDF_DIR}" +gsutil cat "${CLOUD_FILE_LIST_PATH}" | gsutil -m cp -I "${PDF_DIR}/" +gunzip -f "${PDF_DIR}/"*.gz || true + +ls -l "${PDF_DIR}/" -- GitLab