From 855cf7ee16986127fb3b3c632d98ccd5fbfa91e4 Mon Sep 17 00:00:00 2001
From: Daniel Ecer <de-code@users.noreply.github.com>
Date: Sat, 25 Jul 2020 14:52:37 +0200
Subject: [PATCH] added pdf download using file list (#56)

---
 scripts/download-pdf-file-list-files.sh | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)
 create mode 100755 scripts/download-pdf-file-list-files.sh

diff --git a/scripts/download-pdf-file-list-files.sh b/scripts/download-pdf-file-list-files.sh
new file mode 100755
index 0000000..31d6982
--- /dev/null
+++ b/scripts/download-pdf-file-list-files.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+
+set -e
+
+CLOUD_FILE_LIST_PATH=${1:-$CLOUD_FILE_LIST_PATH}
+PDF_DIR=${2:-$PDF_DIR}
+
+if [ -z "${CLOUD_FILE_LIST_PATH}" ]; then
+    echo "Error: CLOUD_FILE_LIST_PATH required"
+    exit 1
+fi
+
+if [ -z "${PDF_DIR}" ]; then
+    echo "Error: PDF_DIR required"
+    exit 1
+fi
+
+echo "downloading dataset pdf from ${CLOUD_FILE_LIST_PATH} to ${PDF_DIR}"
+
+mkdir -p "${PDF_DIR}"
+gsutil cat "${CLOUD_FILE_LIST_PATH}" | gsutil -m cp -I "${PDF_DIR}/"
+gunzip -f "${PDF_DIR}/"*.gz || true
+
+ls -l "${PDF_DIR}/"
-- 
GitLab