From 98256f2c651516f4ef5a25f81f256736034b54b0 Mon Sep 17 00:00:00 2001
From: Daniel Ecer <de-code@users.noreply.github.com>
Date: Fri, 23 Aug 2019 14:32:16 +0100
Subject: [PATCH] copy and upload segmentation training data (#8)

---
 ...opy-raw-training-data-to-file-structure.sh | 49 ++++++++++++++-----
 scripts/upload-dataset.sh                     |  4 ++
 2 files changed, 40 insertions(+), 13 deletions(-)

diff --git a/scripts/copy-raw-training-data-to-file-structure.sh b/scripts/copy-raw-training-data-to-file-structure.sh
index 7c2cf08..73f1100 100755
--- a/scripts/copy-raw-training-data-to-file-structure.sh
+++ b/scripts/copy-raw-training-data-to-file-structure.sh
@@ -20,22 +20,45 @@ fi
 echo "RAW_TRAINING_DATA_DIR=${RAW_TRAINING_DATA_DIR}"
 echo "DATASET_DIR=${DATASET_DIR}"
 
-header_headers_dir="$DATASET_DIR/header/corpus/headers"
-header_tei_dir="$DATASET_DIR/header/corpus/tei-raw"
+mkdir_clean() {
+    for dir in "$@"; do 
+        echo "creating or cleaning directory: ${dir}"
+        mkdir -p "${dir}"
+        rm "${dir}"/* || true
+    done
+}
 
-mkdir -p "$header_headers_dir"
-mkdir -p "$header_tei_dir"
+copy_segmentation_files() {
+    segmentation_raw_dir="$DATASET_DIR/segmentation/corpus/raw"
+    segmentation_tei_dir="$DATASET_DIR/segmentation/corpus/tei-raw"
+    mkdir_clean "$segmentation_raw_dir" "${segmentation_tei_dir}"
 
-rm "${header_headers_dir}"/* || true
-rm "${header_tei_dir}"/* || true
+    echo "copying files from $RAW_TRAINING_DATA_DIR to $segmentation_raw_dir"
+    cp -a "$RAW_TRAINING_DATA_DIR/"*.segmentation "$segmentation_raw_dir"
+    echo "renaming files $segmentation_raw_dir"
+    rename 's#\.training\.#\.#' "$segmentation_raw_dir"/*
 
-echo "copying files from $RAW_TRAINING_DATA_DIR to $header_headers_dir"
-cp -a "$RAW_TRAINING_DATA_DIR/"*.header "$header_headers_dir"
-echo "renaming files $header_headers_dir"
-rename 's#\.training\.#\.#' "$header_headers_dir"/*
+    echo "copying files from $RAW_TRAINING_DATA_DIR to $segmentation_tei_dir"
+    cp -a "$RAW_TRAINING_DATA_DIR/"*.segmentation.tei.xml "$segmentation_tei_dir"
+    rename 's#\.training\.#\.#' "$segmentation_tei_dir"/*
+}
 
-echo "copying files from $RAW_TRAINING_DATA_DIR to $header_tei_dir"
-cp -a "$RAW_TRAINING_DATA_DIR/"*.header.tei.xml "$header_tei_dir"
-rename 's#\.training\.#\.#' "$header_tei_dir"/*
+copy_header_files() {
+    header_headers_dir="$DATASET_DIR/header/corpus/headers"
+    header_tei_dir="$DATASET_DIR/header/corpus/tei-raw"
+    mkdir_clean "$header_headers_dir" "${header_tei_dir}"
+
+    echo "copying files from $RAW_TRAINING_DATA_DIR to $header_headers_dir"
+    cp -a "$RAW_TRAINING_DATA_DIR/"*.header "$header_headers_dir"
+    echo "renaming files $header_headers_dir"
+    rename 's#\.training\.#\.#' "$header_headers_dir"/*
+
+    echo "copying files from $RAW_TRAINING_DATA_DIR to $header_tei_dir"
+    cp -a "$RAW_TRAINING_DATA_DIR/"*.header.tei.xml "$header_tei_dir"
+    rename 's#\.training\.#\.#' "$header_tei_dir"/*
+}
+
+copy_segmentation_files
+copy_header_files
 
 ls -l --recursive "${DATASET_DIR}"
diff --git a/scripts/upload-dataset.sh b/scripts/upload-dataset.sh
index 268881e..860b307 100755
--- a/scripts/upload-dataset.sh
+++ b/scripts/upload-dataset.sh
@@ -37,6 +37,10 @@ echo "CLOUD_DATATSET_PATH=${CLOUD_DATATSET_PATH}"
 
 
 sub_dirs=(
+    "segmentation/corpus/raw"
+    "segmentation/corpus/tei"
+    "segmentation/corpus/tei-raw"
+    "segmentation/corpus/tei-auto"
     "header/corpus/headers"
     "header/corpus/tei"
     "header/corpus/tei-raw"
-- 
GitLab