From 98256f2c651516f4ef5a25f81f256736034b54b0 Mon Sep 17 00:00:00 2001 From: Daniel Ecer <de-code@users.noreply.github.com> Date: Fri, 23 Aug 2019 14:32:16 +0100 Subject: [PATCH] copy and upload segmentation training data (#8) --- ...opy-raw-training-data-to-file-structure.sh | 49 ++++++++++++++----- scripts/upload-dataset.sh | 4 ++ 2 files changed, 40 insertions(+), 13 deletions(-) diff --git a/scripts/copy-raw-training-data-to-file-structure.sh b/scripts/copy-raw-training-data-to-file-structure.sh index 7c2cf08..73f1100 100755 --- a/scripts/copy-raw-training-data-to-file-structure.sh +++ b/scripts/copy-raw-training-data-to-file-structure.sh @@ -20,22 +20,45 @@ fi echo "RAW_TRAINING_DATA_DIR=${RAW_TRAINING_DATA_DIR}" echo "DATASET_DIR=${DATASET_DIR}" -header_headers_dir="$DATASET_DIR/header/corpus/headers" -header_tei_dir="$DATASET_DIR/header/corpus/tei-raw" +mkdir_clean() { + for dir in "$@"; do + echo "creating or cleaning directory: ${dir}" + mkdir -p "${dir}" + rm "${dir}"/* || true + done +} -mkdir -p "$header_headers_dir" -mkdir -p "$header_tei_dir" +copy_segmentation_files() { + segmentation_raw_dir="$DATASET_DIR/segmentation/corpus/raw" + segmentation_tei_dir="$DATASET_DIR/segmentation/corpus/tei-raw" + mkdir_clean "$segmentation_raw_dir" "${segmentation_tei_dir}" -rm "${header_headers_dir}"/* || true -rm "${header_tei_dir}"/* || true + echo "copying files from $RAW_TRAINING_DATA_DIR to $segmentation_raw_dir" + cp -a "$RAW_TRAINING_DATA_DIR/"*.segmentation "$segmentation_raw_dir" + echo "renaming files $segmentation_raw_dir" + rename 's#\.training\.#\.#' "$segmentation_raw_dir"/* -echo "copying files from $RAW_TRAINING_DATA_DIR to $header_headers_dir" -cp -a "$RAW_TRAINING_DATA_DIR/"*.header "$header_headers_dir" -echo "renaming files $header_headers_dir" -rename 's#\.training\.#\.#' "$header_headers_dir"/* + echo "copying files from $RAW_TRAINING_DATA_DIR to $segmentation_tei_dir" + cp -a "$RAW_TRAINING_DATA_DIR/"*.segmentation.tei.xml "$segmentation_tei_dir" + rename 's#\.training\.#\.#' "$segmentation_tei_dir"/* +} -echo "copying files from $RAW_TRAINING_DATA_DIR to $header_tei_dir" -cp -a "$RAW_TRAINING_DATA_DIR/"*.header.tei.xml "$header_tei_dir" -rename 's#\.training\.#\.#' "$header_tei_dir"/* +copy_header_files() { + header_headers_dir="$DATASET_DIR/header/corpus/headers" + header_tei_dir="$DATASET_DIR/header/corpus/tei-raw" + mkdir_clean "$header_headers_dir" "${header_tei_dir}" + + echo "copying files from $RAW_TRAINING_DATA_DIR to $header_headers_dir" + cp -a "$RAW_TRAINING_DATA_DIR/"*.header "$header_headers_dir" + echo "renaming files $header_headers_dir" + rename 's#\.training\.#\.#' "$header_headers_dir"/* + + echo "copying files from $RAW_TRAINING_DATA_DIR to $header_tei_dir" + cp -a "$RAW_TRAINING_DATA_DIR/"*.header.tei.xml "$header_tei_dir" + rename 's#\.training\.#\.#' "$header_tei_dir"/* +} + +copy_segmentation_files +copy_header_files ls -l --recursive "${DATASET_DIR}" diff --git a/scripts/upload-dataset.sh b/scripts/upload-dataset.sh index 268881e..860b307 100755 --- a/scripts/upload-dataset.sh +++ b/scripts/upload-dataset.sh @@ -37,6 +37,10 @@ echo "CLOUD_DATATSET_PATH=${CLOUD_DATATSET_PATH}" sub_dirs=( + "segmentation/corpus/raw" + "segmentation/corpus/tei" + "segmentation/corpus/tei-raw" + "segmentation/corpus/tei-auto" "header/corpus/headers" "header/corpus/tei" "header/corpus/tei-raw" -- GitLab