diff --git a/scripts/copy-raw-training-data-to-file-structure.sh b/scripts/copy-raw-training-data-to-file-structure.sh index 7c2cf088cb3824686855a1a9608a86c2a4d1f5e2..73f110023f5a4dd7fdf2bfc0065d7862e268cada 100755 --- a/scripts/copy-raw-training-data-to-file-structure.sh +++ b/scripts/copy-raw-training-data-to-file-structure.sh @@ -20,22 +20,45 @@ fi echo "RAW_TRAINING_DATA_DIR=${RAW_TRAINING_DATA_DIR}" echo "DATASET_DIR=${DATASET_DIR}" -header_headers_dir="$DATASET_DIR/header/corpus/headers" -header_tei_dir="$DATASET_DIR/header/corpus/tei-raw" +mkdir_clean() { + for dir in "$@"; do + echo "creating or cleaning directory: ${dir}" + mkdir -p "${dir}" + rm "${dir}"/* || true + done +} -mkdir -p "$header_headers_dir" -mkdir -p "$header_tei_dir" +copy_segmentation_files() { + segmentation_raw_dir="$DATASET_DIR/segmentation/corpus/raw" + segmentation_tei_dir="$DATASET_DIR/segmentation/corpus/tei-raw" + mkdir_clean "$segmentation_raw_dir" "${segmentation_tei_dir}" -rm "${header_headers_dir}"/* || true -rm "${header_tei_dir}"/* || true + echo "copying files from $RAW_TRAINING_DATA_DIR to $segmentation_raw_dir" + cp -a "$RAW_TRAINING_DATA_DIR/"*.segmentation "$segmentation_raw_dir" + echo "renaming files $segmentation_raw_dir" + rename 's#\.training\.#\.#' "$segmentation_raw_dir"/* -echo "copying files from $RAW_TRAINING_DATA_DIR to $header_headers_dir" -cp -a "$RAW_TRAINING_DATA_DIR/"*.header "$header_headers_dir" -echo "renaming files $header_headers_dir" -rename 's#\.training\.#\.#' "$header_headers_dir"/* + echo "copying files from $RAW_TRAINING_DATA_DIR to $segmentation_tei_dir" + cp -a "$RAW_TRAINING_DATA_DIR/"*.segmentation.tei.xml "$segmentation_tei_dir" + rename 's#\.training\.#\.#' "$segmentation_tei_dir"/* +} -echo "copying files from $RAW_TRAINING_DATA_DIR to $header_tei_dir" -cp -a "$RAW_TRAINING_DATA_DIR/"*.header.tei.xml "$header_tei_dir" -rename 's#\.training\.#\.#' "$header_tei_dir"/* +copy_header_files() { + header_headers_dir="$DATASET_DIR/header/corpus/headers" + header_tei_dir="$DATASET_DIR/header/corpus/tei-raw" + mkdir_clean "$header_headers_dir" "${header_tei_dir}" + + echo "copying files from $RAW_TRAINING_DATA_DIR to $header_headers_dir" + cp -a "$RAW_TRAINING_DATA_DIR/"*.header "$header_headers_dir" + echo "renaming files $header_headers_dir" + rename 's#\.training\.#\.#' "$header_headers_dir"/* + + echo "copying files from $RAW_TRAINING_DATA_DIR to $header_tei_dir" + cp -a "$RAW_TRAINING_DATA_DIR/"*.header.tei.xml "$header_tei_dir" + rename 's#\.training\.#\.#' "$header_tei_dir"/* +} + +copy_segmentation_files +copy_header_files ls -l --recursive "${DATASET_DIR}" diff --git a/scripts/upload-dataset.sh b/scripts/upload-dataset.sh index 268881e72ff8aa458a76c2a4aff83f44c53acc32..860b3074cc63bf431eda27fe4c1e12fd09aa3534 100755 --- a/scripts/upload-dataset.sh +++ b/scripts/upload-dataset.sh @@ -37,6 +37,10 @@ echo "CLOUD_DATATSET_PATH=${CLOUD_DATATSET_PATH}" sub_dirs=( + "segmentation/corpus/raw" + "segmentation/corpus/tei" + "segmentation/corpus/tei-raw" + "segmentation/corpus/tei-auto" "header/corpus/headers" "header/corpus/tei" "header/corpus/tei-raw"